bilibili
/
crawl-bilibili


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
							#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@File    :   get_barrage.py
@Time    :   2019/05/15 17:10:38
@Author  :   Liuyuqi 
@Version :   1.0
@Contact :   liuyuqi.gov@msn.cn
@License :   (C)Copyright 2019
@Desc    :   弹幕爬虫，弹幕每隔30s刷新一次，所以抓取某视频的弹幕只要通过改变时间参数就可以抓取视频所有的弹幕
'''
from lxml import etree
import requests
import sys
import re

head = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
}


def crawlBarrage(av):
    url = 'http://bilibili.com/video/av' + str(av)
    print(url)
    html = requests.get(url, headers=head)
    selector = etree.HTML(html.text)
    content = selector.xpath("//html")
    for each in content:
        title = each.xpath('//*[@id="viewbox_report"]/h1/span')
        if title:
            print(title[0].text)
            cid_html_1 = each.xpath('//*[@id="link2"]/@value')
            if cid_html_1:
                cid_html = cid_html_1[0]
                cids = re.findall(r'cid=.+&page', cid_html)
                cid = cids[0].replace("cid=", "").replace("&page", "")
                comment_url = 'http://comment.bilibili.com/' + \
                    str(cid) + '.xml'
                print(comment_url)
                comment_text = requests.get(comment_url, headers=head)
                comment_selector = etree.HTML(comment_text.content)
                comment_content = comment_selector.xpath('//i')
                for comment_each in comment_content:
                    comments = comment_each.xpath('//d/text()')
                    if comments:
                        for comment in comments:
                            print(comment)
                            f.writelines(comment + '\n')
            else:
                print('cid not found!')
        else:
            print('video not found!')


if __name__ == '__main__':
    av = input('input av:')
    f = open(av + '.txt', 'w', encoding='utf-8')
    crawlBarrage(av)