1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @File : get_barrage.py
- @Time : 2019/05/15 17:10:38
- @Author : Liuyuqi
- @Version : 1.0
- @Contact : liuyuqi.gov@msn.cn
- @License : (C)Copyright 2019
- @Desc : 弹幕爬虫,弹幕每隔30s刷新一次,所以抓取某视频的弹幕只要通过改变时间参数就可以抓取视频所有的弹幕
- '''
- from lxml import etree
- import requests
- import sys
- import re
- head = {
- 'User-Agent':
- 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
- }
- def crawlBarrage(av):
- url = 'http://bilibili.com/video/av' + str(av)
- print(url)
- html = requests.get(url, headers=head)
- selector = etree.HTML(html.text)
- content = selector.xpath("//html")
- for each in content:
- title = each.xpath('//*[@id="viewbox_report"]/h1/span')
- if title:
- print(title[0].text)
- cid_html_1 = each.xpath('//*[@id="link2"]/@value')
- if cid_html_1:
- cid_html = cid_html_1[0]
- cids = re.findall(r'cid=.+&page', cid_html)
- cid = cids[0].replace("cid=", "").replace("&page", "")
- comment_url = 'http://comment.bilibili.com/' + \
- str(cid) + '.xml'
- print(comment_url)
- comment_text = requests.get(comment_url, headers=head)
- comment_selector = etree.HTML(comment_text.content)
- comment_content = comment_selector.xpath('//i')
- for comment_each in comment_content:
- comments = comment_each.xpath('//d/text()')
- if comments:
- for comment in comments:
- print(comment)
- f.writelines(comment + '\n')
- else:
- print('cid not found!')
- else:
- print('video not found!')
- if __name__ == '__main__':
- av = input('input av:')
- f = open(av + '.txt', 'w', encoding='utf-8')
- crawlBarrage(av)
|