get_barrage.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @File : get_barrage.py
  5. @Time : 2019/05/15 17:10:38
  6. @Author : Liuyuqi
  7. @Version : 1.0
  8. @Contact : liuyuqi.gov@msn.cn
  9. @License : (C)Copyright 2019
  10. @Desc : 弹幕爬虫,弹幕每隔30s刷新一次,所以抓取某视频的弹幕只要通过改变时间参数就可以抓取视频所有的弹幕
  11. '''
  12. from lxml import etree
  13. import requests
  14. import sys
  15. import re
  16. head = {
  17. 'User-Agent':
  18. 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
  19. }
  20. def crawlBarrage(av):
  21. url = 'http://bilibili.com/video/av' + str(av)
  22. print(url)
  23. html = requests.get(url, headers=head)
  24. selector = etree.HTML(html.text)
  25. content = selector.xpath("//html")
  26. for each in content:
  27. title = each.xpath('//*[@id="viewbox_report"]/h1/span')
  28. if title:
  29. print(title[0].text)
  30. cid_html_1 = each.xpath('//*[@id="link2"]/@value')
  31. if cid_html_1:
  32. cid_html = cid_html_1[0]
  33. cids = re.findall(r'cid=.+&page', cid_html)
  34. cid = cids[0].replace("cid=", "").replace("&page", "")
  35. comment_url = 'http://comment.bilibili.com/' + \
  36. str(cid) + '.xml'
  37. print(comment_url)
  38. comment_text = requests.get(comment_url, headers=head)
  39. comment_selector = etree.HTML(comment_text.content)
  40. comment_content = comment_selector.xpath('//i')
  41. for comment_each in comment_content:
  42. comments = comment_each.xpath('//d/text()')
  43. if comments:
  44. for comment in comments:
  45. print(comment)
  46. f.writelines(comment + '\n')
  47. else:
  48. print('cid not found!')
  49. else:
  50. print('video not found!')
  51. if __name__ == '__main__':
  52. av = input('input av:')
  53. f = open(av + '.txt', 'w', encoding='utf-8')
  54. crawlBarrage(av)