get_barrage.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Author : liuyuqi
  5. @Contact : liuyuqi.gov@msn.cn
  6. @Time : 2019/09/07 19:00:20
  7. @Version : 1.0
  8. @License : (C)Copyright 2019
  9. @Desc : 《都挺好》1-30集 弹幕下载
  10. '''
  11. import requests
  12. import json
  13. import pandas as pd
  14. import os
  15. import time
  16. import random
  17. #页面基本信息解析,获取构成弹幕网址所需的后缀ID、播放量、集数等信息。
  18. def parse_base_info(url, headers):
  19. df = pd.DataFrame()
  20. html = requests.get(url, headers=headers)
  21. bs = json.loads(html.text[html.text.find('{'):-1])
  22. for i in bs['results']:
  23. v_id = i['id']
  24. title = i['fields']['title']
  25. view_count = i['fields']['view_all_count']
  26. episode = int(i['fields']['episode'])
  27. if episode == 0:
  28. pass
  29. else:
  30. cache = pd.DataFrame({'id': [v_id], 'title': [title], '播放量': [
  31. view_count], '第几集': [episode]})
  32. df = pd.concat([df, cache])
  33. return df
  34. #传入后缀ID,获取该集的target_id并返回
  35. def get_episode_danmu(v_id, headers):
  36. base_url = 'https://access.video.qq.com/danmu_manage/regist?vappid=97767206&vsecret=c0bdcbae120669fff425d0ef853674614aa659c605a613a4&raw=1'
  37. pay = {"wRegistType": 2, "vecIdList": [v_id],
  38. "wSpeSource": 0, "bIsGetUserCfg": 1,
  39. "mapExtData": {v_id: {"strCid": "wu1e7mrffzvibjy", "strLid": ""}}}
  40. html = requests.post(base_url, data=json.dumps(pay), headers=headers)
  41. bs = json.loads(html.text)
  42. danmu_key = bs['data']['stMap'][v_id]['strDanMuKey']
  43. target_id = danmu_key[danmu_key.find(
  44. 'targetid') + 9: danmu_key.find('vid') - 1]
  45. return [v_id, target_id]
  46. #解析单个弹幕页面,需传入target_id,v_id(后缀ID)和集数(方便匹配),返回具体的弹幕信息
  47. def parse_danmu(url, target_id, v_id, headers, period):
  48. html = requests.get(url, headers=headers)
  49. bs = json.loads(html.text, strict=False)
  50. df = pd.DataFrame()
  51. for i in bs['comments']:
  52. content = i['content']
  53. name = i['opername']
  54. upcount = i['upcount']
  55. user_degree = i['uservip_degree']
  56. timepoint = i['timepoint']
  57. comment_id = i['commentid']
  58. cache = pd.DataFrame({'用户名': [name], '内容': [content], '会员等级': [user_degree],
  59. '弹幕时间点': [timepoint], '弹幕点赞': [upcount], '弹幕id': [comment_id], '集数': [period]})
  60. df = pd.concat([df, cache])
  61. return df
  62. #构造单集弹幕的循环网页,传入target_id和后缀ID(v_id),通过设置爬取页数来改变timestamp的值完成翻页操作
  63. def format_url(target_id, v_id, end=85):
  64. urls = []
  65. base_url = 'https://mfm.video.qq.com/danmu?otype=json&timestamp={}&target_id={}%26vid%3D{}&count=80&second_count=5'
  66. for num in range(15, end * 30 + 15, 30):
  67. url = base_url.format(num, target_id, v_id)
  68. urls.append(url)
  69. return urls
  70. def get_all_ids(part1_url, part2_url, headers):
  71. #分别获取1-30,31-46的所有后缀ID(v_id)
  72. part_1 = parse_base_info(part1_url, headers)
  73. part_2 = parse_base_info(part2_url, headers)
  74. df = pd.concat([part_1, part_2])
  75. df.sort_values('第几集', ascending=True, inplace=True)
  76. count = 1
  77. #创建一个列表存储target_id
  78. info_lst = []
  79. for i in df['id']:
  80. info = get_episode_danmu(i, headers)
  81. info_lst.append(info)
  82. print('正在努力爬取第 %d 集的target_id' % count)
  83. count += 1
  84. time.sleep(2 + random.random())
  85. print('是不是发现多了一集?别担心,会去重的')
  86. #根据后缀ID,将target_id和后缀ID所在的表合并
  87. info_lst = pd.DataFrame(info_lst)
  88. info_lst.columns = ['v_id', 'target_id']
  89. combine = pd.merge(df, info_lst, left_on='id',
  90. right_on='v_id', how='inner')
  91. #去重复值
  92. combine = combine.loc[combine.duplicated('id') == False, :]
  93. return combine
  94. #输入包含v_id,target_id的表,并传入想要爬取多少集
  95. def crawl_all(combine, num, page, headers):
  96. c = 1
  97. final_result = pd.DataFrame()
  98. #print('Bro,马上要开始循环爬取每一集的弹幕了')
  99. for v_id, target_id in zip(combine['v_id'][:num], combine['target_id'][:num]):
  100. count = 1
  101. urls = format_url(target_id, v_id, page)
  102. for url in urls:
  103. result = parse_danmu(url, target_id, v_id, headers, c)
  104. final_result = pd.concat([final_result, result])
  105. time.sleep(2 + random.random())
  106. print('这是 %d 集的第 %d 页爬取..' % (c, count))
  107. count += 1
  108. print('-------------------------------------')
  109. c += 1
  110. return final_result
  111. if __name__ == '__main__':
  112. #《都挺好》1-30集的网址,31-46集的网址
  113. #如果要爬取其他电视剧,只需要根据文章的提示,找到存储后缀ID的原网址即可
  114. part1_url = 'https://union.video.qq.com/fcgi-bin/data?otype=json&tid=682&appid=20001238&appkey=6c03bbe9658448a4&idlist=x003061htl5,t00306i1e62,x003061htl5,b0030velala,w0030ilim7z,i0030r7v63u,z003044noq2,m0030sfinyr,c0030u884k7,k0030m5zbr7,l0030e5nglm,h0030b060vn,j003090ci7w,n0030falyoi,s00308u9kwx,p0030fohijf,g00303ob0cx,v0030960y6n,x0030bl84xw,v0030keuav1,t0030kups1i,n0030y2o52i,x0030s52mev,d0030xuekgw,o0030md1a2a,x0030peo3sk,d00303l5j4k,t0030aexmnt,a0030ybi45z,y0030wpe2wu&callback=jQuery191020844423583354543_1554200358596&_=1554200358597'
  115. part2_url = 'https://union.video.qq.com/fcgi-bin/data?otype=json&tid=682&appid=20001238&appkey=6c03bbe9658448a4&idlist=t0030epjqsi,g003035mi84,n00301fxqbh,h0030zivlrq,d0030qc1yu2,m0030q9ywxj,h0030j0eq19,j0030jks835,a00308xw434,l0030tb319m,x0030xogl32,g0030fju3w3,a0030vrcww0,l0030jzi1mi,c0030mq8yjr,u00302fdo8v,a0030w9g57k,n0030wnj6i8,j0030h91ouj,j00304eu73n,t00305kc1f5,i0030x490o2,u0030jtmlj2,d003031ey5h,w0850w594k6,l0854pfn9lg,f08546r7l7a,d0854s0oq1z,m08546pcd9k,p0854r1nygj&callback=jQuery191020844423583354543_1554200358598&_=1554200358599'
  116. headers = {
  117. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
  118. #得到所有的后缀ID,基于后缀ID爬取target_id
  119. combine = get_all_ids(part1_url, part2_url, headers)
  120. #设置要爬取多少集(num参数),每一集爬取多少页弹幕(1-85页,page参数),这里默认是爬取第一集的5页弹幕
  121. #比如想要爬取30集,每一集85页,num = 30,page = 85
  122. final_result = crawl_all(combine, num=1, page=5, headers=headers)
  123. final_result.to_excel('xxx.xlsx')