get_video_info.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @File : get_video.py
  5. @Time : 2019/05/15 17:09:18
  6. @Author : Liuyuqi
  7. @Version : 1.0
  8. @Contact : liuyuqi.gov@msn.cn
  9. @License : (C)Copyright 2019
  10. @Desc : 爬取 B 站视频 ,注意很多链接都失败了,需检测,接口:
  11. https://www.bilibili.com/video/av100500
  12. '''
  13. from lxml import etree
  14. from multiprocessing.dummy import Pool as ThreadPool
  15. import requests
  16. import time
  17. import sys
  18. import re
  19. import json
  20. import pymysql
  21. urls = []
  22. head = {
  23. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
  24. }
  25. time1 = time.time()
  26. for i in range(17501, 100000):
  27. url = 'http://bilibili.com/video/av' + str(i)
  28. urls.append(url)
  29. # 连接数据库
  30. conn = pymysql.connect(host='localhost',
  31. user='root',
  32. passwd='',
  33. port=3306,
  34. charset='utf8')
  35. cur = conn.cursor()
  36. def crawlVideo(url):
  37. html = requests.get(uurl, headers=head)
  38. selector = etree.HTML(html.text)
  39. content = selector.xpath("//html")
  40. for each in content:
  41. title = each.xpath('//div[@class="v-title"]/h1/@title')
  42. if title:
  43. av = url.replace("http://bilibili.com/video/av", "")
  44. title = title[0]
  45. tminfo1_log = each.xpath('//div[@class="tminfo"]/a/text()')
  46. tminfo2_log = each.xpath('//div[@class="tminfo"]/span[1]/a/text()')
  47. tminfo3_log = each.xpath('//div[@class="tminfo"]/span[2]/a/text()')
  48. if tminfo1_log:
  49. tminfo1 = tminfo1_log[0]
  50. else:
  51. tminfo1 = ""
  52. if tminfo2_log:
  53. tminfo2 = tminfo2_log[0]
  54. else:
  55. tminfo2 = ""
  56. if tminfo3_log:
  57. tminfo3 = tminfo3_log[0]
  58. else:
  59. tminfo3 = ""
  60. tminfo = tminfo1 + '-' + tminfo2 + '-' + tminfo3
  61. time_log = each.xpath('//div[@class="tminfo"]/time/i/text()')
  62. mid_log = each.xpath('//div[@class="b-btn f hide"]/@mid')
  63. name_log = each.xpath('//div[@class="usname"]/a/@title')
  64. article_log = each.xpath(
  65. '//div[@class="up-video-message"]/div[1]/text()')
  66. fans_log = each.xpath(
  67. '//div[@class="up-video-message"]/div[2]/text()')
  68. if time_log:
  69. time = time_log[0]
  70. else:
  71. time = ""
  72. if mid_log:
  73. mid = mid_log[0]
  74. else:
  75. mid = ""
  76. if name_log:
  77. name = name_log[0]
  78. else:
  79. name = ""
  80. if article_log:
  81. article = article_log[0].replace(u"投稿:", "")
  82. else:
  83. article = "-1"
  84. if fans_log:
  85. fans = fans_log[0].replace(u"粉丝:", "")
  86. else:
  87. fans = "-1"
  88. tag1_log = each.xpath('//ul[@class="tag-list"]/li[1]/a/text()')
  89. tag2_log = each.xpath('//ul[@class="tag-list"]/li[2]/a/text()')
  90. tag3_log = each.xpath('//ul[@class="tag-list"]/li[3]/a/text()')
  91. if tag1_log:
  92. tag1 = tag1_log[0]
  93. else:
  94. tag1 = ""
  95. if tag2_log:
  96. tag2 = tag2_log[0]
  97. else:
  98. tag2 = ""
  99. if tag3_log:
  100. tag3 = tag3_log[0]
  101. else:
  102. tag3 = ""
  103. cid_html_1 = each.xpath('//div[@class="scontent"]/iframe/@src')
  104. cid_html_2 = each.xpath('//div[@class="scontent"]/script/text()')
  105. if cid_html_1 or cid_html_2:
  106. if cid_html_1:
  107. cid_html = cid_html_1[0]
  108. else:
  109. cid_html = cid_html_2[0]
  110. cids = re.findall(r'cid=.+&aid', cid_html)
  111. cid = cids[0].replace("cid=", "").replace("&aid", "")
  112. info_url = "http://interface.bilibili.com/player?id=cid:" + \
  113. str(cid) + "&aid=" + av
  114. video_info = requests.get(info_url)
  115. video_selector = etree.HTML(video_info.text)
  116. for video_each in video_selector:
  117. click_log = video_each.xpath('//click/text()')
  118. danmu_log = video_each.xpath('//danmu/text()')
  119. coins_log = video_each.xpath('//coins/text()')
  120. favourites_log = video_each.xpath('//favourites/text()')
  121. duration_log = video_each.xpath('//duration/text()')
  122. honor_click_log = video_each.xpath(
  123. '//honor[@t="click"]/text()')
  124. honor_coins_log = video_each.xpath(
  125. '//honor[@t="coins"]/text()')
  126. honor_favourites_log = video_each.xpath(
  127. '//honor[@t="favourites"]/text()')
  128. if honor_click_log:
  129. honor_click = honor_click_log[0]
  130. else:
  131. honor_click = 0
  132. if honor_coins_log:
  133. honor_coins = honor_coins_log[0]
  134. else:
  135. honor_coins = 0
  136. if honor_favourites_log:
  137. honor_favourites = honor_favourites_log[0]
  138. else:
  139. honor_favourites = 0
  140. if click_log:
  141. click = click_log[0]
  142. else:
  143. click = -1
  144. if danmu_log:
  145. danmu = danmu_log[0]
  146. else:
  147. danmu = -1
  148. if coins_log:
  149. coins = coins_log[0]
  150. else:
  151. coins = -1
  152. if favourites_log:
  153. favourites = favourites_log[0]
  154. else:
  155. favourites = -1
  156. if duration_log:
  157. duration = duration_log[0]
  158. else:
  159. duration = ""
  160. json_url = "http://api.bilibili.com/x/reply?jsonp=jsonp&type=1&sort=0&pn=1&nohot=1&oid=" + av
  161. jsoncontent = requests.get(json_url, headers=head).content
  162. jsDict = json.loads(jsoncontent)
  163. if jsDict['code'] == 0:
  164. jsData = jsDict['data']
  165. jsPages = jsData['page']
  166. common = jsPages['acount']
  167. try:
  168. cur.execute('INSERT INTO video VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
  169. [str(av), str(av), cid, title, tminfo, time, click, danmu, coins, favourites, duration,
  170. mid, name, article, fans, tag1, tag2, tag3, str(common), honor_click, honor_coins, honor_favourites])
  171. print("Succeed: av" + str(av))
  172. except pymysql.Error as e:
  173. print("Mysql Error %d: %s" % (e.args[0], e.args[1]))
  174. else:
  175. print("Error_Json: " + url)
  176. else:
  177. print("Error_noCid:" + url)
  178. else:
  179. print("Error_404: " + url)
  180. if __name__ == "__main__":
  181. # 开10个线程跑
  182. pool = ThreadPool(10)
  183. try:
  184. results = pool.map(crawlVideo, urls)
  185. except Exception as e:
  186. # print 'ConnectionError'
  187. print(e)
  188. time.sleep(300)
  189. results = pool.map(crawlVideo, urls)
  190. conn.close() #关闭数据库
  191. pool.close()
  192. pool.join()