123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @File : get_video.py
- @Time : 2019/05/15 17:09:18
- @Author : Liuyuqi
- @Version : 1.0
- @Contact : liuyuqi.gov@msn.cn
- @License : (C)Copyright 2019
- @Desc : 爬取 B 站视频 ,注意很多链接都失败了,需检测,接口:
- https://www.bilibili.com/video/av100500
- '''
- from lxml import etree
- from multiprocessing.dummy import Pool as ThreadPool
- import requests
- import time
- import sys
- import re
- import json
- import pymysql
- urls = []
- head = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
- }
- time1 = time.time()
- for i in range(17501, 100000):
- url = 'http://bilibili.com/video/av' + str(i)
- urls.append(url)
- # 连接数据库
- conn = pymysql.connect(host='localhost',
- user='root',
- passwd='',
- port=3306,
- charset='utf8')
- cur = conn.cursor()
- def crawlVideo(url):
- html = requests.get(uurl, headers=head)
- selector = etree.HTML(html.text)
- content = selector.xpath("//html")
- for each in content:
- title = each.xpath('//div[@class="v-title"]/h1/@title')
- if title:
- av = url.replace("http://bilibili.com/video/av", "")
- title = title[0]
- tminfo1_log = each.xpath('//div[@class="tminfo"]/a/text()')
- tminfo2_log = each.xpath('//div[@class="tminfo"]/span[1]/a/text()')
- tminfo3_log = each.xpath('//div[@class="tminfo"]/span[2]/a/text()')
- if tminfo1_log:
- tminfo1 = tminfo1_log[0]
- else:
- tminfo1 = ""
- if tminfo2_log:
- tminfo2 = tminfo2_log[0]
- else:
- tminfo2 = ""
- if tminfo3_log:
- tminfo3 = tminfo3_log[0]
- else:
- tminfo3 = ""
- tminfo = tminfo1 + '-' + tminfo2 + '-' + tminfo3
- time_log = each.xpath('//div[@class="tminfo"]/time/i/text()')
- mid_log = each.xpath('//div[@class="b-btn f hide"]/@mid')
- name_log = each.xpath('//div[@class="usname"]/a/@title')
- article_log = each.xpath(
- '//div[@class="up-video-message"]/div[1]/text()')
- fans_log = each.xpath(
- '//div[@class="up-video-message"]/div[2]/text()')
- if time_log:
- time = time_log[0]
- else:
- time = ""
- if mid_log:
- mid = mid_log[0]
- else:
- mid = ""
- if name_log:
- name = name_log[0]
- else:
- name = ""
- if article_log:
- article = article_log[0].replace(u"投稿:", "")
- else:
- article = "-1"
- if fans_log:
- fans = fans_log[0].replace(u"粉丝:", "")
- else:
- fans = "-1"
- tag1_log = each.xpath('//ul[@class="tag-list"]/li[1]/a/text()')
- tag2_log = each.xpath('//ul[@class="tag-list"]/li[2]/a/text()')
- tag3_log = each.xpath('//ul[@class="tag-list"]/li[3]/a/text()')
- if tag1_log:
- tag1 = tag1_log[0]
- else:
- tag1 = ""
- if tag2_log:
- tag2 = tag2_log[0]
- else:
- tag2 = ""
- if tag3_log:
- tag3 = tag3_log[0]
- else:
- tag3 = ""
- cid_html_1 = each.xpath('//div[@class="scontent"]/iframe/@src')
- cid_html_2 = each.xpath('//div[@class="scontent"]/script/text()')
- if cid_html_1 or cid_html_2:
- if cid_html_1:
- cid_html = cid_html_1[0]
- else:
- cid_html = cid_html_2[0]
- cids = re.findall(r'cid=.+&aid', cid_html)
- cid = cids[0].replace("cid=", "").replace("&aid", "")
- info_url = "http://interface.bilibili.com/player?id=cid:" + \
- str(cid) + "&aid=" + av
- video_info = requests.get(info_url)
- video_selector = etree.HTML(video_info.text)
- for video_each in video_selector:
- click_log = video_each.xpath('//click/text()')
- danmu_log = video_each.xpath('//danmu/text()')
- coins_log = video_each.xpath('//coins/text()')
- favourites_log = video_each.xpath('//favourites/text()')
- duration_log = video_each.xpath('//duration/text()')
- honor_click_log = video_each.xpath(
- '//honor[@t="click"]/text()')
- honor_coins_log = video_each.xpath(
- '//honor[@t="coins"]/text()')
- honor_favourites_log = video_each.xpath(
- '//honor[@t="favourites"]/text()')
- if honor_click_log:
- honor_click = honor_click_log[0]
- else:
- honor_click = 0
- if honor_coins_log:
- honor_coins = honor_coins_log[0]
- else:
- honor_coins = 0
- if honor_favourites_log:
- honor_favourites = honor_favourites_log[0]
- else:
- honor_favourites = 0
- if click_log:
- click = click_log[0]
- else:
- click = -1
- if danmu_log:
- danmu = danmu_log[0]
- else:
- danmu = -1
- if coins_log:
- coins = coins_log[0]
- else:
- coins = -1
- if favourites_log:
- favourites = favourites_log[0]
- else:
- favourites = -1
- if duration_log:
- duration = duration_log[0]
- else:
- duration = ""
- json_url = "http://api.bilibili.com/x/reply?jsonp=jsonp&type=1&sort=0&pn=1&nohot=1&oid=" + av
- jsoncontent = requests.get(json_url, headers=head).content
- jsDict = json.loads(jsoncontent)
- if jsDict['code'] == 0:
- jsData = jsDict['data']
- jsPages = jsData['page']
- common = jsPages['acount']
- try:
- cur.execute('INSERT INTO video VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
- [str(av), str(av), cid, title, tminfo, time, click, danmu, coins, favourites, duration,
- mid, name, article, fans, tag1, tag2, tag3, str(common), honor_click, honor_coins, honor_favourites])
- print("Succeed: av" + str(av))
- except pymysql.Error as e:
- print("Mysql Error %d: %s" % (e.args[0], e.args[1]))
- else:
- print("Error_Json: " + url)
- else:
- print("Error_noCid:" + url)
- else:
- print("Error_404: " + url)
- if __name__ == "__main__":
- # 开10个线程跑
- pool = ThreadPool(10)
- try:
- results = pool.map(crawlVideo, urls)
- except Exception as e:
- # print 'ConnectionError'
- print(e)
- time.sleep(300)
- results = pool.map(crawlVideo, urls)
- conn.close() #关闭数据库
- pool.close()
- pool.join()
|