123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- """
- @File : get_user.py
- @Time : 2019/05/15 20:28:36
- @Author : Liuyuqi
- @Version : 1.0
- @Contact : liuyuqi.gov@msn.cn
- @License : (C)Copyright 2019
- @Desc : 抓取 用户信息,接口为: https://space.bilibili.com/521400 后面数字穷举法获取所有用户姓名,性别,年龄等等信息。
- http://space.bilibili.com/ajax/member/GetInfo
- """
- import sys
- import os
- src = "C:/Users/liuyuqi/Desktop/crawl-bilibili"
- os.chdir(src)
- sys.path.append(src)
- import utils.config as conf
- from utils.user_agent import getheaders
- import requests
- import json
- import random
- import pymysql
- import datetime
- import time
- print(src)
- exit()
- # 连接数据库
- conn = pymysql.connect(
- host=conf.readConf("db1", "host"), user=conf.readConf("db1", "user"), passwd=conf.readConf("db1", "pwd"), db="bilibili", charset="utf8"
- )
- cur = conn.cursor()
- # cur.execute("sql")
- # conn.commit()
- head = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
- "X-Requested-With": "XMLHttpRequest",
- "Referer": "http://space.bilibili.com/45388",
- "Origin": "http://space.bilibili.com",
- "Host": "space.bilibili.com",
- "AlexaToolbar-ALX_NS_PH": "AlexaToolbar/alx-4.0",
- "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4",
- "Accept": "application/json, text/javascript, */*; q=0.01",
- }
- proxies = {"http": "http://120.26.110.59:8080"}
- time1 = time.time() # 1557920724.447739
- urls = []
- uas = []
- uas = getheaders()
- def datetime_to_timestamp_in_milliseconds():
- return int(round(time.time() * 1000)) # 1557920582757
- def getsource(url, i):
- payload = {
- "_": datetime_to_timestamp_in_milliseconds(),
- "mid": url.replace("https://space.bilibili.com/", ""),
- }
- head = {
- "User-Agent": random.choice(uas),
- "Referer": "https://space.bilibili.com/"
- + str(i)
- + "?from=search&seid="
- + str(random.randint(10000, 50000)),
- }
- jscontent = (
- requests.session()
- .post(
- "http://space.bilibili.com/ajax/member/GetInfo",
- headers=head,
- data=payload,
- # proxies=proxies,
- )
- .text
- )
- time2 = time.time()
- try:
- jsDict = json.loads(jscontent)
- statusJson = jsDict["status"] if "status" in jsDict.keys() else False
- if statusJson == True:
- if "data" in jsDict.keys():
- jsData = jsDict["data"]
- mid = jsData["mid"]
- name = jsData["name"]
- sex = jsData["sex"]
- rank = jsData["rank"]
- face = jsData["face"]
- # regtimestamp = jsData["regtime"] #没有这个值
- # regtime_local = time.localtime(regtimestamp)
- regtime = "2018-05-06 12:22:23"
- spacesta = jsData["spacesta"]
- birthday = (
- jsData["birthday"] if "birthday" in jsData.keys(
- ) else "nobirthday"
- )
- sign = jsData["sign"]
- level = jsData["level_info"]["current_level"]
- OfficialVerifyType = jsData["official_verify"]["type"]
- OfficialVerifyDesc = jsData["official_verify"]["desc"]
- vipType = jsData["vip"]["vipType"]
- vipStatus = jsData["vip"]["vipStatus"]
- toutu = jsData["toutu"]
- toutuId = jsData["toutuId"]
- coins = jsData["coins"]
- print("Succeed get user info: " +
- str(mid) + "\t" + str(time2 - time1))
- try:
- res = requests.get(
- "https://api.bilibili.com/x/relation/stat?vmid="
- + str(mid)
- + "&jsonp=jsonp"
- ).text
- viewinfo = requests.get(
- "https://api.bilibili.com/x/space/upstat?mid="
- + str(mid)
- + "&jsonp=jsonp"
- ).text
- js_fans_data = json.loads(res)
- js_viewdata = json.loads(viewinfo)
- following = js_fans_data["data"]["following"]
- fans = js_fans_data["data"]["follower"]
- archiveview = js_viewdata["data"]["archive"]["view"]
- article = js_viewdata["data"]["article"]["view"]
- except:
- following = 0
- fans = 0
- archiveview = 0
- article = 0
- else:
- print("no data now")
- try:
- cur.execute(
- 'INSERT INTO user(mid, name, sex, rank, face, regtime, spacesta, \
- birthday, sign, level, OfficialVerifyType, OfficialVerifyDesc, vipType, vipStatus, \
- toutu, toutuId, coins, following, fans ,archiveview, article) \
- VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s",\
- "%s","%s","%s","%s","%s", "%s","%s","%s","%s","%s","%s")'
- % (
- mid,
- name,
- sex,
- rank,
- face,
- regtime,
- spacesta,
- birthday,
- sign,
- level,
- OfficialVerifyType,
- OfficialVerifyDesc,
- vipType,
- vipStatus,
- toutu,
- toutuId,
- coins,
- following,
- fans,
- archiveview,
- article,
- )
- )
- conn.commit()
- except Exception as e:
- print(e)
- else:
- print("Error: " + url)
- except Exception as e:
- print(e)
- pass
- def crawlUser():
- """
- 开抓
- param :
- return:
- """
- # 获得索引头
- cur.execute("sql")
- res = conn.commit()
- m = 5214
- for i in range(m * 100, ((m * 100) + 1)): # range(521400,521500)
- url = "https://space.bilibili.com/" + str(i)
- # urls.append(url)
- getsource(url, i)
- if __name__ == "__main__":
- src = "C:/Users/liuyuqi/Desktop/crawl_bilibili"
- crawlUser()
|