#!/usr/bin/env python # -*- encoding: utf-8 -*- """ @File : get_user.py @Time : 2019/05/15 20:28:36 @Author : Liuyuqi @Version : 1.0 @Contact : liuyuqi.gov@msn.cn @License : (C)Copyright 2019 @Desc : 抓取 用户信息,接口为: https://space.bilibili.com/521400 后面数字穷举法获取所有用户姓名,性别,年龄等等信息。 http://space.bilibili.com/ajax/member/GetInfo """ import sys import os src = "C:/Users/liuyuqi/Desktop/crawl-bilibili" os.chdir(src) sys.path.append(src) import utils.config as conf from utils.user_agent import getheaders import requests import json import random import pymysql import datetime import time print(src) exit() # 连接数据库 conn = pymysql.connect( host=conf.readConf("db1", "host"), user=conf.readConf("db1", "user"), passwd=conf.readConf("db1", "pwd"), db="bilibili", charset="utf8" ) cur = conn.cursor() # cur.execute("sql") # conn.commit() head = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", "X-Requested-With": "XMLHttpRequest", "Referer": "http://space.bilibili.com/45388", "Origin": "http://space.bilibili.com", "Host": "space.bilibili.com", "AlexaToolbar-ALX_NS_PH": "AlexaToolbar/alx-4.0", "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4", "Accept": "application/json, text/javascript, */*; q=0.01", } proxies = {"http": "http://120.26.110.59:8080"} time1 = time.time() # 1557920724.447739 urls = [] uas = [] uas = getheaders() def datetime_to_timestamp_in_milliseconds(): return int(round(time.time() * 1000)) # 1557920582757 def getsource(url, i): payload = { "_": datetime_to_timestamp_in_milliseconds(), "mid": url.replace("https://space.bilibili.com/", ""), } head = { "User-Agent": random.choice(uas), "Referer": "https://space.bilibili.com/" + str(i) + "?from=search&seid=" + str(random.randint(10000, 50000)), } jscontent = ( requests.session() .post( "http://space.bilibili.com/ajax/member/GetInfo", headers=head, data=payload, # proxies=proxies, ) .text ) time2 = time.time() try: jsDict = json.loads(jscontent) statusJson = jsDict["status"] if "status" in jsDict.keys() else False if statusJson == True: if "data" in jsDict.keys(): jsData = jsDict["data"] mid = jsData["mid"] name = jsData["name"] sex = jsData["sex"] rank = jsData["rank"] face = jsData["face"] # regtimestamp = jsData["regtime"] #没有这个值 # regtime_local = time.localtime(regtimestamp) regtime = "2018-05-06 12:22:23" spacesta = jsData["spacesta"] birthday = ( jsData["birthday"] if "birthday" in jsData.keys( ) else "nobirthday" ) sign = jsData["sign"] level = jsData["level_info"]["current_level"] OfficialVerifyType = jsData["official_verify"]["type"] OfficialVerifyDesc = jsData["official_verify"]["desc"] vipType = jsData["vip"]["vipType"] vipStatus = jsData["vip"]["vipStatus"] toutu = jsData["toutu"] toutuId = jsData["toutuId"] coins = jsData["coins"] print("Succeed get user info: " + str(mid) + "\t" + str(time2 - time1)) try: res = requests.get( "https://api.bilibili.com/x/relation/stat?vmid=" + str(mid) + "&jsonp=jsonp" ).text viewinfo = requests.get( "https://api.bilibili.com/x/space/upstat?mid=" + str(mid) + "&jsonp=jsonp" ).text js_fans_data = json.loads(res) js_viewdata = json.loads(viewinfo) following = js_fans_data["data"]["following"] fans = js_fans_data["data"]["follower"] archiveview = js_viewdata["data"]["archive"]["view"] article = js_viewdata["data"]["article"]["view"] except: following = 0 fans = 0 archiveview = 0 article = 0 else: print("no data now") try: cur.execute( 'INSERT INTO user(mid, name, sex, rank, face, regtime, spacesta, \ birthday, sign, level, OfficialVerifyType, OfficialVerifyDesc, vipType, vipStatus, \ toutu, toutuId, coins, following, fans ,archiveview, article) \ VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s",\ "%s","%s","%s","%s","%s", "%s","%s","%s","%s","%s","%s")' % ( mid, name, sex, rank, face, regtime, spacesta, birthday, sign, level, OfficialVerifyType, OfficialVerifyDesc, vipType, vipStatus, toutu, toutuId, coins, following, fans, archiveview, article, ) ) conn.commit() except Exception as e: print(e) else: print("Error: " + url) except Exception as e: print(e) pass def crawlUser(): """ 开抓 param : return: """ # 获得索引头 cur.execute("sql") res = conn.commit() m = 5214 for i in range(m * 100, ((m * 100) + 1)): # range(521400,521500) url = "https://space.bilibili.com/" + str(i) # urls.append(url) getsource(url, i) if __name__ == "__main__": src = "C:/Users/liuyuqi/Desktop/crawl_bilibili" crawlUser()