get_user.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. """
  4. @File : get_user.py
  5. @Time : 2019/05/15 20:28:36
  6. @Author : Liuyuqi
  7. @Version : 1.0
  8. @Contact : liuyuqi.gov@msn.cn
  9. @License : (C)Copyright 2019
  10. @Desc : 抓取 用户信息,接口为: https://space.bilibili.com/521400 后面数字穷举法获取所有用户姓名,性别,年龄等等信息。
  11. http://space.bilibili.com/ajax/member/GetInfo
  12. """
  13. import sys
  14. import os
  15. src = "C:/Users/liuyuqi/Desktop/crawl-bilibili"
  16. os.chdir(src)
  17. sys.path.append(src)
  18. import utils.config as conf
  19. from utils.user_agent import getheaders
  20. import requests
  21. import json
  22. import random
  23. import pymysql
  24. import datetime
  25. import time
  26. print(src)
  27. exit()
  28. # 连接数据库
  29. conn = pymysql.connect(
  30. host=conf.readConf("db1", "host"), user=conf.readConf("db1", "user"), passwd=conf.readConf("db1", "pwd"), db="bilibili", charset="utf8"
  31. )
  32. cur = conn.cursor()
  33. # cur.execute("sql")
  34. # conn.commit()
  35. head = {
  36. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
  37. "X-Requested-With": "XMLHttpRequest",
  38. "Referer": "http://space.bilibili.com/45388",
  39. "Origin": "http://space.bilibili.com",
  40. "Host": "space.bilibili.com",
  41. "AlexaToolbar-ALX_NS_PH": "AlexaToolbar/alx-4.0",
  42. "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4",
  43. "Accept": "application/json, text/javascript, */*; q=0.01",
  44. }
  45. proxies = {"http": "http://120.26.110.59:8080"}
  46. time1 = time.time() # 1557920724.447739
  47. urls = []
  48. uas = []
  49. uas = getheaders()
  50. def datetime_to_timestamp_in_milliseconds():
  51. return int(round(time.time() * 1000)) # 1557920582757
  52. def getsource(url, i):
  53. payload = {
  54. "_": datetime_to_timestamp_in_milliseconds(),
  55. "mid": url.replace("https://space.bilibili.com/", ""),
  56. }
  57. head = {
  58. "User-Agent": random.choice(uas),
  59. "Referer": "https://space.bilibili.com/"
  60. + str(i)
  61. + "?from=search&seid="
  62. + str(random.randint(10000, 50000)),
  63. }
  64. jscontent = (
  65. requests.session()
  66. .post(
  67. "http://space.bilibili.com/ajax/member/GetInfo",
  68. headers=head,
  69. data=payload,
  70. # proxies=proxies,
  71. )
  72. .text
  73. )
  74. time2 = time.time()
  75. try:
  76. jsDict = json.loads(jscontent)
  77. statusJson = jsDict["status"] if "status" in jsDict.keys() else False
  78. if statusJson == True:
  79. if "data" in jsDict.keys():
  80. jsData = jsDict["data"]
  81. mid = jsData["mid"]
  82. name = jsData["name"]
  83. sex = jsData["sex"]
  84. rank = jsData["rank"]
  85. face = jsData["face"]
  86. # regtimestamp = jsData["regtime"] #没有这个值
  87. # regtime_local = time.localtime(regtimestamp)
  88. regtime = "2018-05-06 12:22:23"
  89. spacesta = jsData["spacesta"]
  90. birthday = (
  91. jsData["birthday"] if "birthday" in jsData.keys(
  92. ) else "nobirthday"
  93. )
  94. sign = jsData["sign"]
  95. level = jsData["level_info"]["current_level"]
  96. OfficialVerifyType = jsData["official_verify"]["type"]
  97. OfficialVerifyDesc = jsData["official_verify"]["desc"]
  98. vipType = jsData["vip"]["vipType"]
  99. vipStatus = jsData["vip"]["vipStatus"]
  100. toutu = jsData["toutu"]
  101. toutuId = jsData["toutuId"]
  102. coins = jsData["coins"]
  103. print("Succeed get user info: " +
  104. str(mid) + "\t" + str(time2 - time1))
  105. try:
  106. res = requests.get(
  107. "https://api.bilibili.com/x/relation/stat?vmid="
  108. + str(mid)
  109. + "&jsonp=jsonp"
  110. ).text
  111. viewinfo = requests.get(
  112. "https://api.bilibili.com/x/space/upstat?mid="
  113. + str(mid)
  114. + "&jsonp=jsonp"
  115. ).text
  116. js_fans_data = json.loads(res)
  117. js_viewdata = json.loads(viewinfo)
  118. following = js_fans_data["data"]["following"]
  119. fans = js_fans_data["data"]["follower"]
  120. archiveview = js_viewdata["data"]["archive"]["view"]
  121. article = js_viewdata["data"]["article"]["view"]
  122. except:
  123. following = 0
  124. fans = 0
  125. archiveview = 0
  126. article = 0
  127. else:
  128. print("no data now")
  129. try:
  130. cur.execute(
  131. 'INSERT INTO user(mid, name, sex, rank, face, regtime, spacesta, \
  132. birthday, sign, level, OfficialVerifyType, OfficialVerifyDesc, vipType, vipStatus, \
  133. toutu, toutuId, coins, following, fans ,archiveview, article) \
  134. VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s",\
  135. "%s","%s","%s","%s","%s", "%s","%s","%s","%s","%s","%s")'
  136. % (
  137. mid,
  138. name,
  139. sex,
  140. rank,
  141. face,
  142. regtime,
  143. spacesta,
  144. birthday,
  145. sign,
  146. level,
  147. OfficialVerifyType,
  148. OfficialVerifyDesc,
  149. vipType,
  150. vipStatus,
  151. toutu,
  152. toutuId,
  153. coins,
  154. following,
  155. fans,
  156. archiveview,
  157. article,
  158. )
  159. )
  160. conn.commit()
  161. except Exception as e:
  162. print(e)
  163. else:
  164. print("Error: " + url)
  165. except Exception as e:
  166. print(e)
  167. pass
  168. def crawlUser():
  169. """
  170. 开抓
  171. param :
  172. return:
  173. """
  174. # 获得索引头
  175. cur.execute("sql")
  176. res = conn.commit()
  177. m = 5214
  178. for i in range(m * 100, ((m * 100) + 1)): # range(521400,521500)
  179. url = "https://space.bilibili.com/" + str(i)
  180. # urls.append(url)
  181. getsource(url, i)
  182. if __name__ == "__main__":
  183. src = "C:/Users/liuyuqi/Desktop/crawl_bilibili"
  184. crawlUser()