1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Author : liuyuqi
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2019/08/11 06:41:06
- @Version : 1.0
- @License : (C)Copyright 2019
- @Desc : 获取所有相亲用户数据
- https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&topicId=5abcaa67092dcb4620ca335c&page=3&pageSize=20&sortType=rank
- 总共 447 信息,每页20条,共23页。
- '''
- # import pandas
- # import josn
- # import os,sys,re
- # import requests
- import pymongo
- import time
- import threading
- import urllib.request
- # url_seed = "https://short-msg-ms.juejin.im/v1/pinList/topic?uid=&device_id=&token=&src=web&topicId=5abcaa67092dcb4620ca335c&pageSize=20&sortType=rank&page="
- url_seed = "https://baidu.com"
- url_login = ""
- url_cache = set()
- headers = {
- 'User-Agent': "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36",
- 'Cookie': '_ga=GA1.2.543338178.1565470742;_gid=GA1.2.1886010917.1565470742;gr_session_id_89669d96c88aefbc=79649999-e8c0-470a-9677-82496ff889a4;gr_session_id_89669d96c88aefbc_79649999-e8c0-470a-9677-82496ff889a4=true;gr_user_id=5ead73a1-13db-4b49-85bd-ff1ee44188bd;Hm_lpvt_93bbd335a208870aa1f296bcd6842e5e=1565478332;Hm_lvt_93bbd335a208870aa1f296bcd6842e5e=1565471101,1565471795,1565474508,1565478332;ab={};MEIQIA_TRACK_ID=1PFYEEcl0GseQQFT5TpFEZroHGg;QINGCLOUDELB=7c5122b6c6517c59163563fe189d391bab7e48fb3972913efd95d72fe838c4fb|XU9Nv|XU9Nv;',
- 'Host': 'juejin.im',
- 'Referer': 'https://juejin.im/pins/topic/5abcaa67092dcb4620ca335c',
- 'Sec-Fetch-Mode': 'cors',
- }
- client = pymongo.MongoClient("mongodb://admin:password@localhost:27017/")
- db_juejin = client.juejin_date
- def crawl():
- for i in range(0, 1):
- getUser(i)
- def getUser(page):
- url_page = url_seed+str(page)
- data = {
- "uid": "",
- "device_id": "",
- "token": "",
- "src": "web",
- "topicId": "5abcaa67092dcb4620ca335c",
- "page": "0",
- "pageSize": "20",
- "sortType": "rank",
- }
- try:
- req = urllib.request.Request(method="get",
- url=url_page, data=urllib.parse.urlencode(data).encode(encoding='UTF8'), headers=headers)
- except Exception as err:
- print(err)
- try:
- with urllib.request.urlopen(req) as res:
- print(url_page)
- print(res)
- except Exception as e:
- print(e)
- # print(res.read().decode('utf-8'))
- # for i in res["d"]["list"].length:
- # saveUser(res[i])
- def saveUser(jsonUser):
- '''
- 保存到mongodb中
- '''
- student1 = {
- 'id': '20170101',
- 'name': 'Jordan',
- 'age': 20,
- 'gender': 'male'
- }
- # result = db_juejin.students.insert(student1)
- res1 = db_juejin.students.insert_one(student1)
- print(res1.inserted_id)
- if __name__ == "__main__":
- start_time = time.time()
- crawl()
- print("last time: {} s".format(time.time() - start_time))
-
|