#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Contact : liuyuqi.gov@msn.cn @Time : 2021/03/10 11:04:33 @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved. @Desc : 招采人员能力评价-证书爬虫 共70页 6919条 ,每页 100 条 第一步: 循环爬取所有CertificateListePage ,参数中有签名,所以采用 第二步:根据爬到的Certificateid 爬取详情页,下载证书图片,以 名字-身份证-职称等级 ''' import os import sys import re import json import time # from spliter import Browser import requests headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', "Authorization": "Basic YXBwOlZtMHhkMUl4YkZoVFdHaFRWMGQ0VjFsWGM=", "Cookie": "acw_tc=2760820416153446998782758e6f918d30a3b14e84cdf23375d461edeec5cd", "Sign": "ef1bffdaa27c5e0219fedface033b001", "Tenant-Id": "000000", "Timestamp": "1615345938461"} def downloadCert(): pass def getCert(): url = r"http://tpp.ctba.org.cn/cmsNavDetail/open/certificateDetail?id=1368587249407119362" res=requests.get(url,headers=headers) print(res.text.encode(res.encoding).decode("utf-8")) print(res.content) def getCertificateListPage(): for i in range(1, 2): # with Browser(driver_name='chrome', executable_path="chromedriver.exe") as browser: # browser.visit() res = requests.get( r'http://tpp.ctba.org.cn/api/ctpsp-public/user-certificate/endpoint/publicity-pager?current='+str(i)+'&size=100&level=0&status=3&keyword=' + str(i), headers=headers) print(res.text.encode(res.encoding).decode("utf-8")) print(res.content) def crawl(): getCert() if __name__ == "__main__": crawl()