1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2021/03/10 11:04:33
- @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved.
- @Desc : 招采人员能力评价-证书爬虫
- 共70页 6919条 ,每页 100 条
- 第一步: 循环爬取所有CertificateListePage ,参数中有签名,所以采用
- 第二步:根据爬到的Certificateid 爬取详情页,下载证书图片,以 名字-身份证-职称等级
- '''
- import os
- import sys
- import re
- import json
- import time
- # from spliter import Browser
- import requests
- headers = {
- 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
- "Authorization": "Basic YXBwOlZtMHhkMUl4YkZoVFdHaFRWMGQ0VjFsWGM=",
- "Cookie": "acw_tc=2760820416153446998782758e6f918d30a3b14e84cdf23375d461edeec5cd",
- "Sign": "ef1bffdaa27c5e0219fedface033b001",
- "Tenant-Id": "000000",
- "Timestamp": "1615345938461"}
- def downloadCert():
- pass
- def getCert():
- url = r"http://tpp.ctba.org.cn/cmsNavDetail/open/certificateDetail?id=1368587249407119362"
- res=requests.get(url,headers=headers)
- print(res.text.encode(res.encoding).decode("utf-8"))
- print(res.content)
- def getCertificateListPage():
- for i in range(1, 2):
- # with Browser(driver_name='chrome', executable_path="chromedriver.exe") as browser:
- # browser.visit()
- res = requests.get(
- r'http://tpp.ctba.org.cn/api/ctpsp-public/user-certificate/endpoint/publicity-pager?current='+str(i)+'&size=100&level=0&status=3&keyword=' + str(i), headers=headers)
- print(res.text.encode(res.encoding).decode("utf-8"))
- print(res.content)
- def crawl():
- getCert()
- if __name__ == "__main__":
- crawl()
|