123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2020/09/06 01:38:09
- @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved.
- @Desc : 爬取 https://sh.122.gov.cn 驾照考试报名数据
- '''
- import pandas as pd
- import numpy as np
- import requests
- from selenium import webdriver
- from selenium.common.exceptions import NoSuchElementException
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver import ActionChains
- import os
- import re
- import sys
- import time
- base = r"https://sh.122.gov.cn/#/noticeDetail?fzjg=%E6%B2%AAA&tjyf=202007&fwdmgl=6003"
- chormepath = r"D:/Program-Files/browser-driver/chromedriver.exe"
- phantomjspath = r"D:/Program-Files/phantomjs-2.1.1-windows/bin/phantomjs.exe"
- link = []
- res = []
- save_path = r"download"
- link_path = r"data/link.csv"
- report_path = r"data/report.csv"
- # if not os.path.exists(save_path):
- # os.mkdir(save_path)
- option = webdriver.ChromeOptions()
- option.add_argument("lang=zh_CN.UTF-8")
- option.add_argument("User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36")
- option.add_argument("--headless")
- # 禁止加载图片
- prefs = {
- 'profile.default_content_setting_values.images': 2
- }
- option.add_experimental_option('prefs', prefs)
- # driver = webdriver.PhantomJS(executable_path=phantomjspath)
- driver = webdriver.Chrome(executable_path=chormepath, options=option)
- # driver.maximize_window()
- def getLink():
- driver.implicitly_wait(10)
- driver.get(base) # 加载页面
- if not os.path.exists(link_path):
- for i in range(5):
- monthData = driver.find_elements_by_css_selector(
- "#querylist li a")
- # # 获取本页所有月份
- for i in monthData:
- print(i.text, i.get_attribute("href"))
- link.append([i.text, i.get_attribute("href")])
- # 获取下一步按钮,点击
- driver.find_element_by_xpath(
- '//*[@id="pppagination"]/ul/li[2]/a').click()
- # cookie = getCookie()
- # print(cookie)
- link = pd.DataFrame(link)
- link.to_csv(link_path, header=False)
- else:
- link = pd.read_csv(link_path, names=["month", "link"])
- return link
- def download(url, save_path):
- try:
- with open(save_path, "wb") as file:
- file.write(requests.get(url).raw)
- except Exception as e:
- print(e)
- def getCookie():
- cookie = driver.get_cookies()
- cookie_dict = []
- for c in cookie:
- ck = "{0}={1};".format(c['name'], c['value'])
- cookie_dict.append(ck)
- return cookie_dict
- def crawl():
- global link
- link = pd.read_csv(link_path, names=["month", "link"])
- for i in range(len(link)):
- link1 = link.loc[i]["link"] # 链接
- month1 = link.loc[i]["month"] # 月份
- if not os.path.exists("/data/report" + month1 + ".csv"):
- driver.implicitly_wait(10)
- driver.get(link1)
- # # 找出多少条,多少页
- try:
- text = driver.find_element_by_xpath(
- '//*[@id="pagination"]/span').text # 有异常
- # 共2391条记录 1/120页
- pagesize = re.split("[/页]", re.search("/.*页 ", text).group())[1]
- reportData = pd.DataFrame(
- columns=["date", "place", "course1", "course2", "course3", "course4"])
- for i in range(int(pagesize)):
- # 找出本页table
- trlist = driver.find_elements_by_tag_name("tr")
- for row in trlist:
- tdlist = row.find_elements_by_tag_name("td")
- tmp = []
- for col in tdlist:
- tmp.append(col.text)
- reportData = reportData.append(
- pd.Series(tmp, index=reportData.columns), ignore_index=True) # 增加一行
- # 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3]
- if i > 0:
- driver.find_element_by_xpath(
- '//*[@id="pagination"]/span/a[3]').click()
- else:
- driver.find_element_by_xpath(
- '//*[@id="pagination"]/span/a[2]').click()
- except Exception as e:
- print(e)
- reportData.to_csv("data/report" + month1 + ".csv", header=False)
- driver.close()
- if __name__ == "__main__":
- crawl()
|