#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Contact : liuyuqi.gov@msn.cn @Time : 2020/09/06 01:38:09 @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved. @Desc : 爬取 https://sh.122.gov.cn 驾照考试报名数据 ''' import pandas as pd import numpy as np import requests from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.keys import Keys from selenium.webdriver import ActionChains import os import re import sys import time base = r"https://sh.122.gov.cn/#/noticeDetail?fzjg=%E6%B2%AAA&tjyf=202007&fwdmgl=6003" chormepath = r"D:/Program-Files/browser-driver/chromedriver.exe" phantomjspath = r"D:/Program-Files/phantomjs-2.1.1-windows/bin/phantomjs.exe" link = [] res = [] save_path = r"download" link_path = r"data/link.csv" report_path = r"data/report.csv" # if not os.path.exists(save_path): # os.mkdir(save_path) option = webdriver.ChromeOptions() option.add_argument("lang=zh_CN.UTF-8") option.add_argument("User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36") option.add_argument("--headless") # 禁止加载图片 prefs = { 'profile.default_content_setting_values.images': 2 } option.add_experimental_option('prefs', prefs) # driver = webdriver.PhantomJS(executable_path=phantomjspath) driver = webdriver.Chrome(executable_path=chormepath, options=option) # driver.maximize_window() def getLink(): driver.implicitly_wait(10) driver.get(base) # 加载页面 if not os.path.exists(link_path): for i in range(5): monthData = driver.find_elements_by_css_selector( "#querylist li a") # # 获取本页所有月份 for i in monthData: print(i.text, i.get_attribute("href")) link.append([i.text, i.get_attribute("href")]) # 获取下一步按钮,点击 driver.find_element_by_xpath( '//*[@id="pppagination"]/ul/li[2]/a').click() # cookie = getCookie() # print(cookie) link = pd.DataFrame(link) link.to_csv(link_path, header=False) else: link = pd.read_csv(link_path, names=["month", "link"]) return link def download(url, save_path): try: with open(save_path, "wb") as file: file.write(requests.get(url).raw) except Exception as e: print(e) def getCookie(): cookie = driver.get_cookies() cookie_dict = [] for c in cookie: ck = "{0}={1};".format(c['name'], c['value']) cookie_dict.append(ck) return cookie_dict def crawl(): global link link = pd.read_csv(link_path, names=["month", "link"]) for i in range(len(link)): link1 = link.loc[i]["link"] # 链接 month1 = link.loc[i]["month"] # 月份 if not os.path.exists("/data/report" + month1 + ".csv"): driver.implicitly_wait(10) driver.get(link1) # # 找出多少条,多少页 try: text = driver.find_element_by_xpath( '//*[@id="pagination"]/span').text # 有异常 # 共2391条记录 1/120页 pagesize = re.split("[/页]", re.search("/.*页 ", text).group())[1] reportData = pd.DataFrame( columns=["date", "place", "course1", "course2", "course3", "course4"]) for i in range(int(pagesize)): # 找出本页table trlist = driver.find_elements_by_tag_name("tr") for row in trlist: tdlist = row.find_elements_by_tag_name("td") tmp = [] for col in tdlist: tmp.append(col.text) reportData = reportData.append( pd.Series(tmp, index=reportData.columns), ignore_index=True) # 增加一行 # 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3] if i > 0: driver.find_element_by_xpath( '//*[@id="pagination"]/span/a[3]').click() else: driver.find_element_by_xpath( '//*[@id="pagination"]/span/a[2]').click() except Exception as e: print(e) reportData.to_csv("data/report" + month1 + ".csv", header=False) driver.close() if __name__ == "__main__": crawl()