#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Contact : liuyuqi.gov@msn.cn @Time : 2020/09/06 01:38:09 @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved. @Desc : 爬取 https://sh.122.gov.cn 驾照考试报名数据 ''' import pandas as pd import numpy as np import requests from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.keys import Keys from selenium.webdriver import ActionChains from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import os import re import sys import time base = r"https://sh.122.gov.cn/#/noticeDetail?fzjg=%E6%B2%AAA&tjyf=202007&fwdmgl=6003" chormepath = r"D:/Program-Files/browser-driver/chromedriver.exe" phantomjspath = r"/opt/phantomjs/bin/phantomjs" link = [] res = [] save_path = r"download" link_path = r"data/link.csv" report_path = r"data/report.csv" # if not os.path.exists(save_path): # os.mkdir(save_path) option = webdriver.ChromeOptions() option.add_argument("lang=zh_CN.UTF-8") option.add_argument("User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36") option.add_argument("--headless") # 禁止加载图片 prefs = { 'profile.default_content_setting_values.images': 2 } option.add_experimental_option('prefs', prefs) desired_cap = DesiredCapabilities.PHANTOMJS.copy() desired_cap['phantomjs.page.settings.userAgent'] = 'Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36' driver = webdriver.PhantomJS( executable_path=phantomjspath, desired_capabilities=desired_cap) # driver = webdriver.Chrome(executable_path=chormepath, options=option) # driver.maximize_window() def getLink(): driver.implicitly_wait(10) driver.get(base) # 加载页面 if not os.path.exists(link_path): for i in range(5): monthData = driver.find_elements_by_css_selector( "#querylist li a") # # 获取本页所有月份 for i in monthData: print(i.text, i.get_attribute("href")) link.append([i.text, i.get_attribute("href")]) # 获取下一步按钮,点击 driver.find_element_by_xpath( '//*[@id="pppagination"]/ul/li[2]/a').click() # cookie = getCookie() # print(cookie) link = pd.DataFrame(link) link.to_csv(link_path, header=False) else: link = pd.read_csv(link_path, names=["month", "link"]) return link def download(url, save_path): try: with open(save_path, "wb") as file: file.write(requests.get(url).raw) except Exception as e: print(e) def getCookie(): cookie = driver.get_cookies() cookie_dict = [] for c in cookie: ck = "{0}={1};".format(c['name'], c['value']) cookie_dict.append(ck) return cookie_dict def crawl(): global link link = pd.read_csv(link_path, names=["month", "link"]) for i in range(len(link)): link1 = link.loc[i]["link"] # 链接 month1 = link.loc[i]["month"] # 月份 if not os.path.exists("/data/report" + month1 + ".csv"): driver.implicitly_wait(10) driver.get(link1) # # 找出多少条,多少页 try: text = driver.find_element_by_xpath( '//*[@id="pagination"]/span').text # 有异常 # 共2391条记录 1/120页 pagesize = re.split( "[/页]", re.search("/.*页 ", text).group())[1] reportData = pd.DataFrame( columns=["date", "place", "course1", "course2", "course3", "course4"]) for i in range(int(pagesize)): # 找出本页table trlist = driver.find_elements_by_tag_name("tr") for row in trlist: tdlist = row.find_elements_by_tag_name("td") tmp = [] for col in tdlist: tmp.append(col.text) reportData = reportData.append( pd.Series(tmp, index=reportData.columns), ignore_index=True) # 增加一行 # 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3] if i > 0: driver.find_element_by_xpath( '//*[@id="pagination"]/span/a[3]').click() else: driver.find_element_by_xpath( '//*[@id="pagination"]/span/a[2]').click() except Exception as e: print(e) reportData.to_csv("data/report" + month1 + ".csv", header=False) driver.close() if __name__ == "__main__": crawl()