#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Contact : liuyuqi.gov@msn.cn @Time : 2020/09/07 16:38:41 @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved. @Desc : 驾考报名爬虫 ''' import os import sys import re import time import logging from selenium import webdriver import pandas as pd class Enum(tuple): __getattr__ = tuple.index BrowserType = Enum(['FIREFOX', 'CHROME', 'IE', 'SAFARI', 'PHANTOMJS']) class CrawlCar(): def __init__(self, site, save_folder="data", browser=BrowserType.FIREFOX, driver=None): self.__site = site self.__save_folder = save_folder self.__chapter_list = [] if not os.path.exists(self.__save_folder): os.mkdir(self.__save_folder) if BrowserType.FIREFOX == browser: self.__browser = webdriver.Firefox() elif BrowserType.CHROME == browser: option = webdriver.ChromeOptions() option.add_argument("lang=zh_CN.UTF-8") option.add_argument( "User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36") # option.add_argument("--headless") # 禁止加载图片 prefs = { 'profile.default_content_setting_values.images': 2 } option.add_experimental_option('prefs', prefs) self.__browser = webdriver.Chrome( executable_path=driver, options=option) # window.navigater.webdriver 取消webdriver标志 self.__browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) """}) elif BrowserType.IE == browser: self.__browser = webdriver.Ie(driver) elif BrowserType.SAFARI == browser: self.__browser = webdriver.Safari(driver) elif BrowserType.PHANTOMJS == browser: self.__browser = webdriver.PhantomJS(driver) else: raise TypeError('UNKNOWN BROWSER TYPE: %s' % browser) logging.basicConfig( format='[%(asctime)s] %(levelname)s::%(module)s::%(funcName)s() %(message)s', level=logging.INFO) def __del__(self): self.__browser.quit() def getLink(self): self.__browser.implicitly_wait(10) self.__browser.get(self.__site) # 加载页面 link_path = self.__save_folder + "/link.csv" if not os.path.exists(link_path): for i in range(5): monthData = self.__browser.find_elements_by_css_selector( "#querylist li a") # # 获取本页所有月份 for i in monthData: print(i.text, i.get_attribute("href")) link.append([i.text, i.get_attribute("href")]) # 获取下一步按钮,点击 self.__browser.find_element_by_xpath( '//*[@id="pppagination"]/ul/li[2]/a').click() # cookie = getCookie() # print(cookie) link = pd.DataFrame(link) link.to_csv(link_path, header=False) else: link = pd.read_csv(link_path, names=["month", "link"]) return link def downlaod(self, url, save_path): pass def getCookie(self): cookie = self.__browser.get_cookies() cookie_dict = [] for c in cookie: ck = "{0}={1};".format(c['name'], c['value']) cookie_dict.append(ck) return cookie_dict def crawl(self): link_path = self.__save_folder + "/link.csv" link = pd.read_csv(link_path, names=["month", "link"]) for i in range(len(link)): link1 = link.loc[i]["link"] # 链接 month1 = link.loc[i]["month"] # 月份 if not os.path.exists(self.__save_folder + "/report" + month1 + ".csv"): self.__browser.implicitly_wait(10) self.__browser.get(link1) # # 找出多少条,多少页 try: text = self.__browser.find_element_by_xpath( '//*[@id="pagination"]/span').text # 有异常 # 共2391条记录 1/120页 pagesize = re.split( "[/页]", re.search("/.*页 ", text).group())[1] reportData = pd.DataFrame( columns=["date", "place", "course1", "course2", "course3", "course4"]) for i in range(int(pagesize)): # 找出本页table trlist = self.__browser.find_elements_by_tag_name("tr") for row in trlist: tdlist = row.find_elements_by_tag_name("td") tmp = [] for col in tdlist: tmp.append(col.text) reportData = reportData.append( pd.Series(tmp, index=reportData.columns), ignore_index=True) # 增加一行 # 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3] if i > 0: self.__browser.find_element_by_xpath( '//*[@id="pagination"]/span/a[3]').click() else: self.__browser.find_element_by_xpath( '//*[@id="pagination"]/span/a[2]').click() except Exception as e: print(e) reportData.to_csv(self.__save_folder + "/report" + month1 + ".csv", header=False) def merge(self): ''' 合并多个csv文件 ''' df = pd.DataFrame() for parent, dirnames, filenames in os.walk(self.__save_folder): for filename in filenames: if filename.startswith("report"): df1 = pd.read_csv(os.path.join(parent, filename)) df = pd.concat([df, df1]) df = df.drop(df[df["日期"] == "日期"].index).reset_index() df.to_csv(self.__save_folder + "res.csv", header=False) def start(self): self.crawl()