123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2020/09/07 16:38:41
- @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved.
- @Desc : 驾考报名爬虫
- '''
- import os
- import sys
- import re
- import time
- import logging
- from selenium import webdriver
- import pandas as pd
- class Enum(tuple):
- __getattr__ = tuple.index
- BrowserType = Enum(['FIREFOX', 'CHROME', 'IE', 'SAFARI', 'PHANTOMJS'])
- class CrawlCar():
- def __init__(self, site, save_folder="data", browser=BrowserType.FIREFOX, driver=None):
- self.__site = site
- self.__save_folder = save_folder
- self.__chapter_list = []
- if not os.path.exists(self.__save_folder):
- os.mkdir(self.__save_folder)
- if BrowserType.FIREFOX == browser:
- self.__browser = webdriver.Firefox()
- elif BrowserType.CHROME == browser:
- option = webdriver.ChromeOptions()
- option.add_argument("lang=zh_CN.UTF-8")
- option.add_argument(
- "User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36")
- # option.add_argument("--headless")
- # 禁止加载图片
- prefs = {
- 'profile.default_content_setting_values.images': 2
- }
- option.add_experimental_option('prefs', prefs)
- self.__browser = webdriver.Chrome(
- executable_path=driver, options=option)
- # window.navigater.webdriver 取消webdriver标志
- self.__browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
- "source": """
- Object.defineProperty(navigator, 'webdriver', {
- get: () => undefined
- })
- """})
- elif BrowserType.IE == browser:
- self.__browser = webdriver.Ie(driver)
- elif BrowserType.SAFARI == browser:
- self.__browser = webdriver.Safari(driver)
- elif BrowserType.PHANTOMJS == browser:
- self.__browser = webdriver.PhantomJS(driver)
- else:
- raise TypeError('UNKNOWN BROWSER TYPE: %s' % browser)
- logging.basicConfig(
- format='[%(asctime)s] %(levelname)s::%(module)s::%(funcName)s() %(message)s', level=logging.INFO)
- def __del__(self):
- self.__browser.quit()
- def getLink(self):
- self.__browser.implicitly_wait(10)
- self.__browser.get(self.__site) # 加载页面
- link_path = self.__save_folder + "/link.csv"
- if not os.path.exists(link_path):
- for i in range(5):
- monthData = self.__browser.find_elements_by_css_selector(
- "#querylist li a")
- # # 获取本页所有月份
- for i in monthData:
- print(i.text, i.get_attribute("href"))
- link.append([i.text, i.get_attribute("href")])
- # 获取下一步按钮,点击
- self.__browser.find_element_by_xpath(
- '//*[@id="pppagination"]/ul/li[2]/a').click()
- # cookie = getCookie()
- # print(cookie)
- link = pd.DataFrame(link)
- link.to_csv(link_path, header=False)
- else:
- link = pd.read_csv(link_path, names=["month", "link"])
- return link
- def downlaod(self, url, save_path):
- pass
- def getCookie(self):
- cookie = self.__browser.get_cookies()
- cookie_dict = []
- for c in cookie:
- ck = "{0}={1};".format(c['name'], c['value'])
- cookie_dict.append(ck)
- return cookie_dict
- def crawl(self):
- link_path = self.__save_folder + "/link.csv"
- link = pd.read_csv(link_path, names=["month", "link"])
- for i in range(len(link)):
- link1 = link.loc[i]["link"] # 链接
- month1 = link.loc[i]["month"] # 月份
- if not os.path.exists(self.__save_folder + "/report" + month1 + ".csv"):
- self.__browser.implicitly_wait(10)
- self.__browser.get(link1)
- # # 找出多少条,多少页
- try:
- text = self.__browser.find_element_by_xpath(
- '//*[@id="pagination"]/span').text # 有异常
- # 共2391条记录 1/120页
- pagesize = re.split(
- "[/页]", re.search("/.*页 ", text).group())[1]
- reportData = pd.DataFrame(
- columns=["date", "place", "course1", "course2", "course3", "course4"])
- for i in range(int(pagesize)):
- # 找出本页table
- trlist = self.__browser.find_elements_by_tag_name("tr")
- for row in trlist:
- tdlist = row.find_elements_by_tag_name("td")
- tmp = []
- for col in tdlist:
- tmp.append(col.text)
- reportData = reportData.append(
- pd.Series(tmp, index=reportData.columns), ignore_index=True) # 增加一行
- # 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3]
- if i > 0:
- self.__browser.find_element_by_xpath(
- '//*[@id="pagination"]/span/a[3]').click()
- else:
- self.__browser.find_element_by_xpath(
- '//*[@id="pagination"]/span/a[2]').click()
- except Exception as e:
- print(e)
- reportData.to_csv(self.__save_folder +
- "/report" + month1 + ".csv", header=False)
- def merge(self):
- '''
- 合并多个csv文件
- '''
- df = pd.DataFrame()
- for parent, dirnames, filenames in os.walk(self.__save_folder):
- for filename in filenames:
- if filename.startswith("report"):
- df1 = pd.read_csv(os.path.join(parent, filename))
- df = pd.concat([df, df1])
- df = df.drop(df[df["日期"] == "日期"].index).reset_index()
- df.to_csv(self.__save_folder + "res.csv", header=False)
- def start(self):
- self.crawl()
|