lyq
/
car-analysis


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
							#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@Contact :   liuyuqi.gov@msn.cn
@Time    :   2020/09/06 01:38:09
@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
@Desc    :   爬取 https://sh.122.gov.cn 驾照考试报名数据
'''

import pandas as pd
import numpy as np
import requests
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import os
import re
import sys
import time

base = r"https://sh.122.gov.cn/#/noticeDetail?fzjg=%E6%B2%AAA&tjyf=202007&fwdmgl=6003"
chormepath = r"D:/Program-Files/browser-driver/chromedriver.exe"
phantomjspath = r"/opt/phantomjs/bin/phantomjs"

link = []
res = []
save_path = r"download"
link_path = r"data/link.csv"
report_path = r"data/report.csv"
# if not os.path.exists(save_path):
#     os.mkdir(save_path)

option = webdriver.ChromeOptions()
option.add_argument("lang=zh_CN.UTF-8")
option.add_argument("User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36")
option.add_argument("--headless")

# 禁止加载图片
prefs = {
    'profile.default_content_setting_values.images': 2
}
option.add_experimental_option('prefs', prefs)

desired_cap = DesiredCapabilities.PHANTOMJS.copy()
desired_cap['phantomjs.page.settings.userAgent'] = 'Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36'
driver = webdriver.PhantomJS(
    executable_path=phantomjspath, desired_capabilities=desired_cap)
# driver = webdriver.Chrome(executable_path=chormepath, options=option)
# driver.maximize_window()


def getLink():
    driver.implicitly_wait(10)
    driver.get(base)  # 加载页面
    if not os.path.exists(link_path):
        for i in range(5):
            monthData = driver.find_elements_by_css_selector(
                "#querylist li a")
            # # 获取本页所有月份
            for i in monthData:
                print(i.text, i.get_attribute("href"))
                link.append([i.text, i.get_attribute("href")])
            # 获取下一步按钮，点击
            driver.find_element_by_xpath(
                '//*[@id="pppagination"]/ul/li[2]/a').click()
            # cookie = getCookie()
            # print(cookie)
        link = pd.DataFrame(link)
        link.to_csv(link_path, header=False)
    else:
        link = pd.read_csv(link_path, names=["month", "link"])
    return link


def download(url, save_path):
    try:
        with open(save_path, "wb") as file:
            file.write(requests.get(url).raw)
    except Exception as e:
        print(e)


def getCookie():
    cookie = driver.get_cookies()
    cookie_dict = []
    for c in cookie:
        ck = "{0}={1};".format(c['name'], c['value'])
        cookie_dict.append(ck)
    return cookie_dict


def crawl():
    global link
    link = pd.read_csv(link_path, names=["month", "link"])
    for i in range(len(link)):
        link1 = link.loc[i]["link"]    # 链接
        month1 = link.loc[i]["month"]  # 月份
        if not os.path.exists("/data/report" + month1 + ".csv"):
            driver.implicitly_wait(10)
            driver.get(link1)
            # # 找出多少条，多少页
            try:
                text = driver.find_element_by_xpath(
                    '//*[@id="pagination"]/span').text  # 有异常
                # 共2391条记录 1/120页
                pagesize = re.split(
                    "[/页]", re.search("/.*页  ", text).group())[1]
                reportData = pd.DataFrame(
                    columns=["date", "place", "course1", "course2", "course3", "course4"])
                for i in range(int(pagesize)):
                    # 找出本页table
                    trlist = driver.find_elements_by_tag_name("tr")
                    for row in trlist:
                        tdlist = row.find_elements_by_tag_name("td")
                        tmp = []
                        for col in tdlist:
                            tmp.append(col.text)
                        reportData = reportData.append(
                            pd.Series(tmp, index=reportData.columns), ignore_index=True)  # 增加一行
                # 点击下一步 ，这里有一个问题，第一页 span/a[2] 第二页之后就是 span/a[3]
                    if i > 0:
                        driver.find_element_by_xpath(
                            '//*[@id="pagination"]/span/a[3]').click()
                    else:
                        driver.find_element_by_xpath(
                            '//*[@id="pagination"]/span/a[2]').click()
            except Exception as e:
                print(e)
            reportData.to_csv("data/report" + month1 + ".csv", header=False)
    driver.close()


if __name__ == "__main__":
    crawl()