#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@Contact :   liuyuqi.gov@msn.cn
@Time    :   2020/09/06 01:38:09
@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
@Desc    :   爬取 https://sh.122.gov.cn 驾照考试报名数据
'''

import pandas as pd
import numpy as np
import requests
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains

import os
import re
import sys
import time

base = r"https://sh.122.gov.cn/#/noticeDetail?fzjg=%E6%B2%AAA&tjyf=202007&fwdmgl=6003"
chormepath = r"D:/Program-Files/browser-driver/chromedriver.exe"
phantomjspath = r"D:/Program-Files/phantomjs-2.1.1-windows/bin/phantomjs.exe"

link = []
res = []
save_path = r"download"
link_path = r"data/link.csv"
report_path = r"data/report.csv"
# if not os.path.exists(save_path):
#     os.mkdir(save_path)

option = webdriver.ChromeOptions()
option.add_argument("lang=zh_CN.UTF-8")
option.add_argument("User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36")
option.add_argument("--headless")

# 禁止加载图片
prefs = {
    'profile.default_content_setting_values.images': 2
}
option.add_experimental_option('prefs', prefs)

# driver = webdriver.PhantomJS(executable_path=phantomjspath)
driver = webdriver.Chrome(executable_path=chormepath, options=option)
# driver.maximize_window()


def getLink():
    driver.implicitly_wait(10)
    driver.get(base)  # 加载页面
    if not os.path.exists(link_path):
        for i in range(5):
            monthData = driver.find_elements_by_css_selector(
                "#querylist li a")
            # # 获取本页所有月份
            for i in monthData:
                print(i.text, i.get_attribute("href"))
                link.append([i.text, i.get_attribute("href")])
            # 获取下一步按钮，点击
            driver.find_element_by_xpath(
                '//*[@id="pppagination"]/ul/li[2]/a').click()
            # cookie = getCookie()
            # print(cookie)
        link = pd.DataFrame(link)
        link.to_csv(link_path, header=False)
    else:
        link = pd.read_csv(link_path, names=["month", "link"])
    return link


def download(url, save_path):
    try:
        with open(save_path, "wb") as file:
            file.write(requests.get(url).raw)
    except Exception as e:
        print(e)


def getCookie():
    cookie = driver.get_cookies()
    cookie_dict = []
    for c in cookie:
        ck = "{0}={1};".format(c['name'], c['value'])
        cookie_dict.append(ck)
    return cookie_dict


def crawl():
    global link
    link = pd.read_csv(link_path, names=["month", "link"])
    for i in range(len(link)):
        link1 = link.loc[i]["link"]    # 链接
        month1 = link.loc[i]["month"]  # 月份
        if not os.path.exists("/data/report" + month1 + ".csv"):
            driver.implicitly_wait(10)
            driver.get(link1)
            # # 找出多少条，多少页
            try:
                text = driver.find_element_by_xpath(
                    '//*[@id="pagination"]/span').text  # 有异常
                # 共2391条记录 1/120页
                pagesize = re.split("[/页]", re.search("/.*页  ", text).group())[1]
                reportData = pd.DataFrame(
                    columns=["date", "place", "course1", "course2", "course3", "course4"])
                for i in range(int(pagesize)):
                    # 找出本页table
                    trlist = driver.find_elements_by_tag_name("tr")
                    for row in trlist:
                        tdlist = row.find_elements_by_tag_name("td")
                        tmp = []
                        for col in tdlist:
                            tmp.append(col.text)
                        reportData = reportData.append(
                            pd.Series(tmp, index=reportData.columns), ignore_index=True)  # 增加一行
                # 点击下一步 ，这里有一个问题，第一页 span/a[2] 第二页之后就是 span/a[3]
                    if i > 0:
                        driver.find_element_by_xpath(
                            '//*[@id="pagination"]/span/a[3]').click()
                    else:
                        driver.find_element_by_xpath(
                            '//*[@id="pagination"]/span/a[2]').click()
            except Exception as e:
                print(e)
            reportData.to_csv("data/report" + month1 + ".csv", header=False)
    driver.close()

if __name__ == "__main__":
    crawl()