crwal122.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2020/09/06 01:38:09
  6. @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved.
  7. @Desc : 爬取 https://sh.122.gov.cn 驾照考试报名数据
  8. '''
  9. import pandas as pd
  10. import numpy as np
  11. import requests
  12. from selenium import webdriver
  13. from selenium.common.exceptions import NoSuchElementException
  14. from selenium.webdriver.common.keys import Keys
  15. from selenium.webdriver import ActionChains
  16. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  17. import os
  18. import re
  19. import sys
  20. import time
  21. base = r"https://sh.122.gov.cn/#/noticeDetail?fzjg=%E6%B2%AAA&tjyf=202007&fwdmgl=6003"
  22. chormepath = r"D:/Program-Files/browser-driver/chromedriver.exe"
  23. phantomjspath = r"/opt/phantomjs/bin/phantomjs"
  24. link = []
  25. res = []
  26. save_path = r"download"
  27. link_path = r"data/link.csv"
  28. report_path = r"data/report.csv"
  29. # if not os.path.exists(save_path):
  30. # os.mkdir(save_path)
  31. option = webdriver.ChromeOptions()
  32. option.add_argument("lang=zh_CN.UTF-8")
  33. option.add_argument("User-Agent=Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36")
  34. option.add_argument("--headless")
  35. # 禁止加载图片
  36. prefs = {
  37. 'profile.default_content_setting_values.images': 2
  38. }
  39. option.add_experimental_option('prefs', prefs)
  40. desired_cap = DesiredCapabilities.PHANTOMJS.copy()
  41. desired_cap['phantomjs.page.settings.userAgent'] = 'Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36'
  42. driver = webdriver.PhantomJS(
  43. executable_path=phantomjspath, desired_capabilities=desired_cap)
  44. # driver = webdriver.Chrome(executable_path=chormepath, options=option)
  45. # driver.maximize_window()
  46. def getLink():
  47. driver.implicitly_wait(10)
  48. driver.get(base) # 加载页面
  49. if not os.path.exists(link_path):
  50. for i in range(5):
  51. monthData = driver.find_elements_by_css_selector(
  52. "#querylist li a")
  53. # # 获取本页所有月份
  54. for i in monthData:
  55. print(i.text, i.get_attribute("href"))
  56. link.append([i.text, i.get_attribute("href")])
  57. # 获取下一步按钮,点击
  58. driver.find_element_by_xpath(
  59. '//*[@id="pppagination"]/ul/li[2]/a').click()
  60. # cookie = getCookie()
  61. # print(cookie)
  62. link = pd.DataFrame(link)
  63. link.to_csv(link_path, header=False)
  64. else:
  65. link = pd.read_csv(link_path, names=["month", "link"])
  66. return link
  67. def download(url, save_path):
  68. try:
  69. with open(save_path, "wb") as file:
  70. file.write(requests.get(url).raw)
  71. except Exception as e:
  72. print(e)
  73. def getCookie():
  74. cookie = driver.get_cookies()
  75. cookie_dict = []
  76. for c in cookie:
  77. ck = "{0}={1};".format(c['name'], c['value'])
  78. cookie_dict.append(ck)
  79. return cookie_dict
  80. def crawl():
  81. global link
  82. link = pd.read_csv(link_path, names=["month", "link"])
  83. for i in range(len(link)):
  84. link1 = link.loc[i]["link"] # 链接
  85. month1 = link.loc[i]["month"] # 月份
  86. if not os.path.exists("/data/report" + month1 + ".csv"):
  87. driver.implicitly_wait(10)
  88. driver.get(link1)
  89. # # 找出多少条,多少页
  90. try:
  91. text = driver.find_element_by_xpath(
  92. '//*[@id="pagination"]/span').text # 有异常
  93. # 共2391条记录 1/120页
  94. pagesize = re.split(
  95. "[/页]", re.search("/.*页 ", text).group())[1]
  96. reportData = pd.DataFrame(
  97. columns=["date", "place", "course1", "course2", "course3", "course4"])
  98. for i in range(int(pagesize)):
  99. # 找出本页table
  100. trlist = driver.find_elements_by_tag_name("tr")
  101. for row in trlist:
  102. tdlist = row.find_elements_by_tag_name("td")
  103. tmp = []
  104. for col in tdlist:
  105. tmp.append(col.text)
  106. reportData = reportData.append(
  107. pd.Series(tmp, index=reportData.columns), ignore_index=True) # 增加一行
  108. # 点击下一步 ,这里有一个问题,第一页 span/a[2] 第二页之后就是 span/a[3]
  109. if i > 0:
  110. driver.find_element_by_xpath(
  111. '//*[@id="pagination"]/span/a[3]').click()
  112. else:
  113. driver.find_element_by_xpath(
  114. '//*[@id="pagination"]/span/a[2]').click()
  115. except Exception as e:
  116. print(e)
  117. reportData.to_csv("data/report" + month1 + ".csv", header=False)
  118. driver.close()
  119. if __name__ == "__main__":
  120. crawl()