get_luosi.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Author : liuyuqi
  5. @Contact : liuyuqi.gov@msn.cn
  6. @Time : 2020/04/06 22:13:01
  7. @Version : 1.0
  8. @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved.
  9. @Desc : 爬取螺蛳粉天猫数据
  10. '''
  11. # 导入所需包
  12. from selenium import webdriver
  13. import parsel
  14. import re
  15. import time
  16. import pandas as pd
  17. def login_taobao_acount():
  18. # 打开浏览器
  19. global browser
  20. browser = webdriver.Chrome()
  21. # 登录URL
  22. login_url = 'https://login.taobao.com/member/login.jhtml'
  23. # 打开网页
  24. browser.get(login_url)
  25. # 支付宝登录
  26. browser.find_element_by_class_name('alipay-login').click()
  27. def get_assigned_page(key_words):
  28. # 获取淘宝URL
  29. tb_url = 'https://www.taobao.com/'
  30. # 打开淘宝网
  31. browser.get(tb_url)
  32. # 定位搜索框,输入数据
  33. s_bar = browser.find_element_by_xpath('//*[@id="q"]')
  34. s_bar.send_keys('{}'.format(key_words))
  35. # 点击搜索
  36. browser.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
  37. def get_one_page():
  38. # 先获取第一页的信息
  39. html = parsel.Selector(browser.page_source)
  40. # 获取数据
  41. goods_name = html.xpath('//div[@class="grid g-clearfix"]//img/@alt').extract()
  42. shop_name = html.xpath('//div[@class="grid g-clearfix"]//div[@class="shop"]/a/span[2]/text()').extract()
  43. price = html.xpath('//div[@class="grid g-clearfix"]//div[contains(@class,"price")]/strong/text()').extract()
  44. purchase_num = [re.findall(r'<div class="deal-cnt">(.*?)</div>', i)
  45. for i in html.xpath('//div[@class="grid g-clearfix"]//div[@class="row row-1 g-clearfix"]').extract()]
  46. location = html.xpath('//div[@class="grid g-clearfix"]//div[@class="location"]/text()').extract()
  47. # 存储数据
  48. df_one = pd.DataFrame({
  49. 'goods_name': goods_name,
  50. 'shop_name': shop_name,
  51. 'price': price,
  52. 'purchase_num': purchase_num,
  53. 'location': location
  54. })
  55. return df_one
  56. def get_all_page(page_num):
  57. df_all = pd.DataFrame()
  58. # 循环翻页
  59. for i in range(1, page_num):
  60. # 运行函数
  61. df_one = get_one_page()
  62. # 循环追加
  63. df_all = df_all.append(df_one, ignore_index=True)
  64. # 100页的时候打断翻页
  65. if page_num==100:
  66. break
  67. else:
  68. # 点击翻页
  69. browser.find_element_by_css_selector('#mainsrp-pager > div > div > div > ul > li.item.next > a').click()
  70. # 打印进度
  71. print('我正在获取第{}页的数据'.format(i))
  72. # 休眠一秒
  73. time.sleep(10)
  74. return df_all
  75. if __name__=='__main__':
  76. # 先运行登录函数
  77. login_taobao_acount()
  78. # 再运行搜索函数
  79. get_assigned_page(key_words='螺蛳粉')
  80. # 再运行翻页获取函数
  81. df_all = get_all_page(page_num=101)
  82. # 读出数据
  83. df_all.to_excel('data/螺蛳粉店铺数据.xlsx', index=False)