123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Author : liuyuqi
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2020/04/06 22:13:01
- @Version : 1.0
- @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved.
- @Desc : 爬取螺蛳粉天猫数据
- '''
- # 导入所需包
- from selenium import webdriver
- import parsel
- import re
- import time
- import pandas as pd
- def login_taobao_acount():
- # 打开浏览器
- global browser
- browser = webdriver.Chrome()
- # 登录URL
- login_url = 'https://login.taobao.com/member/login.jhtml'
- # 打开网页
- browser.get(login_url)
- # 支付宝登录
- browser.find_element_by_class_name('alipay-login').click()
- def get_assigned_page(key_words):
- # 获取淘宝URL
- tb_url = 'https://www.taobao.com/'
- # 打开淘宝网
- browser.get(tb_url)
- # 定位搜索框,输入数据
- s_bar = browser.find_element_by_xpath('//*[@id="q"]')
- s_bar.send_keys('{}'.format(key_words))
- # 点击搜索
- browser.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
- def get_one_page():
- # 先获取第一页的信息
- html = parsel.Selector(browser.page_source)
- # 获取数据
- goods_name = html.xpath('//div[@class="grid g-clearfix"]//img/@alt').extract()
- shop_name = html.xpath('//div[@class="grid g-clearfix"]//div[@class="shop"]/a/span[2]/text()').extract()
- price = html.xpath('//div[@class="grid g-clearfix"]//div[contains(@class,"price")]/strong/text()').extract()
- purchase_num = [re.findall(r'<div class="deal-cnt">(.*?)</div>', i)
- for i in html.xpath('//div[@class="grid g-clearfix"]//div[@class="row row-1 g-clearfix"]').extract()]
- location = html.xpath('//div[@class="grid g-clearfix"]//div[@class="location"]/text()').extract()
- # 存储数据
- df_one = pd.DataFrame({
- 'goods_name': goods_name,
- 'shop_name': shop_name,
- 'price': price,
- 'purchase_num': purchase_num,
- 'location': location
- })
- return df_one
- def get_all_page(page_num):
- df_all = pd.DataFrame()
- # 循环翻页
- for i in range(1, page_num):
- # 运行函数
- df_one = get_one_page()
- # 循环追加
- df_all = df_all.append(df_one, ignore_index=True)
- # 100页的时候打断翻页
- if page_num==100:
- break
- else:
- # 点击翻页
- browser.find_element_by_css_selector('#mainsrp-pager > div > div > div > ul > li.item.next > a').click()
- # 打印进度
- print('我正在获取第{}页的数据'.format(i))
- # 休眠一秒
- time.sleep(10)
- return df_all
- if __name__=='__main__':
- # 先运行登录函数
- login_taobao_acount()
- # 再运行搜索函数
- get_assigned_page(key_words='螺蛳粉')
- # 再运行翻页获取函数
- df_all = get_all_page(page_num=101)
- # 读出数据
- df_all.to_excel('data/螺蛳粉店铺数据.xlsx', index=False)
|