#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Author : liuyuqi @Contact : liuyuqi.gov@msn.cn @Time : 2020/04/06 22:13:01 @Version : 1.0 @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved. @Desc : 爬取螺蛳粉天猫数据 ''' # 导入所需包 from selenium import webdriver import parsel import re import time import pandas as pd def login_taobao_acount(): # 打开浏览器 global browser browser = webdriver.Chrome() # 登录URL login_url = 'https://login.taobao.com/member/login.jhtml' # 打开网页 browser.get(login_url) # 支付宝登录 browser.find_element_by_class_name('alipay-login').click() def get_assigned_page(key_words): # 获取淘宝URL tb_url = 'https://www.taobao.com/' # 打开淘宝网 browser.get(tb_url) # 定位搜索框,输入数据 s_bar = browser.find_element_by_xpath('//*[@id="q"]') s_bar.send_keys('{}'.format(key_words)) # 点击搜索 browser.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click() def get_one_page(): # 先获取第一页的信息 html = parsel.Selector(browser.page_source) # 获取数据 goods_name = html.xpath('//div[@class="grid g-clearfix"]//img/@alt').extract() shop_name = html.xpath('//div[@class="grid g-clearfix"]//div[@class="shop"]/a/span[2]/text()').extract() price = html.xpath('//div[@class="grid g-clearfix"]//div[contains(@class,"price")]/strong/text()').extract() purchase_num = [re.findall(r'
(.*?)
', i) for i in html.xpath('//div[@class="grid g-clearfix"]//div[@class="row row-1 g-clearfix"]').extract()] location = html.xpath('//div[@class="grid g-clearfix"]//div[@class="location"]/text()').extract() # 存储数据 df_one = pd.DataFrame({ 'goods_name': goods_name, 'shop_name': shop_name, 'price': price, 'purchase_num': purchase_num, 'location': location }) return df_one def get_all_page(page_num): df_all = pd.DataFrame() # 循环翻页 for i in range(1, page_num): # 运行函数 df_one = get_one_page() # 循环追加 df_all = df_all.append(df_one, ignore_index=True) # 100页的时候打断翻页 if page_num==100: break else: # 点击翻页 browser.find_element_by_css_selector('#mainsrp-pager > div > div > div > ul > li.item.next > a').click() # 打印进度 print('我正在获取第{}页的数据'.format(i)) # 休眠一秒 time.sleep(10) return df_all if __name__=='__main__': # 先运行登录函数 login_taobao_acount() # 再运行搜索函数 get_assigned_page(key_words='螺蛳粉') # 再运行翻页获取函数 df_all = get_all_page(page_num=101) # 读出数据 df_all.to_excel('data/螺蛳粉店铺数据.xlsx', index=False)