#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Author : liuyuqi @Contact : liuyuqi.gov@msn.cn @Time : 2020/04/06 21:18:40 @Version : 1.0 @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved. @Desc : 天猫爬取商品评论 ''' # 导入所需库 import pandas as pd import requests import re import time import json from jsonpath import jsonpath from pprint import pprint df_all = pd.DataFrame() for i in range(1, 101): true_url = "https://rate.tmall.com/list_detail_rate.htm?itemId=598614273525&spuId=0&sellerId=3965833216&order=3¤tPage={}&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098%23E1hvFvvpvoUvUvCkvvvvvjiPn25y0jtRRsFh6jEUPmPwzj3URFdWtjibRLFOtjrPdphvmpmvqRkJvvvWUghCvCWpvREwsDsNzYGUTnAYAZrqv6ruRphvCvvvphvPvpvhvv2MMQhCvvOv9hCvvvvEvpCWvhDi4Bz6VXu4hAx%2F0jZ7%2Bu0Owmz6%2Ff8r58t%2Bm7zydigXe5xLD76fd34AVAllY2%2FAdXQaWXxr58TJ%2B3%2BuQjZL%2Bu6fjLVxfBKK5FGDNdyCvm9vvhCvvvvvvvvvBJZvvUVavvCHtpvv9ZUvvhcDvvmCp9vvBJZvvUHmuphvmvvvpLvEk6nskphvC9hvpyP9Q8wCvvpvvhHh3QhvCvmvphmrvpvEvvEN7GOvvvExRphvCvvvphmrvpvEvvjKMngvvEbp9phvHnMS01gH7rMNz15bMH1btqjN%2FnsvRphvCvvvphv%3D&needFold=0&_ksTS=1585965084590_619&callback=jsonp620".format(i) headers ={ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36', 'referer': 'https://detail.tmall.com/item.htm?spm=a230r.1.14.6.62982a9d1ZzQky&id=598614273525&cm_id=140105335569ed55e27b&abbucket=10&skuId=4416255191724', 'cookie': 'cna=6aIDF0UBwyMCAd9IWWP0ve2T; hng=CN%7Czh-CN%7CCNY%7C156; _m_h5_tk=352faf1e2c7459bb9be6dcfb1a686859_1585650890558; _m_h5_tk_enc=f98da3ebe54c27135860fad3eb0dd30f; enc=7ukbCamgQiUCkbWFKHSyWs3%2FuaYf2BwLnE%2FrqYLwkifjMrzkW7Z9o9ZnyLldLtq72TZm4k67jLO3g3pY8WCUTg%3D%3D; t=fc84a8e7475ba65cba01390893141481; tracknick=%5Cu738B%5Cu771F%5Cu8FBE; lgc=%5Cu738B%5Cu771F%5Cu8FBE; _tb_token_=ee45eb07d7e8a; cookie2=108528103da2fb86166fe94f4ec6d16c; x5sec=7b22726174656d616e616765723b32223a223664363835376639313134633235396364656133353961346431343332383436434f4c4c6e2f514645497659677553626973487066513d3d227d; dnk=%5Cu738B%5Cu771F%5Cu8FBE; uc1=cookie15=VFC%2FuZ9ayeYq2g%3D%3D&existShop=false&tag=8&pas=0&cookie16=UtASsssmPlP%2Ff1IHDsDaPRu%2BPw%3D%3D&lng=zh_CN&cookie14=UoTUP2uetwoTUg%3D%3D&cookie21=V32FPkk%2FgPzW; uc3=id2=W8g1q36CK3mT&nk2=rpB%2B19XZ&lg2=URm48syIIVrSKA%3D%3D&vt3=F8dBxdAXsSeyaM9RvMI%3D; _l_g_=Ug%3D%3D; uc4=nk4=0%40rMpGHW%2BbjZbmUWy3pSf%2FTjQ%3D&id4=0%40WeuSdm3pqpKqyJu1mxC59Ozb5YU%3D; unb=816423751; cookie1=VvaOTBfnXLrXE%2FHlUE7SD0YJwBoh4uUMhGIHJ9cs6KA%3D; login=true; cookie17=W8g1q36CK3mT; _nk_=%5Cu738B%5Cu771F%5Cu8FBE; sgcookie=E61hfOKws3lgRAqU1v%2Fvg; sg=%E8%BE%BE19; csg=0a7d2065; l=dBM9vnp4Qcg5Mq6wBOfgCkjmkJ_t6IRf1sPzt2XL0ICP_H5JLsJNWZfuxtTvCnGVn6y6R35mgkfgBjLtxy4EhZXRFJXn9MpOLd8h.; isg=BNPTDT2niX35UEVSwcvpZWTrYlf9iGdK353nFoXw2_JOBPCmDVoIm3gSPnRqpL9C' } # 发起请求 data = requests.get(true_url, headers=headers).text # 提取内容 json_data = re.findall(r'jsonp620\((.*)\)', data)[0] # 解析数据 js_data = json.loads(json_data) # 获取数据 UserNick = jsonpath(js_data, '$..rateList..displayUserNick') comment_time = jsonpath(js_data, '$..rateList..rateDate') content = jsonpath(js_data, '$..rateList..rateContent') auctionSku = jsonpath(js_data, '$..rateList..auctionSku') df_one = pd.DataFrame({ 'UserNick': UserNick, 'comment_time': comment_time, 'content': content, 'auctionSku': auctionSku }) # 循环追加 df_all = df_all.append(df_one, ignore_index=True) # 休眠5秒 time.sleep(5) # 打印进度 print('我正在获取第{}页的信息'.format(i)) # 读出数据 df_all.to_excel('data/李子柒螺蛳粉评论.xlsx', index=False)