get_comment.py 3.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Author : liuyuqi
  5. @Contact : liuyuqi.gov@msn.cn
  6. @Time : 2020/04/06 21:18:40
  7. @Version : 1.0
  8. @License : Copyright © 2017-2020 liuyuqi. All Rights Reserved.
  9. @Desc : 天猫爬取商品评论
  10. '''
  11. # 导入所需库
  12. import pandas as pd
  13. import requests
  14. import re
  15. import time
  16. import json
  17. from jsonpath import jsonpath
  18. from pprint import pprint
  19. df_all = pd.DataFrame()
  20. for i in range(1, 101):
  21. true_url = "https://rate.tmall.com/list_detail_rate.htm?itemId=598614273525&spuId=0&sellerId=3965833216&order=3&currentPage={}&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098%23E1hvFvvpvoUvUvCkvvvvvjiPn25y0jtRRsFh6jEUPmPwzj3URFdWtjibRLFOtjrPdphvmpmvqRkJvvvWUghCvCWpvREwsDsNzYGUTnAYAZrqv6ruRphvCvvvphvPvpvhvv2MMQhCvvOv9hCvvvvEvpCWvhDi4Bz6VXu4hAx%2F0jZ7%2Bu0Owmz6%2Ff8r58t%2Bm7zydigXe5xLD76fd34AVAllY2%2FAdXQaWXxr58TJ%2B3%2BuQjZL%2Bu6fjLVxfBKK5FGDNdyCvm9vvhCvvvvvvvvvBJZvvUVavvCHtpvv9ZUvvhcDvvmCp9vvBJZvvUHmuphvmvvvpLvEk6nskphvC9hvpyP9Q8wCvvpvvhHh3QhvCvmvphmrvpvEvvEN7GOvvvExRphvCvvvphmrvpvEvvjKMngvvEbp9phvHnMS01gH7rMNz15bMH1btqjN%2FnsvRphvCvvvphv%3D&needFold=0&_ksTS=1585965084590_619&callback=jsonp620".format(i)
  22. headers ={
  23. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
  24. 'referer': 'https://detail.tmall.com/item.htm?spm=a230r.1.14.6.62982a9d1ZzQky&id=598614273525&cm_id=140105335569ed55e27b&abbucket=10&skuId=4416255191724',
  25. 'cookie': 'cna=6aIDF0UBwyMCAd9IWWP0ve2T; hng=CN%7Czh-CN%7CCNY%7C156; _m_h5_tk=352faf1e2c7459bb9be6dcfb1a686859_1585650890558; _m_h5_tk_enc=f98da3ebe54c27135860fad3eb0dd30f; enc=7ukbCamgQiUCkbWFKHSyWs3%2FuaYf2BwLnE%2FrqYLwkifjMrzkW7Z9o9ZnyLldLtq72TZm4k67jLO3g3pY8WCUTg%3D%3D; t=fc84a8e7475ba65cba01390893141481; tracknick=%5Cu738B%5Cu771F%5Cu8FBE; lgc=%5Cu738B%5Cu771F%5Cu8FBE; _tb_token_=ee45eb07d7e8a; cookie2=108528103da2fb86166fe94f4ec6d16c; x5sec=7b22726174656d616e616765723b32223a223664363835376639313134633235396364656133353961346431343332383436434f4c4c6e2f514645497659677553626973487066513d3d227d; dnk=%5Cu738B%5Cu771F%5Cu8FBE; uc1=cookie15=VFC%2FuZ9ayeYq2g%3D%3D&existShop=false&tag=8&pas=0&cookie16=UtASsssmPlP%2Ff1IHDsDaPRu%2BPw%3D%3D&lng=zh_CN&cookie14=UoTUP2uetwoTUg%3D%3D&cookie21=V32FPkk%2FgPzW; uc3=id2=W8g1q36CK3mT&nk2=rpB%2B19XZ&lg2=URm48syIIVrSKA%3D%3D&vt3=F8dBxdAXsSeyaM9RvMI%3D; _l_g_=Ug%3D%3D; uc4=nk4=0%40rMpGHW%2BbjZbmUWy3pSf%2FTjQ%3D&id4=0%40WeuSdm3pqpKqyJu1mxC59Ozb5YU%3D; unb=816423751; cookie1=VvaOTBfnXLrXE%2FHlUE7SD0YJwBoh4uUMhGIHJ9cs6KA%3D; login=true; cookie17=W8g1q36CK3mT; _nk_=%5Cu738B%5Cu771F%5Cu8FBE; sgcookie=E61hfOKws3lgRAqU1v%2Fvg; sg=%E8%BE%BE19; csg=0a7d2065; l=dBM9vnp4Qcg5Mq6wBOfgCkjmkJ_t6IRf1sPzt2XL0ICP_H5JLsJNWZfuxtTvCnGVn6y6R35mgkfgBjLtxy4EhZXRFJXn9MpOLd8h.; isg=BNPTDT2niX35UEVSwcvpZWTrYlf9iGdK353nFoXw2_JOBPCmDVoIm3gSPnRqpL9C'
  26. }
  27. # 发起请求
  28. data = requests.get(true_url, headers=headers).text
  29. # 提取内容
  30. json_data = re.findall(r'jsonp620\((.*)\)', data)[0]
  31. # 解析数据
  32. js_data = json.loads(json_data)
  33. # 获取数据
  34. UserNick = jsonpath(js_data, '$..rateList..displayUserNick')
  35. comment_time = jsonpath(js_data, '$..rateList..rateDate')
  36. content = jsonpath(js_data, '$..rateList..rateContent')
  37. auctionSku = jsonpath(js_data, '$..rateList..auctionSku')
  38. df_one = pd.DataFrame({
  39. 'UserNick': UserNick,
  40. 'comment_time': comment_time,
  41. 'content': content,
  42. 'auctionSku': auctionSku
  43. })
  44. # 循环追加
  45. df_all = df_all.append(df_one, ignore_index=True)
  46. # 休眠5秒
  47. time.sleep(5)
  48. # 打印进度
  49. print('我正在获取第{}页的信息'.format(i))
  50. # 读出数据
  51. df_all.to_excel('data/李子柒螺蛳粉评论.xlsx', index=False)