taobao_comment.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Jan 8 09:42:18 2019
  5. @author: yura
  6. """
  7. from bs4 import BeautifulSoup
  8. import requests
  9. import warnings
  10. import re
  11. from datetime import datetime
  12. import json
  13. import pandas as pd
  14. import random
  15. import time
  16. name=[]
  17. content=[]
  18. comment_date=[]
  19. reply=[]
  20. append_comment=[]
  21. x=1
  22. #更改User-Agent、cookies、url里面的id、保存的文件名
  23. headers = {
  24. 'User-Agent': '',
  25. 'Connection':'keep-alive',
  26. 'accept-encoding': 'gzip, deflate, br',
  27. 'accept-language': 'zh-CN,zh;q=0.9'}
  28. cookies={'cookie':''}
  29. url=''
  30. for i in range(100):
  31. print('正在爬取第'+str(i+1)+'页')
  32. detail_url=url.format(i)
  33. res=requests.get(detail_url,headers=headers,cookies=cookies)
  34. data=re.findall(r'{.*}',res.text)[0]
  35. # data=res.text[13:-1]
  36. data=json.loads(data)
  37. # print(data)
  38. # data=json.loads(data)
  39. for item in data['rateDetail']['rateList']:
  40. name.append(item['displayUserNick'])
  41. content.append(item['rateContent'])
  42. comment_date.append(item['rateDate'])
  43. reply.append(item['reply'])
  44. x+=1
  45. #判断是否有追评
  46. if(item['appendComment']):
  47. append_comment.append(item['appendComment']['content'])
  48. else:
  49. append_comment.append('')
  50. print('第'+str(i+1)+'页爬取完成')
  51. time.sleep(random.random()*30)
  52. result={'名字':name,'评价日期':comment_date,'评价':content,'追评':append_comment,'回复':reply}
  53. results=pd.DataFrame(result)
  54. results.info()
  55. results.to_excel('产品名字_评价.xlsx')