taobao_id.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. from bs4 import BeautifulSoup
  2. import requests
  3. import warnings
  4. import re
  5. from datetime import datetime
  6. import json
  7. import pandas as pd
  8. import random
  9. import time
  10. from datetime import datetime
  11. #from selenium import webdriver
  12. # 每次爬取时cookies都需要重新粘贴使用。
  13. # 网址就是大网页网址(修改s={})
  14. # 修改文件名称
  15. headers = {
  16. 'User-Agent': '',
  17. 'Referer':'https://www.taobao.com/',
  18. 'Connection':'keep-alive'}
  19. url = 'https://s.taobao.com/search?q=%E5%8F%A3%E7%BA%A2&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&sort=sale-desc&bcoffset=0&p4ppushleft=%2C44&s={}'
  20. cookies={'cookie':''}
  21. price=[]
  22. name=[]
  23. address=[]
  24. count=[]
  25. title=[]
  26. id_=[]
  27. detail_url=[]
  28. sale=[]
  29. ji=1
  30. for i in range(51,51):
  31. value=44*i
  32. url_range=url.format(value)
  33. res=requests.get(url_range,headers=headers,cookies=cookies,timeout=10)
  34. res.encoding='utf-8'
  35. # 正则从网址中提取信息
  36. print('正在爬取第'+str(ji)+'页')
  37. data=re.findall('g_page_config =(.*?)g_srp_loadCss',res.text,re.S)[0].strip()[:-1]
  38. content=json.loads(data,encoding='utf-8')
  39. list_=content['mods']['itemlist']['data']['auctions']
  40. for item in list_:
  41. name.append(item['nick'])
  42. price.append(item['view_price'])
  43. address.append(item['item_loc'])
  44. count.append(item['view_sales'].replace('人收货',''))
  45. title.append(item['raw_title'])
  46. id_.append(item['nid']) #nid
  47. detail_url.append(item['detail_url'])
  48. ji+=1
  49. time.sleep(random.random()*100+3)
  50. print('爬取完成')
  51. result={'店铺名称':name,'商品标题':title,'价格':price,'地址':address,'商品编号':id_,'收货人数':count,'详情页网址':detail_url}
  52. #result={'店铺名称':name[:177],'商品标题':title[:177],'价格':price[:177],'地址':address[:177],'商品编号':id_[:177],'收货人数':count[:177],'详情页网址':detail_url[:177]}
  53. results=pd.DataFrame(result)
  54. results.info()
  55. results.to_excel('口红ID.xlsx')