getGanJiData.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Author : liuyuqi
  5. @Contact : liuyuqi.gov@msn.cn
  6. @Time : 2019/11/18 03:19:28
  7. @Version : 1.0
  8. @License : (C)Copyright 2019
  9. @Desc : 赶集网二手爬虫
  10. '''
  11. import requests
  12. import os
  13. from bs4 import BeautifulSoup
  14. class GanJi():
  15. """docstring for GanJi"""
  16. def __init__(self):
  17. super(GanJi, self).__init__()
  18. def get(self,url):
  19. user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
  20. headers = {'User-Agent':user_agent}
  21. webData = requests.get(url + 'o1',headers=headers).text
  22. soup = BeautifulSoup(webData,'lxml')
  23. sum = soup.find('span',class_="num").text.replace("套","")
  24. ave = int(sum) / 32
  25. forNum = int(ave)
  26. if forNum < ave:
  27. forNum = forNum + 1
  28. for x in range(forNum):
  29. webData = requests.get(url + 'o' + str(x + 1),headers=headers).text
  30. soup = BeautifulSoup(webData,'lxml')
  31. find_list = soup.find('div',class_="f-main-list").find_all('div',class_="f-list-item ershoufang-list")
  32. for dl in find_list:
  33. print(dl.find('a',class_="js-title value title-font").text,end='|') # 名称
  34. # 中间 5 个信息
  35. tempDD = dl.find('dd',class_="dd-item size").find_all('span')
  36. for tempSpan in tempDD:
  37. if not tempSpan.text == '' :
  38. print(tempSpan.text.replace("\n", ""),end='|')
  39. print(dl.find('span',class_="area").text.replace(" ","").replace("\n",""),end='|') # 地址
  40. print(dl.find('div',class_="price").text.replace(" ","").replace("\n",""),end='|') # 价钱
  41. print(dl.find('div',class_="time").text.replace(" ","").replace("\n",""),end="|") # 平均
  42. print("http://chaozhou.ganji.com" + dl['href'],end="|") # 地址
  43. print(str(x + 1))
  44. if __name__ == '__main__':
  45. temp = GanJi()
  46. temp.get("http://chaozhou.ganji.com/fang5/xiangqiao/")