12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Author : liuyuqi
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2019/11/18 03:19:28
- @Version : 1.0
- @License : (C)Copyright 2019
- @Desc : 赶集网二手爬虫
- '''
- import requests
- import os
- from bs4 import BeautifulSoup
- class GanJi():
- """docstring for GanJi"""
- def __init__(self):
- super(GanJi, self).__init__()
- def get(self,url):
- user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
- headers = {'User-Agent':user_agent}
-
- webData = requests.get(url + 'o1',headers=headers).text
- soup = BeautifulSoup(webData,'lxml')
-
-
- sum = soup.find('span',class_="num").text.replace("套","")
- ave = int(sum) / 32
- forNum = int(ave)
- if forNum < ave:
- forNum = forNum + 1
- for x in range(forNum):
- webData = requests.get(url + 'o' + str(x + 1),headers=headers).text
- soup = BeautifulSoup(webData,'lxml')
- find_list = soup.find('div',class_="f-main-list").find_all('div',class_="f-list-item ershoufang-list")
- for dl in find_list:
-
- print(dl.find('a',class_="js-title value title-font").text,end='|') # 名称
- # 中间 5 个信息
- tempDD = dl.find('dd',class_="dd-item size").find_all('span')
- for tempSpan in tempDD:
- if not tempSpan.text == '' :
- print(tempSpan.text.replace("\n", ""),end='|')
-
- print(dl.find('span',class_="area").text.replace(" ","").replace("\n",""),end='|') # 地址
-
- print(dl.find('div',class_="price").text.replace(" ","").replace("\n",""),end='|') # 价钱
-
- print(dl.find('div',class_="time").text.replace(" ","").replace("\n",""),end="|") # 平均
-
- print("http://chaozhou.ganji.com" + dl['href'],end="|") # 地址
- print(str(x + 1))
- if __name__ == '__main__':
- temp = GanJi()
- temp.get("http://chaozhou.ganji.com/fang5/xiangqiao/")
|