beike.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2023/05/29 22:03:35
  6. @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
  7. @Desc :
  8. '''
  9. import requests
  10. from lxml import etree
  11. import csv,os,sys,re
  12. import xlwt
  13. import pandas as pd
  14. import argparse
  15. import logging
  16. class Beike(object):
  17. ''' 贝壳网数据 '''
  18. def __init__(self,city:str,save_type:str='csv'):
  19. ''' 初始化 '''
  20. self.sess=requests.session()
  21. self.logger=logging.getLogger(__name__)
  22. self.logger.setLevel(logging.INFO)
  23. self.formatter=logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  24. self.ch=logging.StreamHandler()
  25. self.ch.setLevel(logging.INFO)
  26. self.ch.setFormatter(self.formatter)
  27. self.logger.addHandler(self.ch)
  28. self.args=self.get_args()
  29. def get_proxy(self):
  30. ''' 获取代理 '''
  31. pass
  32. def set_proxy(self):
  33. pass
  34. def get_args(self):
  35. ''' 获取参数 '''
  36. parser=argparse.ArgumentParser(description='贝壳网数据')
  37. parser.add_argument('-c','--city',type=str,help='城市')
  38. parser.add_argument('-p','--page',type=int,default=1,help='页码')
  39. def run(self,city:dict, type='ershoufang'):
  40. for city in citys:
  41. for i in range(1,101):
  42. pass
  43. def get_zufang(self,city:str,page:int=1):
  44. ''' 获取租房数据 '''
  45. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
  46. 'Cookie': 'select_city=510100; lianjia_uuid=db980944-7c31-4bcc-8aae-78cdd9ba138d; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22181a93da768e3e-047b67455463da-26021a51-1327104-181a93da769efd%22%2C%22%24device_id%22%3A%22181a93da768e3e-047b67455463da-26021a51-1327104-181a93da769efd%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E4%BB%98%E8%B4%B9%E5%B9%BF%E5%91%8A%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Fother.php%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E8%B4%9D%E5%A3%B3%E6%89%BE%E6%88%BF%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wychengdu%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; GUARANTEE_BANNER_SHOW=true; login_ucid=2000000245569411; lianjia_ssid=712b2a49-fc10-4962-8fea-379234735b80; lianjia_token=2.0013a6ea83788090ec020bc3b2663bffe3; lianjia_token_secure=2.0013a6ea83788090ec020bc3b2663bffe3; security_ticket=R2sd09nxk8Sm81pfJkvABUssf/StkYIdArkOZN0QqdFQvHLLFF7LBqExRYZbcQYf29gNoQKxH9O7MpAHGu2v73TQnJFksR9KIw/NJ+itit98LyB2Ncs/DgCZmgm3w0ypGjqFK49ik+KFpzJtvr4ukZOE94RSX5+eLymj/Vb+D18=; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiYTRkYjMxYmRhYTM3YTU1ZWI3NDg5OGFiNTVhNTk3ZGQxY2ZhMDkyNTg2YWYwM2NiZDlhMWMwNzM3YmQ2ZmI1ZmZiODA4YmQ4ZTVkMWU4NDNjMjU2M2FhMGZhMzlkM2U4MjkzYTE4Y2U4NjVkNGU3Nzc5NTM5ZmFkNzlmMTE3ZGVkZmFlZDdkNmQxNjc4ZTA0ZDhiOTNiNzhhMDEyM2Q1Y2M0N2QzMDRlYjhkYjliOGM4MGQ2ZTdiMjc0N2UxMzViNjRkYjcwNjA5ZmVkMmIyMjhjMzc2NDQyN2E5YTkzNTY0MjkwYjU2NGE1OGZmYjcxNjMzZDAwZjMxNDBkY2YwNlwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJjODFiYmNhYVwifSIsInIiOiJodHRwczovL2NkLnp1LmtlLmNvbS96dWZhbmciLCJvcyI6IndlYiIsInYiOiIwLjEifQ=='}
  47. resp = requests.get(url=url,headers=headers)
  48. #xpath解析
  49. parser = etree.HTMLParser(encoding='utf-8')
  50. #转换树对象
  51. tree = etree.XML(resp.text,parser=parser)
  52. price = tree.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/div/span/em/text()')
  53. #print(price)
  54. addres = tree.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/div/p[2]/a[1]/text()')
  55. #print(addres)
  56. access = tree.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/div/p[2]/a[2]/text()')
  57. #print(access)
  58. area = tree.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/div/p[2]/a[3]/text()')
  59. #print(area)
  60. #拿到一整列的房源标题
  61. titles = tree.xpath('//*[@id="content"]/div[1]/div[1]/div/div/p[1]/a/text()')
  62. #print(titles)
  63. prices = tree.xpath('//*[@id="content"]/div[1]/div[1]/div/div/span/em/text()\n')
  64. #print(prices)
  65. #获取你需要的信息的所有板块
  66. all_list = tree.xpath('//*[@id="content"]/div[1]/div[1]/div')
  67. #print(all)
  68. #遍历子板块信
  69. for all in all_list:
  70. title = all.xpath('./div/p[1]/a/text()')
  71. price = all.xpath('./div/span/em/text()')
  72. path = all.xpath('./div/p[2]/a[1]/text()')
  73. sub = all.xpath('./div/p[2]/a[2]/text()')
  74. home = all.xpath('./div/p[2]/a[3]/text()')
  75. jx = all.xpath('./div/p[2]/text()[1]')[0]
  76. if '精选' in jx:
  77. area = all.xpath('./div/p[2]/text()[6]')
  78. toward = all.xpath('./div/p[2]/text()[7]')
  79. house_type = all.xpath('./div/p[2]/text()[8]')
  80. else:
  81. area = all.xpath('./div/p[2]/text()[5]')
  82. toward = all.xpath('./div/p[2]/text()[6]')
  83. house_type = all.xpath('./div/p[2]/text()[7]')
  84. list_box.append([title, price, path, sub, home,area,toward,house_type])
  85. for i in list_box:
  86. i[0][0] = i[0][0].strip()
  87. i[1][0] = i[1][0].strip()
  88. i[2][0] = i[2][0].strip()
  89. i[4][0] = i[4][0].strip()
  90. i[5][0] = i[5][0].strip()
  91. i[6][0] = i[6][0].strip()
  92. i[7][0] = i[7][0].strip()
  93. for i in list_box:
  94. print(i)
  95. #设置页 加到表头
  96. '''pages = range(1,3)
  97. for page in pages:
  98. url = f'https://cd.zu.ke.com/zufang/pg{page}/#contentList'''
  99. #存数字方式之一
  100. #存到csv文件,逗号分隔符文件
  101. # writer = csv.writer(open('data1.csv','w',encoding='utf-8'))#打开,未创建文件
  102. # writer.writerow(['title', 'price', 'path', 'sub', 'home','area','toward','house_type'])
  103. # writer.writerows(list_box)
  104. # 存储数据方式二
  105. # 写入excel文件
  106. wb = xlwt.Workbook()#建立工作布
  107. sheet = wb.add_sheet(('data'))#建立表
  108. titles = ('title', 'price', 'path', 'sub', 'home','area','toward','house_type')
  109. for index,title in enumerate(titles):# enumerate 输出两个维度的数据 1、数据系列的索引值,2、数据本身
  110. sheet.write(0,index,title)# 参数-:行索引 参数二:列索引 参数三:数据本身
  111. for i , item in enumerate(list_box):# i:行索引 item:一行数据
  112. for j ,data in enumerate(item):# j:列索引 data:一个数据
  113. sheet.write(i+1,j,data)
  114. wb.save('house.xls')
  115. def get_ershoufang(self):
  116. ''' 获取二手房数据 '''
  117. pass
  118. def get_xinfang(self):
  119. ''' 获取新房数据 '''
  120. pass
  121. @staticmethod
  122. def save( df:pd.DataFrame, filename:str):
  123. ''' 保存数据 '''
  124. if os.path.exists(filename):
  125. df.to_excel(filename, index=False, header=False, mode='a', encoding='utf-8-sig')
  126. else:
  127. df.to_excel(filename, index=False, encoding='utf-8-sig')
  128. if __name__=='__main__':
  129. citys = {
  130. '成都':'cd',
  131. '上海':'sh',
  132. '绵阳':'mianyang',
  133. "吉安":"jian"
  134. }
  135. beike=Beike()
  136. beike.run(citys, "ershoufang")