12345678910111213141516171819202122232425262728293031 |
- # coding=utf-8
- '''
- lxml和bs4解析html对比
- 例子,通过两种方法,把百度所有产品打印出来。
- Created on 2017年7月3日
- @vsersion:python3.6
- @author: liuyuqi
- '''
- import requests
- from bs4 import BeautifulSoup
- from lxml import etree
- url = "https://www.baidu.com/more/"
- res = requests.get(url)
- html = res.text.encode(res.encoding).decode('utf-8')
- # 使用beautiful解析
- soup = BeautifulSoup(html, 'lxml')
- titles = soup.findAll('div', {'class': 'con'})
- print(len(titles))
- for title in titles:
- print(soup.find_all('a')[1].text) # 不好抓取
- # 使用lxml解析
- # //*[@id="content"]/div[1]/div[2]/a
- # //*[@id="content"]/div[2]/div[2]/a
- selector = etree.HTML(html)
- titles = selector.xpath('//*[@id="content"]/div/div/a/text()')
- for title in titles:
- print(title)
|