compare-lxml-beautiful.py 789 B

12345678910111213141516171819202122232425262728293031
  1. # coding=utf-8
  2. '''
  3. lxml和bs4解析html对比
  4. 例子,通过两种方法,把百度所有产品打印出来。
  5. Created on 2017年7月3日
  6. @vsersion:python3.6
  7. @author: liuyuqi
  8. '''
  9. import requests
  10. from bs4 import BeautifulSoup
  11. from lxml import etree
  12. url = "https://www.baidu.com/more/"
  13. res = requests.get(url)
  14. html = res.text.encode(res.encoding).decode('utf-8')
  15. # 使用beautiful解析
  16. soup = BeautifulSoup(html, 'lxml')
  17. titles = soup.findAll('div', {'class': 'con'})
  18. print(len(titles))
  19. for title in titles:
  20. print(soup.find_all('a')[1].text) # 不好抓取
  21. # 使用lxml解析
  22. # //*[@id="content"]/div[1]/div[2]/a
  23. # //*[@id="content"]/div[2]/div[2]/a
  24. selector = etree.HTML(html)
  25. titles = selector.xpath('//*[@id="content"]/div/div/a/text()')
  26. for title in titles:
  27. print(title)