# coding=utf-8
'''
lxml和bs4解析html对比
例子,通过两种方法,把百度所有产品打印出来。
Created on 2017年7月3日
@vsersion:python3.6
@author: liuyuqi
'''
import requests
from bs4 import BeautifulSoup
from lxml import etree
url = "https://www.baidu.com/more/"
res = requests.get(url)
html = res.text.encode(res.encoding).decode('utf-8')
# 使用beautiful解析
soup = BeautifulSoup(html, 'lxml')
titles = soup.findAll('div', {'class': 'con'})
print(len(titles))
for title in titles:
print(soup.find_all('a')[1].text) # 不好抓取
# 使用lxml解析
# //*[@id="content"]/div[1]/div[2]/a
# //*[@id="content"]/div[2]/div[2]/a
selector = etree.HTML(html)
titles = selector.xpath('//*[@id="content"]/div/div/a/text()')
for title in titles:
print(title)