main.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. '''
  4. @Auther :liuyuqi.gov@msn.cn
  5. @Time :2018/4/11 1:07
  6. @File :main.py
  7. '''
  8. import re
  9. import bs4
  10. import urllib.request
  11. url_home = 'http://menshijian8.com' # 要采集的网站
  12. url_set = set()
  13. url_cache = set()
  14. url_count = 0
  15. url_maxCount = 1000000 # 最大采集数量c
  16. url_pattern = url_home + '([\s\S]*)\.html' # 正则表达式匹配文章页面,此处需完善为更好的写法
  17. # 采集匹配文章内容的href标签
  18. def spiderURL(url, pattern):
  19. html = urllib.request.urlopen(url).read().decode('utf8')
  20. soup = bs4.BeautifulSoup(html, 'html.parser')
  21. links = soup.find_all('a', href=re.compile(pattern))
  22. for link in links:
  23. if link['href'] not in url_cache & link["href"]=="":
  24. url_set.add(link['href'])
  25. return soup
  26. def findImages(dom):
  27. img=""
  28. return img
  29. def crwal():
  30. # 采集的过程 异常处理还需要完善,对于一些加了防采集的站,还需要处理header的,下次我们再学习
  31. spiderURL(url_home, url_pattern)
  32. while len(url_set) != 0:
  33. try:
  34. url = url_set.pop()
  35. url_cache.add(url)
  36. soup = spiderURL(url, url_pattern)
  37. page = soup.find('div', {'class': 'content'})
  38. title = page.find('h1').get_text()
  39. autor = page.find('h4').get_text()
  40. content = page.find('article').get_text()
  41. print(title, autor, url)
  42. except Exception as e:
  43. print(url, e)
  44. continue
  45. else:
  46. url_count += 1
  47. finally:
  48. if url_count == url_maxCount:
  49. break
  50. print('一共采集了: ' + str(url_count) + ' 条数据')
  51. if __name__ == '__main__':
  52. crwal()