crawl-menshijian8-images-splinter.py 986 B

123456789101112131415161718192021222324252627282930313233343536
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. '''
  4. @Auther :liuyuqi.gov@msn.cn
  5. @Time :2018/4/11 15:07
  6. @File :crawl-menshijian8-images-splinter.py
  7. '''
  8. import splinter
  9. url_home = 'http://menshijian8.com'
  10. user_agent = "Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en)"
  11. url_set = set()
  12. url_cache = set()
  13. browser = splinter.Browser("phantomjs", executable_path="D:/Program-Files/phantomjs-2.1.1-windows/bin/phantomjs.exe",
  14. user_agent=user_agent)
  15. def spiderURL(url):
  16. global url_set, url_cache
  17. browser.visit(url=url)
  18. browser.driver.set_window_size(1366, 768)
  19. links = browser.find_link_by_partial_href(url_home)
  20. for link in links:
  21. if link["href"] not in url_cache:
  22. url_set.add(link["href"], 0)
  23. print("title", link["href"][0])
  24. return url_set
  25. def crawl():
  26. global url_set
  27. if len(url_set) == 0:
  28. url_set = spiderURL(url_home)
  29. if __name__ == '__main__':
  30. print("------start--------")
  31. crawl()