123456789101112131415161718192021222324252627282930313233343536 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- '''
- @Auther :liuyuqi.gov@msn.cn
- @Time :2018/4/11 15:07
- @File :crawl-menshijian8-images-splinter.py
- '''
- import splinter
- url_home = 'http://menshijian8.com'
- user_agent = "Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en)"
- url_set = set()
- url_cache = set()
- browser = splinter.Browser("phantomjs", executable_path="D:/Program-Files/phantomjs-2.1.1-windows/bin/phantomjs.exe",
- user_agent=user_agent)
- def spiderURL(url):
- global url_set, url_cache
- browser.visit(url=url)
- browser.driver.set_window_size(1366, 768)
- links = browser.find_link_by_partial_href(url_home)
- for link in links:
- if link["href"] not in url_cache:
- url_set.add(link["href"], 0)
- print("title", link["href"][0])
- return url_set
- def crawl():
- global url_set
- if len(url_set) == 0:
- url_set = spiderURL(url_home)
- if __name__ == '__main__':
- print("------start--------")
- crawl()
|