#!/usr/bin/env python # -*- coding: utf-8 -*- ''' @Auther :liuyuqi.gov@msn.cn @Time :2018/4/11 15:07 @File :crawl-menshijian8-images-splinter.py ''' import splinter url_home = 'http://menshijian8.com' user_agent = "Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en)" url_set = set() url_cache = set() browser = splinter.Browser("phantomjs", executable_path="D:/Program-Files/phantomjs-2.1.1-windows/bin/phantomjs.exe", user_agent=user_agent) def spiderURL(url): global url_set, url_cache browser.visit(url=url) browser.driver.set_window_size(1366, 768) links = browser.find_link_by_partial_href(url_home) for link in links: if link["href"] not in url_cache: url_set.add(link["href"], 0) print("title", link["href"][0]) return url_set def crawl(): global url_set if len(url_set) == 0: url_set = spiderURL(url_home) if __name__ == '__main__': print("------start--------") crawl()