meizitu.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536
  1. # -*- coding: utf-8 -*-
  2. from scrapy.selector import Selector
  3. import scrapy
  4. from scrapy.contrib.loader import ItemLoader, Identity
  5. from fun.items import MeizituItem
  6. class MeizituSpider(scrapy.Spider):
  7. name = "meizitu"
  8. allowed_domains = ["meizitu.com"]
  9. start_urls = (
  10. 'http://www.meizitu.com/',
  11. )
  12. def parse(self, response):
  13. sel = Selector(response)
  14. for link in sel.xpath('//h2/a/@href').extract():
  15. request = scrapy.Request(link, callback=self.parse_item)
  16. yield request
  17. pages = sel.xpath("//div[@class='navigation']/div[@id='wp_page_numbers']/ul/li/a/@href").extract()
  18. print('pages: %s' % pages)
  19. if len(pages) > 2:
  20. page_link = pages[-2]
  21. page_link = page_link.replace('/a/', '')
  22. request = scrapy.Request('http://www.meizitu.com/a/%s' % page_link, callback=self.parse)
  23. yield request
  24. def parse_item(self, response):
  25. l = ItemLoader(item=MeizituItem(), response=response)
  26. l.add_xpath('name', '//h2/a/text()')
  27. l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p")
  28. l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity())
  29. l.add_value('url', response.url)
  30. return l.load_item()