coser.py 1.2 KB

123456789101112131415161718192021222324252627282930313233
  1. # -*- coding: utf-8 -*-
  2. from scrapy.selector import Selector
  3. import scrapy
  4. from scrapy.contrib.loader import ItemLoader
  5. from fun.items import CoserItem
  6. class CoserSpider(scrapy.Spider):
  7. name = "coser"
  8. allowed_domains = ["bcy.net"]
  9. start_urls = (
  10. 'http://bcy.net/cn125101',
  11. 'http://bcy.net/cn126487',
  12. 'http://bcy.net/cn126173'
  13. )
  14. def parse(self, response):
  15. sel = Selector(response)
  16. for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract():
  17. link = 'http://bcy.net%s' % link
  18. request = scrapy.Request(link, callback=self.parse_item)
  19. yield request
  20. def parse_item(self, response):
  21. l = ItemLoader(item=CoserItem(), response=response)
  22. l.add_xpath('name', "//h1[@class='js-post-title']/text()")
  23. l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
  24. urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
  25. urls = [url.replace('/w650', '') for url in urls]
  26. l.add_value('image_urls', urls)
  27. l.add_value('url', response.url)
  28. return l.load_item()