# -*- coding: UTF-8 -*- from scrapy.spiders import Spider from mycrawler.items import BasicItem import scrapy header = { 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"} class DlinkSpider(Spider): name = "hikvision1" start_urls = [ # "http://www.hikvisioneurope.com/portal/index.php?dir=Product%20Firmware/" "http://www.hikvisioneurope.com/portal/index.php?dir=Product%20Firmware/Cameras/DS-2CD2X22FWD%2C2X42FWD%2C2X52F/" ] # must be lower character suffix = ["zip", "obj", "exe", "drv", "com", "lan", "dlf", "tar", "tgz", "gz", "iso", "img", "dmg", "bin"] def parse(self, response): t = response.headers.get( 'Content-Type').split(r'/')[-1].split(r';')[0] print response.headers, "content-type\n\n" if t == 'html': for a in response.css('table table a')[1:]: url = response.urljoin(a.xpath('@href').extract()[0]) yield scrapy.Request(url, callback=self.parse, headers=header, meta={'filename': a.css('a::text').extract()}) else: if t in self.suffix: item = BasicItem() item["Firm"] = "Hikvision" item["Link"] = response.url item["Rawlink"] = response.url item["Filename"] = response.meta.get('filename') item["Title"] = item["Filename"] item["Info"] = {} print item 运行不完整,就是callback=self.parse执行不了
我一直明白了,其实不是没有执行,而是scrapy框架惹的祸,因为它每次都会把内容下载下来再作判断,所以就造成了读取内容超时,失败。