scrapy爬虫递归调用失效
# -*- coding: UTF-8 -*-
from scrapy.spiders import Spider
from mycrawler.items import BasicItem
import scrapy
header = {
'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
class DlinkSpider(Spider):
name = "hikvision1"
start_urls = [
# "http://www.hikvisioneurope.com/portal/index.php?dir=Product%20Firmware/"
"http://www.hikvisioneurope.com/portal/index.php?dir=Product%20Firmware/Cameras/DS-2CD2X22FWD%2C2X42FWD%2C2X52F/"
]
# must be lower character
suffix = ["zip", "obj", "exe", "drv", "com", "lan",
"dlf", "tar", "tgz", "gz", "iso", "img", "dmg", "bin"]
def parse(self, response):
t = response.headers.get(
'Content-Type').split(r'/')[-1].split(r';')[0]
print response.headers, "content-type\n\n"
if t == 'html':
for a in response.css('table table a')[1:]:
url = response.urljoin(a.xpath('@href').extract()[0])
yield scrapy.Request(url,
callback=self.parse,
headers=header,
meta={'filename': a.css('a::text').extract()})
else:
if t in self.suffix:
item = BasicItem()
item["Firm"] = "Hikvision"
item["Link"] = response.url
item["Rawlink"] = response.url
item["Filename"] = response.meta.get('filename')
item["Title"] = item["Filename"]
item["Info"] = {}
print item
运行不完整,就是callback=self.parse执行不了
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
我一直明白了,其实不是没有执行,而是scrapy框架惹的祸,因为它每次都会把内容下载下来再作判断,所以就造成了读取内容超时,失败。