python连接信号未被调用

发布于 2025-02-12 17:45:29 字数 1760 浏览 0 评论 0原文

我有以下文件和代码

import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured

logger = logging.getLogger(__name__)

class SpiderOpenCloseLogging:

    def __init__(self, item_count):
        self.item_count = item_count
        self.items_scraped = 0

    @classmethod
    def from_crawler(cls, crawler):

        print('Hey I am called')
        # first check if the extension should be enabled and raise
        # NotConfigured otherwise
        # if not crawler.settings.getbool('MYEXT_ENABLED'):
        #     raise NotConfigured

        # get the number of items from settings
        item_count = 1000 #crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)

        # instantiate the extension object
        ext = cls(crawler.settings,crawler.stats)

        # connect the extension object to signals
        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)

        # return the extension object
        return ext

    def spider_opened(self, spider):
        logger.info("opened spider %s", spider.name)

    def spider_closed(self, spider):
        logger.info("closed spider %s", spider.name)

    def item_scraped(self, item, spider):
        self.items_scraped += 1
        if self.items_scraped % self.item_count == 0:
            logger.info("scraped %d items", self.items_scraped)

,我有更改设置

MYEXT_ENABLED = True 
EXTENSIONS = {
     'project.custom_extension.SpiderOpenCloseLogging': 300
}

,但没有称呼信号,我检查了在设置中给出的路径,蜘蛛被称为

事件,我打印的打印是我没有记录的,

请有人建议我建议我是什么缺少

感谢

I have below file and code

import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured

logger = logging.getLogger(__name__)

class SpiderOpenCloseLogging:

    def __init__(self, item_count):
        self.item_count = item_count
        self.items_scraped = 0

    @classmethod
    def from_crawler(cls, crawler):

        print('Hey I am called')
        # first check if the extension should be enabled and raise
        # NotConfigured otherwise
        # if not crawler.settings.getbool('MYEXT_ENABLED'):
        #     raise NotConfigured

        # get the number of items from settings
        item_count = 1000 #crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)

        # instantiate the extension object
        ext = cls(crawler.settings,crawler.stats)

        # connect the extension object to signals
        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)

        # return the extension object
        return ext

    def spider_opened(self, spider):
        logger.info("opened spider %s", spider.name)

    def spider_closed(self, spider):
        logger.info("closed spider %s", spider.name)

    def item_scraped(self, item, spider):
        self.items_scraped += 1
        if self.items_scraped % self.item_count == 0:
            logger.info("scraped %d items", self.items_scraped)

and I have change the settings

MYEXT_ENABLED = True 
EXTENSIONS = {
     'project.custom_extension.SpiderOpenCloseLogging': 300
}

But no signal is being called, I have checked the path being given in settings, spiders are being called

event the print I have given is not being logged

Can someone please suggest what I am missing

Thanks

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

银河中√捞星星 2025-02-19 17:45:29

所有信号都从我对脚本的改编中调用。您犯了一些错误,对我来说,这没有任何意义,因为您没有指定任何具体的内容。这就是为什么您没有得到信号而是错误的原因:

一些错误:

1。

    def __init__(self, item_count, stats):
        self.item_count = item_count
        #self.items_scraped = 0 --- change this
        self.items_scraped = stats
    def item_scraped(self, item, spider):
        # self.items_scraped += 1 --- You could do this but then you would not need `crawler.stats`
        # if self.items_scraped % self.item_count == 0: --- these should be the other way around
            logger.info("scraped %d items", self.items_scraped)
#additional note;
#--- you did not substantiate self.item_count, putting item_count
 #in from_crawler does not work. Because you are returning ext, so
 #self.item_count takes crawler.settings rather than item_count. So
 #you will get an error.

我。
通过更新,我们有以下更正:

 def __init__(self, item_count, stats): # if you want to include crawler.stats
        self.item_count = item_count
        self.items_scraped = stats

II。

    def spider_opened(self, spider):
        self.items_scraped = self.items_scraped.get_value('item_scraped_count') #use crawler.stats to get item_count
        if self.items_scraped is None:
            self.items_scraped = 0 #then instantiate with 0
        self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000) #get you item count from settings
        print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
        logger.info("opened spider %s", spider.name)

iii。

    def item_scraped(self, item, spider):
        logger.info(f"scraped few {self.items_scraped} items")
        self.items_scraped += 1
        if  self.item_count % self.items_scraped == 0: # these have been flipped
            logger.info(f"scraped increments {self.items_scraped} items")

示例完全放置此:


import logging
from scrapy import signals
import scrapy

logger = logging.getLogger(__name__)

class SpiderOpenCloseLogging(scrapy.Spider):

    name = 'log_signals'

    start_urls =  [f'http://quotes.toscrape.com/page/{i}/' for i in range(1, 11)]

    def __init__(self, item_count, stats):
        self.item_count = item_count
        self.items_scraped = stats
        #self.items_scraped = 0

    @classmethod
    def from_crawler(cls, crawler):
        ext = cls(crawler.settings,crawler.stats)

        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)

        return ext

    def spider_opened(self, spider):
        self.items_scraped = self.items_scraped.get_value('item_scraped_count')
        if self.items_scraped is None:
            self.items_scraped = 0
        self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000)
        print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
        logger.info("opened spider %s", spider.name)

    def spider_closed(self, spider):
        logger.info("closed spider %s", spider.name)

    def item_scraped(self, item, spider):
        logger.info(f"scraped few {self.items_scraped} items")
        self.items_scraped += 1
        if  self.item_count % self.items_scraped == 0:
            #print(f"scraped increments {self.items_scraped} items")
            logger.info(f"scraped increments {self.items_scraped} items")
    
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url=url,
                callback=self.parse
            )
    def parse(self, response):
        content = response.xpath('//div[@class = "row"]//div')
        for items in content:
            yield {
                'some_items_links':items.xpath(".//a//@href").get()
            }

输出:

.
.
.
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 194 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/author/C-S-Lewis'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 195 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/christianity/page/1/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 196 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/love/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 197 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 198 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 199 items
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped increments 200 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/tag/truth/page/1/'}
...

All the signals are called from my adaption to your script. There were a few mistakes that you made, which to me did not make any sense as you were not specifying anything specific. Which is why you was getting no signal but rather errors:

A few mistakes:

1.

    def __init__(self, item_count, stats):
        self.item_count = item_count
        #self.items_scraped = 0 --- change this
        self.items_scraped = stats
    def item_scraped(self, item, spider):
        # self.items_scraped += 1 --- You could do this but then you would not need `crawler.stats`
        # if self.items_scraped % self.item_count == 0: --- these should be the other way around
            logger.info("scraped %d items", self.items_scraped)
#additional note;
#--- you did not substantiate self.item_count, putting item_count
 #in from_crawler does not work. Because you are returning ext, so
 #self.item_count takes crawler.settings rather than item_count. So
 #you will get an error.

i.
By updating, we have the following corrections:

 def __init__(self, item_count, stats): # if you want to include crawler.stats
        self.item_count = item_count
        self.items_scraped = stats

ii.

    def spider_opened(self, spider):
        self.items_scraped = self.items_scraped.get_value('item_scraped_count') #use crawler.stats to get item_count
        if self.items_scraped is None:
            self.items_scraped = 0 #then instantiate with 0
        self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000) #get you item count from settings
        print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
        logger.info("opened spider %s", spider.name)

iii.

    def item_scraped(self, item, spider):
        logger.info(f"scraped few {self.items_scraped} items")
        self.items_scraped += 1
        if  self.item_count % self.items_scraped == 0: # these have been flipped
            logger.info(f"scraped increments {self.items_scraped} items")

Example when putting this altogether:


import logging
from scrapy import signals
import scrapy

logger = logging.getLogger(__name__)

class SpiderOpenCloseLogging(scrapy.Spider):

    name = 'log_signals'

    start_urls =  [f'http://quotes.toscrape.com/page/{i}/' for i in range(1, 11)]

    def __init__(self, item_count, stats):
        self.item_count = item_count
        self.items_scraped = stats
        #self.items_scraped = 0

    @classmethod
    def from_crawler(cls, crawler):
        ext = cls(crawler.settings,crawler.stats)

        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)

        return ext

    def spider_opened(self, spider):
        self.items_scraped = self.items_scraped.get_value('item_scraped_count')
        if self.items_scraped is None:
            self.items_scraped = 0
        self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000)
        print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
        logger.info("opened spider %s", spider.name)

    def spider_closed(self, spider):
        logger.info("closed spider %s", spider.name)

    def item_scraped(self, item, spider):
        logger.info(f"scraped few {self.items_scraped} items")
        self.items_scraped += 1
        if  self.item_count % self.items_scraped == 0:
            #print(f"scraped increments {self.items_scraped} items")
            logger.info(f"scraped increments {self.items_scraped} items")
    
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url=url,
                callback=self.parse
            )
    def parse(self, response):
        content = response.xpath('//div[@class = "row"]//div')
        for items in content:
            yield {
                'some_items_links':items.xpath(".//a//@href").get()
            }

Output:

.
.
.
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 194 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/author/C-S-Lewis'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 195 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/christianity/page/1/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 196 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/love/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 197 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 198 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 199 items
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped increments 200 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/tag/truth/page/1/'}
...
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文