作为脚本运行 scrapysplash
我正在尝试使用 splash
运行 scrapy 脚本,因为我想抓取基于 javascript
的网页,但没有结果。当我使用 python 命令执行此脚本时,出现以下错误:crochet._eventloop.TimeoutError
。此外,parse 方法中的打印语句从未打印过,因此我认为 SplashRequest
有问题。我为了实现这一点而编写的代码是:
import logging
import scrapy
from scrapy import signals
from scrapy.crawler import CrawlerRunner
from scrapy.item import Item, Field
from scrapy.signalmanager import dispatcher
from scrapy_splash import SplashRequest
from crochet import setup, wait_for
setup()
# logging.getLogger('scrapy').propagate = False
class GooglePatentsSpider(scrapy.spiders.Spider):
name = "google_patents_spider"
allowed_domains = ['patents.google.com']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
assert(splash:go(args.url))
splash:wait(5)
return splash:html()
end
'''
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(GooglePatentsSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped)
return spider
def item_scraped(self, item):
return item
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(
url=url,
callback = self.parse,
endpoint='execute',
args={
'lua_source': self.script
}
)
def parse(self, response):
print('from parse')
item = {}
item['status'] = 'Hello world'
return item
@wait_for(timeout=50.0)
async def run_spider():
"""Returns all the scraped items of the provided publication number"""
results = []
def crawler_results(signal, sender, item, response, spider):
results.append(item)
dispatcher.connect(crawler_results, signal=signals.item_scraped)
runner = CrawlerRunner(settings={
'BOT_NAME': 'web_page_crawler',
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
'ROBOTSTXT_OBEY': False,
'SPLASH_URL': 'http://192.168.59.103:8050',
'DOWNLOADER_MIDDLEWARES': {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'SPIDER_MIDDLEWARES': {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPCACHE_STORAGE': 'scrapy_splash.SplashAwareFSCacheStorage'
})
await runner.crawl(GooglePatentsSpider, start_urls=[f'https://patents.google.com/?q=CL%3dgenistein'])
if results:
return results[0]
else:
return 'This publication number cannot be retrieved'
run_spider()
完整的回溯:
Traceback (most recent call last):
File "hits_scraper.py", line 89, in <module>
run_spider()
File "/home/shared/projects/siftlink/scrapers/.scrapers-api/lib/python3.8/site-packages/crochet/_eventloop.py", line 461, in wrapper
return eventual_result.wait(timeout)
File "/home/shared/projects/siftlink/scrapers/.scrapers-api/lib/python3.8/site-packages/crochet/_eventloop.py", line 196, in wait
result = self._result(timeout)
File "/home/shared/projects/siftlink/scrapers/.scrapers-api/lib/python3.8/site-packages/crochet/_eventloop.py", line 175, in _result
raise TimeoutError()
crochet._eventloop.TimeoutError
I am trying to run a scrapy script with splash
, as I want to scrape a javascript
based webpage, but with no results. When I execute this script with python command, I get this error: crochet._eventloop.TimeoutError
. In addition the print statement in parse method never printed, so I consider something is wrong with SplashRequest
. The code that I wrote in order to implement this is that:
import logging
import scrapy
from scrapy import signals
from scrapy.crawler import CrawlerRunner
from scrapy.item import Item, Field
from scrapy.signalmanager import dispatcher
from scrapy_splash import SplashRequest
from crochet import setup, wait_for
setup()
# logging.getLogger('scrapy').propagate = False
class GooglePatentsSpider(scrapy.spiders.Spider):
name = "google_patents_spider"
allowed_domains = ['patents.google.com']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
assert(splash:go(args.url))
splash:wait(5)
return splash:html()
end
'''
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(GooglePatentsSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped)
return spider
def item_scraped(self, item):
return item
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(
url=url,
callback = self.parse,
endpoint='execute',
args={
'lua_source': self.script
}
)
def parse(self, response):
print('from parse')
item = {}
item['status'] = 'Hello world'
return item
@wait_for(timeout=50.0)
async def run_spider():
"""Returns all the scraped items of the provided publication number"""
results = []
def crawler_results(signal, sender, item, response, spider):
results.append(item)
dispatcher.connect(crawler_results, signal=signals.item_scraped)
runner = CrawlerRunner(settings={
'BOT_NAME': 'web_page_crawler',
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
'ROBOTSTXT_OBEY': False,
'SPLASH_URL': 'http://192.168.59.103:8050',
'DOWNLOADER_MIDDLEWARES': {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'SPIDER_MIDDLEWARES': {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPCACHE_STORAGE': 'scrapy_splash.SplashAwareFSCacheStorage'
})
await runner.crawl(GooglePatentsSpider, start_urls=[f'https://patents.google.com/?q=CL%3dgenistein'])
if results:
return results[0]
else:
return 'This publication number cannot be retrieved'
run_spider()
The full traceback:
Traceback (most recent call last):
File "hits_scraper.py", line 89, in <module>
run_spider()
File "/home/shared/projects/siftlink/scrapers/.scrapers-api/lib/python3.8/site-packages/crochet/_eventloop.py", line 461, in wrapper
return eventual_result.wait(timeout)
File "/home/shared/projects/siftlink/scrapers/.scrapers-api/lib/python3.8/site-packages/crochet/_eventloop.py", line 196, in wait
result = self._result(timeout)
File "/home/shared/projects/siftlink/scrapers/.scrapers-api/lib/python3.8/site-packages/crochet/_eventloop.py", line 175, in _result
raise TimeoutError()
crochet._eventloop.TimeoutError
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
当我在运行代码之前没有启动
splash
时,我遇到了同样的错误。如果我运行
splash
(作为docker
图像),那么我也会收到此错误,因为它有不同的IP
但如果我在
'SPLASH_URL'
中使用正确的IP
,那么它就可以工作。在 Linux 上,我获得了运行图像的
IP
,但代码似乎也适用于通用 IP
0.0.0.0
I got the same error when I did't start
splash
befor running code.If I run
splash
(asdocker
image) then I also get this error because it had differentIP
but if I use correct
IP
in'SPLASH_URL'
then it works.On Linux I got
IP
of running image usingbut it seems code works also with universal IP
0.0.0.0