链接没有 url 格式以便抓取它们 scrapy

发布于 2025-01-15 05:10:16 字数 3368 浏览 1 评论 0原文

这是我的代码:


import scrapy
from scrapy import Spider
from scrapy.http import FormRequest

class ProvinciaSpider(Spider):
    name = 'provincia'
    allowed_domains = ['aduanet.gob.pe']
    start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']

    def parse(self, response):
        data ={ 'accion': 'consultaManifExpProvincia',
        'salidaPro': 'YES',
        'strMenu': '-',
        'strEmpTransTerrestre': '-',
        'CMc1_Anno': '2022',
        'CMc1_Numero': '96',
        'CG_cadu': '046',
        'viat': '1'}

        yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias', formdata=data, callback=self.parse_form_page)

    def parse_form_page(self, response):
        table = response.xpath('/html/body/form[1]//td[@class="beta"]/table')
        trs = table.xpath('.//tr')[1:]
        for tr in trs:
            puerto_llegada= tr.xpath('.//td[1]/text()').extract_first().strip()
            pais= tr.xpath('.//td[1]/text()').extract_first().strip()
            bl= tr.xpath('.//td[3]/text()').extract_first().strip()
            peso= tr.xpath('.//td[8]/text()').extract_first().strip()
            bultos= tr.xpath('.//td[9]/text()').extract_first().strip()
            consignatario= tr.xpath('.//td[12]/text()').extract_first().strip()
            embarcador= tr.xpath('.//td[13]/text()').extract_first().strip()
            links=tr.xpath('.//td[4]/a/@href')

            yield response.follow(links.get(),
                                 callback=self.parse_categories,
                                 meta={'puerto_llegada': puerto_llegada,
                                       'pais': pais,
                                       'bl': bl,
                                       'peso': float("".join(peso.split(','))),
                                       'bultos': float("".join(bultos.split(','))),
                                       'consignatario': consignatario,
                                       'embarcador': embarcador})
    def parse_categories(self, response):
        puerto_llegada = response.meta['puerto_llegada']
        pais = response.meta['pais']
        bl = response.meta['bl']
        peso = response.meta['peso']
        bultos = response.meta['bultos']
        consignatario = response.meta['consignatario']
        embarcador = response.meta['embarcador']


        tabla_des= response.xpath('/html/body/form//td[@class="beta"]/table')
        trs3= tabla_des.xpath('.//tr')[1:]
        for tr3 in trs3:
            descripcion= tr.xpath('.//td[7]/text()').extract_first().strip()

            yield {'puerto_llegada': puerto_llegada,
                   'pais': pais,
                   'bl': bl,
                   'peso': PROCESOS,
                   'bultos': bultos,
                   'consignatario': consignatario,
                   'embarcador': embarcador,
                   'descripcion': descripcion}

我收到此错误:

ValueError:请求网址中缺少方案:javascript:jsDetalle2('154');

我想要从中提取数据的每个链接都具有该格式,因此我用于提取每个链接内的数据的代码不起作用。

链接格式类似于 javascript:jsDetalle2('154'),只是数字发生变化。

问题是它不是 http//........ 或 /manizesto...... 在第一种情况下你只需要点击链接就可以了,在第二种情况下你必须将 URL 的第二部分与第一个响应 URL 连接起来。但这个案例没有,所以我不知道如何让它发挥作用。

我怎样才能写它才能工作?

This is my code:


import scrapy
from scrapy import Spider
from scrapy.http import FormRequest

class ProvinciaSpider(Spider):
    name = 'provincia'
    allowed_domains = ['aduanet.gob.pe']
    start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']

    def parse(self, response):
        data ={ 'accion': 'consultaManifExpProvincia',
        'salidaPro': 'YES',
        'strMenu': '-',
        'strEmpTransTerrestre': '-',
        'CMc1_Anno': '2022',
        'CMc1_Numero': '96',
        'CG_cadu': '046',
        'viat': '1'}

        yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias', formdata=data, callback=self.parse_form_page)

    def parse_form_page(self, response):
        table = response.xpath('/html/body/form[1]//td[@class="beta"]/table')
        trs = table.xpath('.//tr')[1:]
        for tr in trs:
            puerto_llegada= tr.xpath('.//td[1]/text()').extract_first().strip()
            pais= tr.xpath('.//td[1]/text()').extract_first().strip()
            bl= tr.xpath('.//td[3]/text()').extract_first().strip()
            peso= tr.xpath('.//td[8]/text()').extract_first().strip()
            bultos= tr.xpath('.//td[9]/text()').extract_first().strip()
            consignatario= tr.xpath('.//td[12]/text()').extract_first().strip()
            embarcador= tr.xpath('.//td[13]/text()').extract_first().strip()
            links=tr.xpath('.//td[4]/a/@href')

            yield response.follow(links.get(),
                                 callback=self.parse_categories,
                                 meta={'puerto_llegada': puerto_llegada,
                                       'pais': pais,
                                       'bl': bl,
                                       'peso': float("".join(peso.split(','))),
                                       'bultos': float("".join(bultos.split(','))),
                                       'consignatario': consignatario,
                                       'embarcador': embarcador})
    def parse_categories(self, response):
        puerto_llegada = response.meta['puerto_llegada']
        pais = response.meta['pais']
        bl = response.meta['bl']
        peso = response.meta['peso']
        bultos = response.meta['bultos']
        consignatario = response.meta['consignatario']
        embarcador = response.meta['embarcador']


        tabla_des= response.xpath('/html/body/form//td[@class="beta"]/table')
        trs3= tabla_des.xpath('.//tr')[1:]
        for tr3 in trs3:
            descripcion= tr.xpath('.//td[7]/text()').extract_first().strip()

            yield {'puerto_llegada': puerto_llegada,
                   'pais': pais,
                   'bl': bl,
                   'peso': PROCESOS,
                   'bultos': bultos,
                   'consignatario': consignatario,
                   'embarcador': embarcador,
                   'descripcion': descripcion}

And I get this error:

ValueError: Missing scheme in request url: javascript:jsDetalle2('154');

Every link that I want to extract data from has that format, so my code for extracting the data inside each link doesn't work.

The link format is like javascript:jsDetalle2('154'), only the numbers change.

The problem is that it isn't http//........ or /manifiesto...... in the first case you only have to follow the link and that's all, in the second case you have to join the second part of the URL with the first response URL. But this case is none, so I don't know how to make it work.

How can I write it in order to work?

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

北方的巷 2025-01-22 05:10:16

我在浏览器中检查了此链接 - 当我单击带有文本 154 的链接时,它会运行带有许多值的 POST,其中之一是 'CMc2_NumDet': '154 ' - 这样我就可以从链接中获取此号码并在 POST 中使用。

在浏览器中您可以看到 'CMc2_Numero': "+++96" 但在代码中您需要 space 而不是 +" 96" (scrapy 将使用 + 而不是 space),或者您可以删除所有 +,如 "96"

顺便说一句:我将所有值放入 meta 作为 item: {...} ,这样稍后我就可以使用 meta['item'] 一行来获取所有值 完整的工作代码

        number = tr.xpath('.//td[4]/a/text()').get()

        data = {
            'accion': "consultaManifExpProvinciaDetalle",
            'CMc2_Anno': "2022",
            'CMc2_Numero': "96",    # <--- without `+`
            'CG_cadu': "046",
            'CMc2_viatra': "1",
            'CMc2_numcon': "",
            'CMc2_NumDet': number,  # <---
            'tipo_archivo': "",
            'reporte': "ExpPro",
            'backPage': "ConsulManifExpPro",
        }

        yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias',
                          formdata=data,
                          callback=self.parse_categories,
                          meta={"item": {'puerto_llegada': puerto_llegada,
                                         'pais': pais,
                                         'bl': bl,
                                         'peso': float("".join(peso.split(','))),
                                         'bultos': float("".join(bultos.split(','))),
                                         'consignatario': consignatario,
                                         'embarcador': embarcador}})
    
def parse_categories(self, response):
    print('[parse_form_page] url:', response.url)

    item = response.meta['item']

    tabla_des = response.xpath('/html/body/form//td[@class="beta"]/table')
    trs3 = tabla_des.xpath('.//tr')[1:]
    for tr3 in trs3:   # trs3[:1]: for single result
        item['descripcion'] = tr3.xpath('.//td[7]/text()').extract_first().strip()
        yield item

包含类别的页面可能在表中包含许多行(具有您不使用的不同 Peso Bruto),因此它可能会在 CSV 中提供许多行。

如果您只需要一行,则使用 trs3[:1]: 而不是 trs3:

我使用不同的 xpath 来查找包含“Description”的表- 因为以前的版本没有检查表是否有描述,并且它可能会得到 3 个表而不是一个。

import scrapy
from scrapy import Spider
from scrapy.http import FormRequest

class ProvinciaSpider(Spider):
    
    name = 'provincia'
    allowed_domains = ['aduanet.gob.pe']
    start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']

    def parse(self, response):
        payload = {
            'accion': 'consultaManifExpProvincia',
            'salidaPro': 'YES',
            'strMenu': '-',
            'strEmpTransTerrestre': '-',
            'CMc1_Anno': '2022',
            'CMc1_Numero': '96',
            'CG_cadu': '046',
            'viat': '1'
        }

        yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias',
                          formdata=payload,
                          callback=self.parse_form_page)

    def parse_form_page(self, response):
        print('[parse_form_page] url:', response.url)
        
        table = response.xpath('/html/body/form[1]//td[@class="beta"]/table')
        trs = table.xpath('.//tr')[1:]
        for tr in trs:
            item = {
                'puerto_llegada': tr.xpath('.//td[1]/text()').extract_first().strip(),
                'pais': tr.xpath('.//td[1]/text()').extract_first().strip(),
                'bl': tr.xpath('.//td[3]/text()').extract_first().strip(),
                'peso': tr.xpath('.//td[8]/text()').extract_first().strip().replace(',', ''),    # <---
                'bultos': tr.xpath('.//td[9]/text()').extract_first().strip().replace(',', ''),  # <---
                'consignatario': tr.xpath('.//td[12]/text()').extract_first().strip(),
                'embarcador': tr.xpath('.//td[13]/text()').extract_first().strip(),
            }

            number = tr.xpath('.//td[4]/a/text()').get().strip()
            print(number.strip())
            
            payload = {
                'accion': "consultaManifExpProvinciaDetalle",
                'CMc2_Anno': "2022",
                'CMc2_Numero': "96",     # without `+` or use `space` instead of `+`
                'CG_cadu': "046",
                'CMc2_viatra': "1",
                'CMc2_numcon': "",
                'CMc2_NumDet': number,   # <---
                'tipo_archivo': "",
                'reporte': "ExpPro",
                'backPage': "ConsulManifExpPro",
            }

            yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias',
                              formdata=payload,
                              callback=self.parse_categories,
                              meta={"item": item})
        
    def parse_categories(self, response):
        print('[parse_form_page] url:', response.url)

        item = response.meta['item']

        table = response.xpath('//table[./tr/th[contains(text(), "Descripcion")]]')
        print('len(table):', len(table))

        trs = table.xpath('.//tr')[1:]
        print('len(trs):', len(trs))
        
        for tr in trs:   # trs[:1]: for single result
            item['descripcion'] = tr.xpath('.//td[7]/text()').extract_first().strip()
            yield item

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
})
c.crawl(ProvinciaSpider)
c.start() 

结果(带有 trs[:1]

puerto_llegada,pais,bl,peso,bultos,consignatario,embarcador,descripcion
BEANR,BEANR,MAEU216473186,47420.00,2160,AGROFAIR BENELUX BV,COOPERATIVA AGRARIA APPBOSA,YT GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS.
NLRTM,NLRTM,MAEU216473104,83890.00,5280,AGROFAIR BENELUX BV.,TULIPAN NARANJA S.A.C.,FYT GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS.
BEANR,BEANR,MAEU216307459,19980.00,285,"Greencof B.V.,",COOPERATIVA AGRARIA RODRIGUEZ DE MENDOZA,285 BAGS OF 69 KG NET OF PERU ORGANIC GREEN COFFEE FAIRTRADE CERTIFIED
JPYOK,JPYOK,MAEU1KT407500,21320.00,709,"HOWA SHOJI CO., LTD",GEALE AGROTRADING E.I.R.L.,GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS. BAN
ITCVV,ITCVV,MAEU913779677,66950.00,3240,BATTAGLIO SPA,IREN PERU SOCIEDAD ANONIMA CERRADA - IREN PERU S.A,GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS. BAN
NLRTM,NLRTM,MAEU913798070,24700.00,5544,FRUTOS TROPICALES EUROPE B.V.,FRUTOS TROPICALES PERU EXPORT SOCIEDAD ANONIMA CER,"FRESH MANGOES NET WEIGHT: 22,176.00 KG P.A.: 0804.50.20.00 TR.: JKXYA0"
BEANR,BEANR,MAEU216473141,23710.00,1080,AGROFAIR BENELUX BV.,TULIPAN NARANJA S.A.C.,FYT GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS.
BEANR,BEANR,MAEU216632137,22270.00,1080,FYFFES INTERNATIONAL,AGRO PACHA S.A.,"GREEN FRESH ORGANIC BANANAS, PACKED IN CARTON BOXES AND POLYETHILENE B"
KRPUS,KRPUS,MAEU913722041,24480.00,1175,TO THE ORDER,PERUPEZ S.A.C.,"NET WEIGHT: 23,500 KG GROSS WEIGHT: 24,480 KG 1,175 SACKS 23,500 KG FR"
NLRTM,NLRTM,MAEU216473211,22520.00,1080,AgroFair Benelux BV,COOPERATIVA AGRARIA DE USUARIOS RIO Y VALLE,ORGANIC FAIRTRADE BANANAS GREEN FRESH CAVENDISH PACKED CARDBOARD BOXES

I checked this link in browser - and when I click link with text 154 then it runs POST with many values and one of them is 'CMc2_NumDet': '154' - so I can get this number from link and use in POST.

In browser you can see 'CMc2_Numero': "+++96" but in code you need space instead of + like " 96" (and scrapy will use + instead of space) or you can remove all + like "96" .

BTW: I put in meta all values as item: {...} so later I can get all values using one line with meta['item']

        number = tr.xpath('.//td[4]/a/text()').get()

        data = {
            'accion': "consultaManifExpProvinciaDetalle",
            'CMc2_Anno': "2022",
            'CMc2_Numero': "96",    # <--- without `+`
            'CG_cadu': "046",
            'CMc2_viatra': "1",
            'CMc2_numcon': "",
            'CMc2_NumDet': number,  # <---
            'tipo_archivo': "",
            'reporte': "ExpPro",
            'backPage': "ConsulManifExpPro",
        }

        yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias',
                          formdata=data,
                          callback=self.parse_categories,
                          meta={"item": {'puerto_llegada': puerto_llegada,
                                         'pais': pais,
                                         'bl': bl,
                                         'peso': float("".join(peso.split(','))),
                                         'bultos': float("".join(bultos.split(','))),
                                         'consignatario': consignatario,
                                         'embarcador': embarcador}})
    
def parse_categories(self, response):
    print('[parse_form_page] url:', response.url)

    item = response.meta['item']

    tabla_des = response.xpath('/html/body/form//td[@class="beta"]/table')
    trs3 = tabla_des.xpath('.//tr')[1:]
    for tr3 in trs3:   # trs3[:1]: for single result
        item['descripcion'] = tr3.xpath('.//td[7]/text()').extract_first().strip()
        yield item

Full working code.

Page with categories may have many rows in table (with different Peso Bruto which you don't use) so it may give many rows in CSV.

If you need only one row then use trs3[:1]: instead of trs3:

I used different xpath to find table with "Descripcion" - because previous version didn't check if table has Descripcion and it could get 3 tables instead of one.

import scrapy
from scrapy import Spider
from scrapy.http import FormRequest

class ProvinciaSpider(Spider):
    
    name = 'provincia'
    allowed_domains = ['aduanet.gob.pe']
    start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']

    def parse(self, response):
        payload = {
            'accion': 'consultaManifExpProvincia',
            'salidaPro': 'YES',
            'strMenu': '-',
            'strEmpTransTerrestre': '-',
            'CMc1_Anno': '2022',
            'CMc1_Numero': '96',
            'CG_cadu': '046',
            'viat': '1'
        }

        yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias',
                          formdata=payload,
                          callback=self.parse_form_page)

    def parse_form_page(self, response):
        print('[parse_form_page] url:', response.url)
        
        table = response.xpath('/html/body/form[1]//td[@class="beta"]/table')
        trs = table.xpath('.//tr')[1:]
        for tr in trs:
            item = {
                'puerto_llegada': tr.xpath('.//td[1]/text()').extract_first().strip(),
                'pais': tr.xpath('.//td[1]/text()').extract_first().strip(),
                'bl': tr.xpath('.//td[3]/text()').extract_first().strip(),
                'peso': tr.xpath('.//td[8]/text()').extract_first().strip().replace(',', ''),    # <---
                'bultos': tr.xpath('.//td[9]/text()').extract_first().strip().replace(',', ''),  # <---
                'consignatario': tr.xpath('.//td[12]/text()').extract_first().strip(),
                'embarcador': tr.xpath('.//td[13]/text()').extract_first().strip(),
            }

            number = tr.xpath('.//td[4]/a/text()').get().strip()
            print(number.strip())
            
            payload = {
                'accion': "consultaManifExpProvinciaDetalle",
                'CMc2_Anno': "2022",
                'CMc2_Numero': "96",     # without `+` or use `space` instead of `+`
                'CG_cadu': "046",
                'CMc2_viatra': "1",
                'CMc2_numcon': "",
                'CMc2_NumDet': number,   # <---
                'tipo_archivo': "",
                'reporte': "ExpPro",
                'backPage': "ConsulManifExpPro",
            }

            yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias',
                              formdata=payload,
                              callback=self.parse_categories,
                              meta={"item": item})
        
    def parse_categories(self, response):
        print('[parse_form_page] url:', response.url)

        item = response.meta['item']

        table = response.xpath('//table[./tr/th[contains(text(), "Descripcion")]]')
        print('len(table):', len(table))

        trs = table.xpath('.//tr')[1:]
        print('len(trs):', len(trs))
        
        for tr in trs:   # trs[:1]: for single result
            item['descripcion'] = tr.xpath('.//td[7]/text()').extract_first().strip()
            yield item

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0',
    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
})
c.crawl(ProvinciaSpider)
c.start() 

Result (with trs[:1])

puerto_llegada,pais,bl,peso,bultos,consignatario,embarcador,descripcion
BEANR,BEANR,MAEU216473186,47420.00,2160,AGROFAIR BENELUX BV,COOPERATIVA AGRARIA APPBOSA,YT GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS.
NLRTM,NLRTM,MAEU216473104,83890.00,5280,AGROFAIR BENELUX BV.,TULIPAN NARANJA S.A.C.,FYT GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS.
BEANR,BEANR,MAEU216307459,19980.00,285,"Greencof B.V.,",COOPERATIVA AGRARIA RODRIGUEZ DE MENDOZA,285 BAGS OF 69 KG NET OF PERU ORGANIC GREEN COFFEE FAIRTRADE CERTIFIED
JPYOK,JPYOK,MAEU1KT407500,21320.00,709,"HOWA SHOJI CO., LTD",GEALE AGROTRADING E.I.R.L.,GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS. BAN
ITCVV,ITCVV,MAEU913779677,66950.00,3240,BATTAGLIO SPA,IREN PERU SOCIEDAD ANONIMA CERRADA - IREN PERU S.A,GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS. BAN
NLRTM,NLRTM,MAEU913798070,24700.00,5544,FRUTOS TROPICALES EUROPE B.V.,FRUTOS TROPICALES PERU EXPORT SOCIEDAD ANONIMA CER,"FRESH MANGOES NET WEIGHT: 22,176.00 KG P.A.: 0804.50.20.00 TR.: JKXYA0"
BEANR,BEANR,MAEU216473141,23710.00,1080,AGROFAIR BENELUX BV.,TULIPAN NARANJA S.A.C.,FYT GREEN ORGANIC FRESH BANANAS CARTON BOXES AND IN POLYETHYLENE BAGS.
BEANR,BEANR,MAEU216632137,22270.00,1080,FYFFES INTERNATIONAL,AGRO PACHA S.A.,"GREEN FRESH ORGANIC BANANAS, PACKED IN CARTON BOXES AND POLYETHILENE B"
KRPUS,KRPUS,MAEU913722041,24480.00,1175,TO THE ORDER,PERUPEZ S.A.C.,"NET WEIGHT: 23,500 KG GROSS WEIGHT: 24,480 KG 1,175 SACKS 23,500 KG FR"
NLRTM,NLRTM,MAEU216473211,22520.00,1080,AgroFair Benelux BV,COOPERATIVA AGRARIA DE USUARIOS RIO Y VALLE,ORGANIC FAIRTRADE BANANAS GREEN FRESH CAVENDISH PACKED CARDBOARD BOXES
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文