用砂纸刮擦数据

发布于 2025-02-11 14:33:51 字数 1318 浏览 2 评论 0原文

import requests
import scrapy
from scrapy.http import Request
from bs4 import BeautifulSoup

class TestSpider(scrapy.Spider):
    name = 'test'
    start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
        }


    def parse(self, response):
        books = response.xpath("//td[@class='icon_link']//a//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)
            
    def parse_book(self, response):
        detail=response.xpath("//div[@class='line_list_K']")
        for i in range(len(detail)):
            title=detail[i].xpath("//span[contains(text(), 'Status:')]//div").get()
            print(title)

我正在尝试从status和电子邮件的数据中获取数据，但它没有给我这个页面链接 https://rejestradwokatow.pl/adwokat/adwokat/abaewicz-dominik-49965

原文

import requests
import scrapy
from scrapy.http import Request
from bs4 import BeautifulSoup

class TestSpider(scrapy.Spider):
    name = 'test'
    start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
        }


    def parse(self, response):
        books = response.xpath("//td[@class='icon_link']//a//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)
            
    def parse_book(self, response):
        detail=response.xpath("//div[@class='line_list_K']")
        for i in range(len(detail)):
            title=detail[i].xpath("//span[contains(text(), 'Status:')]//div").get()
            print(title)

I am trying to grab the data from status and data from email but it give me none this is page link https://rejestradwokatow.pl/adwokat/abaewicz-dominik-49965

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

暮年慕年 2025-02-18 14:33:51

尝试：

import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess

class TestSpider(scrapy.Spider):
    name = 'test'
    start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
        }


    def parse(self, response):
        books = response.xpath("//td[@class='icon_link']//a//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)
            
    def parse_book(self, response):
        e1 = response.xpath('(//*[@class="address_e"])[1]//@data-ea')
        e1=e1.get() if e1 else None
        e2=response.xpath('(//*[@class="address_e"])[1]//@data-eb')
        e2=e2.get() if e2 else None
        try:
            data = e1 +'@'+ e2
            yield {
                'status':response.xpath("//span[contains(text(), 'Status:')]/../div/text()").get(),
                'email': data,
                'url':response.url
                }

        except:
            pass
       

if __name__ == "__main__":
    process =CrawlerProcess(TestSpider)
    process.crawl()
    process.start()

此XPath表达式将帮助您拉出所有5行的所有数据

//span[contains(text(), 'Status:')]/../following-sibling::div[1]/div

//span[contains(text(), 'Status:')]/../following-sibling::div[1]/span

Try:

import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess

class TestSpider(scrapy.Spider):
    name = 'test'
    start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
        }


    def parse(self, response):
        books = response.xpath("//td[@class='icon_link']//a//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)
            
    def parse_book(self, response):
        e1 = response.xpath('(//*[@class="address_e"])[1]//@data-ea')
        e1=e1.get() if e1 else None
        e2=response.xpath('(//*[@class="address_e"])[1]//@data-eb')
        e2=e2.get() if e2 else None
        try:
            data = e1 +'@'+ e2
            yield {
                'status':response.xpath("//span[contains(text(), 'Status:')]/../div/text()").get(),
                'email': data,
                'url':response.url
                }

        except:
            pass
       

if __name__ == "__main__":
    process =CrawlerProcess(TestSpider)
    process.crawl()
    process.start()

This xpath expression will help you to pull all 5 row's all data like

//span[contains(text(), 'Status:')]/../following-sibling::div[1]/div

//span[contains(text(), 'Status:')]/../following-sibling::div[1]/span

回复收藏 0 原文

芸娘子的小脾气 2025-02-18 14:33:51

我将在不使用Scrappy的情况下展示一个例子。希望您理解并可以使其适应您的代码。唯一的困难是该电子邮件由属性

import requests
from bs4 import BeautifulSoup
url = "https://rejestradwokatow.pl/adwokat/abaewicz-dominik-49965"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
status = soup.find('span', string='Status:').findNext('div').getText()
data_ea = soup.find('span', string='Email:').findNext('div').get('data-ea')
data_eb = soup.find('span', string='Email:').findNext('div').get('data-eb')
email = f"{data_ea}@{data_eb}"

print(status, email)

输出中的2个部分组成：

Wykonujący zawód [email protected]

I will show an example without using Scrappy. I hope you understand and can adapt it to your code. The only difficulty is that the email consists of 2 parts inside the attributes

import requests
from bs4 import BeautifulSoup
url = "https://rejestradwokatow.pl/adwokat/abaewicz-dominik-49965"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
status = soup.find('span', string='Status:').findNext('div').getText()
data_ea = soup.find('span', string='Email:').findNext('div').get('data-ea')
data_eb = soup.find('span', string='Email:').findNext('div').get('data-eb')
email = f"{data_ea}@{data_eb}"

print(status, email)

OUTPUT: