用砂纸刮擦数据
import requests
import scrapy
from scrapy.http import Request
from bs4 import BeautifulSoup
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[@class='icon_link']//a//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
detail=response.xpath("//div[@class='line_list_K']")
for i in range(len(detail)):
title=detail[i].xpath("//span[contains(text(), 'Status:')]//div").get()
print(title)
我正在尝试从status
和电子邮件
的数据中获取数据,但它没有给我这个页面链接 https://rejestradwokatow.pl/adwokat/adwokat/abaewicz-dominik-49965
import requests
import scrapy
from scrapy.http import Request
from bs4 import BeautifulSoup
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[@class='icon_link']//a//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
detail=response.xpath("//div[@class='line_list_K']")
for i in range(len(detail)):
title=detail[i].xpath("//span[contains(text(), 'Status:')]//div").get()
print(title)
I am trying to grab the data from status
and data from email
but it give me none this is page link https://rejestradwokatow.pl/adwokat/abaewicz-dominik-49965
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
data:image/s3,"s3://crabby-images/d5906/d59060df4059a6cc364216c4d63ceec29ef7fe66" alt="扫码二维码加入Web技术交流群"
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
尝试:
此XPath表达式将帮助您拉出所有5行的所有数据
Try:
This xpath expression will help you to pull all 5 row's all data like
我将在不使用Scrappy的情况下展示一个例子。希望您理解并可以使其适应您的代码。唯一的困难是该电子邮件由属性
输出中的2个部分组成:
I will show an example without using Scrappy. I hope you understand and can adapt it to your code. The only difficulty is that the email consists of 2 parts inside the attributes
OUTPUT: