scrapy.request returns＆lt; get url＆gt; gt;没有刮擦任何东西

发布于 2025-01-30 06:59:46 字数 1696 浏览 3 评论 0原文

我想刮擦sitepoint.com的feed，这是我的代码：

import scrapy
from urllib.parse import urljoin


class SitepointSpider(scrapy.Spider):
    # TODO: Add url tags (like /javascript) to the spider based on class paraneters
    name = "sitepoint"
    allowed_domains = ["sitepoint.com"]
    start_urls = ["http://sitepoint.com/javascript/"]

    def parse(self, response):
        data = []
        for article in response.css("article"):
            title = article.css("a.t12xxw3g::text").get()
            href = article.css("a.t12xxw3g::attr(href)").get()
            img = article.css("img.f13hvvvv::attr(src)").get()
            time = article.css("time::text").get()
            url = urljoin("https://sitepoint.com", href)
            text = scrapy.Request(url, callback=self.parse_article)

            data.append(
                {"title": title, "href": href, "img": img, "time": time, "text": text}
            )
        yield data

    def parse_article(self, response):
        text = response.xpath(
           '//*[@id="main-content"]/article/div/div/div[1]/section/text()'
        ).extract()
        yield text

这是我得到的响应： -

[{'title': 'How to Build an MVP with React and Firebase', 
'href': '/react-firebase-build-mvp/', 
'img': 'https://uploads.sitepoint.com/wp-content/uploads/2021/09/1632802723react-firebase-mvp- 
app.jpg', 
'time': 'September 28, 2021', 
'text': <GET https://sitepoint.com/react-firebase-build-mvp/>}]

它不会刮擦URL。我跟随这个问题，但仍然无法使其起作用。

原文

I wanted to scrape the feed of sitepoint.com, this is my code:

import scrapy
from urllib.parse import urljoin


class SitepointSpider(scrapy.Spider):
    # TODO: Add url tags (like /javascript) to the spider based on class paraneters
    name = "sitepoint"
    allowed_domains = ["sitepoint.com"]
    start_urls = ["http://sitepoint.com/javascript/"]

    def parse(self, response):
        data = []
        for article in response.css("article"):
            title = article.css("a.t12xxw3g::text").get()
            href = article.css("a.t12xxw3g::attr(href)").get()
            img = article.css("img.f13hvvvv::attr(src)").get()
            time = article.css("time::text").get()
            url = urljoin("https://sitepoint.com", href)
            text = scrapy.Request(url, callback=self.parse_article)

            data.append(
                {"title": title, "href": href, "img": img, "time": time, "text": text}
            )
        yield data

    def parse_article(self, response):
        text = response.xpath(
           '//*[@id="main-content"]/article/div/div/div[1]/section/text()'
        ).extract()
        yield text

And this is the response I get:-

[{'title': 'How to Build an MVP with React and Firebase', 
'href': '/react-firebase-build-mvp/', 
'img': 'https://uploads.sitepoint.com/wp-content/uploads/2021/09/1632802723react-firebase-mvp- 
app.jpg', 
'time': 'September 28, 2021', 
'text': <GET https://sitepoint.com/react-firebase-build-mvp/>}]

It just does not scrape the urls. I followed everything said in this question but still could not make it work.

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

她比我温柔 2025-02-06 06:59:46

您必须访问列表中的详细页面才能刮擦文章。

在这种情况下，您必须首先产生URL，然后在最后一个蜘蛛中产生数据

，//*[@ID =“ MAIN-CONTENT”]/aCTICT/acties/div/div/div/div/div [1]/section不会返回您任何文本

/text（）代码>标签并稍后清理它们以获取您的文章文本数据

，这是完整的工作代码

import re

import scrapy
from urllib.parse import urljoin


class SitepointSpider(scrapy.Spider):
    # TODO: Add url tags (like /javascript) to the spider based on class paraneters
    name = "sitepoint"
    allowed_domains = ["sitepoint.com"]
    start_urls = ["http://sitepoint.com/javascript/"]

    def clean_text(self, raw_html):
        """
        :param raw_html: this will take raw html code
        :return: text without html tags
        """
        cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        return re.sub(cleaner, '', raw_html)

    def parse(self, response):
        for article in response.css("article"):
            title = article.css("a.t12xxw3g::text").get()
            href = article.css("a.t12xxw3g::attr(href)").get()
            img = article.css("img.f13hvvvv::attr(src)").get()
            time = article.css("time::text").get()
            url = urljoin("https://sitepoint.com", href)
            yield scrapy.Request(url, callback=self.parse_article, meta={"title": title,
                                                                         "href": href,
                                                                         "img": img,
                                                                         "time": time})

    def parse_article(self, response):
        title = response.request.meta["title"]
        href = response.request.meta["href"]
        img = response.request.meta["img"]
        time = response.request.meta["time"]
        all_data = {}
        article_html = response.xpath('//*[@id="main-content"]/article/div/div/div[1]/section').get()
        all_data["title"] = title
        all_data["href"] = href
        all_data["img"] = img
        all_data["time"] = time
        all_data["text"] = self.clean_text(article_html)

        yield all_data

You have to visit the detail page from the listing to scrape the article.

In that case you have to yield the URL first then yield the data in the last spider

Also, the //*[@id="main-content"]/article/div/div/div[1]/section/text() won't return you any text since there are lots of HTML elements under the section tag

One solution is you can scrape all the HTML element inside section tag and clean them later to get your article text data

here is the full working code

import re

import scrapy
from urllib.parse import urljoin


class SitepointSpider(scrapy.Spider):
    # TODO: Add url tags (like /javascript) to the spider based on class paraneters
    name = "sitepoint"
    allowed_domains = ["sitepoint.com"]
    start_urls = ["http://sitepoint.com/javascript/"]

    def clean_text(self, raw_html):
        """
        :param raw_html: this will take raw html code
        :return: text without html tags
        """
        cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        return re.sub(cleaner, '', raw_html)

    def parse(self, response):
        for article in response.css("article"):
            title = article.css("a.t12xxw3g::text").get()
            href = article.css("a.t12xxw3g::attr(href)").get()
            img = article.css("img.f13hvvvv::attr(src)").get()
            time = article.css("time::text").get()
            url = urljoin("https://sitepoint.com", href)
            yield scrapy.Request(url, callback=self.parse_article, meta={"title": title,
                                                                         "href": href,
                                                                         "img": img,
                                                                         "time": time})

    def parse_article(self, response):
        title = response.request.meta["title"]
        href = response.request.meta["href"]
        img = response.request.meta["img"]
        time = response.request.meta["time"]
        all_data = {}
        article_html = response.xpath('//*[@id="main-content"]/article/div/div/div[1]/section').get()
        all_data["title"] = title
        all_data["href"] = href
        all_data["img"] = img
        all_data["time"] = time
        all_data["text"] = self.clean_text(article_html)

        yield all_data

回复收藏 0 原文

~没有更多了~