Web Crawler - 使用 Scrapy 迭代 Postgres 数据库结果

发布于 2025-01-11 19:46:45 字数 1318 浏览 0 评论 0 原文

我正在尝试编写一个 scraper 从数据库结果中获取域。我能够从数据库获取数据，但我不知道如何将其提供给 Scrapy。我在这里查看并找到了很多建议，但没有一个是我真正在做的。当我运行下面的代码时，没有任何反应，甚至没有错误。

scaper.py

#import json
import json

#import database library
import psycopg2

#import scrapy library
import scrapy

#create database connection
conn = psycopg2.connect(
    host="localhost",
    database="mydb",
    user="dbuser",
    password="postgres",
    port=5432
)

#create cursor from database
#cursor() is python equivalent to query() to fetch the rows
query = conn.cursor()

#execute query from database
query.execute('SELECT info FROM domains')

#create scrapy class
class MySpider(scrapy.Spider):  
    name = "scrap_domains"

    #start_requests with scrapy
    def start_requests(self):

        #iterate over database result
        for url in query:

            #iterate over each json object
            for item in url:

                #get domain name
                domain_name = item['domain']

                #grab information from url
                yield scrapy.Request()

        #print response
        def parse(self, response):
            print(response)

# we close the cursor and conn both
query.close()
conn.close()

原文

I'm trying to write a scraper that gets domains from database result. I'm able to get data from database but I can't wrap my head around how to feed it to Scrapy. I've looked here and found many suggestions but none is really what I'm doing. When I run my codes below, nothing happens not even an error.

scaper.py

#import json
import json

#import database library
import psycopg2

#import scrapy library
import scrapy

#create database connection
conn = psycopg2.connect(
    host="localhost",
    database="mydb",
    user="dbuser",
    password="postgres",
    port=5432
)

#create cursor from database
#cursor() is python equivalent to query() to fetch the rows
query = conn.cursor()

#execute query from database
query.execute('SELECT info FROM domains')

#create scrapy class
class MySpider(scrapy.Spider):  
    name = "scrap_domains"

    #start_requests with scrapy
    def start_requests(self):

        #iterate over database result
        for url in query:

            #iterate over each json object
            for item in url:

                #get domain name
                domain_name = item['domain']

                #grab information from url
                yield scrapy.Request()

        #print response
        def parse(self, response):
            print(response)

# we close the cursor and conn both
query.close()
conn.close()

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

故人如初 2025-01-18 19:46:46

我终于让我的刮刀工作了。该问题是由于每次迭代时关闭游标和数据库连接引起的。正如我一直在学习的，Python 不像 Node 那样是异步的。应该编写一个函数来检测迭代何时完成，然后继续执行进一步的任务，但为了本示例的目的，我们只是像在文件底部那样将它们注释掉。我正在发布详细答案以供将来参考。

注释：我使用此抓取工具来抓取数据库中存储的 3 亿条记录的列表。只需更改每页的限制，下面的代码将为您完成其余的工作，直到全部完成。完成后，只需获取 json 文件并上传到数据库即可。我受苦是为了让你不必受苦。

我正在使用 PostgreSQL 并将数据存储在 JSONB 中。我的表格只有 2 列，如下所示：

id (int) | info (jsonb)
1        | {"domain": "weerstation-aarschot.be","timestamp":1646106745}
2        | {"domain": "wereldvakanties.be","timestamp":1646106746}
3        | {"domain": "welzijnscentrum.be","timestamp":1646106747}

根据 scrapy 文档，复制/粘贴下面的代码并在终端中运行此命令，将所有域写入 json 文件：

scrapy runspider scraper.py -o domains.json

使用选择器，用于从正文中提取 HTML 数据

scraper.py

#import datetime
from datetime import datetime

#calendar
import calendar
from email import header;
import time;

#import json
import json
from urllib import request
from wsgiref import headers

#import database library
import psycopg2

#import scrapy library
import scrapy

#create database connection
conn = psycopg2.connect(
    host="localhost",
    database="mydb",
    user="username",
    password="postgres",
    port=5432
)

#create cursor from database
#cursor() is python equivalent to query() to fetch the rows
query = conn.cursor()

#spiderclass
class MySpider(scrapy.Spider):  
    name = "domains"

    def start_requests(self):

            #database pagination
            #loop through extremely large datasets automatically
            current_page = ""
            offset = 0
            limit = 1000000
            flag = True

            #while its true
            while flag:

                #execute query from database
                query.execute("SELECT info FROM domains ORDER BY id ASC LIMIT "+str(limit)+" OFFSET "+str(offset))

                # query db with start and offset, example: select * from domains limit %start% offset %offset%
                unique_domains = query.fetchone()

                #condition
                if not unique_domains:
                    flag = False
                else:

                    # do processing with your data
                    offset += limit

                    #iterate over database result
                    for url in query:

                        #iterate over each json object
                        for item in url:

                            #variables from result
                            hostname = item['domain']
                            https_url = "https://"+hostname
                            http_url = "http://"+hostname

                            #fetch http request
                            yield scrapy.Request(url=https_url, callback=self.parse)

    #print response
    def parse(self, response):

        #current date
        currDate = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        timestamp = calendar.timegm(time.gmtime())
        date_created = currDate

        #variables from response
        url = response.url
        status = response.status
        headers = response.headers
        request = response.request
        body = response.body

        #header information
        content_language = ""
        protocol = ""

        #meta tags
        favicon = response.css('link::attr(href)').get()
        title = response.xpath('//title/text()').get()
        description = response.xpath('//description/text()').get()
        keywords = response.xpath('//keywords/text()').get()
        author = response.xpath('//author/text()').get()
        type = response.xpath('//content-type/text()').get()

        #open graph tags
        og_title = ""
        og_type = ""
        og_url = ""
        og_image = ""
        og_site_name = ""
        og_description = ""

        #get all the links to follow
        links = response.css('a::attr(href)').getall()

        #get text from every header tag (<h>)
        h1_text = response.css('h1::text').getall()
        h2_text = response.css('h2::text').getall()
        h3_text = response.css('h3::text').getall()

        #get span text
        span_text = response.css('span::text').getall();

        #get text from every paragraph tag (<p>)
        p_text = response.css('p::text').getall()

        #get text from every div tag (<div>)
        div_text = response.css('div::text').getall()

        #get every image
        images = response.css('img').xpath('@src').getall()

        #get every video
        videos = []

        #category, score
        websiteScore = ""

        #grab information from url
        result =  yield{
                    'url': url,
                    'status': status, 
                    "score" : websiteScore,
                    "type" : type,
                    "category" : "",
                    "industry" : "",
                    "timestamp" : timestamp,
                    "date_created" :date_created,
                    "headers" :{
                    "content_language" : content_language,
                    "protocol" : protocol,
                    },
                    "metas" : [
                    {
                    "favicon ": favicon,
                    "title" :title,
                    "description": description,
                    "keywords" :keywords,
                    "author" : author,
                    }
                    ],
                    "open_graph" : [
                    {
                    "og_title" :og_title,
                    "og_type" :og_type,
                    "og_url" :og_url,
                    "og_image" :og_image,
                    "og_site_name" :og_site_name,
                    "og_description" : og_description,
                    }
                    ],
                    "links": links,
                    "h1_text": h1_text,
                    "h2_text": h2_text,
                    "h3_text": h3_text,
                    "div_text": div_text,
                    "p_text": p_text,
                    "span_text": span_text,
                    "images ": images,
                    "videos" : videos

                    }

        #print result
        #print(request)

    # we close the cursor and conn both
    #query.close()
    #conn.close()
    #scrapy runspider scraper.py -o domains.json

domains.json（示例输出）

[
{"url": "https://weerstation-aarschot.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535167.804621, "date_created": "2022-03-05 21:52:47", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
{"url": "https://wereldvakanties.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535168.069924, "date_created": "2022-03-05 21:52:48", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
{"url": "https://welzijnscentrum.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535168.096689, "date_created": "2022-03-05 21:52:48", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
]

I finally got my scraper working. The problem was caused by closing the cursor and database connection on every iteration. Python is not async like Node, as I've been learning. A function should be written to detect when the iteration is finished then proceed with further tasks but for the purpose of this example, we just comment them out like we did at the bottom of the file. I'm posting a detailed answer for future references.

Notes : I use this scraper to scrape through a list of 300 millions records stored in my database. Just change your limit per page and the code below will do the rest for you until it's all done. When it' finished, just grab your json file and upload to your database. I suffered so that you don't have to.

I'm using PostgreSQL and store the data in JSONB. My table only has 2 columns and looks like this :

id (int) | info (jsonb)
1        | {"domain": "weerstation-aarschot.be","timestamp":1646106745}
2        | {"domain": "wereldvakanties.be","timestamp":1646106746}
3        | {"domain": "welzijnscentrum.be","timestamp":1646106747}

As per the scrapy documents, copy/paste codes below and run this command in your terminal to write all domains to a json file :

scrapy runspider scraper.py -o domains.json

Use the Selectors to extract HTML data from the body

scraper.py

#import datetime
from datetime import datetime

#calendar
import calendar
from email import header;
import time;

#import json
import json
from urllib import request
from wsgiref import headers

#import database library
import psycopg2

#import scrapy library
import scrapy

#create database connection
conn = psycopg2.connect(
    host="localhost",
    database="mydb",
    user="username",
    password="postgres",
    port=5432
)

#create cursor from database
#cursor() is python equivalent to query() to fetch the rows
query = conn.cursor()

#spiderclass
class MySpider(scrapy.Spider):  
    name = "domains"

    def start_requests(self):

            #database pagination
            #loop through extremely large datasets automatically
            current_page = ""
            offset = 0
            limit = 1000000
            flag = True

            #while its true
            while flag:

                #execute query from database
                query.execute("SELECT info FROM domains ORDER BY id ASC LIMIT "+str(limit)+" OFFSET "+str(offset))

                # query db with start and offset, example: select * from domains limit %start% offset %offset%
                unique_domains = query.fetchone()

                #condition
                if not unique_domains:
                    flag = False
                else:

                    # do processing with your data
                    offset += limit

                    #iterate over database result
                    for url in query:

                        #iterate over each json object
                        for item in url:

                            #variables from result
                            hostname = item['domain']
                            https_url = "https://"+hostname
                            http_url = "http://"+hostname

                            #fetch http request
                            yield scrapy.Request(url=https_url, callback=self.parse)

    #print response
    def parse(self, response):

        #current date
        currDate = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        timestamp = calendar.timegm(time.gmtime())
        date_created = currDate

        #variables from response
        url = response.url
        status = response.status
        headers = response.headers
        request = response.request
        body = response.body

        #header information
        content_language = ""
        protocol = ""

        #meta tags
        favicon = response.css('link::attr(href)').get()
        title = response.xpath('//title/text()').get()
        description = response.xpath('//description/text()').get()
        keywords = response.xpath('//keywords/text()').get()
        author = response.xpath('//author/text()').get()
        type = response.xpath('//content-type/text()').get()

        #open graph tags
        og_title = ""
        og_type = ""
        og_url = ""
        og_image = ""
        og_site_name = ""
        og_description = ""

        #get all the links to follow
        links = response.css('a::attr(href)').getall()

        #get text from every header tag (<h>)
        h1_text = response.css('h1::text').getall()
        h2_text = response.css('h2::text').getall()
        h3_text = response.css('h3::text').getall()

        #get span text
        span_text = response.css('span::text').getall();

        #get text from every paragraph tag (<p>)
        p_text = response.css('p::text').getall()

        #get text from every div tag (<div>)
        div_text = response.css('div::text').getall()

        #get every image
        images = response.css('img').xpath('@src').getall()

        #get every video
        videos = []

        #category, score
        websiteScore = ""

        #grab information from url
        result =  yield{
                    'url': url,
                    'status': status, 
                    "score" : websiteScore,
                    "type" : type,
                    "category" : "",
                    "industry" : "",
                    "timestamp" : timestamp,
                    "date_created" :date_created,
                    "headers" :{
                    "content_language" : content_language,
                    "protocol" : protocol,
                    },
                    "metas" : [
                    {
                    "favicon ": favicon,
                    "title" :title,
                    "description": description,
                    "keywords" :keywords,
                    "author" : author,
                    }
                    ],
                    "open_graph" : [
                    {
                    "og_title" :og_title,
                    "og_type" :og_type,
                    "og_url" :og_url,
                    "og_image" :og_image,
                    "og_site_name" :og_site_name,
                    "og_description" : og_description,
                    }
                    ],
                    "links": links,
                    "h1_text": h1_text,
                    "h2_text": h2_text,
                    "h3_text": h3_text,
                    "div_text": div_text,
                    "p_text": p_text,
                    "span_text": span_text,
                    "images ": images,
                    "videos" : videos

                    }

        #print result
        #print(request)

    # we close the cursor and conn both
    #query.close()
    #conn.close()
    #scrapy runspider scraper.py -o domains.json

domains.json (example output)

[
{"url": "https://weerstation-aarschot.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535167.804621, "date_created": "2022-03-05 21:52:47", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
{"url": "https://wereldvakanties.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535168.069924, "date_created": "2022-03-05 21:52:48", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
{"url": "https://welzijnscentrum.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535168.096689, "date_created": "2022-03-05 21:52:48", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
]

回复收藏 0 原文

~没有更多了~