如何使用多线程改进我的网页抓取代码?

发布于 2025-01-10 15:20:15 字数 2216 浏览 0 评论 0原文

这是我的代码。它逐页抓取网页并将数据提取到 Excel。它通过提取当前页面分页中存在的锚标记来获取下一页链接。

目前速度很慢;有人可以通过使用多线程或其他方式帮助加快速度吗?

import requests
from urllib3.exceptions import InsecureRequestWarning
import csv

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs

f = csv.writer(open('GEM.csv', 'w', newline=''))
f.writerow(['Bidnumber', 'Items', 'Quantitiy', 'Department', 'Enddate','pageNumber'])


def scrap_bid_data():
    page_no = 1
    url = ""
    while page_no <= 532:

        print('Hold on creating URL to fetch data for...'+str(page_no))
        if page_no == 2:
            url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + "AMCR24yMNFkfoXF3wKPmGMy_wV8TJPAlxm6oWiTHGOI"
        if page_no == 1:
            url = 'https://bidplus.gem.gov.in/bidlists?bidlists'

        print('URL created: ' + url)
        scraped_data = requests.get(url, verify=False)
        soup_data = bs(scraped_data.text, 'lxml')
        nextlink = soup_data.find('a', {'rel': 'next'})

        nxt = nextlink['href'].split('=')[1]
        extracted_data = soup_data.find('div', {'id': 'pagi_content'})
        if len(extracted_data) == 0:
            break
        else:
            for idx in range(len(extracted_data)):
                if (idx % 2 == 1):
                    bid_data = extracted_data.contents[idx].text.strip().split('\n')
                    if (len(bid_data) > 1):
                        print(page_no)
                        if (len(bid_data[8]) > 1 and len(bid_data[10].split(':')) > 1):
                            bidno = bid_data[0].split(":")[-1]
                            items = bid_data[9].strip().split('Items:')[-1]
                            qnty = int(bid_data[10].split(':')[1].strip())
                            dept = (bid_data[11] + bid_data[16].strip()).split(":")[-1]
                            edate = bid_data[21].split("End Date:")[-1]
                            f.writerow([bidno, items, qnty, dept, edate,page_no])

        page_no=page_no+1
        url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' +nxt
        print('printing the next url')
        print(url)
scrap_bid_data()

This is my code. It is web scraping page by page and extracting the data to Excel. It is taking the next page link by extracting the anchor tag present in pagination of the current page.

Currently it is slow; can someone please help to make it fast by using multithreading or anything else?

import requests
from urllib3.exceptions import InsecureRequestWarning
import csv

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs

f = csv.writer(open('GEM.csv', 'w', newline=''))
f.writerow(['Bidnumber', 'Items', 'Quantitiy', 'Department', 'Enddate','pageNumber'])


def scrap_bid_data():
    page_no = 1
    url = ""
    while page_no <= 532:

        print('Hold on creating URL to fetch data for...'+str(page_no))
        if page_no == 2:
            url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + "AMCR24yMNFkfoXF3wKPmGMy_wV8TJPAlxm6oWiTHGOI"
        if page_no == 1:
            url = 'https://bidplus.gem.gov.in/bidlists?bidlists'

        print('URL created: ' + url)
        scraped_data = requests.get(url, verify=False)
        soup_data = bs(scraped_data.text, 'lxml')
        nextlink = soup_data.find('a', {'rel': 'next'})

        nxt = nextlink['href'].split('=')[1]
        extracted_data = soup_data.find('div', {'id': 'pagi_content'})
        if len(extracted_data) == 0:
            break
        else:
            for idx in range(len(extracted_data)):
                if (idx % 2 == 1):
                    bid_data = extracted_data.contents[idx].text.strip().split('\n')
                    if (len(bid_data) > 1):
                        print(page_no)
                        if (len(bid_data[8]) > 1 and len(bid_data[10].split(':')) > 1):
                            bidno = bid_data[0].split(":")[-1]
                            items = bid_data[9].strip().split('Items:')[-1]
                            qnty = int(bid_data[10].split(':')[1].strip())
                            dept = (bid_data[11] + bid_data[16].strip()).split(":")[-1]
                            edate = bid_data[21].split("End Date:")[-1]
                            f.writerow([bidno, items, qnty, dept, edate,page_no])

        page_no=page_no+1
        url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' +nxt
        print('printing the next url')
        print(url)
scrap_bid_data()

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文