如何使用多线程改进我的网页抓取代码?
这是我的代码。它逐页抓取网页并将数据提取到 Excel。它通过提取当前页面分页中存在的锚标记来获取下一页链接。
目前速度很慢;有人可以通过使用多线程或其他方式帮助加快速度吗?
import requests
from urllib3.exceptions import InsecureRequestWarning
import csv
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
f = csv.writer(open('GEM.csv', 'w', newline=''))
f.writerow(['Bidnumber', 'Items', 'Quantitiy', 'Department', 'Enddate','pageNumber'])
def scrap_bid_data():
page_no = 1
url = ""
while page_no <= 532:
print('Hold on creating URL to fetch data for...'+str(page_no))
if page_no == 2:
url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + "AMCR24yMNFkfoXF3wKPmGMy_wV8TJPAlxm6oWiTHGOI"
if page_no == 1:
url = 'https://bidplus.gem.gov.in/bidlists?bidlists'
print('URL created: ' + url)
scraped_data = requests.get(url, verify=False)
soup_data = bs(scraped_data.text, 'lxml')
nextlink = soup_data.find('a', {'rel': 'next'})
nxt = nextlink['href'].split('=')[1]
extracted_data = soup_data.find('div', {'id': 'pagi_content'})
if len(extracted_data) == 0:
break
else:
for idx in range(len(extracted_data)):
if (idx % 2 == 1):
bid_data = extracted_data.contents[idx].text.strip().split('\n')
if (len(bid_data) > 1):
print(page_no)
if (len(bid_data[8]) > 1 and len(bid_data[10].split(':')) > 1):
bidno = bid_data[0].split(":")[-1]
items = bid_data[9].strip().split('Items:')[-1]
qnty = int(bid_data[10].split(':')[1].strip())
dept = (bid_data[11] + bid_data[16].strip()).split(":")[-1]
edate = bid_data[21].split("End Date:")[-1]
f.writerow([bidno, items, qnty, dept, edate,page_no])
page_no=page_no+1
url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' +nxt
print('printing the next url')
print(url)
scrap_bid_data()
This is my code. It is web scraping page by page and extracting the data to Excel. It is taking the next page link by extracting the anchor tag present in pagination of the current page.
Currently it is slow; can someone please help to make it fast by using multithreading or anything else?
import requests
from urllib3.exceptions import InsecureRequestWarning
import csv
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
f = csv.writer(open('GEM.csv', 'w', newline=''))
f.writerow(['Bidnumber', 'Items', 'Quantitiy', 'Department', 'Enddate','pageNumber'])
def scrap_bid_data():
page_no = 1
url = ""
while page_no <= 532:
print('Hold on creating URL to fetch data for...'+str(page_no))
if page_no == 2:
url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + "AMCR24yMNFkfoXF3wKPmGMy_wV8TJPAlxm6oWiTHGOI"
if page_no == 1:
url = 'https://bidplus.gem.gov.in/bidlists?bidlists'
print('URL created: ' + url)
scraped_data = requests.get(url, verify=False)
soup_data = bs(scraped_data.text, 'lxml')
nextlink = soup_data.find('a', {'rel': 'next'})
nxt = nextlink['href'].split('=')[1]
extracted_data = soup_data.find('div', {'id': 'pagi_content'})
if len(extracted_data) == 0:
break
else:
for idx in range(len(extracted_data)):
if (idx % 2 == 1):
bid_data = extracted_data.contents[idx].text.strip().split('\n')
if (len(bid_data) > 1):
print(page_no)
if (len(bid_data[8]) > 1 and len(bid_data[10].split(':')) > 1):
bidno = bid_data[0].split(":")[-1]
items = bid_data[9].strip().split('Items:')[-1]
qnty = int(bid_data[10].split(':')[1].strip())
dept = (bid_data[11] + bid_data[16].strip()).split(":")[-1]
edate = bid_data[21].split("End Date:")[-1]
f.writerow([bidno, items, qnty, dept, edate,page_no])
page_no=page_no+1
url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' +nxt
print('printing the next url')
print(url)
scrap_bid_data()
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论