使用 BeautifulSoup 抓取多个网页的问题
我正在抓取一个网址(例如: https://bitinfocharts.com/ top-100-richest-dogecoin-addresses-4.html),URL 末尾的数字是页码。我正在尝试抓取多个页面,因此我使用以下代码来循环访问多个页面:
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
当我在脚本中运行代码并打印页面时,它返回 4、5 和 6,这意味着它应该可以工作。然而,每当我运行完整的代码时,它只给出第六页的结果。
我认为可能发生的情况是,代码正在最终确定最后一个数字并将其格式化为 URL,而每当它应该将每个数字格式化为 URL 时。
我尝试查看其他有类似问题的人,但未能找到解决方案。我相信这可能是代码格式错误,但我不太确定。非常感谢任何建议。谢谢。
这是我的代码的其余部分:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
import os
import pandas as pd
import openpyxl
# define 1-1-2020 as a datetime object
after_date = datetime(2021, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
if last_out_str != "": # check to make sure the date field isn't empty
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z") # load date into datetime object for comparison
if last_out > after_date: # if check to see if the date is after last_out
address_links.append(url + '-full') #add adddress_links to the list, -full makes the link show all data
print(address_links)
for url in address_links: #loop through the urls in address_links list
r = s.get(url)
soup = bs(r.content, 'lxml')
ad2 = (soup.title.string) #grab the web title which is used for the filename
ad2 = ad2.replace('Dogecoin', '')
ad2 = ad2.replace('Address', '')
ad2 = ad2.replace('-', '')
filename = ad2.replace(' ', '')
sections = soup.find_all(class_='table-striped')
for section in sections: #This contains the data which is imported into the 'gf' dataframe or the 'info' xlsx sheet
oldprofit = section.find_all('td')[11].text #Get the profit
removetext = oldprofit.replace('USD', '')
removetext = removetext.replace(' ', '')
removetext = removetext.replace(',', '')
profit = float(removetext)
balance = section.find_all('td')[0].text #Get the wallet balance
amount_recieved = section.find_all('td')[3].text #Get amount recieved
ins = amount_recieved[amount_recieved.find('(') + 1:amount_recieved.find(')')] #Filter out text from
# amount recieved
ins = ins.replace('ins', '')
ins = ins.replace(' ', '')
ins = float(ins)
first_recieved = section.find_all('td')[4].text #Get the data of the first incoming transaction
fr = first_recieved.replace('first', '')
fr = fr.replace(':', '')
fr = fr.replace(' ', '')
last_recieved = section.find_all('td')[5].text #Get the date of the last incoming transaction
lr = last_recieved.replace('last', '')
lr = lr.replace(':', '')
lr = lr.replace(' ', '')
amount_sent = section.find_all('td')[7].text #Get the amount sent
outs = amount_sent[amount_sent.find('(') + 1:amount_sent.find(')')] #Filter out the text
outs = outs.replace('outs', '')
outs = outs.replace(' ', '')
outs = float(outs)
first_sent = section.find_all('td')[8].text #Get the first outgoing transaction date
fs = first_sent.replace('first', '') #clean up first outgoing transaction date
fs = fs.replace(':', '')
fs = fs.replace(' ', '')
last_sent = section.find_all('td')[9].text #Get the last outgoing transaction date
ls = last_sent.replace('last', '') #Clean up last outgoing transaction date
ls = ls.replace(':', '')
ls = ls.replace(' ', '')
dbalance = section.find_all('td')[0].select('b') #get the balance of doge
dusd = section.find_all('td')[0].select('span')[1] #get balance of USD
for data in dbalance: #used to clean the text up
balance = data.text
for data1 in dusd: #used to clean the text up
usd = data1.text
# Compare profit to goal, if profit doesn't meet the goal, the URL is not scraped
goal = float(30000)
if profit < goal:
continue
#Select wallets with under 2000 transactions
trans = float(ins + outs) #adds the amount of incoming and outgoing transactions
trans_limit = float(2000)
if trans > trans_limit:
continue
# Create Info Dataframe using the data from above
info = {
'Balance': [balance],
'USD Value': [usd],
'Wallet Profit': [profit],
'Amount Recieved': [amount_recieved],
'First Recieved': [fr],
'Last Recieved': [lr],
'Amount Sent': [amount_sent],
'First Sent': [fs],
'Last Sent': [ls],
}
gf = pd.DataFrame(info)
a = 'a'
if a:
df = \
pd.read_html(requests.get(url, headers={'User-agent': 'Mozilla/5.0'}).text, attrs={"id": "table_maina"},
index_col=None, header=[0])[0] #uses pandas to read the dataframe and save it
directory = '/Users/chris/Desktop/Files' #directory for the file to go to
file = f'{filename}.xlsx'
writer = pd.ExcelWriter(os.path.join(directory, file), engine='xlsxwriter')
with pd.ExcelWriter(writer) as writer:
df.to_excel(writer, sheet_name='transactions')
gf.to_excel(writer, sheet_name='info')
I am scraping a URL (example: https://bitinfocharts.com/top-100-richest-dogecoin-addresses-4.html) and the number on the end of the URL is the page number. I am trying to scrape multiple pages, so I used the following code to loop through the multiple pages:
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
When I run the code in my script and print the page, it returns 4, 5 and 6, meaning that it should be working. However whenever I run the full code, it only gives me the results for the 6th page.
What I think may be happening is the code is finalizing on the last number and formatting that into the URL, whenever it should formatting each number into the URL instead.
I have tried looking at other people with similar issues but haven't been able to find a solution. I believe this may be a code formatting error but I am not exactly sure. Any advice is greatly appreciated. Thank you.
Here is the remainder of my code:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
import os
import pandas as pd
import openpyxl
# define 1-1-2020 as a datetime object
after_date = datetime(2021, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
if last_out_str != "": # check to make sure the date field isn't empty
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z") # load date into datetime object for comparison
if last_out > after_date: # if check to see if the date is after last_out
address_links.append(url + '-full') #add adddress_links to the list, -full makes the link show all data
print(address_links)
for url in address_links: #loop through the urls in address_links list
r = s.get(url)
soup = bs(r.content, 'lxml')
ad2 = (soup.title.string) #grab the web title which is used for the filename
ad2 = ad2.replace('Dogecoin', '')
ad2 = ad2.replace('Address', '')
ad2 = ad2.replace('-', '')
filename = ad2.replace(' ', '')
sections = soup.find_all(class_='table-striped')
for section in sections: #This contains the data which is imported into the 'gf' dataframe or the 'info' xlsx sheet
oldprofit = section.find_all('td')[11].text #Get the profit
removetext = oldprofit.replace('USD', '')
removetext = removetext.replace(' ', '')
removetext = removetext.replace(',', '')
profit = float(removetext)
balance = section.find_all('td')[0].text #Get the wallet balance
amount_recieved = section.find_all('td')[3].text #Get amount recieved
ins = amount_recieved[amount_recieved.find('(') + 1:amount_recieved.find(')')] #Filter out text from
# amount recieved
ins = ins.replace('ins', '')
ins = ins.replace(' ', '')
ins = float(ins)
first_recieved = section.find_all('td')[4].text #Get the data of the first incoming transaction
fr = first_recieved.replace('first', '')
fr = fr.replace(':', '')
fr = fr.replace(' ', '')
last_recieved = section.find_all('td')[5].text #Get the date of the last incoming transaction
lr = last_recieved.replace('last', '')
lr = lr.replace(':', '')
lr = lr.replace(' ', '')
amount_sent = section.find_all('td')[7].text #Get the amount sent
outs = amount_sent[amount_sent.find('(') + 1:amount_sent.find(')')] #Filter out the text
outs = outs.replace('outs', '')
outs = outs.replace(' ', '')
outs = float(outs)
first_sent = section.find_all('td')[8].text #Get the first outgoing transaction date
fs = first_sent.replace('first', '') #clean up first outgoing transaction date
fs = fs.replace(':', '')
fs = fs.replace(' ', '')
last_sent = section.find_all('td')[9].text #Get the last outgoing transaction date
ls = last_sent.replace('last', '') #Clean up last outgoing transaction date
ls = ls.replace(':', '')
ls = ls.replace(' ', '')
dbalance = section.find_all('td')[0].select('b') #get the balance of doge
dusd = section.find_all('td')[0].select('span')[1] #get balance of USD
for data in dbalance: #used to clean the text up
balance = data.text
for data1 in dusd: #used to clean the text up
usd = data1.text
# Compare profit to goal, if profit doesn't meet the goal, the URL is not scraped
goal = float(30000)
if profit < goal:
continue
#Select wallets with under 2000 transactions
trans = float(ins + outs) #adds the amount of incoming and outgoing transactions
trans_limit = float(2000)
if trans > trans_limit:
continue
# Create Info Dataframe using the data from above
info = {
'Balance': [balance],
'USD Value': [usd],
'Wallet Profit': [profit],
'Amount Recieved': [amount_recieved],
'First Recieved': [fr],
'Last Recieved': [lr],
'Amount Sent': [amount_sent],
'First Sent': [fs],
'Last Sent': [ls],
}
gf = pd.DataFrame(info)
a = 'a'
if a:
df = \
pd.read_html(requests.get(url, headers={'User-agent': 'Mozilla/5.0'}).text, attrs={"id": "table_maina"},
index_col=None, header=[0])[0] #uses pandas to read the dataframe and save it
directory = '/Users/chris/Desktop/Files' #directory for the file to go to
file = f'{filename}.xlsx'
writer = pd.ExcelWriter(os.path.join(directory, file), engine='xlsxwriter')
with pd.ExcelWriter(writer) as writer:
df.to_excel(writer, sheet_name='transactions')
gf.to_excel(writer, sheet_name='info')
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
检查您的缩进 - 在您的问题中,循环位于同一级别,因此发出请求的循环正在所有页面上迭代,但在迭代完成之前永远不会处理结果。这就是为什么它只适用于最后一页。
移动循环,它应该处理响应并将元素提取到第一个循环中:
Check your indentation - In your question the loops are on the same level, so loop that make the requests is iterating over all the pages but results are never processed until iterating is done. That is why it only works for the last page.
Move your loops, that should handle the response and extract elements into your first loop: