如何在Web刮擦中使用Python多处理,然后将刮擦文本附加到Pandas DataFrame?
我现在每天都从零售网站(例如,亚马逊)中进行网络搭配,以跟踪价格历史记录。
刮擦包括两个部分; 首先,我收集了我获得基本信息的产品的“清单”,例如每个列表的产品名称,价格,ID和 url ,并将其保存到Pandas数据框架。 其次,使用产品的每个 url ,我收集了有关产品的更多详细信息,并将该产品单一附加到其他列置列中。我在第一部分没有问题(需要10分钟),但是完成第二部分通常需要超过15个小时。
以下是示例代码。尽管下面的示例代码不是真实的,但实际代码只是延长的版本。
import requests
import pandas as pd
import json
from user_agent import generate_user_agent
df_dic = {
"product_name": ['product1','product2','product3','product4','product5'],
"product_price": ['500','800','300','700','1000'],
"product_id": ['1000','1001','1002','1003','1004'],
"product_url": ['url1','url2','url3','url4','url5'],
}
# df is the data scraped from the first part
df = pd.DataFrame(df_dic)
df['product_chracter1'] = ""
df['product_chracter2'] = ""
df['product_chracter3'] = ""
df['product_chracter4'] = ""
df['product_chracter5'] = ""
df['product_chracter6'] = ""
df['product_chracter7'] = ""
df['product_chracter8'] = ""
df['product_chracter9'] = ""
df['product_chracter10'] = ""
# Below is the beginning of the second part where detailed product characteristics (more than 50) are attached to dataframe
for i_url in df['product_url']:
try:
product_id = df['product_id'].loc[df['prouct_url']==i_url]
params = {'productSeq': product_id}
headers = {'User-Agent': generate_user_agent(device_type='smartphone', navigator='chrome')}
baseline_url = r'https://www.something.com'
html = requests.post(baseline_url, headers = headers, params = params).text
time.sleep(0.3)
df['product_chracter1'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter1 = "(.+?)";', html)[0]
df['product_chracter2'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter2 = "(.+?)";', html)[0]
df['product_chracter3'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter3 = "(.+?)";', html)[0]
df['product_chracter4'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter4 = "(.+?)";', html)[0]
df['product_chracter5'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter5 = "(.+?)";', html)[0]
baseline_url_2 = r'https://www.something_2.com'
html = requests.post(baseline_url_2, headers = headers, params = params).text
time.sleep(0.3)
df['product_chracter6'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter6 = "(.+?)";', html)[0]
df['product_chracter7'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter7 = "(.+?)";', html)[0]
df['product_chracter8'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter8 = "(.+?)";', html)[0]
df['product_chracter9'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter9 = "(.+?)";', html)[0]
df['product_chracter10'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter10 = "(.+?)";', html)[0]
except Exception as e:
print(f"Some error happened at {i_url}")
print(e)
continue
filename = 'site_date'
df.to_pickle(f'{filename}.pkl')
我使用尝试
和除了
外,有一些产品在刮擦或不显示某些特征时出售的产品,因此会导致错误。我仅使用requests.get
和requests.post
,而不是selenium
。我的问题是,如何在Python中使用多处理
并使我的代码运行速度更快?我读到threadpoolexecutor
从conturrent.futures.futures
库可以提供帮助,但在我的情况下我真的不知道如何实现它。
任何帮助,不同的方法和任何评论都将不胜感激。感谢您的时间和考虑阅读本文。
更新:2022-07-02
多亏了 @gohkohhan的评论,我能够将多处理应用于我的代码,并且几乎需要1/max_workers
与上一个运行的时间代码。我知道我的代码不是完美的,这里的很多人都是超级硕士,python
,所以我希望您指出 需要从以下代码中改进的东西。
import requests,re
import time
import pandas as pd
from user_agent import generate_user_agent
import concurrent.futures
start_time = time.time
urls = df['product_url']
def return_dataframe(i_url):
try:
unique = []
product_chracter1=[]; product_chracter6=[]
product_chracter2=[]; product_chracter7=[]
product_chracter3=[]; product_chracter8=[]
product_chracter4=[]; product_chracter9=[]
product_chracter5=[]; product_chracter10=[]
product_id = df['product_id'].loc[df['prouct_url']==i_url]
params = {'productSeq': product_id}
headers = {'User-Agent':generate_user_agent(device_type='smartphone', navigator='chrome')}
unique.append(i_url)
baseline_url = r'https://www.something.com'
html = requests.post(baseline_url, headers = headers, params = params).text
time.sleep(0.3)
product_chracter1.append(re.findall(r'var product_chracter1 = "(.+?)";', html)[0])
product_chracter2.append(re.findall(r'var product_chracter2 = "(.+?)";', html)[0])
product_chracter3.append(re.findall(r'var product_chracter3 = "(.+?)";', html)[0])
product_chracter4.append(re.findall(r'var product_chracter4 = "(.+?)";', html)[0])
product_chracter5.append(re.findall(r'var product_chracter5 = "(.+?)";', html)[0])
baseline_url_2 = r'https://www.something_2.com'
html = requests.post(baseline_url_2, headers = headers, params = params).text
time.sleep(0.3)
product_chracter6.append(re.findall(r'var product_chracter6 = "(.+?)";', html)[0])
product_chracter7.append(re.findall(r'var product_chracter7 = "(.+?)";', html)[0])
product_chracter8.append(re.findall(r'var product_chracter8 = "(.+?)";', html)[0])
product_chracter9.append(re.findall(r'var product_chracter9 = "(.+?)";', html)[0])
product_chracter10.append(re.findall(r'var product_chracter10 = "(.+?)";', html)[0])
i_df_detailed = pd.DataFrame(list(zip(unique, product_chracter1, product_chracter2, product_chracter3,
product_chracter4, product_chracter5, product_chracter6,
product_chracter7, product_chracter8, product_chracter9,
product_chracter10)),
columns = ['unique', 'product_chracter1', 'product_chracter2', 'product_chracter3',
'product_chracter4', 'product_chracter5', 'product_chracter6',
'product_chracter7', 'product_chracter8', 'product_chracter9',
'product_chracter10'])
return i_df_detailed
except Exception as e:
print(f"Some error happened at {i_url}")
print(e)
continue
detailed_agg = []
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
future_to_url = {executor.submit(return_dataframe, i_url): i_url for i_url in urls}
for future in concurrent.futures.as_completed(future_to_url):
try:
detailed_agg.append(future.result())
except Exception as exc:
print('generated an exception: %s' % (exc))
df_detailed = pd.concat(detailed_agg)
df_agg = pd.merge(df, df_detailed, how='left', left_on=['product_url'], right_on=['unique'])
# How long did it take?
print("--- %.1f minutes ---" % (int(time.time() - start_time)/60))
任何建议将不胜感激!
I'm now web-scraping about 10,000 products from a retail website (e.g., Amazon) everyday to keep track of price history.
Scraping consists of two parts; firstly, I collect the "listings" of products where I get the basic information, such as product name, price, id, and url of each listing and save that to pandas dataframe. Secondly, using each url of product, I collect more detailed information about product and attach that one-by-one into the other columns of dataframe. I have no issue in the first part (takes <10 minutes), but it usually takes more than 15 hours to complete the second part.
Below is a sample code. Although the sample code below is not real, the actual code is just prolonged version of it.
import requests
import pandas as pd
import json
from user_agent import generate_user_agent
df_dic = {
"product_name": ['product1','product2','product3','product4','product5'],
"product_price": ['500','800','300','700','1000'],
"product_id": ['1000','1001','1002','1003','1004'],
"product_url": ['url1','url2','url3','url4','url5'],
}
# df is the data scraped from the first part
df = pd.DataFrame(df_dic)
df['product_chracter1'] = ""
df['product_chracter2'] = ""
df['product_chracter3'] = ""
df['product_chracter4'] = ""
df['product_chracter5'] = ""
df['product_chracter6'] = ""
df['product_chracter7'] = ""
df['product_chracter8'] = ""
df['product_chracter9'] = ""
df['product_chracter10'] = ""
# Below is the beginning of the second part where detailed product characteristics (more than 50) are attached to dataframe
for i_url in df['product_url']:
try:
product_id = df['product_id'].loc[df['prouct_url']==i_url]
params = {'productSeq': product_id}
headers = {'User-Agent': generate_user_agent(device_type='smartphone', navigator='chrome')}
baseline_url = r'https://www.something.com'
html = requests.post(baseline_url, headers = headers, params = params).text
time.sleep(0.3)
df['product_chracter1'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter1 = "(.+?)";', html)[0]
df['product_chracter2'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter2 = "(.+?)";', html)[0]
df['product_chracter3'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter3 = "(.+?)";', html)[0]
df['product_chracter4'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter4 = "(.+?)";', html)[0]
df['product_chracter5'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter5 = "(.+?)";', html)[0]
baseline_url_2 = r'https://www.something_2.com'
html = requests.post(baseline_url_2, headers = headers, params = params).text
time.sleep(0.3)
df['product_chracter6'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter6 = "(.+?)";', html)[0]
df['product_chracter7'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter7 = "(.+?)";', html)[0]
df['product_chracter8'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter8 = "(.+?)";', html)[0]
df['product_chracter9'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter9 = "(.+?)";', html)[0]
df['product_chracter10'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter10 = "(.+?)";', html)[0]
except Exception as e:
print(f"Some error happened at {i_url}")
print(e)
continue
filename = 'site_date'
df.to_pickle(f'{filename}.pkl')
I used try
and except
as there exist some products that are sold while scraping or do not show some characteristics, thus, causes error. I'm only using requests.get
and requests.post
, not selenium
. My question is, how I can use multiprocessing
in python and make my code run faster? I read that ThreadPoolExecutor
from the concurrent.futures
library can help, but really do not have any knowledge how I can implement that in my case.
Any help, different approach, and any comments would be greatly appreciated. Thank you for your time and consideration for reading this.
Updated: 2022-07-02
Thanks to @GohKohHan's comment, I was able to apply multiprocessing into my code and it takes almost 1/max_workers
of the time run with the previous code. I know my code is not perfect and a lot of people here are super masters at python
, so I hope you to point out any mistakes or things that need to be improved from the following code.
import requests,re
import time
import pandas as pd
from user_agent import generate_user_agent
import concurrent.futures
start_time = time.time
urls = df['product_url']
def return_dataframe(i_url):
try:
unique = []
product_chracter1=[]; product_chracter6=[]
product_chracter2=[]; product_chracter7=[]
product_chracter3=[]; product_chracter8=[]
product_chracter4=[]; product_chracter9=[]
product_chracter5=[]; product_chracter10=[]
product_id = df['product_id'].loc[df['prouct_url']==i_url]
params = {'productSeq': product_id}
headers = {'User-Agent':generate_user_agent(device_type='smartphone', navigator='chrome')}
unique.append(i_url)
baseline_url = r'https://www.something.com'
html = requests.post(baseline_url, headers = headers, params = params).text
time.sleep(0.3)
product_chracter1.append(re.findall(r'var product_chracter1 = "(.+?)";', html)[0])
product_chracter2.append(re.findall(r'var product_chracter2 = "(.+?)";', html)[0])
product_chracter3.append(re.findall(r'var product_chracter3 = "(.+?)";', html)[0])
product_chracter4.append(re.findall(r'var product_chracter4 = "(.+?)";', html)[0])
product_chracter5.append(re.findall(r'var product_chracter5 = "(.+?)";', html)[0])
baseline_url_2 = r'https://www.something_2.com'
html = requests.post(baseline_url_2, headers = headers, params = params).text
time.sleep(0.3)
product_chracter6.append(re.findall(r'var product_chracter6 = "(.+?)";', html)[0])
product_chracter7.append(re.findall(r'var product_chracter7 = "(.+?)";', html)[0])
product_chracter8.append(re.findall(r'var product_chracter8 = "(.+?)";', html)[0])
product_chracter9.append(re.findall(r'var product_chracter9 = "(.+?)";', html)[0])
product_chracter10.append(re.findall(r'var product_chracter10 = "(.+?)";', html)[0])
i_df_detailed = pd.DataFrame(list(zip(unique, product_chracter1, product_chracter2, product_chracter3,
product_chracter4, product_chracter5, product_chracter6,
product_chracter7, product_chracter8, product_chracter9,
product_chracter10)),
columns = ['unique', 'product_chracter1', 'product_chracter2', 'product_chracter3',
'product_chracter4', 'product_chracter5', 'product_chracter6',
'product_chracter7', 'product_chracter8', 'product_chracter9',
'product_chracter10'])
return i_df_detailed
except Exception as e:
print(f"Some error happened at {i_url}")
print(e)
continue
detailed_agg = []
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
future_to_url = {executor.submit(return_dataframe, i_url): i_url for i_url in urls}
for future in concurrent.futures.as_completed(future_to_url):
try:
detailed_agg.append(future.result())
except Exception as exc:
print('generated an exception: %s' % (exc))
df_detailed = pd.concat(detailed_agg)
df_agg = pd.merge(df, df_detailed, how='left', left_on=['product_url'], right_on=['unique'])
# How long did it take?
print("--- %.1f minutes ---" % (int(time.time() - start_time)/60))
Any suggestions would be greatly appreciated!
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论