如何在Web刮擦中使用Python多处理，然后将刮擦文本附加到Pandas DataFrame？

发布于 2025-02-11 11:47:00 字数 7349 浏览 0 评论 0原文

我现在每天都从零售网站（例如，亚马逊）中进行网络搭配，以跟踪价格历史记录。

刮擦包括两个部分；首先，我收集了我获得基本信息的产品的“清单”，例如每个列表的产品名称，价格，ID和 url ，并将其保存到Pandas数据框架。其次，使用产品的每个 url ，我收集了有关产品的更多详细信息，并将该产品单一附加到其他列置列中。我在第一部分没有问题（需要10分钟），但是完成第二部分通常需要超过15个小时。

以下是示例代码。尽管下面的示例代码不是真实的，但实际代码只是延长的版本。

import requests
import pandas as pd
import json
from user_agent import generate_user_agent

df_dic = {
    "product_name": ['product1','product2','product3','product4','product5'],
    "product_price": ['500','800','300','700','1000'],
    "product_id": ['1000','1001','1002','1003','1004'],
    "product_url": ['url1','url2','url3','url4','url5'],
    
}

# df is the data scraped from the first part
df = pd.DataFrame(df_dic) 

df['product_chracter1'] = ""
df['product_chracter2'] = ""
df['product_chracter3'] = ""
df['product_chracter4'] = ""
df['product_chracter5'] = ""

df['product_chracter6'] = ""
df['product_chracter7'] = ""
df['product_chracter8'] = ""
df['product_chracter9'] = ""
df['product_chracter10'] = ""


# Below is the beginning of the second part where detailed product characteristics (more than 50) are attached to dataframe 

for i_url in df['product_url']:
    try:
        product_id = df['product_id'].loc[df['prouct_url']==i_url]         
        params = {'productSeq': product_id}        
        headers = {'User-Agent': generate_user_agent(device_type='smartphone', navigator='chrome')}
        
        baseline_url = r'https://www.something.com'
        html = requests.post(baseline_url, headers = headers, params = params).text
        time.sleep(0.3)
        df['product_chracter1'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter1 = "(.+?)";', html)[0]
        df['product_chracter2'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter2 = "(.+?)";', html)[0]
        df['product_chracter3'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter3 = "(.+?)";', html)[0]
        df['product_chracter4'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter4 = "(.+?)";', html)[0]
        df['product_chracter5'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter5 = "(.+?)";', html)[0]

        baseline_url_2 = r'https://www.something_2.com'
        html = requests.post(baseline_url_2, headers = headers, params = params).text
        time.sleep(0.3)
        df['product_chracter6'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter6 = "(.+?)";', html)[0]
        df['product_chracter7'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter7 = "(.+?)";', html)[0]
        df['product_chracter8'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter8 = "(.+?)";', html)[0]
        df['product_chracter9'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter9 = "(.+?)";', html)[0]
        df['product_chracter10'].loc[df['prouct_url']==i_url]    = re.findall(r'var product_chracter10 = "(.+?)";', html)[0]

    except Exception as e:
        print(f"Some error happened at {i_url}")
        print(e)
        continue

filename = 'site_date'
df.to_pickle(f'{filename}.pkl')

我使用尝试和除了外，有一些产品在刮擦或不显示某些特征时出售的产品，因此会导致错误。我仅使用requests.get和requests.post，而不是selenium。我的问题是，如何在Python中使用多处理并使我的代码运行速度更快？我读到threadpoolexecutor从conturrent.futures.futures库可以提供帮助，但在我的情况下我真的不知道如何实现它。

任何帮助，不同的方法和任何评论都将不胜感激。感谢您的时间和考虑阅读本文。

更新：2022-07-02

多亏了 @gohkohhan的评论，我能够将多处理应用于我的代码，并且几乎需要1/max_workers与上一个运行的时间代码。我知道我的代码不是完美的，这里的很多人都是超级硕士，python，所以我希望您指出 需要从以下代码中改进的东西。

import requests,re
import time
import pandas as pd
from user_agent import generate_user_agent
import concurrent.futures


start_time = time.time
urls = df['product_url']
def return_dataframe(i_url):
    try:
        unique = []  
        product_chracter1=[];  product_chracter6=[]
        product_chracter2=[];  product_chracter7=[]
        product_chracter3=[];  product_chracter8=[]
        product_chracter4=[];  product_chracter9=[]
        product_chracter5=[];  product_chracter10=[]

        product_id = df['product_id'].loc[df['prouct_url']==i_url]         
        params = {'productSeq': product_id}        
        headers = {'User-Agent':generate_user_agent(device_type='smartphone', navigator='chrome')}
        
        unique.append(i_url)
        baseline_url = r'https://www.something.com'
        html = requests.post(baseline_url, headers = headers, params = params).text
        time.sleep(0.3)
        product_chracter1.append(re.findall(r'var product_chracter1 = "(.+?)";', html)[0])
        product_chracter2.append(re.findall(r'var product_chracter2 = "(.+?)";', html)[0])
        product_chracter3.append(re.findall(r'var product_chracter3 = "(.+?)";', html)[0])
        product_chracter4.append(re.findall(r'var product_chracter4 = "(.+?)";', html)[0])
        product_chracter5.append(re.findall(r'var product_chracter5 = "(.+?)";', html)[0])

        baseline_url_2 = r'https://www.something_2.com'
        html = requests.post(baseline_url_2, headers = headers, params = params).text
        time.sleep(0.3)
        product_chracter6.append(re.findall(r'var product_chracter6 = "(.+?)";', html)[0])
        product_chracter7.append(re.findall(r'var product_chracter7 = "(.+?)";', html)[0])
        product_chracter8.append(re.findall(r'var product_chracter8 = "(.+?)";', html)[0])
        product_chracter9.append(re.findall(r'var product_chracter9 = "(.+?)";', html)[0])
        product_chracter10.append(re.findall(r'var product_chracter10 = "(.+?)";', html)[0])
       
        i_df_detailed = pd.DataFrame(list(zip(unique, product_chracter1, product_chracter2, product_chracter3, 
                      product_chracter4, product_chracter5, product_chracter6,
                      product_chracter7, product_chracter8, product_chracter9,
                      product_chracter10)),
                      columns = ['unique', 'product_chracter1', 'product_chracter2', 'product_chracter3',
                       'product_chracter4', 'product_chracter5', 'product_chracter6',
                       'product_chracter7', 'product_chracter8', 'product_chracter9',
                       'product_chracter10'])

        return i_df_detailed

    except Exception as e:
        print(f"Some error happened at {i_url}")
        print(e)
        continue

detailed_agg = []
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:    
    future_to_url = {executor.submit(return_dataframe, i_url): i_url for i_url in urls}
    for future in concurrent.futures.as_completed(future_to_url):
        try:
            detailed_agg.append(future.result())
        except Exception as exc:
            print('generated an exception: %s' % (exc))

 

df_detailed = pd.concat(detailed_agg)
df_agg = pd.merge(df, df_detailed, how='left', left_on=['product_url'], right_on=['unique'])

# How long did it take?
print("---  %.1f minutes ---" % (int(time.time() - start_time)/60))

任何建议将不胜感激！

原文

I'm now web-scraping about 10,000 products from a retail website (e.g., Amazon) everyday to keep track of price history.

Scraping consists of two parts; firstly, I collect the "listings" of products where I get the basic information, such as product name, price, id, and url of each listing and save that to pandas dataframe. Secondly, using each url of product, I collect more detailed information about product and attach that one-by-one into the other columns of dataframe. I have no issue in the first part (takes <10 minutes), but it usually takes more than 15 hours to complete the second part.

Below is a sample code. Although the sample code below is not real, the actual code is just prolonged version of it.

import requests
import pandas as pd
import json
from user_agent import generate_user_agent

df_dic = {
    "product_name": ['product1','product2','product3','product4','product5'],
    "product_price": ['500','800','300','700','1000'],
    "product_id": ['1000','1001','1002','1003','1004'],
    "product_url": ['url1','url2','url3','url4','url5'],
    
}

# df is the data scraped from the first part
df = pd.DataFrame(df_dic) 

df['product_chracter1'] = ""
df['product_chracter2'] = ""
df['product_chracter3'] = ""
df['product_chracter4'] = ""
df['product_chracter5'] = ""

df['product_chracter6'] = ""
df['product_chracter7'] = ""
df['product_chracter8'] = ""
df['product_chracter9'] = ""
df['product_chracter10'] = ""


# Below is the beginning of the second part where detailed product characteristics (more than 50) are attached to dataframe 

for i_url in df['product_url']:
    try:
        product_id = df['product_id'].loc[df['prouct_url']==i_url]         
        params = {'productSeq': product_id}        
        headers = {'User-Agent': generate_user_agent(device_type='smartphone', navigator='chrome')}
        
        baseline_url = r'https://www.something.com'
        html = requests.post(baseline_url, headers = headers, params = params).text
        time.sleep(0.3)
        df['product_chracter1'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter1 = "(.+?)";', html)[0]
        df['product_chracter2'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter2 = "(.+?)";', html)[0]
        df['product_chracter3'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter3 = "(.+?)";', html)[0]
        df['product_chracter4'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter4 = "(.+?)";', html)[0]
        df['product_chracter5'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter5 = "(.+?)";', html)[0]

        baseline_url_2 = r'https://www.something_2.com'
        html = requests.post(baseline_url_2, headers = headers, params = params).text
        time.sleep(0.3)
        df['product_chracter6'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter6 = "(.+?)";', html)[0]
        df['product_chracter7'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter7 = "(.+?)";', html)[0]
        df['product_chracter8'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter8 = "(.+?)";', html)[0]
        df['product_chracter9'].loc[df['prouct_url']==i_url]     = re.findall(r'var product_chracter9 = "(.+?)";', html)[0]
        df['product_chracter10'].loc[df['prouct_url']==i_url]    = re.findall(r'var product_chracter10 = "(.+?)";', html)[0]

    except Exception as e:
        print(f"Some error happened at {i_url}")
        print(e)
        continue

filename = 'site_date'
df.to_pickle(f'{filename}.pkl')

I used try and except as there exist some products that are sold while scraping or do not show some characteristics, thus, causes error. I'm only using requests.get and requests.post, not selenium. My question is, how I can use multiprocessing in python and make my code run faster? I read that ThreadPoolExecutor from the concurrent.futures library can help, but really do not have any knowledge how I can implement that in my case.

Any help, different approach, and any comments would be greatly appreciated. Thank you for your time and consideration for reading this.

Updated: 2022-07-02

Thanks to @GohKohHan's comment, I was able to apply multiprocessing into my code and it takes almost 1/max_workers of the time run with the previous code. I know my code is not perfect and a lot of people here are super masters at python, so I hope you to point out any mistakes or things that need to be improved from the following code.

import requests,re
import time
import pandas as pd
from user_agent import generate_user_agent
import concurrent.futures


start_time = time.time
urls = df['product_url']
def return_dataframe(i_url):
    try:
        unique = []  
        product_chracter1=[];  product_chracter6=[]
        product_chracter2=[];  product_chracter7=[]
        product_chracter3=[];  product_chracter8=[]
        product_chracter4=[];  product_chracter9=[]
        product_chracter5=[];  product_chracter10=[]

        product_id = df['product_id'].loc[df['prouct_url']==i_url]         
        params = {'productSeq': product_id}        
        headers = {'User-Agent':generate_user_agent(device_type='smartphone', navigator='chrome')}
        
        unique.append(i_url)
        baseline_url = r'https://www.something.com'
        html = requests.post(baseline_url, headers = headers, params = params).text
        time.sleep(0.3)
        product_chracter1.append(re.findall(r'var product_chracter1 = "(.+?)";', html)[0])
        product_chracter2.append(re.findall(r'var product_chracter2 = "(.+?)";', html)[0])
        product_chracter3.append(re.findall(r'var product_chracter3 = "(.+?)";', html)[0])
        product_chracter4.append(re.findall(r'var product_chracter4 = "(.+?)";', html)[0])
        product_chracter5.append(re.findall(r'var product_chracter5 = "(.+?)";', html)[0])

        baseline_url_2 = r'https://www.something_2.com'
        html = requests.post(baseline_url_2, headers = headers, params = params).text
        time.sleep(0.3)
        product_chracter6.append(re.findall(r'var product_chracter6 = "(.+?)";', html)[0])
        product_chracter7.append(re.findall(r'var product_chracter7 = "(.+?)";', html)[0])
        product_chracter8.append(re.findall(r'var product_chracter8 = "(.+?)";', html)[0])
        product_chracter9.append(re.findall(r'var product_chracter9 = "(.+?)";', html)[0])
        product_chracter10.append(re.findall(r'var product_chracter10 = "(.+?)";', html)[0])
       
        i_df_detailed = pd.DataFrame(list(zip(unique, product_chracter1, product_chracter2, product_chracter3, 
                      product_chracter4, product_chracter5, product_chracter6,
                      product_chracter7, product_chracter8, product_chracter9,
                      product_chracter10)),
                      columns = ['unique', 'product_chracter1', 'product_chracter2', 'product_chracter3',
                       'product_chracter4', 'product_chracter5', 'product_chracter6',
                       'product_chracter7', 'product_chracter8', 'product_chracter9',
                       'product_chracter10'])

        return i_df_detailed

    except Exception as e:
        print(f"Some error happened at {i_url}")
        print(e)
        continue

detailed_agg = []
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:    
    future_to_url = {executor.submit(return_dataframe, i_url): i_url for i_url in urls}
    for future in concurrent.futures.as_completed(future_to_url):
        try:
            detailed_agg.append(future.result())
        except Exception as exc:
            print('generated an exception: %s' % (exc))

 

df_detailed = pd.concat(detailed_agg)
df_agg = pd.merge(df, df_detailed, how='left', left_on=['product_url'], right_on=['unique'])

# How long did it take?
print("---  %.1f minutes ---" % (int(time.time() - start_time)/60))

Any suggestions would be greatly appreciated!

分享到QQ

分享到微博