使用ThreadPoolManager和Selenium从URL列表中从网站剪接

发布于 2025-02-07 01:52:15 字数 3155 浏览 2 评论 0原文

我正在尝试使用URL列表中的Selenium来取消飞行价格。我拥有的URL列表非常大，因此我的初始实现只需从迭代中的每个URL中获取一个元素即可完成24小时才能完成。因此，我决定将其加速加速。我做了一些研究，并认为使用线程可能会有所帮助。以下代码的目的是将URL分配在3个线程之间，但是，它不起作用。我认为这些网页可能不会加载？我正在寻找有关这是否是可行的方法，或者这是否不是更好的策略。谢谢你！

#import libraries
from time import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor
from threading import current_thread
from threading import get_ident
from threading import get_native_id

## initialize drivers
driver0 = webdriver.Chrome(executable_path = 'C:\Program Files (x86)\chromedriver.exe')
driver1 = webdriver.Chrome(executable_path = 'C:\Program Files (x86)\chromedriver.exe')
driver2 = webdriver.Chrome(executable_path = 'C:\Program Files (x86)\chromedriver.exe')

def get_cost(url):
    #use correct driver
    # print('url: ',url)
    thread = current_thread()
    print(f'Worker thread: name={thread.name}, idnet={get_ident()}, id={get_native_id()}')
    if thread.name == 'ThreadPoolExecutor-0_0':
        print('- Thread 0')
        driver = driver0.get(url)
    elif thread.name == 'ThreadPoolExecutor-0_1':
        print('- Thread 1')
        driver = driver1.get(url)
    elif thread.name == 'ThreadPoolExecutor-0_2':
        print('- Thread 2')
        driver = driver2.get(url)
    else:
        print("error")
    time.sleep(20) # maybe it doesnt have time to load?
    #find cost
    try:
        element = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH,'/html/body/c-wiz[2]/div/div[2]/c-wiz/div/c-wiz/c-wiz/div[2]/div[2]/ul[1]/li[1]/div/div[2]/div/div[9]/div[2]/span'))
        )
        cost = element.get_attribute('textContent')
    except:
        cost = "-"
    
    print('url: ',url)
    print('cost: ',cost)

urls = ['https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-25%20one%20way',
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-26%20one%20way', 
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-27%20one%20way',
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-28%20one%20way',
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-29%20one%20way', 
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-30%20one%20way',
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-08-01%20one%20way',
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-08-02%20one%20way', 
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-08-03%20one%20way']


## MAIN
with ThreadPoolExecutor(max_workers=3) as exe:
    exe.map(get_cost, urls)

## close drivers
driver0.quit()
driver1.quit()
driver2.quit()

原文

I am trying to scrap flight prices using selenium from a list of urls. The list of urls I have is very large so my initial implementation that simply grabbed an element from from each url in iteration would take of 24 hours to complete. So I decided to take stab at speeding it up. I did some research and decided that using threading might help. The goal of the code below is to divide the urls between 3 threads, however, it is not working. I think the webpages might just not be loading? I am looking for advice in whether this is a feasible approach or not, or if it isn't what a better strategy might be. Thank you!

#import libraries
from time import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor
from threading import current_thread
from threading import get_ident
from threading import get_native_id

## initialize drivers
driver0 = webdriver.Chrome(executable_path = 'C:\Program Files (x86)\chromedriver.exe')
driver1 = webdriver.Chrome(executable_path = 'C:\Program Files (x86)\chromedriver.exe')
driver2 = webdriver.Chrome(executable_path = 'C:\Program Files (x86)\chromedriver.exe')

def get_cost(url):
    #use correct driver
    # print('url: ',url)
    thread = current_thread()
    print(f'Worker thread: name={thread.name}, idnet={get_ident()}, id={get_native_id()}')
    if thread.name == 'ThreadPoolExecutor-0_0':
        print('- Thread 0')
        driver = driver0.get(url)
    elif thread.name == 'ThreadPoolExecutor-0_1':
        print('- Thread 1')
        driver = driver1.get(url)
    elif thread.name == 'ThreadPoolExecutor-0_2':
        print('- Thread 2')
        driver = driver2.get(url)
    else:
        print("error")
    time.sleep(20) # maybe it doesnt have time to load?
    #find cost
    try:
        element = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH,'/html/body/c-wiz[2]/div/div[2]/c-wiz/div/c-wiz/c-wiz/div[2]/div[2]/ul[1]/li[1]/div/div[2]/div/div[9]/div[2]/span'))
        )
        cost = element.get_attribute('textContent')
    except:
        cost = "-"
    
    print('url: ',url)
    print('cost: ',cost)

urls = ['https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-25%20one%20way',
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-26%20one%20way', 
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-27%20one%20way',
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-28%20one%20way',
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-29%20one%20way', 
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-30%20one%20way',
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-08-01%20one%20way',
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-08-02%20one%20way', 
        'https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-08-03%20one%20way']


## MAIN
with ThreadPoolExecutor(max_workers=3) as exe:
    exe.map(get_cost, urls)

## close drivers
driver0.quit()
driver1.quit()
driver2.quit()

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

懒猫 2025-02-14 01:52:15

我使用

pytest test_scrape_google_flights.py -n 4 - headless

from parameterized import parameterized
from seleniumbase import BaseCase

class GoogleTests(BaseCase):
    @parameterized.expand(
        [
            ['https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-25%20one%20way'],
            ['https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-26%20one%20way'],
            ['https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-27%20one%20way'],
            ['https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-28%20one%20way'],
        ]
    )
    def test_parameterized_google_search(self, url):
        self.open(url)
        self.wait_for_text("$", 'div[role="main"]')
        content = self.get_text('div[role="main"]')
        items = content.split("\n")
        for item in items:
            if "$" in item:
                self._print("%s - %s\n" % (url.split(r"%20")[-3], item))

这是运行的结果：（

pytest test_scrape_google_flights.py -n 4 --headless
=================================== test session starts ===================================
platform darwin -- Python 3.10.1, pytest-7.1.2, pluggy-1.0.0
rootdir: /Users/michael/github/SeleniumBase/examples, configfile: pytest.ini
plugins: html-2.0.1, xdist-2.5.0, forked-1.4.0, metadata-2.0.1, rerunfailures-10.2, ordering-0.6, cov-3.0.0, seleniumbase-3.2.9
gw0 [5] / gw1 [5] / gw2 [5] / gw3 [5]
2022-07-27 - $80
2022-07-27 - $235
2022-07-27 - $267
2022-07-27 - $216
2022-07-27 - $471
2022-07-27 - $516
2022-07-27 - $601
2022-07-27 - $626
2022-07-27 - $827
.2022-07-26 - Travel on Jul 27 for $80
2022-07-26 - $453
2022-07-26 - $515
2022-07-26 - $572
2022-07-26 - $596
2022-07-26 - $601
2022-07-26 - $314
2022-07-26 - $316
2022-07-26 - $480
.2022-07-28 - Travel on Jul 29 for $71
2022-07-28 - $96
2022-07-28 - $301
2022-07-28 - $616
2022-07-28 - $340
2022-07-28 - $347
2022-07-28 - $412
2022-07-28 - $462
2022-07-28 - $478
2022-07-28 - $669
2022-07-28 - $1,165
.2022-07-25 - Travel on Jul 27 for $80
2022-07-25 - $130
2022-07-25 - $422
2022-07-25 - $573
2022-07-25 - $615
2022-07-25 - $340
2022-07-25 - $384
2022-07-25 - $501
2022-07-25 - $646

完整披露：我构建了该自动化框架）

I created a quick solution using SeleniumBase that runs with pytest, which already includes multithreading abilities:

pytest test_scrape_google_flights.py -n 4 --headless

from parameterized import parameterized
from seleniumbase import BaseCase

class GoogleTests(BaseCase):
    @parameterized.expand(
        [
            ['https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-25%20one%20way'],
            ['https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-26%20one%20way'],
            ['https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-27%20one%20way'],
            ['https://www.google.com/travel/flights?q=Flights%20to%20Paphos%20from%20Vienna%20on%202022-07-28%20one%20way'],
        ]
    )
    def test_parameterized_google_search(self, url):
        self.open(url)
        self.wait_for_text("quot;, 'div[role="main"]')
        content = self.get_text('div[role="main"]')
        items = content.split("\n")
        for item in items:
            if "quot; in item:
                self._print("%s - %s\n" % (url.split(r"%20")[-3], item))

This was the result of running that:

pytest test_scrape_google_flights.py -n 4 --headless
=================================== test session starts ===================================
platform darwin -- Python 3.10.1, pytest-7.1.2, pluggy-1.0.0
rootdir: /Users/michael/github/SeleniumBase/examples, configfile: pytest.ini
plugins: html-2.0.1, xdist-2.5.0, forked-1.4.0, metadata-2.0.1, rerunfailures-10.2, ordering-0.6, cov-3.0.0, seleniumbase-3.2.9
gw0 [5] / gw1 [5] / gw2 [5] / gw3 [5]
2022-07-27 - $80
2022-07-27 - $235
2022-07-27 - $267
2022-07-27 - $216
2022-07-27 - $471
2022-07-27 - $516
2022-07-27 - $601
2022-07-27 - $626
2022-07-27 - $827
.2022-07-26 - Travel on Jul 27 for $80
2022-07-26 - $453
2022-07-26 - $515
2022-07-26 - $572
2022-07-26 - $596
2022-07-26 - $601
2022-07-26 - $314
2022-07-26 - $316
2022-07-26 - $480
.2022-07-28 - Travel on Jul 29 for $71
2022-07-28 - $96
2022-07-28 - $301
2022-07-28 - $616
2022-07-28 - $340
2022-07-28 - $347
2022-07-28 - $412
2022-07-28 - $462
2022-07-28 - $478
2022-07-28 - $669
2022-07-28 - $1,165
.2022-07-25 - Travel on Jul 27 for $80
2022-07-25 - $130
2022-07-25 - $422
2022-07-25 - $573
2022-07-25 - $615
2022-07-25 - $340
2022-07-25 - $384
2022-07-25 - $501
2022-07-25 - $646

(Full disclosure: I built that automation framework)

回复收藏 0 原文

~没有更多了~