使用 url 范围时如何在 Selenium Python 中添加多线程?

发布于 2025-01-14 12:08:47 字数 1224 浏览 8 评论 0原文

所以我的想法是,如果我添加一些可以将 url 范围拆分为 5 个的内容,然后为 5 个 chromedriver 实例中的每一个实例提供自己的 url 范围拆分来处理它,将使抓取速度更快。这是我最大的问题。但如果每个 chromedriver 都有自己的 csv 文件,也许会更好,或者我需要添加一些东西来将所有抓取内容集中在一个文件中?我在这里真的很茫然,但我已经在提高我的技能水平了。我永远感激您至少在如何使多线程工作方面提供的任何具体帮助。谢谢你!

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv
path_to_file = "test1.csv"
csvFile = open(path_to_file, 'a', encoding="utf-8", newline='')
csvWriter = csv.writer(csvFile)
options = webdriver.ChromeOptions() 
driver = webdriver.Chrome(options=options)
header_added = False
time.sleep(3)
for i in range(1,153512):
    print(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
    driver.get(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
    try:
        Name = driver.find_element(By.XPATH,"//table[@id='recordServiceData']//tr[contains(.,'Name:')]").text.replace("Name:", "")
    except:
        Name =''
    csvWriter.writerow([i, Name])
    print(Name)

So my thoughts are that if I would add something that can split the url-range into 5 and then give each of 5 chromedriver instances their own split of the url-range to handle it would make scraping much faster. And thats my biggest question. But maybe then its better if each chromedriver had their own csv file, or I would need to add something that pools all the scraping in one file? Im really at a loss here and I'm already pushing my skill level. I am eternally grateful for any concrete help on at least how to get multithreading working. Thank you!

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv
path_to_file = "test1.csv"
csvFile = open(path_to_file, 'a', encoding="utf-8", newline='')
csvWriter = csv.writer(csvFile)
options = webdriver.ChromeOptions() 
driver = webdriver.Chrome(options=options)
header_added = False
time.sleep(3)
for i in range(1,153512):
    print(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
    driver.get(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
    try:
        Name = driver.find_element(By.XPATH,"//table[@id='recordServiceData']//tr[contains(.,'Name:')]").text.replace("Name:", "")
    except:
        Name =''
    csvWriter.writerow([i, Name])
    print(Name)

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

允世 2025-01-21 12:08:47

试试这个:

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv
path_to_file = "test1.csv"
csvFile = open(path_to_file, 'a', encoding="utf-8", newline='')
csvWriter = csv.writer(csvFile)

header_added = False
time.sleep(3)


    
def init_driver_worker(_range_task): #create new instace of chrome then make it do its job
    ##### init driver
    options = webdriver.ChromeOptions()
    #you can't run multible instances of chrome
    #  with the same profile being used,
    #  so either create new profile for each instance or use incognito mode
    options.add_argument("--incognito")
    options.add_argument("--headless") #use headless browser (no GUI) to be faster
    driver = webdriver.Chrome(options=options)
    ##### do the task
    for i in _range_task:
        print(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
        driver.get(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
        try:
            Name = driver.find_element(By.XPATH,"//table[@id='recordServiceData']//tr[contains(.,'Name:')]").text.replace("Name:", "")
        except:
            Name =''
        csvWriter.writerow([i, Name])
        print(Name)
    exit() #close the thread
    
    
    
def split_range(_range, parts): #split a range to chunks
    chunk_size = int(len(_range)/parts)
    chunks = [_range[x:x+chunk_size] for x in range(0, len(_range), chunk_size)]
    return chunks

my_range = range(1,153512)
chunks = split_range(my_range, 10) # split the task to 10 instances of chrome

from threading import Thread
thread_workers = []
for chunk in chunks:
    t = Thread(target=init_driver_worker, args=([chunk]))
    thread_workers.append(t)
    t.start()
    
# wait for the thread_workers to finish
for t in thread_workers:
    t.join()

try this:

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv
path_to_file = "test1.csv"
csvFile = open(path_to_file, 'a', encoding="utf-8", newline='')
csvWriter = csv.writer(csvFile)

header_added = False
time.sleep(3)


    
def init_driver_worker(_range_task): #create new instace of chrome then make it do its job
    ##### init driver
    options = webdriver.ChromeOptions()
    #you can't run multible instances of chrome
    #  with the same profile being used,
    #  so either create new profile for each instance or use incognito mode
    options.add_argument("--incognito")
    options.add_argument("--headless") #use headless browser (no GUI) to be faster
    driver = webdriver.Chrome(options=options)
    ##### do the task
    for i in _range_task:
        print(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
        driver.get(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
        try:
            Name = driver.find_element(By.XPATH,"//table[@id='recordServiceData']//tr[contains(.,'Name:')]").text.replace("Name:", "")
        except:
            Name =''
        csvWriter.writerow([i, Name])
        print(Name)
    exit() #close the thread
    
    
    
def split_range(_range, parts): #split a range to chunks
    chunk_size = int(len(_range)/parts)
    chunks = [_range[x:x+chunk_size] for x in range(0, len(_range), chunk_size)]
    return chunks

my_range = range(1,153512)
chunks = split_range(my_range, 10) # split the task to 10 instances of chrome

from threading import Thread
thread_workers = []
for chunk in chunks:
    t = Thread(target=init_driver_worker, args=([chunk]))
    thread_workers.append(t)
    t.start()
    
# wait for the thread_workers to finish
for t in thread_workers:
    t.join()
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文