尝试使用Selenium刮擦标题

发布于 2025-02-01 16:04:12 字数 695 浏览 2 评论 0原文

我正在尝试刮擦title他们将进入每个链接并刮擦标题，但它们会向我显示错误

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from time import sleep

PATH="C:\Program Files (x86)\chromedriver.exe"
url='https://www.supplyvan.com/power-tools/cordless-powertools/cordless-drills.html'
driver =webdriver.Chrome(PATH)
wait = WebDriverWait(driver, 20)
driver.get(url)
list_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
title=driver.find_element_by_xpath('h1').text()
print(title)

原文

I am try to scrape title they will go inside every link and scrape the title but they will show me error

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from time import sleep

PATH="C:\Program Files (x86)\chromedriver.exe"
url='https://www.supplyvan.com/power-tools/cordless-powertools/cordless-drills.html'
driver =webdriver.Chrome(PATH)
wait = WebDriverWait(driver, 20)
driver.get(url)
list_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
title=driver.find_element_by_xpath('h1').text()
print(title)

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

滥情哥ㄟ 2025-02-08 16:04:12

您需要更改选择器以获取H1标签文本。

在此片段中，Scraper将访问第一个链接，并打印

# click the single link
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
time.sleep(2)

# parse the h1 tag text
title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
print(title)

驾驶员将访问每个链接的标题，并刮擦标题

# parse all the links
page_links = [element.get_attribute('href') for element in
              driver.find_elements(By.CSS_SELECTOR, "h4.card-title > a")]

# visit all the links
for link in page_links:
    driver.get(link)
    time.sleep(2)
    title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text

    # parse title for all the links
    print(title)
    time.sleep(2)

完整代码，其中包含两个片段 -


import time

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

options = webdriver.ChromeOptions()

# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")

chrome_driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)


def supplyvan_scraper():
    with chrome_driver as driver:
        driver.implicitly_wait(15)
        URL = 'https://www.supplyvan.com/power-tools/cordless-powertools/cordless-drills.html'
        driver.get(URL)
        time.sleep(3)

        # opt #1 visit first link, print the title uncomment to see
        # click the single link
        # WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
        # time.sleep(2)
        #
        # # parse the h1 tag text
        # title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
        # print(title)

        # opt #2 visit all links, print titles
        # parse all the links
        page_links = [element.get_attribute('href') for element in
                      driver.find_elements(By.CSS_SELECTOR, "h4.card-title > a")]

        # visit all the links
        for link in page_links:
            driver.get(link)
            time.sleep(2)
            title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text

            # parse title for all the links
            print(title)
            # driver.back()
            time.sleep(2)

        time.sleep(2)
        driver.quit()


supplyvan_scraper()

输出所有访问的链接 -

Bosch Professional Cordless Drill, GSR-120-Li, 12V, Blue/Black
Makita LXT Cordless Drill Driver, DDF481RTJ, 18V, 13MM
Bosch Cordless Drill, GSR-1000, 10.8V
.....

You need to change the selector to get the h1 tag text.

In this snippet, the scraper will visit the first link and print the title

# click the single link
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
time.sleep(2)

# parse the h1 tag text
title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
print(title)

The driver will visit every link and scrape the title

# parse all the links
page_links = [element.get_attribute('href') for element in
              driver.find_elements(By.CSS_SELECTOR, "h4.card-title > a")]

# visit all the links
for link in page_links:
    driver.get(link)
    time.sleep(2)
    title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text

    # parse title for all the links
    print(title)
    time.sleep(2)

full code with both snippet included -


import time

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

options = webdriver.ChromeOptions()

# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")

chrome_driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)


def supplyvan_scraper():
    with chrome_driver as driver:
        driver.implicitly_wait(15)
        URL = 'https://www.supplyvan.com/power-tools/cordless-powertools/cordless-drills.html'
        driver.get(URL)
        time.sleep(3)

        # opt #1 visit first link, print the title uncomment to see
        # click the single link
        # WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h4.card-title"))).click()
        # time.sleep(2)
        #
        # # parse the h1 tag text
        # title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text
        # print(title)

        # opt #2 visit all links, print titles
        # parse all the links
        page_links = [element.get_attribute('href') for element in
                      driver.find_elements(By.CSS_SELECTOR, "h4.card-title > a")]

        # visit all the links
        for link in page_links:
            driver.get(link)
            time.sleep(2)
            title = driver.find_element(By.CSS_SELECTOR, 'h1.productView-title').text

            # parse title for all the links
            print(title)
            # driver.back()
            time.sleep(2)

        time.sleep(2)
        driver.quit()


supplyvan_scraper()

Output for all the visited links -

Bosch Professional Cordless Drill, GSR-120-Li, 12V, Blue/Black
Makita LXT Cordless Drill Driver, DDF481RTJ, 18V, 13MM
Bosch Cordless Drill, GSR-1000, 10.8V
.....

回复收藏 0 原文

~没有更多了~