带有返回空列表的Web Crapping

发布于 2025-02-11 10:41:30 字数 585 浏览 1 评论 0原文

from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
driver = webdriver.Chrome()

driver.get('https://www.flaconi.de/haare/kerastase/chronologiste/kerastase-chronologiste-bain-regenerant-haarshampoo.html?yoReviewsPage=2')
soup = BeautifulSoup(driver.page_source, 'lxml')

soup.find_all('div',class_='content-review')

# it always return empty list

# I want to scrap all of review contents from e.g "<div class="content-review" id="325243269"> Super Shampoo, meine Haare glänzt und sind sehr weich.

from bs4 import BeautifulSoup  
import pandas as pd
from selenium import webdriver
driver = webdriver.Chrome()


driver.get('https://www.flaconi.de/haare/kerastase/chronologiste/kerastase-chronologiste-bain-regenerant-haarshampoo.html?yoReviewsPage=2')
soup = BeautifulSoup(driver.page_source, 'lxml') 

soup.find_all('div',class_='content-review')

# it always return empty list

# I want to scrap all of review contents from e.g "<div class="content-review" id="325243269"> Super Shampoo, meine Haare glänzt und sind sehr weich. ???? </div>"

I try multiple ways but it always return empty list.
How should I do in order to solve this problem?

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(2

物价感观 2025-02-18 10:41:30

您需要等到页面完全加载直到完全加载:

driver.get(url)
timeout = 5
try:
    element_present = EC.presence_of_element_located((By.CLASS_NAME, 'content-review'))
    WebDriverWait(driver, timeout).until(element_present)
except TimeoutException:
    print("Timed out waiting for page to load")
soup = BeautifulSoup(driver.page_source, 'lxml')
for review in soup.find_all('div', class_='content-review'):
    print(review.getText().strip())

添加必要的libs:

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

输出:

Super Shampoo, meine Haare glänzt und sind sehr weich. 

Yo need to wait until page will completely loaded:

driver.get(url)
timeout = 5
try:
    element_present = EC.presence_of_element_located((By.CLASS_NAME, 'content-review'))
    WebDriverWait(driver, timeout).until(element_present)
except TimeoutException:
    print("Timed out waiting for page to load")
soup = BeautifulSoup(driver.page_source, 'lxml')
for review in soup.find_all('div', class_='content-review'):
    print(review.getText().strip())

Add necessary libs:

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

OUTPUT:

Super Shampoo, meine Haare glänzt und sind sehr weich. ????
Ich verwende dieses Produkt seit kurzem und ich bin begeistert, so ein pflegendes Shampoo habe ich noch nie gehabt. Er gibt meinen Haar Glanz, Geschmeidigkeit und Fülle. Ich kann es nur empfehlen.
Zufrieden
Tolles Shampoo
Sehr gut

Second option - find request with reviews and get data:

url = "https://staticw2.yotpo.com/batch/1eunvtBQrA7MbZslPu3gAznkZCUjvEeL5tp0uybR/80053469-250"

payload='methods=%5B%7B%22method%22%3A%22main_widget%22%2C%22params%22%3A%7B%22pid%22%3A%2280053469-250%22%2C%22page%22%3A2%2C%22order_metadata_fields%22%3A%7B%7D%2C%22widget_product_id%22%3A%2280053469-250%22%7D%7D%5D&app_key=1eunvtBQrA7MbZslPu3gAznkZCUjvEeL5tp0uybR'
response = requests.request("POST", url, data=payload)
soup = BeautifulSoup(response.json()[0]['result'], 'lxml')
for review in soup.find_all('div', class_='content-review'):
    print(review.getText().strip())

With same output

梦在深巷 2025-02-18 10:41:30

这里的主要问题是,您需要关闭位于 shadow dom

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementNotInteractableException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

driver = webdriver.Chrome()
# OR: driver = webdriver.Chrome(executable_path='D:\Downloads\chromedriver\chromedriver.exe')
url = 'https://www.flaconi.de/haare/kerastase/chronologiste/kerastase-chronologiste-bain-regenerant-haarshampoo.html?yoReviewsPage=2'
driver.get(url)

webdriverWaiter = WebDriverWait(driver, 20)

webdriverWaiter.until(EC.text_to_be_present_in_element_attribute((By.CSS_SELECTOR, "body"), "class" ,"overflowHidden"))
shadow_host = driver.find_element(By.CSS_SELECTOR, '#usercentrics-root')
shadow_root = shadow_host.shadow_root

accept_cookies_button_css = "button[data-testid='uc-accept-all-button']"

# wait for accept cookies button to appear
accept_cookies_button = None
while not accept_cookies_button:
    try:
        accept_cookies_button = shadow_root.find_element(By.CSS_SELECTOR, accept_cookies_button_css)
    except NoSuchElementException:
        time.sleep(1)

# click accept cookies button
clicked = False
while not clicked:
    try:
        accept_cookies_button.click()
        clicked = True
    except ElementNotInteractableException:
        time.sleep(1)

content_review_css = ".content-review"
webdriverWaiter.until(EC.visibility_of_element_located((By.CSS_SELECTOR, content_review_css)))
reviews = driver.find_elements(By.CSS_SELECTOR, content_review_css)
for rev in reviews:
    print(rev.text)

弹出图像:

Main issue here is that you need to close 'accept cookies' popup which is located in shadow DOM.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementNotInteractableException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

driver = webdriver.Chrome()
# OR: driver = webdriver.Chrome(executable_path='D:\Downloads\chromedriver\chromedriver.exe')
url = 'https://www.flaconi.de/haare/kerastase/chronologiste/kerastase-chronologiste-bain-regenerant-haarshampoo.html?yoReviewsPage=2'
driver.get(url)

webdriverWaiter = WebDriverWait(driver, 20)

webdriverWaiter.until(EC.text_to_be_present_in_element_attribute((By.CSS_SELECTOR, "body"), "class" ,"overflowHidden"))
shadow_host = driver.find_element(By.CSS_SELECTOR, '#usercentrics-root')
shadow_root = shadow_host.shadow_root

accept_cookies_button_css = "button[data-testid='uc-accept-all-button']"

# wait for accept cookies button to appear
accept_cookies_button = None
while not accept_cookies_button:
    try:
        accept_cookies_button = shadow_root.find_element(By.CSS_SELECTOR, accept_cookies_button_css)
    except NoSuchElementException:
        time.sleep(1)

# click accept cookies button
clicked = False
while not clicked:
    try:
        accept_cookies_button.click()
        clicked = True
    except ElementNotInteractableException:
        time.sleep(1)

content_review_css = ".content-review"
webdriverWaiter.until(EC.visibility_of_element_located((By.CSS_SELECTOR, content_review_css)))
reviews = driver.find_elements(By.CSS_SELECTOR, content_review_css)
for rev in reviews:
    print(rev.text)

Popup image:
enter image description here

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文