Selenium Python:元素不相互作用

发布于 2025-02-11 17:30:45 字数 2401 浏览 1 评论 0原文

我正在尝试从此网站刮擦信息示例网站

我需要获得2021版并搜索代码。 这是我的代码:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions

ignored_exceptions=(NoSuchElementException,StaleElementReferenceException)

options = Options()
options.add_argument('--disable-extensions')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--remote-debugging-port=9515')
options.add_argument('--disable-setuid-sandbox')
options.add_argument("--start-maximized")

driver = webdriver.Chrome(service=Service("/usr/bin/chromedriver"), options=options)
url = "https://noc.esdc.gc.ca/"
driver.get(url)

search_by_code = WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions)\
                        .until(expected_conditions.presence_of_element_located((By.XPATH, "/html/body/main/div[2]/div/div/div/div/div/div/div/div/ul/li[2]/a")))
# click to activate this option
search_by_code.click()

text_area = WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions)\
                        .until(expected_conditions.presence_of_element_located((By.XPATH, "/html/body/main/div[2]/div/div/div/div/div/div/div/div/div/details[2]/div/div/form/div/div[2]/div/input")))

version = Select(WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions)\
                        .until(expected_conditions.presence_of_element_located((By.XPATH, "/html/body/main/div[2]/div/div/div/div/div/div/div/div/div/details[2]/div/div/form/div/div[1]/select"))))

search_button = driver.find_element(By.XPATH, '/html/body/main/div[2]/div/div/div/div/div/div/div/div/div/details[2]/div/div/form/div/div[2]/div/div/button')

# select version 2021
version.select_by_value('2021.0')

# click on text area
text_area.click()

# type the text 
text_area.send_keys("10010  –  Financial managers")

# click the button
search_button.click()

print(source = driver.current_url)

我不确定我错过了什么?我添加了一些评论来描述逻辑。

I am trying to scrape information from this website example website

I need to get the version 2021 and search by code.
Here is my code:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions

ignored_exceptions=(NoSuchElementException,StaleElementReferenceException)

options = Options()
options.add_argument('--disable-extensions')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--remote-debugging-port=9515')
options.add_argument('--disable-setuid-sandbox')
options.add_argument("--start-maximized")

driver = webdriver.Chrome(service=Service("/usr/bin/chromedriver"), options=options)
url = "https://noc.esdc.gc.ca/"
driver.get(url)

search_by_code = WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions)\
                        .until(expected_conditions.presence_of_element_located((By.XPATH, "/html/body/main/div[2]/div/div/div/div/div/div/div/div/ul/li[2]/a")))
# click to activate this option
search_by_code.click()

text_area = WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions)\
                        .until(expected_conditions.presence_of_element_located((By.XPATH, "/html/body/main/div[2]/div/div/div/div/div/div/div/div/div/details[2]/div/div/form/div/div[2]/div/input")))

version = Select(WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions)\
                        .until(expected_conditions.presence_of_element_located((By.XPATH, "/html/body/main/div[2]/div/div/div/div/div/div/div/div/div/details[2]/div/div/form/div/div[1]/select"))))

search_button = driver.find_element(By.XPATH, '/html/body/main/div[2]/div/div/div/div/div/div/div/div/div/details[2]/div/div/form/div/div[2]/div/div/button')

# select version 2021
version.select_by_value('2021.0')

# click on text area
text_area.click()

# type the text 
text_area.send_keys("10010  –  Financial managers")

# click the button
search_button.click()

print(source = driver.current_url)

I am not sure what I have missed? I added some comments to describe the logic.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

吻安 2025-02-18 17:30:45

您可以为此网站使用python-Requrequests库。我在下面完成了一些脚本以发送请求并获取数据。您可以将查询发送到searchCriteria.codesearch值。您可以使用bs4lxml,或scrapy selector类,并通过xpath或a 来定位值CSS选择器

import re
import requests

cookies = {
    'ASP.NET_SessionId': 'u4qffsgfreddkgvdphaxo3pc',
    'BIGipServernoc_esdc_gc_ca_http_443.app~noc_esdc_gc_ca_http_443_pool': '639114412.20480.0000',
    'gpv_pthl': 'blank%20theme',
    'gpv_pc': 'Employment%20and%20Social%20Development%20Canada',
    'gpv_pqs': 'blank%20query%20string',
    'gpv_pu': 'noc.esdc.gc.ca%2FSearch%2FQuickSearchJobTitleResults',
    'gpv_pt': 'Search%20by%20job%20title%20-%20Results%20-%20Canada.ca',
    's_plt': '9.44',
    's_tp': '1206',
    'gpv_url': 'noc.esdc.gc.ca%2FSearch%2FQuickSearchJobTitleResults',
    's_ips': '741',
}

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'en-PK,en;q=0.9,ur-PK;q=0.8,ur;q=0.7,en-GB;q=0.6,en-US;q=0.5,sv;q=0.4,it;q=0.3',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    # Requests sorts cookies= alphabetically
    # 'Cookie': 'ASP.NET_SessionId=u4qffsgfreddkgvdphaxo3pc; BIGipServernoc_esdc_gc_ca_http_443.app~noc_esdc_gc_ca_http_443_pool=639114412.20480.0000; gpv_pthl=blank%20theme; gpv_pc=Employment%20and%20Social%20Development%20Canada; gpv_pqs=blank%20query%20string; gpv_pu=noc.esdc.gc.ca%2FSearch%2FQuickSearchJobTitleResults; gpv_pt=Search%20by%20job%20title%20-%20Results%20-%20Canada.ca; s_plt=9.44; s_tp=1206; gpv_url=noc.esdc.gc.ca%2FSearch%2FQuickSearchJobTitleResults; s_ips=741',
    'Origin': 'https://noc.esdc.gc.ca',
    'Pragma': 'no-cache',
    'Referer': 'https://noc.esdc.gc.ca/Search/QuickSearchJobTitleResults',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
    'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}

data = {
    'SearchCriteria.CodeVersion': '2021.0',
    'SearchCriteria.CodeSearch': '10010  –  Financial managers',
    'btn-submitSearchNOC': 'Search',
}

r = requests.post('https://noc.esdc.gc.ca/Search/QuickSearchJobTitleResults', cookies=cookies, headers=headers, data=data)
id_ = re.findall(r'objectid=(.*?)&', r.text)[0]

url = f'https://noc.esdc.gc.ca/Structure/NocProfile?objectid={id_}'
r = requests.get(url=url, headers=headers)

print(r.text)

You can use the python-requests library for this website. I have done a little script below to send requests and get the data. You can send your query to SearchCriteria.CodeSearch value. You can use BS4, LXML, or Scrapy Selector class and target the values by XPATH or a CSS selector.

import re
import requests

cookies = {
    'ASP.NET_SessionId': 'u4qffsgfreddkgvdphaxo3pc',
    'BIGipServernoc_esdc_gc_ca_http_443.app~noc_esdc_gc_ca_http_443_pool': '639114412.20480.0000',
    'gpv_pthl': 'blank%20theme',
    'gpv_pc': 'Employment%20and%20Social%20Development%20Canada',
    'gpv_pqs': 'blank%20query%20string',
    'gpv_pu': 'noc.esdc.gc.ca%2FSearch%2FQuickSearchJobTitleResults',
    'gpv_pt': 'Search%20by%20job%20title%20-%20Results%20-%20Canada.ca',
    's_plt': '9.44',
    's_tp': '1206',
    'gpv_url': 'noc.esdc.gc.ca%2FSearch%2FQuickSearchJobTitleResults',
    's_ips': '741',
}

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'en-PK,en;q=0.9,ur-PK;q=0.8,ur;q=0.7,en-GB;q=0.6,en-US;q=0.5,sv;q=0.4,it;q=0.3',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    # Requests sorts cookies= alphabetically
    # 'Cookie': 'ASP.NET_SessionId=u4qffsgfreddkgvdphaxo3pc; BIGipServernoc_esdc_gc_ca_http_443.app~noc_esdc_gc_ca_http_443_pool=639114412.20480.0000; gpv_pthl=blank%20theme; gpv_pc=Employment%20and%20Social%20Development%20Canada; gpv_pqs=blank%20query%20string; gpv_pu=noc.esdc.gc.ca%2FSearch%2FQuickSearchJobTitleResults; gpv_pt=Search%20by%20job%20title%20-%20Results%20-%20Canada.ca; s_plt=9.44; s_tp=1206; gpv_url=noc.esdc.gc.ca%2FSearch%2FQuickSearchJobTitleResults; s_ips=741',
    'Origin': 'https://noc.esdc.gc.ca',
    'Pragma': 'no-cache',
    'Referer': 'https://noc.esdc.gc.ca/Search/QuickSearchJobTitleResults',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
    'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}

data = {
    'SearchCriteria.CodeVersion': '2021.0',
    'SearchCriteria.CodeSearch': '10010  –  Financial managers',
    'btn-submitSearchNOC': 'Search',
}

r = requests.post('https://noc.esdc.gc.ca/Search/QuickSearchJobTitleResults', cookies=cookies, headers=headers, data=data)
id_ = re.findall(r'objectid=(.*?)&', r.text)[0]

url = f'https://noc.esdc.gc.ca/Structure/NocProfile?objectid={id_}'
r = requests.get(url=url, headers=headers)

print(r.text)
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文