我如何循环浏览多个页面以使用硒和python下载Excel文件

发布于 2025-02-09 05:05:19 字数 2595 浏览 2 评论 0原文

我正在尝试构建一个网络刮板,该网络刮板将浏览网站的页面,并从页面底部的下拉菜单下载Excel文件。

这些网页仅允许我下载每个页面上显示的50个位置,我不能一次下载所有内容。

我能够下载第一页的Excel文件,但是以下页面没有其他任何内容。

运行下面提供的代码后,我将获得以下输出。

Skipped a page 
No more pages.

如果我排除要求下载页面的行,它可以通过每个页面,直到成功结束。

我将在下面提供一个示例,以实现目标。

感谢任何帮助和建议!谢谢你!

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

state = 'oklahoma'
rent_to_own = 'rent to own'

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx')

industry = driver.find_element(By.ID, "txtKeyword") 
industry.send_keys(rent_to_own)

location = driver.find_element(By.ID, "txtLocation")
location.send_keys(state)

driver.find_element(By.ID, "btnSubmit").click()

driver.implicitly_wait(3)
        
def web_scrape():
        more_drawer = driver.find_element(By.XPATH, "//div[@class='more-drawer']//a[@href='/toolkit/jobs/find-businesses.aspx?keyword="+rent_to_own+"&ajax=0&location="+state+"&lang=en&Desfillall=y#Des']")
        more_drawer.click()

        driver.implicitly_wait(5)

        get_50 = Select(driver.find_element(By.ID, 'ViewPerPage'))
        get_50.select_by_value('50')

        driver.implicitly_wait(5)

        filter_description = driver.find_element(By.XPATH, "//ul[@class='filters-list']//a[@href='/toolkit/jobs/find-businesses.aspx?keyword="+rent_to_own+"&ajax=0&location="+state+"&lang=en&Desfillall=y&pagesize=50&currentpage=1&descfilter=Furniture~B~Renting ~F~ Leasing']")
        filter_description.click()
        
        while True:
            try:
                download_excel = Select(driver.find_element(By.ID, 'ResultsDownload'))
                download_excel.select_by_value('Excel')
                driver.implicitly_wait(20)
                first_50 = driver.find_element(By.XPATH, "//div[@id='relatedOccupations']//a[@onclick='hideMoreRelatedOccupations()']")
                first_50.click()
                driver.implicitly_wait(20)
                next_page = driver.find_element(By.XPATH, "//div[@class='pagination-wrap']//div//a[@class='next-page']")
                next_page.click()
                driver.implicitly_wait(20)
                print("Skipped a page.")
            except:
                print("No more pages.")
                return
web_scrape()

I am trying to build a web scraper that will go through a website's pages and download the excel files from a dropdown menu at the bottom of the page.

The webpages only allow me to download the 50 locations that are displayed on each page and I cannot download all of them at once.

I am able to download the first page's Excel file, but the following pages yield nothing else.

I get the following output after running the code I have provided below.

Skipped a page 
No more pages.

If I exclude the lines where it asks to download the pages, it is able to go through each page until the end successfully.

I'll provide an example below for what I am trying to get accomplished.

I would appreciate any help and advice! Thank you!

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

state = 'oklahoma'
rent_to_own = 'rent to own'

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx')

industry = driver.find_element(By.ID, "txtKeyword") 
industry.send_keys(rent_to_own)

location = driver.find_element(By.ID, "txtLocation")
location.send_keys(state)

driver.find_element(By.ID, "btnSubmit").click()

driver.implicitly_wait(3)
        
def web_scrape():
        more_drawer = driver.find_element(By.XPATH, "//div[@class='more-drawer']//a[@href='/toolkit/jobs/find-businesses.aspx?keyword="+rent_to_own+"&ajax=0&location="+state+"&lang=en&Desfillall=y#Des']")
        more_drawer.click()

        driver.implicitly_wait(5)

        get_50 = Select(driver.find_element(By.ID, 'ViewPerPage'))
        get_50.select_by_value('50')

        driver.implicitly_wait(5)

        filter_description = driver.find_element(By.XPATH, "//ul[@class='filters-list']//a[@href='/toolkit/jobs/find-businesses.aspx?keyword="+rent_to_own+"&ajax=0&location="+state+"&lang=en&Desfillall=y&pagesize=50¤tpage=1&descfilter=Furniture~B~Renting ~F~ Leasing']")
        filter_description.click()
        
        while True:
            try:
                download_excel = Select(driver.find_element(By.ID, 'ResultsDownload'))
                download_excel.select_by_value('Excel')
                driver.implicitly_wait(20)
                first_50 = driver.find_element(By.XPATH, "//div[@id='relatedOccupations']//a[@onclick='hideMoreRelatedOccupations()']")
                first_50.click()
                driver.implicitly_wait(20)
                next_page = driver.find_element(By.XPATH, "//div[@class='pagination-wrap']//div//a[@class='next-page']")
                next_page.click()
                driver.implicitly_wait(20)
                print("Skipped a page.")
            except:
                print("No more pages.")
                return
web_scrape()

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

爱的故事 2025-02-16 05:05:19

以下是有效的东西。再次,我认为可以改善这一点的方式。我坚持使用硒,但您甚至不需要打开网页,只能使用美丽的汤与正确的URL参数进行韦伯克斯克式。同样,最快的方法可能不是一次将每个项目都写入Excel One,但它可以使用,更好的方法可能是使用Pandas,然后在最后创建Excel工作簿。但是无论如何,如果您有任何疑问,请告诉我。

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import openpyxl as xl
import os
import math

cwd = os.getcwd() #Or whatever dir you want
filename = '\test123.xlsx'

location = 'oklahoma'
keyword = 'rent to own'

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx?keyword=' + keyword + '&ajax=0&location=' + location + '&radius=50&pagesize=50¤tpage=1&lang=en')

driver.implicitly_wait(3)

wb = xl.Workbook()
ws = wb.worksheets[0]

#get number of pages
ret = driver.find_element(By.ID, 'recordNumber')
lp = math.ceil(float(ret.text)/50)
r = 1

for i in range(1, lp):
    
    print(i)
    driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx?keyword=' + keyword + '&ajax=0&location=' + location + '&radius=50&pagesize=50¤tpage=' + str(i) + '&lang=en')
    table_id = driver.find_elements(By.CLASS_NAME, 'res-table')[0]
    rows = table_id.find_elements(By.TAG_NAME, "tr")
    
    for count, row in enumerate(rows, start=1):
        if count >= 0:
            cols = row.find_elements(By.TAG_NAME, "td")
            refs = row.find_elements(By.TAG_NAME, "a")
            for c, ref in enumerate(refs, start=1):
                ws.cell(row=r, column=c).value = '=HYPERLINK("{}", "{}")'.format(ref.get_attribute("href"), ref.text)
            for c, col in enumerate(cols, start=1):
                if c > 1:
                    ws.cell(row=r, column=c).value = col.text
        r += 1




wb.save(cwd + filename)
print('done')

这将返回带有750多行数据的Excel文件,其中包含链接。

Below is something that works. Again I would think the way I went about this could be improved. I stuck with Selenium but you really don't even need to open the webpage and can just webscrape using correct URL params with Beautiful Soup. Also the fastest way was probably not to write every item into excel one at a time but it works, better way is probably using pandas and then creating an excel workbook at the end. But anyway if you have any questions let me know.

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import openpyxl as xl
import os
import math

cwd = os.getcwd() #Or whatever dir you want
filename = '\test123.xlsx'

location = 'oklahoma'
keyword = 'rent to own'

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx?keyword=' + keyword + '&ajax=0&location=' + location + '&radius=50&pagesize=50¤tpage=1&lang=en')

driver.implicitly_wait(3)

wb = xl.Workbook()
ws = wb.worksheets[0]

#get number of pages
ret = driver.find_element(By.ID, 'recordNumber')
lp = math.ceil(float(ret.text)/50)
r = 1

for i in range(1, lp):
    
    print(i)
    driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx?keyword=' + keyword + '&ajax=0&location=' + location + '&radius=50&pagesize=50¤tpage=' + str(i) + '&lang=en')
    table_id = driver.find_elements(By.CLASS_NAME, 'res-table')[0]
    rows = table_id.find_elements(By.TAG_NAME, "tr")
    
    for count, row in enumerate(rows, start=1):
        if count >= 0:
            cols = row.find_elements(By.TAG_NAME, "td")
            refs = row.find_elements(By.TAG_NAME, "a")
            for c, ref in enumerate(refs, start=1):
                ws.cell(row=r, column=c).value = '=HYPERLINK("{}", "{}")'.format(ref.get_attribute("href"), ref.text)
            for c, col in enumerate(cols, start=1):
                if c > 1:
                    ws.cell(row=r, column=c).value = col.text
        r += 1




wb.save(cwd + filename)
print('done')

This returns an excel file with 750+ rows of data with links included.

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文