如何使用 beautifulsoup 和 selenium 抓取 glassdoor 薪资数据

发布于 2025-01-11 14:12:43 字数 4187 浏览 0 评论 0原文

我正在尝试从 glassdoor 网站上抓取薪资数据。但问题是，在这个过程中，由于某些原因，数据没有被收集。而且我不确定错误在哪里。

email = "" # your email here
password = "" # your password here

# Manual options for the city, num pages to scrape, and URL
pages = 700
cityName = "United-Kingdom"
cityURL = "https://www.glassdoor.co.uk/Salaries/uk-systems-engineer-salary-SRCH_IL.0,2_IN2_KO3,19.htm?clickSource=searchBtn"

def obj_dict(obj):
    return obj.__dict__
#enddef

def json_export(data):
    jsonFile = open(cityName + ".json", "w")
    jsonFile.write(json.dumps(data, indent=4, separators=(',', ': '), default=obj_dict))
    jsonFile.close()
#enddef

def init_driver():
    driver = webdriver.Chrome("C:/Users/mm/Desktop/glassdoor/chromedriver.exe")
    driver.wait = WebDriverWait(driver, 10)
    return driver
#enddef

def login(driver, email, password):
    driver.get("http://www.glassdoor.com/profile/login_input.htm")
    try:
        user_field = driver.wait.until(EC.presence_of_element_located(
            (By.NAME, "email")))
        pw_field = driver.find_element_by_class_name("signin-password")
        login_button = driver.find_element_by_id("signInBtn")
        user_field.send_keys(email)
        user_field.send_keys(Keys.TAB)
        time.sleep(1)
        pw_field.send_keys(password)
        time.sleep(1)
        login_button.click()
    except TimeoutException:
        print("TimeoutException! Email/password field or login button not found on glassdoor.com")
#enddef

def parse_salaries_HTML(salaries, data):
    for salary in salaries:
        jobTitle = "-"
        company = "-"
        meanPay = "-"
        jobTitle = salary.find("a", { "class" : "jobTitle"}).getText().strip()
        company = salary.find("div", { "class" : "i-emp"}).getText().strip()
        try:
            meanPay = salary.find("div", { "class" : "meanPay"}).find("strong").getText().strip()
        except:
            meanPay = 'xxx'
        r = Salary.Salary(jobTitle, company, meanPay)
        data.append(r)
    return data
#enddef

def get_data(driver, URL, startPage, endPage, data, refresh):
    if (startPage > endPage):
        return data
    #endif
    print ("\nPage " + str(startPage) + " of " + str(endPage))
    currentURL = URL + "_IP" + str(startPage) + ".htm"
    time.sleep(2)
    #endif
    if (refresh):
        driver.get(currentURL)
        print ("Getting " + currentURL)
    #endif
    time.sleep(2)
    HTML = driver.page_source
    soup = BeautifulSoup(HTML, "html.parser")
    salaries = soup.find("div", { "class" : ["salaryChartModule"] })
    if salaries is not None:
        salaries = salaries.find_all("div", { "class" : ["salaryRow"] }) 
    # Process futher
    if (salaries):
        data = parse_salaries_HTML(salaries, data)
        print ("Page " + str(startPage) + " scraped.")
        if (startPage % 10 == 0):
            print ("\nTaking a breather for a few seconds ...")
            time.sleep(10)
        #endif
        get_data(driver, URL, startPage + 1, endPage, data, True)
    else:
        print ("Waiting ... page still loading or CAPTCHA input required")
        time.sleep(3)
        get_data(driver, URL, startPage, endPage, data, False)
    #endif
    return data
#enddef

if __name__ == "__main__":
    driver = init_driver()
    time.sleep(3)
    print ("Logging into Glassdoor account ...")
    login(driver, email, password)
    time.sleep(10)
    print ("\nStarting data scraping ...")
    data = get_data(driver, cityURL[:-4], 1, pages, [], True)
    print ("\nExporting data to " + cityName + ".json")
    json_export(data)
    driver.quit()
#endif

运行代码后，Glassdoor 页面将打开并登录我的帐户。之后我得到以下结果：

“登录 Glassdoor 帐户...... 超时异常！在 glassdoor.com 上找不到电子邮件/密码字段或登录按钮

开始数据抓取 ...

第 1 页，共 700 页获取 https://www.glassdoor.co.uk/Salaries/uk-systems-engineer-salary-SRCH_IL.0,2_IN2_KO3,19.htm?clickSource=searc_IP1.htm 正在等待...页面仍在加载或需要输入验证码

第 1 页，共 700 页等待...页面仍在加载或需要验证码输入“

所以问题是它永远不会从网站获取任何数据，也永远不会进入下一页。我只是一遍又一遍地获取结果的最后两行。我真的会如果有人能告诉我错误在哪里以及如何解决它，我将不胜感激。

原文

I am trying to scrape salary data from glassdoor website. However, the problem is that during the process the data is not collected for some reasons. And I am not sure where the error is.

email = "" # your email here
password = "" # your password here

# Manual options for the city, num pages to scrape, and URL
pages = 700
cityName = "United-Kingdom"
cityURL = "https://www.glassdoor.co.uk/Salaries/uk-systems-engineer-salary-SRCH_IL.0,2_IN2_KO3,19.htm?clickSource=searchBtn"

def obj_dict(obj):
    return obj.__dict__
#enddef

def json_export(data):
    jsonFile = open(cityName + ".json", "w")
    jsonFile.write(json.dumps(data, indent=4, separators=(',', ': '), default=obj_dict))
    jsonFile.close()
#enddef

def init_driver():
    driver = webdriver.Chrome("C:/Users/mm/Desktop/glassdoor/chromedriver.exe")
    driver.wait = WebDriverWait(driver, 10)
    return driver
#enddef

def login(driver, email, password):
    driver.get("http://www.glassdoor.com/profile/login_input.htm")
    try:
        user_field = driver.wait.until(EC.presence_of_element_located(
            (By.NAME, "email")))
        pw_field = driver.find_element_by_class_name("signin-password")
        login_button = driver.find_element_by_id("signInBtn")
        user_field.send_keys(email)
        user_field.send_keys(Keys.TAB)
        time.sleep(1)
        pw_field.send_keys(password)
        time.sleep(1)
        login_button.click()
    except TimeoutException:
        print("TimeoutException! Email/password field or login button not found on glassdoor.com")
#enddef

def parse_salaries_HTML(salaries, data):
    for salary in salaries:
        jobTitle = "-"
        company = "-"
        meanPay = "-"
        jobTitle = salary.find("a", { "class" : "jobTitle"}).getText().strip()
        company = salary.find("div", { "class" : "i-emp"}).getText().strip()
        try:
            meanPay = salary.find("div", { "class" : "meanPay"}).find("strong").getText().strip()
        except:
            meanPay = 'xxx'
        r = Salary.Salary(jobTitle, company, meanPay)
        data.append(r)
    return data
#enddef

def get_data(driver, URL, startPage, endPage, data, refresh):
    if (startPage > endPage):
        return data
    #endif
    print ("\nPage " + str(startPage) + " of " + str(endPage))
    currentURL = URL + "_IP" + str(startPage) + ".htm"
    time.sleep(2)
    #endif
    if (refresh):
        driver.get(currentURL)
        print ("Getting " + currentURL)
    #endif
    time.sleep(2)
    HTML = driver.page_source
    soup = BeautifulSoup(HTML, "html.parser")
    salaries = soup.find("div", { "class" : ["salaryChartModule"] })
    if salaries is not None:
        salaries = salaries.find_all("div", { "class" : ["salaryRow"] }) 
    # Process futher
    if (salaries):
        data = parse_salaries_HTML(salaries, data)
        print ("Page " + str(startPage) + " scraped.")
        if (startPage % 10 == 0):
            print ("\nTaking a breather for a few seconds ...")
            time.sleep(10)
        #endif
        get_data(driver, URL, startPage + 1, endPage, data, True)
    else:
        print ("Waiting ... page still loading or CAPTCHA input required")
        time.sleep(3)
        get_data(driver, URL, startPage, endPage, data, False)
    #endif
    return data
#enddef

if __name__ == "__main__":
    driver = init_driver()
    time.sleep(3)
    print ("Logging into Glassdoor account ...")
    login(driver, email, password)
    time.sleep(10)
    print ("\nStarting data scraping ...")
    data = get_data(driver, cityURL[:-4], 1, pages, [], True)
    print ("\nExporting data to " + cityName + ".json")
    json_export(data)
    driver.quit()
#endif

After running the code the Glassdoor page is opening and logging in my account. After that I get the following result:

"Logging into Glassdoor account ...
TimeoutException! Email/password field or login button not found on glassdoor.com

Starting data scraping ...

Page 1 of 700
Getting https://www.glassdoor.co.uk/Salaries/uk-systems-engineer-salary-SRCH_IL.0,2_IN2_KO3,19.htm?clickSource=searc_IP1.htm
Waiting ... page still loading or CAPTCHA input required

Page 1 of 700
Waiting ... page still loading or CAPTCHA input required"

So the problem is that it never gets any data from the website and never goes to the next page. I just keep getting last two lined of the result over and over. I would really appreciate if someone could tell me where the error is and how to fix that.

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

久光 2025-01-18 14:12:43

您的错误表明登录有问题：

"Logging into Glassdoor account ... TimeoutException! 
Email/password field or login button not found on glassdoor.com

您只需在登录表单中搜索错误的元素即可。

没有 name="email"，而是 name="username"（或 id="inlineUserEmail"）。

没有 class="signin-password" 但 name="password" （或 id="inlineUserPassword"）

没有 name="signInBtn" 但 name="submit"

#user_field = driver.wait.until(EC.presence_of_element_located(
#    (By.NAME, "email")))  # <-- wrong
user_field = driver.wait.until(EC.presence_of_element_located(
    (By.NAME, "username")))

#pw_field = driver.find_element_by_class_name("signin-password")  # <-- wrong
#pw_field = driver.find_element_by_id("inlineUserPassword")
pw_field = driver.find_element_by_name("password")

#login_button = driver.find_element_by_id("signInBtn")  # <-- wrong
login_button = driver.find_element_by_name("submit")

编辑：

登录后，您也可能使用错误的值来获取数据 - 但我没有login/password 访问它并检查您是否使用了正确的值。

如果您无需登录即可查看数据，那么您可以删除登录代码并显示真正的问题是什么。

看来您使用了完全错误的元素。
没有 salaryChartModule，也没有 salaryRow。

您应该删除这部分并从头开始。

在 get_data() 内部运行 get_data() 是错误的想法 - 它会进行递归，并且可能会使用更多内存，并且很难回到开头。它总是需要返回get_data()来将数据发送到开头。

要获取下一页，您可以这样做

driver.find_element_by_xpath('//button[@label="Next"]').click()

，但需要在某个循环 while True 中运行所有代码，以便在下一页重复它。当无法点击下一页的链接时，您需要 try/ except 来捕获错误，并使用 break 退出循环

while True:
    # ... code for scraping page ...

    try:
        driver.find_element_by_xpath('//button[@label="Next"]').click()
    except Excception as ex:
        print('Excception:', ex)
        break

Your error shows that it has problem to login:

"Logging into Glassdoor account ... TimeoutException! 
Email/password field or login button not found on glassdoor.com

You simply search wrong elements on in login form.

There is no name="email" but name="username" (or id="inlineUserEmail").

There is no class="signin-password" but name="password" (or id="inlineUserPassword")

There is no name="signInBtn" but name="submit"

#user_field = driver.wait.until(EC.presence_of_element_located(
#    (By.NAME, "email")))  # <-- wrong
user_field = driver.wait.until(EC.presence_of_element_located(
    (By.NAME, "username")))

#pw_field = driver.find_element_by_class_name("signin-password")  # <-- wrong
#pw_field = driver.find_element_by_id("inlineUserPassword")
pw_field = driver.find_element_by_name("password")

#login_button = driver.find_element_by_id("signInBtn")  # <-- wrong
login_button = driver.find_element_by_name("submit")

EDIT:

After login you may use also wrong values to get data - but I don't have login/password access it and to check if you use correct values.

And if you can see data without login then you could remove code for login and show what is the real problem.

It seems you use totally wrong elements.
There is no salaryChartModule, and there is no salaryRow.

You should delete this part and start from the beginning.

And it is wrong idea to run get_data() inside get_data() - it make recursion and it may use more memory, and it hard to get back to the beginning. It need always return get_data() to send data to the beginning.

To get next page you can do

driver.find_element_by_xpath('//button[@label="Next"]').click()

but it would need to run all code in some loop while True to repeate it for next pages. And you would need try/except to catch error when it can't click link to next page, and use break to exit loop

while True:
    # ... code for scraping page ...

    try:
        driver.find_element_by_xpath('//button[@label="Next"]').click()
    except Excception as ex:
        print('Excception:', ex)
        break

回复收藏 0 原文

~没有更多了~