如何使用 beautifulsoup 和 selenium 抓取 glassdoor 薪资数据
我正在尝试从 glassdoor 网站上抓取薪资数据。但问题是,在这个过程中,由于某些原因,数据没有被收集。而且我不确定错误在哪里。
email = "" # your email here
password = "" # your password here
# Manual options for the city, num pages to scrape, and URL
pages = 700
cityName = "United-Kingdom"
cityURL = "https://www.glassdoor.co.uk/Salaries/uk-systems-engineer-salary-SRCH_IL.0,2_IN2_KO3,19.htm?clickSource=searchBtn"
def obj_dict(obj):
return obj.__dict__
#enddef
def json_export(data):
jsonFile = open(cityName + ".json", "w")
jsonFile.write(json.dumps(data, indent=4, separators=(',', ': '), default=obj_dict))
jsonFile.close()
#enddef
def init_driver():
driver = webdriver.Chrome("C:/Users/mm/Desktop/glassdoor/chromedriver.exe")
driver.wait = WebDriverWait(driver, 10)
return driver
#enddef
def login(driver, email, password):
driver.get("http://www.glassdoor.com/profile/login_input.htm")
try:
user_field = driver.wait.until(EC.presence_of_element_located(
(By.NAME, "email")))
pw_field = driver.find_element_by_class_name("signin-password")
login_button = driver.find_element_by_id("signInBtn")
user_field.send_keys(email)
user_field.send_keys(Keys.TAB)
time.sleep(1)
pw_field.send_keys(password)
time.sleep(1)
login_button.click()
except TimeoutException:
print("TimeoutException! Email/password field or login button not found on glassdoor.com")
#enddef
def parse_salaries_HTML(salaries, data):
for salary in salaries:
jobTitle = "-"
company = "-"
meanPay = "-"
jobTitle = salary.find("a", { "class" : "jobTitle"}).getText().strip()
company = salary.find("div", { "class" : "i-emp"}).getText().strip()
try:
meanPay = salary.find("div", { "class" : "meanPay"}).find("strong").getText().strip()
except:
meanPay = 'xxx'
r = Salary.Salary(jobTitle, company, meanPay)
data.append(r)
return data
#enddef
def get_data(driver, URL, startPage, endPage, data, refresh):
if (startPage > endPage):
return data
#endif
print ("\nPage " + str(startPage) + " of " + str(endPage))
currentURL = URL + "_IP" + str(startPage) + ".htm"
time.sleep(2)
#endif
if (refresh):
driver.get(currentURL)
print ("Getting " + currentURL)
#endif
time.sleep(2)
HTML = driver.page_source
soup = BeautifulSoup(HTML, "html.parser")
salaries = soup.find("div", { "class" : ["salaryChartModule"] })
if salaries is not None:
salaries = salaries.find_all("div", { "class" : ["salaryRow"] })
# Process futher
if (salaries):
data = parse_salaries_HTML(salaries, data)
print ("Page " + str(startPage) + " scraped.")
if (startPage % 10 == 0):
print ("\nTaking a breather for a few seconds ...")
time.sleep(10)
#endif
get_data(driver, URL, startPage + 1, endPage, data, True)
else:
print ("Waiting ... page still loading or CAPTCHA input required")
time.sleep(3)
get_data(driver, URL, startPage, endPage, data, False)
#endif
return data
#enddef
if __name__ == "__main__":
driver = init_driver()
time.sleep(3)
print ("Logging into Glassdoor account ...")
login(driver, email, password)
time.sleep(10)
print ("\nStarting data scraping ...")
data = get_data(driver, cityURL[:-4], 1, pages, [], True)
print ("\nExporting data to " + cityName + ".json")
json_export(data)
driver.quit()
#endif
运行代码后,Glassdoor 页面将打开并登录我的帐户。之后我得到以下结果:
“登录 Glassdoor 帐户...... 超时异常!在 glassdoor.com 上找不到电子邮件/密码字段或登录按钮
开始数据抓取 ...
第 1 页,共 700 页 获取 https://www.glassdoor.co.uk/Salaries/uk-systems-engineer-salary-SRCH_IL.0,2_IN2_KO3,19.htm?clickSource=searc_IP1.htm 正在等待...页面仍在加载或需要输入验证码
第 1 页,共 700 页 等待...页面仍在加载或需要验证码输入“
所以问题是它永远不会从网站获取任何数据,也永远不会进入下一页。我只是一遍又一遍地获取结果的最后两行。我真的会如果有人能告诉我错误在哪里以及如何解决它,我将不胜感激。
I am trying to scrape salary data from glassdoor website. However, the problem is that during the process the data is not collected for some reasons. And I am not sure where the error is.
email = "" # your email here
password = "" # your password here
# Manual options for the city, num pages to scrape, and URL
pages = 700
cityName = "United-Kingdom"
cityURL = "https://www.glassdoor.co.uk/Salaries/uk-systems-engineer-salary-SRCH_IL.0,2_IN2_KO3,19.htm?clickSource=searchBtn"
def obj_dict(obj):
return obj.__dict__
#enddef
def json_export(data):
jsonFile = open(cityName + ".json", "w")
jsonFile.write(json.dumps(data, indent=4, separators=(',', ': '), default=obj_dict))
jsonFile.close()
#enddef
def init_driver():
driver = webdriver.Chrome("C:/Users/mm/Desktop/glassdoor/chromedriver.exe")
driver.wait = WebDriverWait(driver, 10)
return driver
#enddef
def login(driver, email, password):
driver.get("http://www.glassdoor.com/profile/login_input.htm")
try:
user_field = driver.wait.until(EC.presence_of_element_located(
(By.NAME, "email")))
pw_field = driver.find_element_by_class_name("signin-password")
login_button = driver.find_element_by_id("signInBtn")
user_field.send_keys(email)
user_field.send_keys(Keys.TAB)
time.sleep(1)
pw_field.send_keys(password)
time.sleep(1)
login_button.click()
except TimeoutException:
print("TimeoutException! Email/password field or login button not found on glassdoor.com")
#enddef
def parse_salaries_HTML(salaries, data):
for salary in salaries:
jobTitle = "-"
company = "-"
meanPay = "-"
jobTitle = salary.find("a", { "class" : "jobTitle"}).getText().strip()
company = salary.find("div", { "class" : "i-emp"}).getText().strip()
try:
meanPay = salary.find("div", { "class" : "meanPay"}).find("strong").getText().strip()
except:
meanPay = 'xxx'
r = Salary.Salary(jobTitle, company, meanPay)
data.append(r)
return data
#enddef
def get_data(driver, URL, startPage, endPage, data, refresh):
if (startPage > endPage):
return data
#endif
print ("\nPage " + str(startPage) + " of " + str(endPage))
currentURL = URL + "_IP" + str(startPage) + ".htm"
time.sleep(2)
#endif
if (refresh):
driver.get(currentURL)
print ("Getting " + currentURL)
#endif
time.sleep(2)
HTML = driver.page_source
soup = BeautifulSoup(HTML, "html.parser")
salaries = soup.find("div", { "class" : ["salaryChartModule"] })
if salaries is not None:
salaries = salaries.find_all("div", { "class" : ["salaryRow"] })
# Process futher
if (salaries):
data = parse_salaries_HTML(salaries, data)
print ("Page " + str(startPage) + " scraped.")
if (startPage % 10 == 0):
print ("\nTaking a breather for a few seconds ...")
time.sleep(10)
#endif
get_data(driver, URL, startPage + 1, endPage, data, True)
else:
print ("Waiting ... page still loading or CAPTCHA input required")
time.sleep(3)
get_data(driver, URL, startPage, endPage, data, False)
#endif
return data
#enddef
if __name__ == "__main__":
driver = init_driver()
time.sleep(3)
print ("Logging into Glassdoor account ...")
login(driver, email, password)
time.sleep(10)
print ("\nStarting data scraping ...")
data = get_data(driver, cityURL[:-4], 1, pages, [], True)
print ("\nExporting data to " + cityName + ".json")
json_export(data)
driver.quit()
#endif
After running the code the Glassdoor page is opening and logging in my account. After that I get the following result:
"Logging into Glassdoor account ...
TimeoutException! Email/password field or login button not found on glassdoor.com
Starting data scraping ...
Page 1 of 700
Getting https://www.glassdoor.co.uk/Salaries/uk-systems-engineer-salary-SRCH_IL.0,2_IN2_KO3,19.htm?clickSource=searc_IP1.htm
Waiting ... page still loading or CAPTCHA input required
Page 1 of 700
Waiting ... page still loading or CAPTCHA input required"
So the problem is that it never gets any data from the website and never goes to the next page. I just keep getting last two lined of the result over and over. I would really appreciate if someone could tell me where the error is and how to fix that.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
您的错误表明登录有问题:
您只需在登录表单中搜索错误的元素即可。
没有
name="email"
,而是name="username"
(或id="inlineUserEmail"
)。没有
class="signin-password"
但name="password"
(或id="inlineUserPassword"
)没有
name="signInBtn"
但name="submit"
编辑:
登录后,您也可能使用错误的值来获取数据 - 但我没有
login
/password
访问它并检查您是否使用了正确的值。如果您无需登录即可查看数据,那么您可以删除登录代码并显示真正的问题是什么。
看来您使用了完全错误的元素。
没有
salaryChartModule
,也没有salaryRow
。您应该删除这部分并从头开始。
在
get_data()
内部运行get_data()
是错误的想法 - 它会进行递归,并且可能会使用更多内存,并且很难回到开头。它总是需要返回get_data()
来将数据发送到开头。要获取下一页,您可以这样做
,但需要在某个循环
while True
中运行所有代码,以便在下一页重复它。当无法点击下一页的链接时,您需要try/ except
来捕获错误,并使用break
退出循环Your error shows that it has problem to login:
You simply search wrong elements on in login form.
There is no
name="email"
butname="username"
(orid="inlineUserEmail"
).There is no
class="signin-password"
butname="password"
(orid="inlineUserPassword"
)There is no
name="signInBtn"
butname="submit"
EDIT:
After login you may use also wrong values to get data - but I don't have
login
/password
access it and to check if you use correct values.And if you can see data without login then you could remove code for login and show what is the real problem.
It seems you use totally wrong elements.
There is no
salaryChartModule
, and there is nosalaryRow
.You should delete this part and start from the beginning.
And it is wrong idea to run
get_data()
insideget_data()
- it make recursion and it may use more memory, and it hard to get back to the beginning. It need alwaysreturn get_data()
to send data to the beginning.To get next page you can do
but it would need to run all code in some loop
while True
to repeate it for next pages. And you would needtry/except
to catch error when it can't click link to next page, and usebreak
to exit loop