当前位置：文江博客话题详情

Python beautifulsoup web-scraping

Web刮擦价格Airbnb数据与Python

发布于 2025-01-23 19:02:54 字数 4026 浏览 0 评论 0 原文

我一直在尝试网上刮擦Air BNB网站，以获取价格，没有太多运气。我已经成功地引入了感兴趣的其他领域（家庭描述，家庭位置，评论等）。以下是我尝试过未能成功的方法。我认为，网页上的“价格”是“跨度类”，而不是“ div类”的其他事实，我的问题是我的问题，但我在猜测。

我正在使用的URL是： w.airbnb.com/rooms/52361296?category_tag= tag％3A8173＆amp; amp; amp; amp = 4＆amp;儿童= 0＆amp;婴儿= 0＆amp; check_in = 2022-12-11＆amp; check_out = 20222-12-12-18＆federated_search_search_id = 6174a074a078-a823-4fad-823-4fad-8227aiid; a 1645454076_FOOVSASHSASHSYVDBPBS

可以将其作为以下代码中的输入放置。

任何帮助将不胜感激。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from bs4 import BeautifulSoup
import requests
from IPython.display import IFrame

input_string = input("""Enter URLs for AirBnB sites that you want webscraped AND separate by a ',' : """)
airbnb_list = []
try:
    airbnb_list = input_string.split(",")
    x = 0
    y = len(airbnb_list)
    while y >= x:
        print(x+1 , '.) ' , airbnb_list[x])
        x=x+1
        if y == x:
            break
    #print(airbnb_list[len(airbnb_list)])
except:
    print("""Please separate list by a ','""")

a = pd.DataFrame([{"Title":'', "Stars": '', "Size":'', "Check In":'', "Check Out":'', "Rules":'',
               "Location":'', "Home Type":'', "House Desc":''}])

for x in range(len(airbnb_list)):
        url = airbnb_list[x]
        soup = BeautifulSoup(requests.get(url).content, 'html.parser')
        stars = soup.find(class_='_c7v1se').get_text()
        desc = soup.find(class_='_12nksyy').get_text()
        size = soup.find(class_='_jro6t0').get_text()
        #checkIn = soup.find(class_='_1acx77b').get_text()
        checkIn = soup.find(class_='_12aeg4v').get_text()
        #checkOut = soup.find(class_='_14tl4ml5').get_text()
        checkOut = soup.find(class_='_12aeg4v').get_text()
        Rules = soup.find(class_='cihcm8w dir dir-ltr').get_text()
        #location = soup.find(class_='_9ns6hl').get_text()
        location = soup.find(class_='_152qbzi').get_text()
        HomeType = soup.find(class_='_b8stb0').get_text()
        title = soup.title.string

        print('Stars: ', stars)
        print('')
        #Home Type
        print('Home Type: ', HomeType)
        print('')
        #Space Description
        print('Description: ', desc)
        print('')
        print('Rental size: ',size)
        print('')
        #CheckIn
        print('Check In: ', checkIn)
        print('')
        #CheckOut
        print('Check Out: ', checkOut)
        print('')
        #House Rules
        print('House Rules: ',Rules)
        print('')
        #print(soup.find("button", {"id":"#Id name of the button"}))
        #Home Location
        print('Home location: ', location)
        #Dates available
        #print('Dates available: ', soup.find(class_='_1yhfti2').get_text())
        print('===================================================================================')

        df = pd.DataFrame([{"Title":title, "Stars": stars, "Size":size, "Check In":checkIn, "Check Out":checkOut, "Rules":Rules,
                       "Location":location, "Home Type":HomeType, "House Desc":desc}])
        a = a.append(df)

        #Attemping to print the price tag on the website
        print(soup.find_all('span', {'class': '_tyxjp1'}))
        print(soup.find(class_='_tyxjp1').get_text())


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-10-2d9689dbc836> in <module>
      1 #print(soup.find_all('span', {'class': '_tyxjp1'}))
----> 2 print(soup.find(class_='_tyxjp1').get_text())

AttributeError: 'NoneType' object has no attribute 'get_text'

原文

I have been trying to web scrape an air bnb website to obtain the price without much luck. I have successfully been able to bring in the other areas of interest (home description, home location, reviews, etc). Below is what I've tried unsuccessfully. I think that the fact the "price" on the web page is a 'span class' as opposed to the others which are 'div class' is where my issue is, but I'm speculating.

The URL I'm using is: https://www.airbnb.com/rooms/52361296?category_tag=Tag%3A8173&adults=4&children=0&infants=0&check_in=2022-12-11&check_out=2022-12-18&federated_search_id=6174a078-a823-4fad-827a-7ca652b5e786&source_impression_id=p3_1645454076_foOVSAshSYvdbpbS

This can be placed as the input in the below code.

Any assistance would be greatly appreciated.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from bs4 import BeautifulSoup
import requests
from IPython.display import IFrame

input_string = input("""Enter URLs for AirBnB sites that you want webscraped AND separate by a ',' : """)
airbnb_list = []
try:
    airbnb_list = input_string.split(",")
    x = 0
    y = len(airbnb_list)
    while y >= x:
        print(x+1 , '.) ' , airbnb_list[x])
        x=x+1
        if y == x:
            break
    #print(airbnb_list[len(airbnb_list)])
except:
    print("""Please separate list by a ','""")

a = pd.DataFrame([{"Title":'', "Stars": '', "Size":'', "Check In":'', "Check Out":'', "Rules":'',
               "Location":'', "Home Type":'', "House Desc":''}])

for x in range(len(airbnb_list)):
        url = airbnb_list[x]
        soup = BeautifulSoup(requests.get(url).content, 'html.parser')
        stars = soup.find(class_='_c7v1se').get_text()
        desc = soup.find(class_='_12nksyy').get_text()
        size = soup.find(class_='_jro6t0').get_text()
        #checkIn = soup.find(class_='_1acx77b').get_text()
        checkIn = soup.find(class_='_12aeg4v').get_text()
        #checkOut = soup.find(class_='_14tl4ml5').get_text()
        checkOut = soup.find(class_='_12aeg4v').get_text()
        Rules = soup.find(class_='cihcm8w dir dir-ltr').get_text()
        #location = soup.find(class_='_9ns6hl').get_text()
        location = soup.find(class_='_152qbzi').get_text()
        HomeType = soup.find(class_='_b8stb0').get_text()
        title = soup.title.string

        print('Stars: ', stars)
        print('')
        #Home Type
        print('Home Type: ', HomeType)
        print('')
        #Space Description
        print('Description: ', desc)
        print('')
        print('Rental size: ',size)
        print('')
        #CheckIn
        print('Check In: ', checkIn)
        print('')
        #CheckOut
        print('Check Out: ', checkOut)
        print('')
        #House Rules
        print('House Rules: ',Rules)
        print('')
        #print(soup.find("button", {"id":"#Id name of the button"}))
        #Home Location
        print('Home location: ', location)
        #Dates available
        #print('Dates available: ', soup.find(class_='_1yhfti2').get_text())
        print('===================================================================================')

        df = pd.DataFrame([{"Title":title, "Stars": stars, "Size":size, "Check In":checkIn, "Check Out":checkOut, "Rules":Rules,
                       "Location":location, "Home Type":HomeType, "House Desc":desc}])
        a = a.append(df)

        #Attemping to print the price tag on the website
        print(soup.find_all('span', {'class': '_tyxjp1'}))
        print(soup.find(class_='_tyxjp1').get_text())


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-10-2d9689dbc836> in <module>
      1 #print(soup.find_all('span', {'class': '_tyxjp1'}))
----> 2 print(soup.find(class_='_tyxjp1').get_text())

AttributeError: 'NoneType' object has no attribute 'get_text'

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

你在我安 2025-01-30 19:02:54

我看到您正在使用请求模块来刮擦Airbnb。
该模块非常通用，并且可以在具有静态内容的网站上使用。
但是，它具有一个主要的缺点：它不会渲染JavaScript创建的内容。
这是一个问题，因为如今大多数网站一旦用户降落在网页上，就可以使用JavaScript创建其他HTML元素。

Airbnb价格块的创建与使用JavaScript完全一样。

有很多方法可以刮擦这种内容。
我最喜欢的方法是使用 selenium 。
它基本上是一个库，可让您启动一个真正的浏览器并使用您选择的编程语言与其进行通信。

这是您可以轻松使用硒的方法。

首先，设置它。请注意可以打开和关闭的无头选项。
如果要查看浏览器如何加载网页，请切换它

# setup selenium (I am using chrome here, so chrome has to be installed on your system)
chromedriver_autoinstaller.install()
options = Options()
# if you set this to False if you want to see how the chrome window loads airbnb - useful for debugging
options.headless = True
driver = webdriver.Chrome(options=options)

，然后导航到下一个网站

# navigate to airbnb
driver.get(url)

，等待直到价格块加载。
它可能会出现在我们的瞬时，但是根据您的互联网连接的速度，可能需要几秒钟

# wait until the price block loads
timeout = 10
expectation = EC.presence_of_element_located((By.CSS_SELECTOR, '._tyxjp1'))
price_element = WebDriverWait(driver, timeout).until(expectation)

价格，

# print the price
print(price_element.get_attribute('innerHTML'))

，最后打印我将代码添加到示例中的

import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

import pandas as pd
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.common.by import By

input_string = input("""Enter URLs for AirBnB sites that you want webscraped AND separate by a ',' : """)
airbnb_list = []
try:
    airbnb_list = input_string.split(",")
    x = 0
    y = len(airbnb_list)
    while y >= x:
        print(x+1 , '.) ' , airbnb_list[x])
        x=x+1
        if y == x:
            break
    #print(airbnb_list[len(airbnb_list)])
except:
    print("""Please separate list by a ','""")

a = pd.DataFrame([{"Title":'', "Stars": '', "Size":'', "Check In":'', "Check Out":'', "Rules":'',
               "Location":'', "Home Type":'', "House Desc":''}])

# setup selenium (I am using chrome here, so chrome has to be installed on your system)
chromedriver_autoinstaller.install()
options = Options()
# if you set this to False if you want to see how the chrome window loads airbnb - useful for debugging
options.headless = True
driver = webdriver.Chrome(options=options)

for x in range(len(airbnb_list)):
        url = airbnb_list[x]
        soup = BeautifulSoup(requests.get(url).content, 'html.parser')

        # navigate to airbnb
        driver.get(url)

        # wait until the price block loads
        timeout = 10
        expectation = EC.presence_of_element_located((By.CSS_SELECTOR, '._tyxjp1'))
        price_element = WebDriverWait(driver, timeout).until(expectation)

        # print the price
        print(price_element.get_attribute('innerHTML'))

以便您可以记住您的IP可能会可能最终被禁止刮擦Airbnb。
为了解决这个问题，使用代理IP并旋转它们总是一个好主意。
关注此旋转的代理教程以免被阻止。

希望有帮助！

I see you are using the requests module to scrape airbnb.
That module is extremely versatile and works on websites that have static content.
However, it has one major drawback: it doesn't render content created by javascript.
This is a problem, as most of the websites these days create additional html elements using javascript once the user lands on the web page.

The airbnb price block is created exactly like that - using javascript.

There are many ways to scrape that kind of content.
My favourite way is to use selenium.
It's basically a library that allows you to launch a real browser and communicate with it using your programming language of choice.

Here's how you can easily use selenium.

First, set it up. Notice the headless option which can be toggled on and off.
Toggle it off if you want to see how the browser loads the webpage

# setup selenium (I am using chrome here, so chrome has to be installed on your system)
chromedriver_autoinstaller.install()
options = Options()
# if you set this to False if you want to see how the chrome window loads airbnb - useful for debugging
options.headless = True
driver = webdriver.Chrome(options=options)

Then, navigate to the website

# navigate to airbnb
driver.get(url)

Next, wait until the price block loads.
It might appear near instantaneous to us, but depending on the speed of your internet connection it might take a few seconds

# wait until the price block loads
timeout = 10
expectation = EC.presence_of_element_located((By.CSS_SELECTOR, '._tyxjp1'))
price_element = WebDriverWait(driver, timeout).until(expectation)

And finally, print the price

# print the price
print(price_element.get_attribute('innerHTML'))

I added my code to your example so you could play around with it

import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

import pandas as pd
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.common.by import By

input_string = input("""Enter URLs for AirBnB sites that you want webscraped AND separate by a ',' : """)
airbnb_list = []
try:
    airbnb_list = input_string.split(",")
    x = 0
    y = len(airbnb_list)
    while y >= x:
        print(x+1 , '.) ' , airbnb_list[x])
        x=x+1
        if y == x:
            break
    #print(airbnb_list[len(airbnb_list)])
except:
    print("""Please separate list by a ','""")

a = pd.DataFrame([{"Title":'', "Stars": '', "Size":'', "Check In":'', "Check Out":'', "Rules":'',
               "Location":'', "Home Type":'', "House Desc":''}])

# setup selenium (I am using chrome here, so chrome has to be installed on your system)
chromedriver_autoinstaller.install()
options = Options()
# if you set this to False if you want to see how the chrome window loads airbnb - useful for debugging
options.headless = True
driver = webdriver.Chrome(options=options)

for x in range(len(airbnb_list)):
        url = airbnb_list[x]
        soup = BeautifulSoup(requests.get(url).content, 'html.parser')

        # navigate to airbnb
        driver.get(url)

        # wait until the price block loads
        timeout = 10
        expectation = EC.presence_of_element_located((By.CSS_SELECTOR, '._tyxjp1'))
        price_element = WebDriverWait(driver, timeout).until(expectation)

        # print the price
        print(price_element.get_attribute('innerHTML'))

Keep in mind that your IP might eventually get banned for scraping AirBnb.
To work around that it is always a good idea to use proxy IPs and rotate them.
Follow this rotating proxies tutorial to avoid getting blocked.

Hope that helps!

回复收藏 0 原文

~没有更多了~