刮擦刮擦的URL(嵌套)

发布于 2025-02-05 16:37:16 字数 1161 浏览 1 评论 0原文

在刮擦的第一部分中,获取公园名称,其中包含详细信息,包括链接(URL)到公园页面。我想从Scrated URL(链接)中获取电话号码,并将它们全部显示在一起。

from bs4 import BeautifulSoup
import requests
import re

def get_parknames():

    html_text = requests.get('http://www.jump-parks.com/en/trampoline-parks/usa/').text
    soup = BeautifulSoup(html_text, 'lxml')
    parks = soup.find_all('div', class_ = 'grid__item')

    for park in parks:

        park_name = park.find('h3', class_ = 'card__title').text
        state = park.find('span', class_ = "address__country_long")
        country = park.find('span', {'itemprop' : 'addressCountry'}).text
        link = park.find('a', attrs={'href': re.compile("^https://")})

        html_text2 = requests.get(link)
        soup2 = BeautifulSoup(html_text2, 'lxml')
        phones = soup.find_all('div', class_ = 'single-meta')

        for phone in phones:
            phone_number = phone.find('a', attrs={'href': re.compile("")})

        print(f'''
        Park Name: {park_name}
        State: {state}
        Country: {country}
        Link: {link['href']}
        Phone: {phone_number}
        ''')

if __name__ == '__main__':
    get_parknames()

In the first part of scraping, getting park names with details including links (URL) to the park pages. I want to get phone numbers from scraped URL's (link) and show them all together.

from bs4 import BeautifulSoup
import requests
import re

def get_parknames():

    html_text = requests.get('http://www.jump-parks.com/en/trampoline-parks/usa/').text
    soup = BeautifulSoup(html_text, 'lxml')
    parks = soup.find_all('div', class_ = 'grid__item')

    for park in parks:

        park_name = park.find('h3', class_ = 'card__title').text
        state = park.find('span', class_ = "address__country_long")
        country = park.find('span', {'itemprop' : 'addressCountry'}).text
        link = park.find('a', attrs={'href': re.compile("^https://")})

        html_text2 = requests.get(link)
        soup2 = BeautifulSoup(html_text2, 'lxml')
        phones = soup.find_all('div', class_ = 'single-meta')

        for phone in phones:
            phone_number = phone.find('a', attrs={'href': re.compile("")})

        print(f'''
        Park Name: {park_name}
        State: {state}
        Country: {country}
        Link: {link['href']}
        Phone: {phone_number}
        ''')

if __name__ == '__main__':
    get_parknames()

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

春花秋月 2025-02-12 16:37:17

您看到的数据已加载来自不同URL的JavaScript。要获取所有页面,您可以使用下一个示例:

import json
import requests
from bs4 import BeautifulSoup


api_url = "https://www.jump-parks.com/en/wp-json/facetwp/v1/refresh"

payload = {
    "action": "facetwp_refresh",
    "data": {
        "extras": {"sort": "default"},
        "facets": {"listings_counts": [], "listings_pager": []},
        "first_load": 0,
        "frozen_facets": {},
        "http_params": {
            "archive_args": {"taxonomy": "job_listing_category", "term": "usa"},
            "get": [],
            "uri": "en/trampoline-parks/usa",
            "url_vars": [],
        },
        "is_bfcache": 1,
        "paged": 1,
        "soft_refresh": 1,
        "template": "listings",
    },
}


for payload["data"]["paged"] in range(1, 5): # <-- increase number of pages here

    data = requests.post(api_url, json=payload).json()
    soup = BeautifulSoup(data["template"], "html.parser")
    
    # to print all returned data uncomment next line:
    # print(json.dumps(data, indent=4))

    for article in soup.select("article"):
        name, link, state = (
            article.h3.text,
            article.a["href"],
            article.select_one('[itemprop="addressRegion"]').text,
        )

        print("{:<50} {:<15} {}".format(name, state, link))

打印:

Above All Trampoline Park Liberty                  Missouri        https://www.jump-parks.com/en/trampoline-park/above-all-trampoline-park-liberty/
Adrenaline Indoor Adventure Park LLC Fishers       Indiana         https://www.jump-parks.com/en/trampoline-park/adrenaline-indoor-adventure-park-llc-fishers/
Adventure Action Park Knoxville Knoxville          Tennessee       https://www.jump-parks.com/en/trampoline-park/adventure-action-park-knoxville-knoxville/
Adventure Air Sports Kennesaw                      Georgia         https://www.jump-parks.com/en/trampoline-park/adventure-air-sports-kennesaw/
Adventure Air Sports Rock Hill                     South Carolina  https://www.jump-parks.com/en/trampoline-park/adventure-air-sports-rock-hill/
Aerosports Trampoline Parks Corona                 California      https://www.jump-parks.com/en/trampoline-park/aerosports-trampoline-parks-corona/
Aerosports Trampoline Parks Fresno                 California      https://www.jump-parks.com/en/trampoline-park/aerosports-trampoline-parks-fresno/
Aerosports Trampoline Parks Murrieta               California      https://www.jump-parks.com/en/trampoline-park/aerosports-trampoline-parks-murrieta/
Air Insanity Indoor Trampoline Park Rochester      Minnesota       https://www.jump-parks.com/en/trampoline-park/air-insanity-indoor-trampoline-park-rochester/

... and so on.

编辑:获取电话号码:

import json
import requests
from bs4 import BeautifulSoup


api_url = "https://www.jump-parks.com/en/wp-json/facetwp/v1/refresh"

payload = {
    "action": "facetwp_refresh",
    "data": {
        "extras": {"sort": "default"},
        "facets": {"listings_counts": [], "listings_pager": []},
        "first_load": 0,
        "frozen_facets": {},
        "http_params": {
            "archive_args": {"taxonomy": "job_listing_category", "term": "usa"},
            "get": [],
            "uri": "en/trampoline-parks/usa",
            "url_vars": [],
        },
        "is_bfcache": 1,
        "paged": 1,
        "soft_refresh": 1,
        "template": "listings",
    },
}


all_data = []
for payload["data"]["paged"] in range(1, 5):
    print(f'Page {payload["data"]["paged"]}')

    data = requests.post(api_url, json=payload).json()
    soup = BeautifulSoup(data["template"], "html.parser")

    # to print all returned data uncomment next line:
    # print(json.dumps(data, indent=4))

    for article in soup.select("article"):
        name, link, state = (
            article.h3.text,
            article.a["href"],
            article.select_one('[itemprop="addressRegion"]').text,
        )
        all_data.append((name, state, link))


for name, state, link in all_data:
    soup = BeautifulSoup(requests.get(link).content, "html.parser")
    phone = soup.select_one('[itemprop="telephone"]').text
    print("{:<50} {:<15} {:<15}".format(name, state, phone, link))

The data you see is loaded with JavaScript from different URL. To get all pages you can use next example:

import json
import requests
from bs4 import BeautifulSoup


api_url = "https://www.jump-parks.com/en/wp-json/facetwp/v1/refresh"

payload = {
    "action": "facetwp_refresh",
    "data": {
        "extras": {"sort": "default"},
        "facets": {"listings_counts": [], "listings_pager": []},
        "first_load": 0,
        "frozen_facets": {},
        "http_params": {
            "archive_args": {"taxonomy": "job_listing_category", "term": "usa"},
            "get": [],
            "uri": "en/trampoline-parks/usa",
            "url_vars": [],
        },
        "is_bfcache": 1,
        "paged": 1,
        "soft_refresh": 1,
        "template": "listings",
    },
}


for payload["data"]["paged"] in range(1, 5): # <-- increase number of pages here

    data = requests.post(api_url, json=payload).json()
    soup = BeautifulSoup(data["template"], "html.parser")
    
    # to print all returned data uncomment next line:
    # print(json.dumps(data, indent=4))

    for article in soup.select("article"):
        name, link, state = (
            article.h3.text,
            article.a["href"],
            article.select_one('[itemprop="addressRegion"]').text,
        )

        print("{:<50} {:<15} {}".format(name, state, link))

Prints:

Above All Trampoline Park Liberty                  Missouri        https://www.jump-parks.com/en/trampoline-park/above-all-trampoline-park-liberty/
Adrenaline Indoor Adventure Park LLC Fishers       Indiana         https://www.jump-parks.com/en/trampoline-park/adrenaline-indoor-adventure-park-llc-fishers/
Adventure Action Park Knoxville Knoxville          Tennessee       https://www.jump-parks.com/en/trampoline-park/adventure-action-park-knoxville-knoxville/
Adventure Air Sports Kennesaw                      Georgia         https://www.jump-parks.com/en/trampoline-park/adventure-air-sports-kennesaw/
Adventure Air Sports Rock Hill                     South Carolina  https://www.jump-parks.com/en/trampoline-park/adventure-air-sports-rock-hill/
Aerosports Trampoline Parks Corona                 California      https://www.jump-parks.com/en/trampoline-park/aerosports-trampoline-parks-corona/
Aerosports Trampoline Parks Fresno                 California      https://www.jump-parks.com/en/trampoline-park/aerosports-trampoline-parks-fresno/
Aerosports Trampoline Parks Murrieta               California      https://www.jump-parks.com/en/trampoline-park/aerosports-trampoline-parks-murrieta/
Air Insanity Indoor Trampoline Park Rochester      Minnesota       https://www.jump-parks.com/en/trampoline-park/air-insanity-indoor-trampoline-park-rochester/

... and so on.

EDIT: To get phone numbers:

import json
import requests
from bs4 import BeautifulSoup


api_url = "https://www.jump-parks.com/en/wp-json/facetwp/v1/refresh"

payload = {
    "action": "facetwp_refresh",
    "data": {
        "extras": {"sort": "default"},
        "facets": {"listings_counts": [], "listings_pager": []},
        "first_load": 0,
        "frozen_facets": {},
        "http_params": {
            "archive_args": {"taxonomy": "job_listing_category", "term": "usa"},
            "get": [],
            "uri": "en/trampoline-parks/usa",
            "url_vars": [],
        },
        "is_bfcache": 1,
        "paged": 1,
        "soft_refresh": 1,
        "template": "listings",
    },
}


all_data = []
for payload["data"]["paged"] in range(1, 5):
    print(f'Page {payload["data"]["paged"]}')

    data = requests.post(api_url, json=payload).json()
    soup = BeautifulSoup(data["template"], "html.parser")

    # to print all returned data uncomment next line:
    # print(json.dumps(data, indent=4))

    for article in soup.select("article"):
        name, link, state = (
            article.h3.text,
            article.a["href"],
            article.select_one('[itemprop="addressRegion"]').text,
        )
        all_data.append((name, state, link))


for name, state, link in all_data:
    soup = BeautifulSoup(requests.get(link).content, "html.parser")
    phone = soup.select_one('[itemprop="telephone"]').text
    print("{:<50} {:<15} {:<15}".format(name, state, phone, link))
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文