我如何调试它以将网页抓取的内容写入字段名称下的 CSV? (Python,美丽汤)

发布于 2025-01-11 21:52:12 字数 3095 浏览 0 评论 0原文

我希望在调试这个脚本时得到帮助。当我只是打印迭代循环时,我没有任何问题。我现在如何将该内容写入正确字段名称下的 csv 文件中?脚本位于

'''

from bs4 import BeautifulSoup
import csv, requests, re

# The following code is used to set the pythonssl default cipher list - needed if the server has an outdated openssl
try:
    requests.packages.urllib3.contrib.pyopenssl.DEFAULT_SSL_CIPHER_LIST += 'HIGH:!DH:!aNULL'
except AttributeError:
    # no pyopenssl support used / needed / available
    pass
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'

# Set HTML headers to emulate Firefox browser to avoid bot security
# Potentially unused, but good practice to enable proper script function
html_headers = {    
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }


# Download BMO branch entries
base_url = "https://www.yellowpages.ca/search/si/{}/Bmo+Banque+De+Montr%C3%A9al/Canada" # URL structure of entries
first_page_request = requests.get(base_url.format("1"), headers=html_headers)   # Format method used to insert page number into greater URL
first_page_content = BeautifulSoup(first_page_request.content, "html.parser")   # Grabs all page content using Beautiful Soup module and standard Python parser

# Get total HTML page count for BMO entries
page_count_class = first_page_content.find_all(class_="pageCount")[0] 
total_pages = re.search("([0-9]+)</span>", str(page_count_class)).group(1)  # Extract the last page number from the total page count at the bottom of the page

# Initial CSV handling
f = open("bmo_data.csv", "w", newline="")    # Chose w mode to write to a file
field_names = ["name", "street_address", "city", "province", "postal_code"]
writer = csv.DictWriter(f, fieldnames=field_names)   # Setting the writer and field names
writer.writeheader()

# Running iterative loop through each page of the yellowpages listings for BMO
for page in range(1, int(total_pages) + 1):
    page_request = requests.get(base_url.format(page), headers=html_headers)
    page_html_content = BeautifulSoup(page_request.content, "html.parser")

    entry_names = page_html_content.find_all(class_="jsListingName")    # Get store name
    entry_addresses = page_html_content.find_all("span", itemprop="streetAddress")  # Get street address HTML object
    entry_cities = page_html_content.find_all("span", itemprop="addressLocality")   # Get city HTML object
    entry_provinces = page_html_content.find_all("span", itemprop="addressRegion")  # Get province HTML object
    entry_postal = page_html_content.find_all("span", itemprop="postalCode")    # Get postal code HTML object

    for name, address, city, province, postalCode in zip(entry_names, entry_addresses, entry_cities, entry_provinces, entry_postal):
        writer.writerow(page, name.get_text(), address.get_text(), city.get_text(), province.get_text(), postalCode.get_text())

f.close()

'''下面

I'm hoping for help in debugging this script. When I am simply printing out the iterative loop I don't have any issues. How would I now write that content to a csv file under the proper field names? Script is below

'''

from bs4 import BeautifulSoup
import csv, requests, re

# The following code is used to set the pythonssl default cipher list - needed if the server has an outdated openssl
try:
    requests.packages.urllib3.contrib.pyopenssl.DEFAULT_SSL_CIPHER_LIST += 'HIGH:!DH:!aNULL'
except AttributeError:
    # no pyopenssl support used / needed / available
    pass
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'

# Set HTML headers to emulate Firefox browser to avoid bot security
# Potentially unused, but good practice to enable proper script function
html_headers = {    
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }


# Download BMO branch entries
base_url = "https://www.yellowpages.ca/search/si/{}/Bmo+Banque+De+Montr%C3%A9al/Canada" # URL structure of entries
first_page_request = requests.get(base_url.format("1"), headers=html_headers)   # Format method used to insert page number into greater URL
first_page_content = BeautifulSoup(first_page_request.content, "html.parser")   # Grabs all page content using Beautiful Soup module and standard Python parser

# Get total HTML page count for BMO entries
page_count_class = first_page_content.find_all(class_="pageCount")[0] 
total_pages = re.search("([0-9]+)</span>", str(page_count_class)).group(1)  # Extract the last page number from the total page count at the bottom of the page

# Initial CSV handling
f = open("bmo_data.csv", "w", newline="")    # Chose w mode to write to a file
field_names = ["name", "street_address", "city", "province", "postal_code"]
writer = csv.DictWriter(f, fieldnames=field_names)   # Setting the writer and field names
writer.writeheader()

# Running iterative loop through each page of the yellowpages listings for BMO
for page in range(1, int(total_pages) + 1):
    page_request = requests.get(base_url.format(page), headers=html_headers)
    page_html_content = BeautifulSoup(page_request.content, "html.parser")

    entry_names = page_html_content.find_all(class_="jsListingName")    # Get store name
    entry_addresses = page_html_content.find_all("span", itemprop="streetAddress")  # Get street address HTML object
    entry_cities = page_html_content.find_all("span", itemprop="addressLocality")   # Get city HTML object
    entry_provinces = page_html_content.find_all("span", itemprop="addressRegion")  # Get province HTML object
    entry_postal = page_html_content.find_all("span", itemprop="postalCode")    # Get postal code HTML object

    for name, address, city, province, postalCode in zip(entry_names, entry_addresses, entry_cities, entry_provinces, entry_postal):
        writer.writerow(page, name.get_text(), address.get_text(), city.get_text(), province.get_text(), postalCode.get_text())

f.close()

'''

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

似最初 2025-01-18 21:52:12

您的方法的一个问题是,如果缺少一个元素,那么当您尝试编写列表时,所有列表都将不同步。更好的方法是找到包含一个结果的 HTML 元素,然后仅获取一个条目的元素。然后可以一次将其写入一行。

由于某些项目并不总是存在,因此当 .find() 无法找到合适的值时,您可以使用函数返回空白字符串。

例如:

from bs4 import BeautifulSoup
import csv, requests, re


def get_text(soup):
    ''' Return the text if available else an empty string '''
    return soup.get_text(strip=True) if soup else ''
    
    
# The following code is used to set the pythonssl default cipher list - needed if the server has an outdated openssl
try:
    requests.packages.urllib3.contrib.pyopenssl.DEFAULT_SSL_CIPHER_LIST += 'HIGH:!DH:!aNULL'
except AttributeError:
    # no pyopenssl support used / needed / available
    pass
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'

# Set HTML headers to emulate Firefox browser to avoid bot security
# Potentially unused, but good practice to enable proper script function
html_headers = {    
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }


# Download BMO branch entries
base_url = "https://www.yellowpages.ca/search/si/{}/Bmo+Banque+De+Montr%C3%A9al/Canada" # URL structure of entries
first_page_request = requests.get(base_url.format("1"), headers=html_headers)   # Format method used to insert page number into greater URL
first_page_content = BeautifulSoup(first_page_request.content, "html.parser")   # Grabs all page content using Beautiful Soup module and standard Python parser

# Get total HTML page count for BMO entries
page_count_class = first_page_content.find_all(class_="pageCount")[0] 
total_pages = re.search("([0-9]+)</span>", str(page_count_class)).group(1)  # Extract the last page number from the total page count at the bottom of the page

# Initial CSV handling

with open("bmo_data.csv", "w", encoding="latin-1", newline="") as f_output:   # Chose w mode to write to a file
    csv_output = csv.writer(f_output)   # Setting the writer
    csv_output.writerow(["name", "street_address", "city", "province", "postal_code"])

    # Running iterative loop through each page of the yellowpages listings for BMO
    for page in range(1, int(total_pages) + 1):
        print(f"Getting page {page} of {int(total_pages)}")
        page_request = requests.get(base_url.format(page), headers=html_headers)
        page_html_content = BeautifulSoup(page_request.content, "html.parser")

        for div in page_html_content.find_all('div', class_="listing_right_section"):
            entry_name = div.find(class_="jsListingName")    # Get store name
            entry_address = div.find("span", itemprop="streetAddress")   # Get street address HTML object
            entry_city = div.find("span", itemprop="addressLocality")   # Get city HTML object
            entry_province = div.find("span", itemprop="addressRegion")  # Get province HTML object
            entry_postal = div.find("span", itemprop="postalCode")    # Get postal code HTML object
            
            csv_output.writerow([
                get_text(entry_name), 
                get_text(entry_address), 
                get_text(entry_city), 
                get_text(entry_province), 
                get_text(entry_postal)])

如果字段名称是固定的,则不需要使用 DictWriter(),但这取决于您。

这将给出输出开始:

name,street_address,city,province,postal_code
BMO Bank of Montreal,270 Dundas St,London,ON,N6A 1H3
BMO Banque De Montréal,51 Notre Dame E Victorvl,Victoriaville,QC,G6P 1R6
BMO Banque De Montréal,739 Rue Conseil Sher,,QC,

One problem with your approach is that if an element is missing, all the lists will be out of sync when you try and write them. A better approach is to find an HTML element that encompasses one result and then just get the elements for just the one entry. This can then be written a row at time.

As some items are not always present, you can use a function to return a blank string for when the .find() failed to locate a suitable value.

For example:

from bs4 import BeautifulSoup
import csv, requests, re


def get_text(soup):
    ''' Return the text if available else an empty string '''
    return soup.get_text(strip=True) if soup else ''
    
    
# The following code is used to set the pythonssl default cipher list - needed if the server has an outdated openssl
try:
    requests.packages.urllib3.contrib.pyopenssl.DEFAULT_SSL_CIPHER_LIST += 'HIGH:!DH:!aNULL'
except AttributeError:
    # no pyopenssl support used / needed / available
    pass
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'

# Set HTML headers to emulate Firefox browser to avoid bot security
# Potentially unused, but good practice to enable proper script function
html_headers = {    
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }


# Download BMO branch entries
base_url = "https://www.yellowpages.ca/search/si/{}/Bmo+Banque+De+Montr%C3%A9al/Canada" # URL structure of entries
first_page_request = requests.get(base_url.format("1"), headers=html_headers)   # Format method used to insert page number into greater URL
first_page_content = BeautifulSoup(first_page_request.content, "html.parser")   # Grabs all page content using Beautiful Soup module and standard Python parser

# Get total HTML page count for BMO entries
page_count_class = first_page_content.find_all(class_="pageCount")[0] 
total_pages = re.search("([0-9]+)</span>", str(page_count_class)).group(1)  # Extract the last page number from the total page count at the bottom of the page

# Initial CSV handling

with open("bmo_data.csv", "w", encoding="latin-1", newline="") as f_output:   # Chose w mode to write to a file
    csv_output = csv.writer(f_output)   # Setting the writer
    csv_output.writerow(["name", "street_address", "city", "province", "postal_code"])

    # Running iterative loop through each page of the yellowpages listings for BMO
    for page in range(1, int(total_pages) + 1):
        print(f"Getting page {page} of {int(total_pages)}")
        page_request = requests.get(base_url.format(page), headers=html_headers)
        page_html_content = BeautifulSoup(page_request.content, "html.parser")

        for div in page_html_content.find_all('div', class_="listing_right_section"):
            entry_name = div.find(class_="jsListingName")    # Get store name
            entry_address = div.find("span", itemprop="streetAddress")   # Get street address HTML object
            entry_city = div.find("span", itemprop="addressLocality")   # Get city HTML object
            entry_province = div.find("span", itemprop="addressRegion")  # Get province HTML object
            entry_postal = div.find("span", itemprop="postalCode")    # Get postal code HTML object
            
            csv_output.writerow([
                get_text(entry_name), 
                get_text(entry_address), 
                get_text(entry_city), 
                get_text(entry_province), 
                get_text(entry_postal)])

If the fieldnames are fixed, you don't need to use DictWriter(), that is up to you though.

This would give output starting:

name,street_address,city,province,postal_code
BMO Bank of Montreal,270 Dundas St,London,ON,N6A 1H3
BMO Banque De Montréal,51 Notre Dame E Victorvl,Victoriaville,QC,G6P 1R6
BMO Banque De Montréal,739 Rue Conseil Sher,,QC,
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文