如何使网络废料更像人性化？

发布于 2025-02-04 10:13:21 字数 4074 浏览 2 评论 0 原文

我有一个用Python编写的Web废料脚本，当我将其用于网站时，它会阻止我，并说“您的页面非常快。您可能是一个机器人”。

我尝试将 time.sleep（）添加到延迟代码，但总是会被阻止。有什么方法可以使该代码稍慢一点？

我不确定为什么要这么说。它与网站上的查看页面不一样吗？它的加载是什么，使其不标记为机器人，但我的脚本可以？

from bs4 import BeautifulSoup
import re
import requests
import time
import sys
import csv

FIXED_WEB = "web.net"


def load_car_pages(seq, limit, i):
    time.sleep(10)
    html_web = requests.get(
        f"web.net/homepage",
        headers={
            'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0',
            'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8",
            'Accept-Language': "en-US,en;q=0.5",
            'Accept-Encoding': "gzip, deflate",
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Te': 'trailers'
        }).text
    time.sleep(10)
    sup_me_patate = BeautifulSoup(html_web, 'lxml')
    headers = sup_me_patate.find_all('div', class_='sui-AtomCard-info') # find headers
    print(f"{headers}")

    for a in headers:
        string = str(a)
        href_pos = [m.start() for m in re.finditer('href=', string)]
        for pos in href_pos:
            slicing = string[pos + 6: string.find('"', pos + 6)]

            print(f"For Link: {slicing}")
            web_link = FIXED_WEB + slicing
            print(f"LINK: {web_link}")
            # limit = 25
            # i = 0
            time.sleep(10)
            try:
                car_web = requests.get(web_link, headers={
                    'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0',
                    'Origin': FIXED_WEB,
                    "Access-Control-Request-Method": "GET",
                    'Accept-Language': "en-US,en;q=0.5",
                    'Accept-Encoding': "gzip, deflate",
                    'Request-Domain': 'web.net',
                    'Site': 'car',
                    'Referer': web_link,
                    "Sec-Fetch-Dest": "empty",
                    "Sec- Fetch-Mode": "cors",
                    "Sec-Fetch-Site": "same-origin",
                    "Te": "trailers",
                    'Connection': 'close'}).text

                soup = BeautifulSoup(web_link, "lxml")

                # with open(soup.title.string + ".html", 'w') as coolhtml:
                #     string = str(soup)
                #     coolhtml.write(string)
                #     sys.exit(0)
                 name = soup.find_all('h2',
                                     class_="mt-TitleBasic-title mt-TitleBasic-title--xs mt-TitleBasic-title--black")
                address = soup.find('p', class_="mt-CardUser-location").text
                phone_number = soup.find('span', class_='mt-LeadPhoneCall-linkText mt-LeadPhoneCall-linkText--small')\
                    .text

                j = 0
                for b in name:
                    if j == 8:
                        real_name = b.text
                        print(b.text)
                    j += 1
   
                # some costansts
                NAME = real_name
                ADDRESS = address
                PHONE_NUMBER = phone_number

                header = ['Name', 'Address', 'Phone Number']
                data = [ADDRESS, PHONE_NUMBER, NAME]

                with open("info.csv", 'a', encoding='UTF8') as csv_numbers:
                    writer = csv.writer(csv_numbers)
                    writer.writerow(data)

                i += 1
                print(i)
                if i == limit:
                    print("it prints...")
                    limit += 35
                    seq += 1
                    load_car_pages(seq, limit, i)

            except Exception as ACX:
                print(f"Bro Exception occurred::{ACX}...")
            # continue


def main():
    # get_car_links()
    load_car_pages(0, 35, 0)


main()

原文

I have a web scrap script written in python and when I use it to a website it blocks me and says "you getting the page very fast. you might be a bot".

I tried adding time.sleep() to delay code but it always gets blocked. Is there any way to make this code a little slower?

I'm not sure why it should say so. Isn't it the same as viewing page from a website? What does it load that makes it not labelled as a bot but my script does?

from bs4 import BeautifulSoup
import re
import requests
import time
import sys
import csv

FIXED_WEB = "web.net"


def load_car_pages(seq, limit, i):
    time.sleep(10)
    html_web = requests.get(
        f"web.net/homepage",
        headers={
            'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0',
            'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8",
            'Accept-Language': "en-US,en;q=0.5",
            'Accept-Encoding': "gzip, deflate",
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Te': 'trailers'
        }).text
    time.sleep(10)
    sup_me_patate = BeautifulSoup(html_web, 'lxml')
    headers = sup_me_patate.find_all('div', class_='sui-AtomCard-info') # find headers
    print(f"{headers}")

    for a in headers:
        string = str(a)
        href_pos = [m.start() for m in re.finditer('href=', string)]
        for pos in href_pos:
            slicing = string[pos + 6: string.find('"', pos + 6)]

            print(f"For Link: {slicing}")
            web_link = FIXED_WEB + slicing
            print(f"LINK: {web_link}")
            # limit = 25
            # i = 0
            time.sleep(10)
            try:
                car_web = requests.get(web_link, headers={
                    'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0',
                    'Origin': FIXED_WEB,
                    "Access-Control-Request-Method": "GET",
                    'Accept-Language': "en-US,en;q=0.5",
                    'Accept-Encoding': "gzip, deflate",
                    'Request-Domain': 'web.net',
                    'Site': 'car',
                    'Referer': web_link,
                    "Sec-Fetch-Dest": "empty",
                    "Sec- Fetch-Mode": "cors",
                    "Sec-Fetch-Site": "same-origin",
                    "Te": "trailers",
                    'Connection': 'close'}).text

                soup = BeautifulSoup(web_link, "lxml")

                # with open(soup.title.string + ".html", 'w') as coolhtml:
                #     string = str(soup)
                #     coolhtml.write(string)
                #     sys.exit(0)
                 name = soup.find_all('h2',
                                     class_="mt-TitleBasic-title mt-TitleBasic-title--xs mt-TitleBasic-title--black")
                address = soup.find('p', class_="mt-CardUser-location").text
                phone_number = soup.find('span', class_='mt-LeadPhoneCall-linkText mt-LeadPhoneCall-linkText--small')\
                    .text

                j = 0
                for b in name:
                    if j == 8:
                        real_name = b.text
                        print(b.text)
                    j += 1
   
                # some costansts
                NAME = real_name
                ADDRESS = address
                PHONE_NUMBER = phone_number

                header = ['Name', 'Address', 'Phone Number']
                data = [ADDRESS, PHONE_NUMBER, NAME]

                with open("info.csv", 'a', encoding='UTF8') as csv_numbers:
                    writer = csv.writer(csv_numbers)
                    writer.writerow(data)

                i += 1
                print(i)
                if i == limit:
                    print("it prints...")
                    limit += 35
                    seq += 1
                    load_car_pages(seq, limit, i)

            except Exception as ACX:
                print(f"Bro Exception occurred::{ACX}...")
            # continue


def main():
    # get_car_links()
    load_car_pages(0, 35, 0)


main()

分享到QQ

分享到微博