python请求:403禁止错误,同时“ www.researchgate.net”下调pdf文件。

发布于 2025-01-31 23:32:39 字数 2205 浏览 1 评论 0原文

尝试从researchgate.net使用Python的请求下载研究文章的PDF时获取403禁止错误,尽管不需要身份验证,但是使用以下代码:

import requests
import shutil
dlink = "https://www.researchgate.net/profile/Corina-Florescu/publication/318596725_PositionRank_An_Unsupervised_Approach_to_Keyphrase_Extraction_from_Scholarly_Documents/links/5972182c0f7e9b40168fe63d/PositionRank-An-Unsupervised-Approach-to-Keyphrase-Extraction-from-Scholarly-Documents.pdf"

r = requests.get(dlink, stream=True, 
headers={
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53',
    "sec-ch-ua-platform": '"Windows"',
    "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"',
    "sec-ch-ua-mobile": '?0',
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Sec-Fetch-User": "?1",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Dest": "document",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9",
    "Cookie": "did=zp8vP15CGPkm8uJhry60BLeiaOSKZW8p51UupMljAdqP7WMnERbTEfnOEdkUTTE6; ptc=RG1.81179579133230362.1653300988; __cf_bm=VPc7zYTygZZxFutIAXPHQpRxUBRoKo5DuX4.nIiFqY4-1653300988-0-AS9iIVhVUqqi7Se51WNYTtVymQf9Bcoz4j93uLiq8GlgX73WKpoRJDlUdo0r3fsgmlKXLudkfYdv3NWQ3R10So0=; sid=JwZkeNfoeKESm08f4EtenQVKDk2nKtHGd7oj2rLJlkcwYIZBWff4LHaT6fuoKhSCkGzqyR0Hw5NoUrpTSJnIQsBdhyT7H4gjE0OLNSdOG1q6SSiq5rFPqb1UCjy8nmQS",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "max-age=0",    
    }
)
print(r.request.headers)
print()
if r.status_code == 200:
    with open("x.pdf", 'wb') as f:
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)
else:
    print("Error downloading the file")
    print(r.status_code) #403
    print(r.reason)      #Forbidden 

我在此处使用的标头是从Edge Browser的请求标题部分中收集的,以隐身模式收集。

感谢您的注意。

Getting 403 Forbidden error when trying to download the Pdf of a research article from researchgate.net using python's requests, although there is no authentication required, using the following codes:

import requests
import shutil
dlink = "https://www.researchgate.net/profile/Corina-Florescu/publication/318596725_PositionRank_An_Unsupervised_Approach_to_Keyphrase_Extraction_from_Scholarly_Documents/links/5972182c0f7e9b40168fe63d/PositionRank-An-Unsupervised-Approach-to-Keyphrase-Extraction-from-Scholarly-Documents.pdf"

r = requests.get(dlink, stream=True, 
headers={
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53',
    "sec-ch-ua-platform": '"Windows"',
    "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"',
    "sec-ch-ua-mobile": '?0',
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Sec-Fetch-User": "?1",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Dest": "document",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9",
    "Cookie": "did=zp8vP15CGPkm8uJhry60BLeiaOSKZW8p51UupMljAdqP7WMnERbTEfnOEdkUTTE6; ptc=RG1.81179579133230362.1653300988; __cf_bm=VPc7zYTygZZxFutIAXPHQpRxUBRoKo5DuX4.nIiFqY4-1653300988-0-AS9iIVhVUqqi7Se51WNYTtVymQf9Bcoz4j93uLiq8GlgX73WKpoRJDlUdo0r3fsgmlKXLudkfYdv3NWQ3R10So0=; sid=JwZkeNfoeKESm08f4EtenQVKDk2nKtHGd7oj2rLJlkcwYIZBWff4LHaT6fuoKhSCkGzqyR0Hw5NoUrpTSJnIQsBdhyT7H4gjE0OLNSdOG1q6SSiq5rFPqb1UCjy8nmQS",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "max-age=0",    
    }
)
print(r.request.headers)
print()
if r.status_code == 200:
    with open("x.pdf", 'wb') as f:
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)
else:
    print("Error downloading the file")
    print(r.status_code) #403
    print(r.reason)      #Forbidden 

The header I'm using here is collected from the Edge browser's request header section in incognito mode.

Thanks for your attention.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

尤怨 2025-02-07 23:32:39

禁止是由于TLS验证失败而造成的。我必须更改TLS适配器才能使其正常工作。

import ssl
import requests
from requests.adapters import HTTPAdapter
from urllib3.poolmanager import PoolManager
from urllib3.util import ssl_


CIPHERS = "ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-SHA256:AES256-SHA"

class TLSAdapter(HTTPAdapter):
    
    def __init__(self, ssl_options=0, *args, **kwargs):
        self.ssl_options = ssl_options
        super().__init__(*args, **kwargs)
    
    def init_poolmanager(self, *args, **kwargs):
        context = ssl_.create_urllib3_context(ciphers=CIPHERS, cert_reqs=ssl.CERT_REQUIRED, options=self.ssl_options)
        self.poolmanager = PoolManager(*args, ssl_context=context, **kwargs)


url = "https://www.researchgate.net/profile/Corina-Florescu/publication/318596725_PositionRank_An_Unsupervised_Approach_to_Keyphrase_Extraction_from_Scholarly_Documents/links/5972182c0f7e9b40168fe63d/PositionRank-An-Unsupervised-Approach-to-Keyphrase-Extraction-from-Scholarly-Documents.pdf"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53",
}
adapter = TLSAdapter(ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)

with requests.session() as session:
    session.mount("https://www.researchgate.net/", adapter)
    response = session.get(url, headers=headers)
    print(response.status_code)  # 200

Forbidden was caused because of TLS verification failure. I had to change the TLS adapter to make it work.

import ssl
import requests
from requests.adapters import HTTPAdapter
from urllib3.poolmanager import PoolManager
from urllib3.util import ssl_


CIPHERS = "ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-SHA256:AES256-SHA"

class TLSAdapter(HTTPAdapter):
    
    def __init__(self, ssl_options=0, *args, **kwargs):
        self.ssl_options = ssl_options
        super().__init__(*args, **kwargs)
    
    def init_poolmanager(self, *args, **kwargs):
        context = ssl_.create_urllib3_context(ciphers=CIPHERS, cert_reqs=ssl.CERT_REQUIRED, options=self.ssl_options)
        self.poolmanager = PoolManager(*args, ssl_context=context, **kwargs)


url = "https://www.researchgate.net/profile/Corina-Florescu/publication/318596725_PositionRank_An_Unsupervised_Approach_to_Keyphrase_Extraction_from_Scholarly_Documents/links/5972182c0f7e9b40168fe63d/PositionRank-An-Unsupervised-Approach-to-Keyphrase-Extraction-from-Scholarly-Documents.pdf"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53",
}
adapter = TLSAdapter(ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)

with requests.session() as session:
    session.mount("https://www.researchgate.net/", adapter)
    response = session.get(url, headers=headers)
    print(response.status_code)  # 200
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文