如何处理ASCII'编解码器可以编码字符' \ xe9'错误?
我正在尝试从网站下载Excel文件。下面我的代码:
import os
import requests
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve, quote
from urllib.parse import urljoin
import urllib
headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get("https://www.elections.on.ca/en/resource-centre/elections-results.html#accordion2022ge")
soup = BeautifulSoup(resp.text,"html.parser")
for link in soup.find_all('a', href=True):
# print(link)
if 'xlsx' in link['href']:
print(link['href'])
url="https://www.elections.on.ca/"+link['href']
# print(url)
file= url.split("/")[-1].split(".")[0]+".xlsx"
# print(file)
urllib.request.urlretrieve(url, file)
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-9-e1694f5ee458> in <module>
8 file= url.split("/")[-1].split(".")[0]+".xlsx"
9 # print(file)
---> 10 urllib.request.urlretrieve(url, file)
...
UnicodeEncodeError: 'ascii' codec can't encode characters in position 101-102: ordinal not in range(128).
编辑:我尝试了 safestry
解决方案解决方案 nuniceCodeEncodeError:'asciii'acciii'codec'codec'codec codode' '在位置20:序数不在范围内(128),但行不通。请参阅下面:
def safeStr(obj):
try: return str(obj).encode('ascii', 'ignore').decode('ascii')
except: return ""
url="https://www.elections.on.ca/"+'/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx'
# print(url)
print(url)
file= url.split("/")[-1].split(".")[0]+".xlsx"
url = safeStr(url)
print(url)
# print(file)
urllib.request.urlretrieve(url, file)
我收到的错误是:
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orlans%20076.xlsx
HTTPError Traceback (most recent call last)
<ipython-input-33-01070419a054> in <module>
6 print(url)
7 # print(file)
----> 8 urllib.request.urlretrieve(url, file)
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
我尝试了另一种解决方案,来自 urlretrieve问题无法从URL获取图像,但它也无法使用unicode String ,但它也无效:
url = "https://www.elections.on.ca/"+urllib.parse.quote('/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx')
#url = safeStr(url)
print(url)
urllib.request.urlretrieve(url, file)
我遇到的错误是:
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%2520Totals%2520From%2520Official%2520Tabulation%2520-%2520Orl%C3%A9ans%2520076.xlsx
HTTPError Traceback (most recent call last)
<ipython-input-56-cfce9d1344d0> in <module>
2 #url = safeStr(url)
3 print(url)
----> 4 urllib.request.urlretrieve(url, file)
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
data:image/s3,"s3://crabby-images/d5906/d59060df4059a6cc364216c4d63ceec29ef7fe66" alt="扫码二维码加入Web技术交流群"
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
我认为这是一个解决方案...
问题是您启动的URL:
已经被URL引用(例如,由
>%20
替换的空间),但在此处仍然包含非ASCII CHARSorléans
因此,从将帮助我们,但只应用
urllib.parse。引用(...)
以%2520
的形式导致两次编码的空间。这就是为什么在请求处理的URL时获得404的原因。因此,首先,我们需要取消引用URL(即
%20 - &gt;“”
),然后再次引用它 - 这次也将引用重音的char,并且应该起作用。尝试一下:
我们得到的结果是:
...现在应该工作!
I think this is a solution...
The problem is that the url you start with:
is already url-quoted (e.g. spaces replaced by
%20
), but still contains non-ascii chars hereOrléans
So the solution from this question will help us, but just applying
urllib.parse.quote(...)
results in twice-encoded spaces as%2520
. That is why you get a 404 when requesting the processed url.So first we need to unquote the url (i.e.
%20 ->> " "
), then quote it again - this time the accented char will be quoted too and it should work.Try this:
The result we get is:
...should work now!