Python:“ascii”编解码器无法对字符 u'\\u2026' 进行编码
我正在尝试通过以下代码在 python 中使用 Bing api:
#!/usr/bin/python
from bingapi import bingapi
import re
import json
import urllib
import cgi
import cgitb
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def strip_tags2(data):
p = re.compile(r'<[^<]*?>')
q = re.compile(r'[&;!@#$%^*()]*')
data = p.sub('', data)
return q.sub('', data)
def getUrl(item):
return item['Url']
def getContent(item):
return item['Description']
def getTitle(item):
return item['Title']
def getInfo(qry, siteStr):
qryStr = qry + "+" + siteStr
#qryStr = u"%s" % qryStr.encode('UTF-8')
query = urllib.urlencode({'q' : qryStr})
url = 'http://api.bing.net/json.aspx?Appid=<myappid>&Version=2.2&Market=en-US&Query=%s&Sources=web&Web.Count=10&JsonType=raw' % (query)
search_results = urllib.urlopen(url)
j = json.loads(search_results.read())
results = j['SearchResponse']['Web']['Results']
return results
def updateRecent(qry):
f = open("recent.txt", "r")
lines = f.readlines()
f.close()
lines = lines[1:]
if len(qry) > 50: #truncate if string too long
qry = (qry[:50] + '...')
qry = strip_tags2(qry) #strip out the html if injection try
lines.append("\n%s" % qry)
f = open("recent.txt", "w")
f.writelines(lines)
f.close()
if __name__ == '__main__':
form = cgi.FieldStorage()
qry = form["qry"].value
qry = r'%s' % qry
updateRecent(qry)
siteStr = "(site:answers.yahoo.com OR site:chacha.com OR site:blurtit.com OR site:answers.com OR site:question.com OR site:answerbag.com OR site:stackexchange.com)"
print "Content-type: text/html"
print
header = open("header.html", "r")
contents = header.readlines()
header.close()
for item in contents:
print item
print """
<div id="results">
<center><h1>Results:</h1></center>
"""
for item in getInfo(siteStr, qry):
print "<h3>%s</h3>" % getTitle(item)
print "<br />"
print "%s" % getUrl(item)
print "<br />"
print "<p style=\"color:gray\">%s</p>" % getContent(item)
print "<br />"
print "</div>"
footer = open("footer.html", "r")
contents = footer.readlines()
footer.close()
for thing in contents:
print thing
我打印了一些结果,然后给出以下错误:
UnicodeEncodeError: 'ascii' codec can't encode character u'\\u2026' in position 72: ordinal not in range(128)
有人可以解释为什么会发生这种情况吗?它显然与 url 的编码方式有关,但到底哪里出了问题呢?提前致谢!
I am trying to use the Bing api in python with the following code:
#!/usr/bin/python
from bingapi import bingapi
import re
import json
import urllib
import cgi
import cgitb
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def strip_tags2(data):
p = re.compile(r'<[^<]*?>')
q = re.compile(r'[&;!@#$%^*()]*')
data = p.sub('', data)
return q.sub('', data)
def getUrl(item):
return item['Url']
def getContent(item):
return item['Description']
def getTitle(item):
return item['Title']
def getInfo(qry, siteStr):
qryStr = qry + "+" + siteStr
#qryStr = u"%s" % qryStr.encode('UTF-8')
query = urllib.urlencode({'q' : qryStr})
url = 'http://api.bing.net/json.aspx?Appid=<myappid>&Version=2.2&Market=en-US&Query=%s&Sources=web&Web.Count=10&JsonType=raw' % (query)
search_results = urllib.urlopen(url)
j = json.loads(search_results.read())
results = j['SearchResponse']['Web']['Results']
return results
def updateRecent(qry):
f = open("recent.txt", "r")
lines = f.readlines()
f.close()
lines = lines[1:]
if len(qry) > 50: #truncate if string too long
qry = (qry[:50] + '...')
qry = strip_tags2(qry) #strip out the html if injection try
lines.append("\n%s" % qry)
f = open("recent.txt", "w")
f.writelines(lines)
f.close()
if __name__ == '__main__':
form = cgi.FieldStorage()
qry = form["qry"].value
qry = r'%s' % qry
updateRecent(qry)
siteStr = "(site:answers.yahoo.com OR site:chacha.com OR site:blurtit.com OR site:answers.com OR site:question.com OR site:answerbag.com OR site:stackexchange.com)"
print "Content-type: text/html"
print
header = open("header.html", "r")
contents = header.readlines()
header.close()
for item in contents:
print item
print """
<div id="results">
<center><h1>Results:</h1></center>
"""
for item in getInfo(siteStr, qry):
print "<h3>%s</h3>" % getTitle(item)
print "<br />"
print "%s" % getUrl(item)
print "<br />"
print "<p style=\"color:gray\">%s</p>" % getContent(item)
print "<br />"
print "</div>"
footer = open("footer.html", "r")
contents = footer.readlines()
footer.close()
for thing in contents:
print thing
I prints a few results, and then gives me the following error:
UnicodeEncodeError: 'ascii' codec can't encode character u'\\u2026' in position 72: ordinal not in range(128)
Can someone explain why this is happening? It clearly has something to do with how the url is getting encoded, but what is exactly is wrong? Thanks in advance!
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
该特定的 Unicode 字符是“水平省略号”。一个或多个 getXXXXX() 函数返回 Unicode 字符串,其中之一包含非 ASCII 字符。我建议声明输出的编码,例如:
并以该编码显式编码您的输出。
That particular Unicode character is "HORIZONTAL ELLIPSIS". One or more of your getXXXXX() functions are returning Unicode strings, one of which contains a non-ASCII character. I suggest declaring the encoding of your output, for example:
and explicitly encoding your output in that encoding.
我们需要知道抛出异常的行号,它将在回溯中。无论如何,问题是您正在从文件/URL 读取 unicode,然后将它们隐式转换为 US-ASCII,可能是在其中一个串联操作中。您应该为所有常量字符串添加前缀 u 以指示它们是 unicode 字符串,例如
We need to know the line number where the exception was thrown, it will be in the backtrace. Anyway, the problem is that you are reading unicode from the files/URLs and then implicitly converting them to US-ASCII, probably in one of the concatenation operations. You should prefix all constant strings with u to indicate that they are unicode strings, like in