Python：“ascii”编解码器无法对字符 u'\\u2026' 进行编码

发布于 2024-10-16 03:38:37 字数 2992 浏览 1 评论 0原文

我正在尝试通过以下代码在 python 中使用 Bing api：

#!/usr/bin/python
from bingapi import bingapi  
import re
import json
import urllib
import cgi
import cgitb
from HTMLParser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
            self.reset()
            self.fed = []
    def handle_data(self, d):
            self.fed.append(d)
    def get_data(self):
            return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def strip_tags2(data):
    p = re.compile(r'<[^<]*?>')
    q = re.compile(r'[&;!@#$%^*()]*')
    data = p.sub('', data)
    return q.sub('', data)

def getUrl(item):
    return item['Url']

def getContent(item):
    return item['Description']

def getTitle(item):
    return item['Title']

def getInfo(qry, siteStr):
    qryStr = qry + "+" + siteStr
    #qryStr = u"%s" % qryStr.encode('UTF-8')
    query = urllib.urlencode({'q' : qryStr})
    url = 'http://api.bing.net/json.aspx?Appid=<myappid>&Version=2.2&Market=en-US&Query=%s&Sources=web&Web.Count=10&JsonType=raw' % (query)
    search_results = urllib.urlopen(url)
    j = json.loads(search_results.read())
    results = j['SearchResponse']['Web']['Results']
    return results

def updateRecent(qry):
    f = open("recent.txt", "r")
    lines = f.readlines()
    f.close()
    lines = lines[1:]

    if len(qry) > 50: #truncate if string too long
            qry = (qry[:50] + '...')
    qry = strip_tags2(qry) #strip out the html if injection try

    lines.append("\n%s" % qry)
    f = open("recent.txt", "w")
    f.writelines(lines)
    f.close()

if __name__ == '__main__':
    form = cgi.FieldStorage()
    qry = form["qry"].value
    qry = r'%s' % qry

    updateRecent(qry)

    siteStr = "(site:answers.yahoo.com OR site:chacha.com OR site:blurtit.com OR site:answers.com OR site:question.com OR site:answerbag.com OR site:stackexchange.com)"

    print "Content-type: text/html"
    print

    header = open("header.html", "r")
    contents = header.readlines()
    header.close()
    for item in contents:
            print item

    print """
    <div id="results">
    <center><h1>Results:</h1></center>
    """
    for item in getInfo(siteStr, qry):
            print "<h3>%s</h3>" % getTitle(item)
            print "<br />"
            print "%s" % getUrl(item)
            print "<br />"
            print "<p style=\"color:gray\">%s</p>" % getContent(item)
            print "<br />"
    print "</div>"

    footer = open("footer.html", "r")
    contents = footer.readlines()
    footer.close()
    for thing in contents:
            print thing

我打印了一些结果，然后给出以下错误：

UnicodeEncodeError: 'ascii' codec can't encode character u'\\u2026' in position 72:    ordinal not in range(128)

有人可以解释为什么会发生这种情况吗？它显然与 url 的编码方式有关，但到底哪里出了问题呢？提前致谢！

原文

I am trying to use the Bing api in python with the following code:

#!/usr/bin/python
from bingapi import bingapi  
import re
import json
import urllib
import cgi
import cgitb
from HTMLParser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
            self.reset()
            self.fed = []
    def handle_data(self, d):
            self.fed.append(d)
    def get_data(self):
            return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def strip_tags2(data):
    p = re.compile(r'<[^<]*?>')
    q = re.compile(r'[&;!@#$%^*()]*')
    data = p.sub('', data)
    return q.sub('', data)

def getUrl(item):
    return item['Url']

def getContent(item):
    return item['Description']

def getTitle(item):
    return item['Title']

def getInfo(qry, siteStr):
    qryStr = qry + "+" + siteStr
    #qryStr = u"%s" % qryStr.encode('UTF-8')
    query = urllib.urlencode({'q' : qryStr})
    url = 'http://api.bing.net/json.aspx?Appid=<myappid>&Version=2.2&Market=en-US&Query=%s&Sources=web&Web.Count=10&JsonType=raw' % (query)
    search_results = urllib.urlopen(url)
    j = json.loads(search_results.read())
    results = j['SearchResponse']['Web']['Results']
    return results

def updateRecent(qry):
    f = open("recent.txt", "r")
    lines = f.readlines()
    f.close()
    lines = lines[1:]

    if len(qry) > 50: #truncate if string too long
            qry = (qry[:50] + '...')
    qry = strip_tags2(qry) #strip out the html if injection try

    lines.append("\n%s" % qry)
    f = open("recent.txt", "w")
    f.writelines(lines)
    f.close()

if __name__ == '__main__':
    form = cgi.FieldStorage()
    qry = form["qry"].value
    qry = r'%s' % qry

    updateRecent(qry)

    siteStr = "(site:answers.yahoo.com OR site:chacha.com OR site:blurtit.com OR site:answers.com OR site:question.com OR site:answerbag.com OR site:stackexchange.com)"

    print "Content-type: text/html"
    print

    header = open("header.html", "r")
    contents = header.readlines()
    header.close()
    for item in contents:
            print item

    print """
    <div id="results">
    <center><h1>Results:</h1></center>
    """
    for item in getInfo(siteStr, qry):
            print "<h3>%s</h3>" % getTitle(item)
            print "<br />"
            print "%s" % getUrl(item)
            print "<br />"
            print "<p style=\"color:gray\">%s</p>" % getContent(item)
            print "<br />"
    print "</div>"

    footer = open("footer.html", "r")
    contents = footer.readlines()
    footer.close()
    for thing in contents:
            print thing

I prints a few results, and then gives me the following error:

UnicodeEncodeError: 'ascii' codec can't encode character u'\\u2026' in position 72:    ordinal not in range(128)

Can someone explain why this is happening? It clearly has something to do with how the url is getting encoded, but what is exactly is wrong? Thanks in advance!

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

情归归情 2024-10-23 03:38:37

该特定的 Unicode 字符是“水平省略号”。一个或多个 getXXXXX() 函数返回 Unicode 字符串，其中之一包含非 ASCII 字符。我建议声明输出的编码，例如：

Content-Type: text/html; charset=utf-8

并以该编码显式编码您的输出。

That particular Unicode character is "HORIZONTAL ELLIPSIS". One or more of your getXXXXX() functions are returning Unicode strings, one of which contains a non-ASCII character. I suggest declaring the encoding of your output, for example:

Content-Type: text/html; charset=utf-8

and explicitly encoding your output in that encoding.

回复收藏 0 原文

や三分注定 2024-10-23 03:38:37

我们需要知道抛出异常的行号，它将在回溯中。无论如何，问题是您正在从文件/URL 读取 unicode，然后将它们隐式转换为 US-ASCII，可能是在其中一个串联操作中。您应该为所有常量字符串添加前缀 u 以指示它们是 unicode 字符串，例如

u"\n%s" % qry

We need to know the line number where the exception was thrown, it will be in the backtrace. Anyway, the problem is that you are reading unicode from the files/URLs and then implicitly converting them to US-ASCII, probably in one of the concatenation operations. You should prefix all constant strings with u to indicate that they are unicode strings, like in