python: 获取谷歌AdSense收益报告

发布于 2024-10-26 03:51:48 字数 4005 浏览 1 评论 0原文

我需要一个 python 脚本来获取 google adsense 收入，我发现了 adsense scraper： http://pypi.python.org/pypi/adsense_scraper/0.5 它使用 Twill 和 html5lib 来抓取 google adsense 收入数据。当我使用它时，我收到此错误消息：

Traceback (most recent call last):
  File "adsense_scraper.py", line 163, in <module>
    data = main()
  File "adsense_scraper.py", line 154, in main
    b = get_adsense(login, password)
  File "adsense_scraper.py", line 128, in get_adsense
    b.submit()
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 467, in submit
    self._journey('open', request)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 523, in _journey
    r = func(*args, **kwargs)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
    return self._mech_open(url, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
    response = UserAgentBase.open(self, request, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
    response = meth(req, response)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 590, in http_response
   "http", request, response, code, msg, hdrs)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
    result = apply(self._call_chain, args)
  File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
    result = func(*args)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
    return self.parent.open(new)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
    return self._mech_open(url, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
    response = UserAgentBase.open(self, request, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
    response = meth(req, response)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\utils.py", line 442, in http_response
    "refresh", msg, hdrs)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
    result = apply(self._call_chain, args)
  File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
    result = func(*args)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
    return self.parent.open(new)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
    return self._mech_open(url, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
    response = UserAgentBase.open(self, request, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 181, in open
    response = urlopen(self, req, data)
  File "C:\Python26\lib\urllib2.py", line 406, in _open 'unknown_open', req)
  File "C:\Python26\lib\urllib2.py", line 361, in _call_chain result = func(*args)
  File "C:\Python26\lib\urllib2.py", line 1163, in unknown_open raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>

所以重要的是：

urllib2.URLError: <urlopen error unknown url type: 'http>

有人可以告诉我错误在哪里吗？有没有更好的方法通过 python 获取数据？谢谢

原文

I need a python script that gets the google adsense earnings and I found adsense scraper:
http://pypi.python.org/pypi/adsense_scraper/0.5
It uses Twill and html5lib to scrape google adsense earnings data. When I use it I get this error message:

Traceback (most recent call last):
  File "adsense_scraper.py", line 163, in <module>
    data = main()
  File "adsense_scraper.py", line 154, in main
    b = get_adsense(login, password)
  File "adsense_scraper.py", line 128, in get_adsense
    b.submit()
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 467, in submit
    self._journey('open', request)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 523, in _journey
    r = func(*args, **kwargs)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
    return self._mech_open(url, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
    response = UserAgentBase.open(self, request, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
    response = meth(req, response)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 590, in http_response
   "http", request, response, code, msg, hdrs)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
    result = apply(self._call_chain, args)
  File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
    result = func(*args)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
    return self.parent.open(new)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
    return self._mech_open(url, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
    response = UserAgentBase.open(self, request, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
    response = meth(req, response)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\utils.py", line 442, in http_response
    "refresh", msg, hdrs)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
    result = apply(self._call_chain, args)
  File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
    result = func(*args)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
    return self.parent.open(new)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
    return self._mech_open(url, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
    response = UserAgentBase.open(self, request, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 181, in open
    response = urlopen(self, req, data)
  File "C:\Python26\lib\urllib2.py", line 406, in _open 'unknown_open', req)
  File "C:\Python26\lib\urllib2.py", line 361, in _call_chain result = func(*args)
  File "C:\Python26\lib\urllib2.py", line 1163, in unknown_open raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>

So the important thing is:

urllib2.URLError: <urlopen error unknown url type: 'http>

Can somebody tell me where the error is? Is there even a better way to get the data via python? Thanks

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

捂风挽笑 2024-11-02 03:51:48

该包有几个错误，您只提到第一个

1）斜纹包无法正确处理谷歌的重定向，

    newurl = newurl.strip( "'" )

之前添加到 twill/other_packages/_mechanize_dist/_http.py:108

    newurl = _rfc3986.clean_url(newurl, "latin-1")

在修复

2）您必须拥有正确的语言在adsense中设置 - 英文

3）原来的adsense_scraper有几个问题

#!/usr/bin/env python
"""Scrapes Google AdSense data with Python using Twill

Current canonical location of this module is here:
http://github.com/etrepum/adsense_scraper/tree/master


Usage::

    from adsense_scraper import get_adsense, get_time_period
    b = get_adsense('YOUR_ADSENSE_LOGIN', 'YOUR_ADSENSE_PASSWORD')
    rows = get_time_period(b, 'yesterday')
    # The summary data is always the first row with channel == ''
    print 'I earned this much yesterday: $%(earnings)s' % rows[0]

"""
# requires html5lib, twill
import sys
import pprint
import decimal
from cStringIO import StringIO
from xml.etree import cElementTree

try:
    from html5lib import HTMLParser
    import twill.commands
except ImportError:
    print >>sys.stderr, """\
adsense_scraper has dependencies::

    Twill 0.9 http://twill.idyll.org/
    html5lib 0.11 http://code.google.com/p/html5lib/

Try this::

    $ easy_install twill html5lib
"""
    raise SystemExit()

__version__ = '0.5'

SERVICE_LOGIN_BOX_URL = "https://www.google.com/accounts/ServiceLogin?service=adsense&rm=hide&fpui=3&nui=15&alwf=true<mpl=adsense&passive=true&continue=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&followup=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&hl=en_US"
OVERVIEW_URL = "https://www.google.com/adsense/report/overview?timePeriod="

TIME_PERIODS = [
    'today',
    'yesterday',
    'thismonth',
    'lastmonth',
    'sincelastpayment',
]


def parse_decimal(s):
    """Return an int or decimal.Decimal given a human-readable number

    """
    light_stripped = s.strip(u'\u20ac')
    stripped = light_stripped.replace(',', '.').rstrip('%').lstrip('
)
    try:
        int(stripped)
        return light_stripped
    except ValueError:
        pass
    try:
        float(stripped)
        return light_stripped
    except ValueError:
        return decimal.Decimal(stripped)


def parse_summary_table(doc):
    """
    Parse the etree doc for summarytable, returns::

        [{'channel': unicode,
          'impressions': int,
          'clicks': int,
          'ctr': decimal.Decimal,
          'ecpm': decimal.Decimal,
          'earnings': decimal.Decimal}]

    """
    for t in doc.findall('.//table'):
        if t.attrib.get('id') == 'summarytable':
            break
    else:
        raise ValueError("summary table not found")

    res = []
    FIELDS = ['impressions', 'clicks', 'ctr', 'ecpm', 'earnings']
    for row in t.findall('.//tr'):
        celltext = []
        for c in row.findall('td'):
            tail = ''
            # adsense inserts an empty span if a row has a period in it, so
            # get the children and find the tail element to append to the text
            if c.find('a') and c.find('a').getchildren():
                tail = c.find('a').getchildren()[0].tail or ''
            celltext.append('%s%s' % ((c.text or c.findtext('a') or '').strip(), tail.strip()))

        celltext = filter( lambda x: x != "" , celltext )
        if len(celltext) != len(FIELDS):
            continue
        try:
            value_cols = map(parse_decimal, celltext)
        except decimal.InvalidOperation:
            continue
        res.append(dict(zip(FIELDS, value_cols)))

    return res


def get_adsense(login, password):
    """Returns a twill browser instance after having logged in to AdSense
    with *login* and *password*.

    The returned browser will have all of the appropriate cookies set but may
    not be at the exact page that you want data from.

    """
    b = twill.commands.get_browser()
    b.go(SERVICE_LOGIN_BOX_URL)
    for form in b.get_all_forms():
        try:
            form['Email'] = login
            form['Passwd'] = password
        except ValueError:
            continue
        else:
            break
    else:
        raise ValueError("Could not find login form on page")
    b._browser.select_form(predicate=lambda f: f is form)
    b.submit()
    return b


def get_time_period(b, period):
    """Returns the parsed summarytable for the time period *period* given
    *b* which should be the result of a get_adsense call. *period* must be
    a time period that AdSense supports:
    ``'today'``, ``'yesterday'``, ``'thismonth'``,
    ``'lastmonth'``, ``'sincelastpayment'``.

    """
    b.go(OVERVIEW_URL + period)
    # The cElementTree treebuilder doesn't work reliably enough
    # to use directly, so we parse and then dump into cElementTree.
    doc = cElementTree.fromstring(HTMLParser().parse(b.get_html()).toxml())
    return parse_summary_table(doc)


def main():
    try:
        login, password = sys.argv[1:]
    except ValueError:
        raise SystemExit("usage: %s LOGIN PASSWORD" % (sys.argv[0],))
    twill.set_output(StringIO())
    twill.commands.reset_browser()
    b = get_adsense(login, password)
    data = {}
    for period in TIME_PERIODS:
        data[period] = get_time_period(b, period)
    pprint.pprint(data)
    twill.set_output(None)
    return data

if __name__ == '__main__':
    data = main()

there are several errors with the package, you mentioned only the first one

1) twill package does not handle google's redirects correctly, adding

    newurl = newurl.strip( "'" )

to twill/other_packages/_mechanize_dist/_http.py:108 before

    newurl = _rfc3986.clean_url(newurl, "latin-1")

fixes that

2) you have to have the correct language set in adsense - English

3) there are several problems in the orignal adsense_scraper

#!/usr/bin/env python
"""Scrapes Google AdSense data with Python using Twill

Current canonical location of this module is here:
http://github.com/etrepum/adsense_scraper/tree/master


Usage::

    from adsense_scraper import get_adsense, get_time_period
    b = get_adsense('YOUR_ADSENSE_LOGIN', 'YOUR_ADSENSE_PASSWORD')
    rows = get_time_period(b, 'yesterday')
    # The summary data is always the first row with channel == ''
    print 'I earned this much yesterday: $%(earnings)s' % rows[0]

"""
# requires html5lib, twill
import sys
import pprint
import decimal
from cStringIO import StringIO
from xml.etree import cElementTree

try:
    from html5lib import HTMLParser
    import twill.commands
except ImportError:
    print >>sys.stderr, """\
adsense_scraper has dependencies::

    Twill 0.9 http://twill.idyll.org/
    html5lib 0.11 http://code.google.com/p/html5lib/

Try this::

    $ easy_install twill html5lib
"""
    raise SystemExit()

__version__ = '0.5'

SERVICE_LOGIN_BOX_URL = "https://www.google.com/accounts/ServiceLogin?service=adsense&rm=hide&fpui=3&nui=15&alwf=true<mpl=adsense&passive=true&continue=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&followup=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&hl=en_US"
OVERVIEW_URL = "https://www.google.com/adsense/report/overview?timePeriod="

TIME_PERIODS = [
    'today',
    'yesterday',
    'thismonth',
    'lastmonth',
    'sincelastpayment',
]


def parse_decimal(s):
    """Return an int or decimal.Decimal given a human-readable number

    """
    light_stripped = s.strip(u'\u20ac')
    stripped = light_stripped.replace(',', '.').rstrip('%').lstrip('
)
    try:
        int(stripped)
        return light_stripped
    except ValueError:
        pass
    try:
        float(stripped)
        return light_stripped
    except ValueError:
        return decimal.Decimal(stripped)


def parse_summary_table(doc):
    """
    Parse the etree doc for summarytable, returns::

        [{'channel': unicode,
          'impressions': int,
          'clicks': int,
          'ctr': decimal.Decimal,
          'ecpm': decimal.Decimal,
          'earnings': decimal.Decimal}]

    """
    for t in doc.findall('.//table'):
        if t.attrib.get('id') == 'summarytable':
            break
    else:
        raise ValueError("summary table not found")

    res = []
    FIELDS = ['impressions', 'clicks', 'ctr', 'ecpm', 'earnings']
    for row in t.findall('.//tr'):
        celltext = []
        for c in row.findall('td'):
            tail = ''
            # adsense inserts an empty span if a row has a period in it, so
            # get the children and find the tail element to append to the text
            if c.find('a') and c.find('a').getchildren():
                tail = c.find('a').getchildren()[0].tail or ''
            celltext.append('%s%s' % ((c.text or c.findtext('a') or '').strip(), tail.strip()))

        celltext = filter( lambda x: x != "" , celltext )
        if len(celltext) != len(FIELDS):
            continue
        try:
            value_cols = map(parse_decimal, celltext)
        except decimal.InvalidOperation:
            continue
        res.append(dict(zip(FIELDS, value_cols)))

    return res


def get_adsense(login, password):
    """Returns a twill browser instance after having logged in to AdSense
    with *login* and *password*.

    The returned browser will have all of the appropriate cookies set but may
    not be at the exact page that you want data from.

    """
    b = twill.commands.get_browser()
    b.go(SERVICE_LOGIN_BOX_URL)
    for form in b.get_all_forms():
        try:
            form['Email'] = login
            form['Passwd'] = password
        except ValueError:
            continue
        else:
            break
    else:
        raise ValueError("Could not find login form on page")
    b._browser.select_form(predicate=lambda f: f is form)
    b.submit()
    return b


def get_time_period(b, period):
    """Returns the parsed summarytable for the time period *period* given
    *b* which should be the result of a get_adsense call. *period* must be
    a time period that AdSense supports:
    ``'today'``, ``'yesterday'``, ``'thismonth'``,
    ``'lastmonth'``, ``'sincelastpayment'``.

    """
    b.go(OVERVIEW_URL + period)
    # The cElementTree treebuilder doesn't work reliably enough
    # to use directly, so we parse and then dump into cElementTree.
    doc = cElementTree.fromstring(HTMLParser().parse(b.get_html()).toxml())
    return parse_summary_table(doc)


def main():
    try:
        login, password = sys.argv[1:]
    except ValueError:
        raise SystemExit("usage: %s LOGIN PASSWORD" % (sys.argv[0],))
    twill.set_output(StringIO())
    twill.commands.reset_browser()
    b = get_adsense(login, password)
    data = {}
    for period in TIME_PERIODS:
        data[period] = get_time_period(b, period)
    pprint.pprint(data)
    twill.set_output(None)
    return data

if __name__ == '__main__':
    data = main()

回复收藏 0 原文

~没有更多了~