父类中的 __getattr__ 导致子类 __init__ 递归错误
按照答案中的建议:子类化 beautifulsoup html 解析器,出现类型错误,我尝试使用类组合而不是子类化BeautifulSoup
。
基本的 Scraper 类本身就可以很好地工作(至少在我有限的测试中)。
Scraper 类:
from BeautifulSoup import BeautifulSoup
import urllib2
class Scrape():
"""base class to be subclassed
basically a wrapper that providers basic url fetching with urllib2
and the basic html parsing with beautifulsoupץ
some useful methods are provided with class composition with BeautifulSoup.
for direct access to the soup class you can use the _soup property."""
def __init__(self,file):
self._file = file
#very basic input validation
#import re
#import urllib2
#from BeautifulSoup import BeautifulSoup
try:
self._page = urllib2.urlopen(self._file) #fetching the page
except (urllib2.URLError):
print ('please enter a valid url starting with http/https/ftp/file')
self._soup = BeautifulSoup(self._page) #calling the html parser
#BeautifulSoup.__init__(self,self._page)
# the next part is the class compostion part - we transform attribute and method calls to the BeautifulSoup class
#search functions:
self.find = self._soup.find
self.findAll = self._soup.findAll
self.__iter__ = self._soup.__iter__ #enables iterating,looping in the object
self.__len__ = self._soup.__len__
self.__contains__ = self._soup.__contains__
#attribute fetching and setting - __getattr__ implented by the scraper class
self.__setattr__ = self._soup.__setattr__
self.__getattribute__ = self._soup.__getattribute__
#Called to implement evaluation of self[key]
self.__getitem__ = self._soup.__getitem__
self.__setitem__ = self._soup.__setitem__
self.__delitem__ = self._soup.__delitem__
self.__call__ = self._soup.__call__#Called when the instance is “called” as a function
self._getAttrMap = self._soup._getAttrMap
self.has_key = self._soup.has_key
#walking the html document methods
self.contents = self._soup.contents
self.text = self._soup.text
self.extract = self._soup.extract
self.next = self._soup.next
self.parent = self._soup.parent
self.fetch = self._soup.fetch
self.fetchText = self._soup.fetchText
self.findAllNext = self._soup.findAllNext
self.findChild = self._soup.findChild
self.findChildren = self._soup.findChildren
self.findNext = self._soup.findNext
self.findNextSibling = self._soup.findNextSibling
self.first = self._soup.first
self.name = self._soup.name
self.get = self._soup.get
self.getString = self._soup.getString
# comparison operators or similiar boolean checks
self.__eq__ = self._soup.__eq__
self.__ne__ = self._soup.__ne__
self.__hash__ = self._soup.__hash__
self.__nonezero__ = self._soup.__nonzero__ #not sure
# the class represntation magic methods:
self.__str__ = self._soup.__str__
self.__repr__ =self._soup.__repr__
#self.__dict__ = self._soup.__dict__
def __getattr__(self,method):
"""basically this 'magic' method transforms calls for unknown attributes to
and enables to traverse the html document with the .notation.
for example - using instancename.div will return the first div.
explantion: python calls __getattr__ if It didn't find any method or attribute correspanding to the call.
I'm not sure this is a good or the right use for the method """
return self._soup.find(method)
def clean(self,work=False,element=False):
"""clean method that provides:basic cleaning of head,scripts etc
input 'work' soup object to clean from unneccesary parts:scripts,head,style
has optional variable:'element' that can get a tuple of element
that enables to override what element to clean"""
self._work = work or self._soup
self._cleanelements=element or ("head","style","script")
#for elem in self._work.findAll(self._cleanelements):
for elem in self.findAll(self._cleanelements):
elem.extract()
但是当我对它进行子类化时,我得到了某种递归循环,我只能算出来。
这是子类(相关部分):
class MainTraffic(Scrape):
"""class traffic - subclasses the Scrape class
inputs a page url and a category"""
def __init__(self, file, cat, caller = False):
if not caller:
self._file = file
#import urllib2
#self._request = urllib2.Request(self._file)# request to post the show all questions
Scrape.__init__(self,self._file)
self.pagecat = cat
self.clean(self)
self.cleansoup = self.cleantotable(self)
self.fetchlinks(self.cleansoup)
#self.populatequestiondic()
#del (self.cleansoup)
def cleantotable(self):
pass
def fetchlinks(self,fetch):
pass
def length(self):
from sqlalchemy import func
self.len = session.query(func.count(Question.id)).scalar()
return int(self.len)
def __len__(self):
return self.length()
def __repr__(self):
self.repr = "traffic theory question, current number of questions:{0}".format(self.length())
return self.repr
def __getitem__(self,key):
try:
self._item = session.query(Question).filter_by(question_num=key).first()
return self._item
except (IndexError, KeyError):
print "no such key:{0}".format(key)
这是错误消息:
File "C:\Python27\learn\traffic.py", line 117, in __init__
Scrape.__init__(self,self._file)
File "C:\Python27\learn\traffic.py", line 26, in __init__
self._soup = BeautifulSoup(self._page) #calling the html parser
File "C:\Python27\learn\traffic.py", line 92, in __getattr__
return self._soup.find(method)
File "C:\Python27\learn\traffic.py", line 92, in __getattr__
return self._soup.find(method)
File "C:\Python27\learn\traffic.py", line 92, in __getattr__
return self._soup.find(method)
RuntimeError: maximum recursion depth exceeded
我怀疑问题出在我滥用 __getattr__
上,但我不知道应该更改什么。
Following an advice in the answer: subclassing beautifulsoup html parser, getting type error, I'm trying to use class composition instead of subclassing BeautifulSoup
.
The basic Scraper class works fine on it's own (at least to my limited testing).
The Scraper class:
from BeautifulSoup import BeautifulSoup
import urllib2
class Scrape():
"""base class to be subclassed
basically a wrapper that providers basic url fetching with urllib2
and the basic html parsing with beautifulsoupץ
some useful methods are provided with class composition with BeautifulSoup.
for direct access to the soup class you can use the _soup property."""
def __init__(self,file):
self._file = file
#very basic input validation
#import re
#import urllib2
#from BeautifulSoup import BeautifulSoup
try:
self._page = urllib2.urlopen(self._file) #fetching the page
except (urllib2.URLError):
print ('please enter a valid url starting with http/https/ftp/file')
self._soup = BeautifulSoup(self._page) #calling the html parser
#BeautifulSoup.__init__(self,self._page)
# the next part is the class compostion part - we transform attribute and method calls to the BeautifulSoup class
#search functions:
self.find = self._soup.find
self.findAll = self._soup.findAll
self.__iter__ = self._soup.__iter__ #enables iterating,looping in the object
self.__len__ = self._soup.__len__
self.__contains__ = self._soup.__contains__
#attribute fetching and setting - __getattr__ implented by the scraper class
self.__setattr__ = self._soup.__setattr__
self.__getattribute__ = self._soup.__getattribute__
#Called to implement evaluation of self[key]
self.__getitem__ = self._soup.__getitem__
self.__setitem__ = self._soup.__setitem__
self.__delitem__ = self._soup.__delitem__
self.__call__ = self._soup.__call__#Called when the instance is “called” as a function
self._getAttrMap = self._soup._getAttrMap
self.has_key = self._soup.has_key
#walking the html document methods
self.contents = self._soup.contents
self.text = self._soup.text
self.extract = self._soup.extract
self.next = self._soup.next
self.parent = self._soup.parent
self.fetch = self._soup.fetch
self.fetchText = self._soup.fetchText
self.findAllNext = self._soup.findAllNext
self.findChild = self._soup.findChild
self.findChildren = self._soup.findChildren
self.findNext = self._soup.findNext
self.findNextSibling = self._soup.findNextSibling
self.first = self._soup.first
self.name = self._soup.name
self.get = self._soup.get
self.getString = self._soup.getString
# comparison operators or similiar boolean checks
self.__eq__ = self._soup.__eq__
self.__ne__ = self._soup.__ne__
self.__hash__ = self._soup.__hash__
self.__nonezero__ = self._soup.__nonzero__ #not sure
# the class represntation magic methods:
self.__str__ = self._soup.__str__
self.__repr__ =self._soup.__repr__
#self.__dict__ = self._soup.__dict__
def __getattr__(self,method):
"""basically this 'magic' method transforms calls for unknown attributes to
and enables to traverse the html document with the .notation.
for example - using instancename.div will return the first div.
explantion: python calls __getattr__ if It didn't find any method or attribute correspanding to the call.
I'm not sure this is a good or the right use for the method """
return self._soup.find(method)
def clean(self,work=False,element=False):
"""clean method that provides:basic cleaning of head,scripts etc
input 'work' soup object to clean from unneccesary parts:scripts,head,style
has optional variable:'element' that can get a tuple of element
that enables to override what element to clean"""
self._work = work or self._soup
self._cleanelements=element or ("head","style","script")
#for elem in self._work.findAll(self._cleanelements):
for elem in self.findAll(self._cleanelements):
elem.extract()
But when I subclass it I get some sort of recursion loop, I just can figure.
Here is the subclass (the relevant parts):
class MainTraffic(Scrape):
"""class traffic - subclasses the Scrape class
inputs a page url and a category"""
def __init__(self, file, cat, caller = False):
if not caller:
self._file = file
#import urllib2
#self._request = urllib2.Request(self._file)# request to post the show all questions
Scrape.__init__(self,self._file)
self.pagecat = cat
self.clean(self)
self.cleansoup = self.cleantotable(self)
self.fetchlinks(self.cleansoup)
#self.populatequestiondic()
#del (self.cleansoup)
def cleantotable(self):
pass
def fetchlinks(self,fetch):
pass
def length(self):
from sqlalchemy import func
self.len = session.query(func.count(Question.id)).scalar()
return int(self.len)
def __len__(self):
return self.length()
def __repr__(self):
self.repr = "traffic theory question, current number of questions:{0}".format(self.length())
return self.repr
def __getitem__(self,key):
try:
self._item = session.query(Question).filter_by(question_num=key).first()
return self._item
except (IndexError, KeyError):
print "no such key:{0}".format(key)
and here is the error message:
File "C:\Python27\learn\traffic.py", line 117, in __init__
Scrape.__init__(self,self._file)
File "C:\Python27\learn\traffic.py", line 26, in __init__
self._soup = BeautifulSoup(self._page) #calling the html parser
File "C:\Python27\learn\traffic.py", line 92, in __getattr__
return self._soup.find(method)
File "C:\Python27\learn\traffic.py", line 92, in __getattr__
return self._soup.find(method)
File "C:\Python27\learn\traffic.py", line 92, in __getattr__
return self._soup.find(method)
RuntimeError: maximum recursion depth exceeded
I suspect the problem is with me misusing the __getattr__
, but I couldn't figure out what should I change.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
第 1 部分
您的代码不起作用,因为
__getattr__()
在初始化之前就访问了self._soup
。发生这种情况是由于四行看似无害的行:为什么你捕获异常而不实际处理它?
下一行访问 self._page,如果 urlopen() 抛出异常,则该页尚未设置:
由于尚未设置,因此访问它会调用
__getattr__()
,后者会访问self ._soup
,尚未设置,因此它访问__getattr__
。最简单的“修复”是对 _soup 进行特殊处理以防止无限递归。此外,对于
__getattr__
来说,简单地对 soup 进行普通属性查找似乎更有意义:第 2 部分
复制所有方法不太可能很好地工作,并且似乎完全错过了类组合的要点。
Part 1
Your code doesn't work because
__getattr__()
accessesself._soup
before it has been initialized. This happens due to four innocuous-looking lines:Why do you catch the exception and not actually handle it?
The next line accesses self._page, which has not been set yet if urlopen() threw an exception:
Since it hasn't been set, accessing it calls
__getattr__()
, which accessesself._soup
, which has not been set yet so it accesses__getattr__
.The easiest "fix" is to special-case _soup to prevent infinite recursion. Additionally, it seems to make more sense for
__getattr__
to simply do normal attribute lookup on soup:Part 2
Copying all the methods over is unlikely to work very well, and seems to miss the point of class composition entirely.