Python Scrapy 框架发布错误图像 - 为什么/如何解决这个问题?
我正在使用 Python 的 Scrapy 框架从一个网站抓取多个条目,包括文本和图像,并将它们一一发布到另一个网站。一切正常,只是发布的图像与错误的对应文本不同。我一生都无法弄清楚该采取什么不同的做法。
这是代码,如果有人能帮我解决这个问题,我将不胜感激:
from flexmls.items import FlexmlsItem
class Epropertysites(BaseSpider):
name = 'epropertysites'
start_urls = ['http://www.epropertysites.com/']
URL = 'http://www.epropertysites.com'
def parse(self, response):
return FormRequest.from_response(response,
formdata={'i_login':settings.get('EPROP_USER', u''),
'i_password':settings.get('EPROP_PASSW', u'')},
callback=self.after_login)
def after_login(self, response):
if 'is incorrect' in response.body:
print 'Failed to login with\r\n press enter'
self.log('Login failes', log.ERROR)
raw_input()
return
for row in csv.DictReader(open(os.path.join("results", 'flexmls.csv'))):
yield Request('http://www.epropertysites.com/myprop_add.htm',
meta={'item':row},
dont_filter=True,
callback=self.post_ad)
def post_ad(self, response):
item = response.request.meta['item']
try:
print 'posting', item['address'].encode()
except:pass
formdata={'i_address':item['address'],
'i_city':item['city'],
'i_price':item['price'] if item['price'] else u'0',
'i_state':item['state'].strip(),
'i_zip':item['zip'].strip(),
'i_county':item['county'],
'i_mls':item['id'].strip(),
'i_type':'1',
'i_br':item['beds'] if item['beds'] else u'1',
'i_ba':item['baths'] if item['baths'] else u'1',
'i_sqft':item['sqft'],
'i_year_blt':item['year_built'],
'i_tagline':item['address'],
'i_desc':item['description'].replace("\n", '\r\n'),
'i_site_key':item['address'].replace(u" ", u'-').replace(u".", u'').strip(),
'i_domain':'ePropertySites.com',
'i_layout':'%.2d' %random.randint(2,5),
'i_color02':'%.2d' %random.randint(1,12)
}
return FormRequest('http://www.epropertysites.com/myprop_add.htm?&f=3',
formdata=formdata,
meta={'item':item, 'form':formdata},
callback=self.post_images)
def encode_multipart_formdata(self, fields, files):
BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
CRLF = '\r\n'
L = []
for (key, value) in fields:
L.append('--' + BOUNDARY)
L.append('Content-Disposition: form-data; name="%s"' % key)
L.append('')
L.append(value)
for (filename, value) in files:
L.append('--' + BOUNDARY)
L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (filename, filename))
L.append('Content-Type: image/jpeg')
L.append('')
L.append(value)
L.append('--' + BOUNDARY + '--')
L.append('')
body = CRLF.join(L)
content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
return content_type, body
def post_images(self, response):
if 'That Website Key is already being used' in response.body:return
page = HtmlXPathSelector(response)
item = response.request.meta['item']
images = eval(item['images'])
fields = [('i_caption_1',''), ('v_max','1'),
('Content-Disposition: form-data; name="mode"','send')]
files = [ ( os.path.basename(image),
open(os.path.join(settings.get("IMAGES_STORE"), image)).read())
for image in images]
content_type, body = self.encode_multipart_formdata(fields, files)
return FormRequest(self.URL + page.select("//form/@action").extract()[0],
body=body,
method='POST',
headers={'Content-Type':content_type,
'content-length':len(body)},
meta={"item":item, 'form':response.request.meta['form']},
callback=self.get_change_page)
def get_change_page(self, response):
page = HtmlXPathSelector(response)
ad_id = page.select("//form/@action").re(r"&key=(\d+)&")[0].strip()
return Request("http://www.epropertysites.com/myproperties.htm?&f=mod&key=%s" %ad_id,
meta={'item':response.request.meta['item'],
'form':response.request.meta['form'],
'id':ad_id},
callback=self.post_rest_info)
I am working with the Scrapy framework for Python to scrape several entries including text and images from one site and post them to another, one by one. It all works well, except that the images are posting with the wrong corresponding text. I can't, for the life of me, figure out what to do differently.
Here is the code, if anyone could please help me figure this out, I would greatly appreciate it:
from flexmls.items import FlexmlsItem
class Epropertysites(BaseSpider):
name = 'epropertysites'
start_urls = ['http://www.epropertysites.com/']
URL = 'http://www.epropertysites.com'
def parse(self, response):
return FormRequest.from_response(response,
formdata={'i_login':settings.get('EPROP_USER', u''),
'i_password':settings.get('EPROP_PASSW', u'')},
callback=self.after_login)
def after_login(self, response):
if 'is incorrect' in response.body:
print 'Failed to login with\r\n press enter'
self.log('Login failes', log.ERROR)
raw_input()
return
for row in csv.DictReader(open(os.path.join("results", 'flexmls.csv'))):
yield Request('http://www.epropertysites.com/myprop_add.htm',
meta={'item':row},
dont_filter=True,
callback=self.post_ad)
def post_ad(self, response):
item = response.request.meta['item']
try:
print 'posting', item['address'].encode()
except:pass
formdata={'i_address':item['address'],
'i_city':item['city'],
'i_price':item['price'] if item['price'] else u'0',
'i_state':item['state'].strip(),
'i_zip':item['zip'].strip(),
'i_county':item['county'],
'i_mls':item['id'].strip(),
'i_type':'1',
'i_br':item['beds'] if item['beds'] else u'1',
'i_ba':item['baths'] if item['baths'] else u'1',
'i_sqft':item['sqft'],
'i_year_blt':item['year_built'],
'i_tagline':item['address'],
'i_desc':item['description'].replace("\n", '\r\n'),
'i_site_key':item['address'].replace(u" ", u'-').replace(u".", u'').strip(),
'i_domain':'ePropertySites.com',
'i_layout':'%.2d' %random.randint(2,5),
'i_color02':'%.2d' %random.randint(1,12)
}
return FormRequest('http://www.epropertysites.com/myprop_add.htm?&f=3',
formdata=formdata,
meta={'item':item, 'form':formdata},
callback=self.post_images)
def encode_multipart_formdata(self, fields, files):
BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_
CRLF = '\r\n'
L = []
for (key, value) in fields:
L.append('--' + BOUNDARY)
L.append('Content-Disposition: form-data; name="%s"' % key)
L.append('')
L.append(value)
for (filename, value) in files:
L.append('--' + BOUNDARY)
L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (filename, filename))
L.append('Content-Type: image/jpeg')
L.append('')
L.append(value)
L.append('--' + BOUNDARY + '--')
L.append('')
body = CRLF.join(L)
content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
return content_type, body
def post_images(self, response):
if 'That Website Key is already being used' in response.body:return
page = HtmlXPathSelector(response)
item = response.request.meta['item']
images = eval(item['images'])
fields = [('i_caption_1',''), ('v_max','1'),
('Content-Disposition: form-data; name="mode"','send')]
files = [ ( os.path.basename(image),
open(os.path.join(settings.get("IMAGES_STORE"), image)).read())
for image in images]
content_type, body = self.encode_multipart_formdata(fields, files)
return FormRequest(self.URL + page.select("//form/@action").extract()[0],
body=body,
method='POST',
headers={'Content-Type':content_type,
'content-length':len(body)},
meta={"item":item, 'form':response.request.meta['form']},
callback=self.get_change_page)
def get_change_page(self, response):
page = HtmlXPathSelector(response)
ad_id = page.select("//form/@action").re(r"&key=(\d+)&")[0].strip()
return Request("http://www.epropertysites.com/myproperties.htm?&f=mod&key=%s" %ad_id,
meta={'item':response.request.meta['item'],
'form':response.request.meta['form'],
'id':ad_id},
callback=self.post_rest_info)
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论