Python Scrapy 框架发布错误图像 - 为什么/如何解决这个问题？

发布于 2024-11-19 21:13:20 字数 4580 浏览 2 评论 0原文

我正在使用 Python 的 Scrapy 框架从一个网站抓取多个条目，包括文本和图像，并将它们一一发布到另一个网站。一切正常，只是发布的图像与错误的对应文本不同。我一生都无法弄清楚该采取什么不同的做法。

这是代码，如果有人能帮我解决这个问题，我将不胜感激：

from flexmls.items import FlexmlsItem

class Epropertysites(BaseSpider):

name = 'epropertysites'

start_urls = ['http://www.epropertysites.com/']
URL  = 'http://www.epropertysites.com'

def parse(self, response):
    return FormRequest.from_response(response,
                                     formdata={'i_login':settings.get('EPROP_USER', u''),
                                               'i_password':settings.get('EPROP_PASSW', u'')},
                                               callback=self.after_login)

def after_login(self, response):
    if 'is incorrect' in response.body:
        print 'Failed to login with\r\n press enter'
        self.log('Login failes', log.ERROR)
        raw_input()
        return
    for row in csv.DictReader(open(os.path.join("results", 'flexmls.csv'))):
        yield Request('http://www.epropertysites.com/myprop_add.htm',
                      meta={'item':row},
                      dont_filter=True,
                   callback=self.post_ad)

def post_ad(self, response):
    item = response.request.meta['item']
    try:
        print 'posting', item['address'].encode()
    except:pass
    formdata={'i_address':item['address'],
        'i_city':item['city'],
        'i_price':item['price'] if item['price'] else u'0',
        'i_state':item['state'].strip(),
        'i_zip':item['zip'].strip(),
        'i_county':item['county'],
       'i_mls':item['id'].strip(),
        'i_type':'1',
        'i_br':item['beds'] if item['beds'] else u'1',
        'i_ba':item['baths'] if item['baths'] else u'1',
        'i_sqft':item['sqft'],
        'i_year_blt':item['year_built'],
        'i_tagline':item['address'],
        'i_desc':item['description'].replace("\n", '\r\n'),
        'i_site_key':item['address'].replace(u" ", u'-').replace(u".", u'').strip(),
        'i_domain':'ePropertySites.com',
        'i_layout':'%.2d' %random.randint(2,5),
        'i_color02':'%.2d' %random.randint(1,12)
        }

    return FormRequest('http://www.epropertysites.com/myprop_add.htm?&f=3',
                       formdata=formdata,
                              meta={'item':item, 'form':formdata},
                       callback=self.post_images)


def encode_multipart_formdata(self, fields, files):

    BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
    CRLF = '\r\n'
    L = []
    for (key, value) in fields:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"' % key)
        L.append('')
        L.append(value)
    for (filename, value) in files:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (filename, filename))
        L.append('Content-Type: image/jpeg')
        L.append('')
        L.append(value)
    L.append('--' + BOUNDARY + '--')
    L.append('')
    body = CRLF.join(L)
    content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
    return content_type, body

def post_images(self, response):
    if 'That Website Key is already being used' in response.body:return
    page = HtmlXPathSelector(response)

    item = response.request.meta['item']
    images = eval(item['images'])
    fields = [('i_caption_1',''), ('v_max','1'),
              ('Content-Disposition: form-data; name="mode"','send')]
    files = [ ( os.path.basename(image),
               open(os.path.join(settings.get("IMAGES_STORE"), image)).read())
             for image in images]
    content_type, body = self.encode_multipart_formdata(fields, files)
    return FormRequest(self.URL + page.select("//form/@action").extract()[0],
                           body=body,
                           method='POST',
                           headers={'Content-Type':content_type,
                                    'content-length':len(body)},
                           meta={"item":item, 'form':response.request.meta['form']},
                           callback=self.get_change_page)

def get_change_page(self, response):
    page = HtmlXPathSelector(response)
    ad_id = page.select("//form/@action").re(r"&key=(\d+)&")[0].strip()
    return Request("http://www.epropertysites.com/myproperties.htm?&f=mod&key=%s" %ad_id,
                   meta={'item':response.request.meta['item'],
                         'form':response.request.meta['form'],
                         'id':ad_id},
                   callback=self.post_rest_info)

原文

I am working with the Scrapy framework for Python to scrape several entries including text and images from one site and post them to another, one by one. It all works well, except that the images are posting with the wrong corresponding text. I can't, for the life of me, figure out what to do differently.

Here is the code, if anyone could please help me figure this out, I would greatly appreciate it:

from flexmls.items import FlexmlsItem

class Epropertysites(BaseSpider):

name = 'epropertysites'

start_urls = ['http://www.epropertysites.com/']
URL  = 'http://www.epropertysites.com'

def parse(self, response):
    return FormRequest.from_response(response,
                                     formdata={'i_login':settings.get('EPROP_USER', u''),
                                               'i_password':settings.get('EPROP_PASSW', u'')},
                                               callback=self.after_login)

def after_login(self, response):
    if 'is incorrect' in response.body:
        print 'Failed to login with\r\n press enter'
        self.log('Login failes', log.ERROR)
        raw_input()
        return
    for row in csv.DictReader(open(os.path.join("results", 'flexmls.csv'))):
        yield Request('http://www.epropertysites.com/myprop_add.htm',
                      meta={'item':row},
                      dont_filter=True,
                   callback=self.post_ad)

def post_ad(self, response):
    item = response.request.meta['item']
    try:
        print 'posting', item['address'].encode()
    except:pass
    formdata={'i_address':item['address'],
        'i_city':item['city'],
        'i_price':item['price'] if item['price'] else u'0',
        'i_state':item['state'].strip(),
        'i_zip':item['zip'].strip(),
        'i_county':item['county'],
       'i_mls':item['id'].strip(),
        'i_type':'1',
        'i_br':item['beds'] if item['beds'] else u'1',
        'i_ba':item['baths'] if item['baths'] else u'1',
        'i_sqft':item['sqft'],
        'i_year_blt':item['year_built'],
        'i_tagline':item['address'],
        'i_desc':item['description'].replace("\n", '\r\n'),
        'i_site_key':item['address'].replace(u" ", u'-').replace(u".", u'').strip(),
        'i_domain':'ePropertySites.com',
        'i_layout':'%.2d' %random.randint(2,5),
        'i_color02':'%.2d' %random.randint(1,12)
        }

    return FormRequest('http://www.epropertysites.com/myprop_add.htm?&f=3',
                       formdata=formdata,
                              meta={'item':item, 'form':formdata},
                       callback=self.post_images)


def encode_multipart_formdata(self, fields, files):

    BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_

    CRLF = '\r\n'
    L = []
    for (key, value) in fields:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"' % key)
        L.append('')
        L.append(value)
    for (filename, value) in files:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (filename, filename))
        L.append('Content-Type: image/jpeg')
        L.append('')
        L.append(value)
    L.append('--' + BOUNDARY + '--')
    L.append('')
    body = CRLF.join(L)
    content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
    return content_type, body

def post_images(self, response):
    if 'That Website Key is already being used' in response.body:return
    page = HtmlXPathSelector(response)

    item = response.request.meta['item']
    images = eval(item['images'])
    fields = [('i_caption_1',''), ('v_max','1'),
              ('Content-Disposition: form-data; name="mode"','send')]
    files = [ ( os.path.basename(image),
               open(os.path.join(settings.get("IMAGES_STORE"), image)).read())
             for image in images]
    content_type, body = self.encode_multipart_formdata(fields, files)
    return FormRequest(self.URL + page.select("//form/@action").extract()[0],
                           body=body,
                           method='POST',
                           headers={'Content-Type':content_type,
                                    'content-length':len(body)},
                           meta={"item":item, 'form':response.request.meta['form']},
                           callback=self.get_change_page)

def get_change_page(self, response):
    page = HtmlXPathSelector(response)
    ad_id = page.select("//form/@action").re(r"&key=(\d+)&")[0].strip()
    return Request("http://www.epropertysites.com/myproperties.htm?&f=mod&key=%s" %ad_id,
                   meta={'item':response.request.meta['item'],
                         'form':response.request.meta['form'],
                         'id':ad_id},
                   callback=self.post_rest_info)

分享到QQ

分享到微博