Python Scrapy 框架发布错误图像 - 为什么/如何解决这个问题?

发布于 2024-11-19 21:13:20 字数 4580 浏览 2 评论 0原文

我正在使用 Python 的 Scrapy 框架从一个网站抓取多个条目,包括文本和图像,并将它们一一发布到另一个网站。一切正常,只是发布的图像与错误的对应文本不同。我一生都无法弄清楚该采取什么不同的做法。

这是代码,如果有人能帮我解决这个问题,我将不胜感激:

from flexmls.items import FlexmlsItem

class Epropertysites(BaseSpider):

name = 'epropertysites'

start_urls = ['http://www.epropertysites.com/']
URL  = 'http://www.epropertysites.com'

def parse(self, response):
    return FormRequest.from_response(response,
                                     formdata={'i_login':settings.get('EPROP_USER', u''),
                                               'i_password':settings.get('EPROP_PASSW', u'')},
                                               callback=self.after_login)

def after_login(self, response):
    if 'is incorrect' in response.body:
        print 'Failed to login with\r\n press enter'
        self.log('Login failes', log.ERROR)
        raw_input()
        return
    for row in csv.DictReader(open(os.path.join("results", 'flexmls.csv'))):
        yield Request('http://www.epropertysites.com/myprop_add.htm',
                      meta={'item':row},
                      dont_filter=True,
                   callback=self.post_ad)

def post_ad(self, response):
    item = response.request.meta['item']
    try:
        print 'posting', item['address'].encode()
    except:pass
    formdata={'i_address':item['address'],
        'i_city':item['city'],
        'i_price':item['price'] if item['price'] else u'0',
        'i_state':item['state'].strip(),
        'i_zip':item['zip'].strip(),
        'i_county':item['county'],
       'i_mls':item['id'].strip(),
        'i_type':'1',
        'i_br':item['beds'] if item['beds'] else u'1',
        'i_ba':item['baths'] if item['baths'] else u'1',
        'i_sqft':item['sqft'],
        'i_year_blt':item['year_built'],
        'i_tagline':item['address'],
        'i_desc':item['description'].replace("\n", '\r\n'),
        'i_site_key':item['address'].replace(u" ", u'-').replace(u".", u'').strip(),
        'i_domain':'ePropertySites.com',
        'i_layout':'%.2d' %random.randint(2,5),
        'i_color02':'%.2d' %random.randint(1,12)
        }

    return FormRequest('http://www.epropertysites.com/myprop_add.htm?&f=3',
                       formdata=formdata,
                              meta={'item':item, 'form':formdata},
                       callback=self.post_images)


def encode_multipart_formdata(self, fields, files):

    BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
    CRLF = '\r\n'
    L = []
    for (key, value) in fields:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"' % key)
        L.append('')
        L.append(value)
    for (filename, value) in files:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (filename, filename))
        L.append('Content-Type: image/jpeg')
        L.append('')
        L.append(value)
    L.append('--' + BOUNDARY + '--')
    L.append('')
    body = CRLF.join(L)
    content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
    return content_type, body

def post_images(self, response):
    if 'That Website Key is already being used' in response.body:return
    page = HtmlXPathSelector(response)

    item = response.request.meta['item']
    images = eval(item['images'])
    fields = [('i_caption_1',''), ('v_max','1'),
              ('Content-Disposition: form-data; name="mode"','send')]
    files = [ ( os.path.basename(image),
               open(os.path.join(settings.get("IMAGES_STORE"), image)).read())
             for image in images]
    content_type, body = self.encode_multipart_formdata(fields, files)
    return FormRequest(self.URL + page.select("//form/@action").extract()[0],
                           body=body,
                           method='POST',
                           headers={'Content-Type':content_type,
                                    'content-length':len(body)},
                           meta={"item":item, 'form':response.request.meta['form']},
                           callback=self.get_change_page)

def get_change_page(self, response):
    page = HtmlXPathSelector(response)
    ad_id = page.select("//form/@action").re(r"&key=(\d+)&")[0].strip()
    return Request("http://www.epropertysites.com/myproperties.htm?&f=mod&key=%s" %ad_id,
                   meta={'item':response.request.meta['item'],
                         'form':response.request.meta['form'],
                         'id':ad_id},
                   callback=self.post_rest_info)

I am working with the Scrapy framework for Python to scrape several entries including text and images from one site and post them to another, one by one. It all works well, except that the images are posting with the wrong corresponding text. I can't, for the life of me, figure out what to do differently.

Here is the code, if anyone could please help me figure this out, I would greatly appreciate it:

from flexmls.items import FlexmlsItem

class Epropertysites(BaseSpider):

name = 'epropertysites'

start_urls = ['http://www.epropertysites.com/']
URL  = 'http://www.epropertysites.com'

def parse(self, response):
    return FormRequest.from_response(response,
                                     formdata={'i_login':settings.get('EPROP_USER', u''),
                                               'i_password':settings.get('EPROP_PASSW', u'')},
                                               callback=self.after_login)

def after_login(self, response):
    if 'is incorrect' in response.body:
        print 'Failed to login with\r\n press enter'
        self.log('Login failes', log.ERROR)
        raw_input()
        return
    for row in csv.DictReader(open(os.path.join("results", 'flexmls.csv'))):
        yield Request('http://www.epropertysites.com/myprop_add.htm',
                      meta={'item':row},
                      dont_filter=True,
                   callback=self.post_ad)

def post_ad(self, response):
    item = response.request.meta['item']
    try:
        print 'posting', item['address'].encode()
    except:pass
    formdata={'i_address':item['address'],
        'i_city':item['city'],
        'i_price':item['price'] if item['price'] else u'0',
        'i_state':item['state'].strip(),
        'i_zip':item['zip'].strip(),
        'i_county':item['county'],
       'i_mls':item['id'].strip(),
        'i_type':'1',
        'i_br':item['beds'] if item['beds'] else u'1',
        'i_ba':item['baths'] if item['baths'] else u'1',
        'i_sqft':item['sqft'],
        'i_year_blt':item['year_built'],
        'i_tagline':item['address'],
        'i_desc':item['description'].replace("\n", '\r\n'),
        'i_site_key':item['address'].replace(u" ", u'-').replace(u".", u'').strip(),
        'i_domain':'ePropertySites.com',
        'i_layout':'%.2d' %random.randint(2,5),
        'i_color02':'%.2d' %random.randint(1,12)
        }

    return FormRequest('http://www.epropertysites.com/myprop_add.htm?&f=3',
                       formdata=formdata,
                              meta={'item':item, 'form':formdata},
                       callback=self.post_images)


def encode_multipart_formdata(self, fields, files):

    BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_

    CRLF = '\r\n'
    L = []
    for (key, value) in fields:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"' % key)
        L.append('')
        L.append(value)
    for (filename, value) in files:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (filename, filename))
        L.append('Content-Type: image/jpeg')
        L.append('')
        L.append(value)
    L.append('--' + BOUNDARY + '--')
    L.append('')
    body = CRLF.join(L)
    content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
    return content_type, body

def post_images(self, response):
    if 'That Website Key is already being used' in response.body:return
    page = HtmlXPathSelector(response)

    item = response.request.meta['item']
    images = eval(item['images'])
    fields = [('i_caption_1',''), ('v_max','1'),
              ('Content-Disposition: form-data; name="mode"','send')]
    files = [ ( os.path.basename(image),
               open(os.path.join(settings.get("IMAGES_STORE"), image)).read())
             for image in images]
    content_type, body = self.encode_multipart_formdata(fields, files)
    return FormRequest(self.URL + page.select("//form/@action").extract()[0],
                           body=body,
                           method='POST',
                           headers={'Content-Type':content_type,
                                    'content-length':len(body)},
                           meta={"item":item, 'form':response.request.meta['form']},
                           callback=self.get_change_page)

def get_change_page(self, response):
    page = HtmlXPathSelector(response)
    ad_id = page.select("//form/@action").re(r"&key=(\d+)&")[0].strip()
    return Request("http://www.epropertysites.com/myproperties.htm?&f=mod&key=%s" %ad_id,
                   meta={'item':response.request.meta['item'],
                         'form':response.request.meta['form'],
                         'id':ad_id},
                   callback=self.post_rest_info)

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文