当前位置：文江博客话题详情

Python 网页爬虫 Scrapy Django yield

scrappy爬取汽车信息数据

发布于 2022-09-02 09:24:17 字数 6217 浏览 19 评论 0

# -*- coding: utf-8 -*-
import scrapy
class CarinfosSpider(scrapy.Spider):
    name='carinfos'
    start_urls = (
        'http://www.xgo.com.cn/brand.html',
    )

    #解析起始url,获取各个品牌链接传给parse_brand解析
    #抓取http://www.xgo.com.cn/brand/abt/等链接
    def parse(self, response):
        #ID按照抓取顺序
        brands_id = 0    #品牌ID 
        types_id = 0    #车系ID
        cars_id = 0        #车型ID
        links = response.xpath('//div[@class="l"]/a[1]/@href').extract()
        for link in links:
            yield scrapy.Request(link,callback = self.parse_brand,meta={'brands_id':brands_id,'types_id':types_id,'cars_id':cars_id})

        #获取品牌下的车系
        #抓取http://www.xgo.com.cn/4990/的链接并转换为http://www.xgo.com.cn/4990/items.html
    def parse_brand(self,response):
        brands_id = response.meta['brands_id']
        types_id = response.meta['types_id']
        cars_id = response.meta['cars_id']

        brand_id = brands_id
        brands_id = brands_id + 1
        brand_name = response.css('.brand_logo+h1::text').extract()
        brand_img = response.css('.brand_logo img::attr(src)').extract()
        #记录brand_id,brand_name,brand_img

        links = response.css('.car-list p a::attr(href)').extract()
        print links
        for link in links:
            full_url = response.urljoin(link) + 'items.html'
            yield scrapy.Request(full_url,callback = self.parse_type,meta={'brand_id':brand_id,'brand_name':brand_name,'brands_id':brands_id,'types_id':types_id,'cars_id':cars_id})

    #解析车系下的车型,#抓取http://product.xgo.com.cn/other/index190852.shtml等链接
    def parse_type(self,response):
        brand_id = response.meta['brand_id']
        brand_name = response.meta['brand_name']
        type_id = types_id
        types_id = types_id + 1
        type_name = response.css('.car_banner_l .num::text').extract()
        #记录brand_id,brand_name,type_id,type_name

        yield scrapy.Request(full_url,callback = self.parse_cars,meta={'brand_id':brand_id,'brand_name':brand_name,'type_id':type_id,'type_id':type_id})

    #抓取http://product.xgo.com.cn/191/190852/param.shtml等链接
    def parse_cars(self,response):
        brand_id = response.meta['brand_id']
        brand_name = response.meta['brand_name']
        type_id = response.meta['type_id']
        type_name = response.meta['type_name']
        links = response.css('#theanchor .car_banner_r ul li p a::attr(href)').extract() 
        for link in links:
            yield scrapy.Request(link,callback = self.parse_car_link,meta={'brand_id':brand_id,'brand_name':brand_name,'type_id':type_id,'type_name':type_name})

    #解析车型的配置参数
    def parse_car_link(self,response):
        brand_id = response.meta['brand_id']
        brand_name = response.meta['brand_name']
        type_id = response.meta['type_id']
        type_name = response.meta['type_name']
        link = response.urljoin(response.css('.cxk-navbox ul li a::attr(href)').extract()[3])
        yield scrapy.Request(link,callback = self.parse_car,meta={'brand_id':brand_id,'brand_name':brand_name,'type_id':type_id,'type_name':type_name})
         

    def parse_car(self,response):
        manufacturers1_id = response.meta['brand_id']
        manufacturers1 = response.meta['brand_name']
        manufacturers2_id = response.meta['type_id']
        manufacturers2 = response.meta['type_name']
        car_id = cars_id
        cars_id = cars_id + 1
        #记录brand_id,type_id,car_id,car_info
        name = response.css('.offer_topnav h3 a::text').extract()[0]
        where = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[0]
        level = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[1]
        year = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[2]
        displacement = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[3]
        maximumSpeed = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[4]
        officialAcceleration = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[5]
        ministryOfIntegratedFuelConsumption = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[6]
        vehicleQuality = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[7]
        longHighWith = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[9]
        bodyStructure1 = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[15]
        doorNum = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[17]
        seatNum = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[18]
        mailVolume = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[19]
        model = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[22]
        intakeForm = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[24]
        fuelForm = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[38]
        fuel = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[39]
        fuleWay = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[40]
        environmentalProtection = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[43]
        powerType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[44]
        gearbox = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[50]
        drivingMethod = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[53]
        bodyStructure2 = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[59]
        frontBrakeType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[61]
        brakeType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[62]
        parkingBrakeType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[63]
        price = response.css('.cxkmoneys .cxk-jg::text').extract()

收藏 0

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

评论（1）

一萌ing 2022-09-09 09:24:17

写的脚本有误

~没有更多了~

关于作者

简单气质女生网名

暂无简介

文章

评论

907 人气

关注发私信

相关话题

热门标签

操作系统程序设计 IT运维 Linux系统管理 JavaScript 服务器应用 solaris C/C++ PHP Shell BSD Vue.js aix Oracle Python HTML 系统管理 HTML5 CSS 前端

推荐作者

佚名

文章 0 评论 0

羁客

文章 0 评论 0

天天爱笑的徐老师

文章 0 评论 0

星

文章 0 评论 0

夏日落

文章 0 评论 0

隐诗

文章 0 评论 0

友情链接

我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的隐私政策了解更多相关信息。单击 接受 或继续使用网站，即表示您同意使用 Cookies 和您的相关数据。

原文