scrappy爬取汽车信息数据
# -*- coding: utf-8 -*-
import scrapy
class CarinfosSpider(scrapy.Spider):
name='carinfos'
start_urls = (
'http://www.xgo.com.cn/brand.html',
)
#解析起始url,获取各个品牌链接传给parse_brand解析
#抓取http://www.xgo.com.cn/brand/abt/等链接
def parse(self, response):
#ID按照抓取顺序
brands_id = 0 #品牌ID
types_id = 0 #车系ID
cars_id = 0 #车型ID
links = response.xpath('//div[@class="l"]/a[1]/@href').extract()
for link in links:
yield scrapy.Request(link,callback = self.parse_brand,meta={'brands_id':brands_id,'types_id':types_id,'cars_id':cars_id})
#获取品牌下的车系
#抓取http://www.xgo.com.cn/4990/的链接并转换为http://www.xgo.com.cn/4990/items.html
def parse_brand(self,response):
brands_id = response.meta['brands_id']
types_id = response.meta['types_id']
cars_id = response.meta['cars_id']
brand_id = brands_id
brands_id = brands_id + 1
brand_name = response.css('.brand_logo+h1::text').extract()
brand_img = response.css('.brand_logo img::attr(src)').extract()
#记录brand_id,brand_name,brand_img
links = response.css('.car-list p a::attr(href)').extract()
print links
for link in links:
full_url = response.urljoin(link) + 'items.html'
yield scrapy.Request(full_url,callback = self.parse_type,meta={'brand_id':brand_id,'brand_name':brand_name,'brands_id':brands_id,'types_id':types_id,'cars_id':cars_id})
#解析车系下的车型,#抓取http://product.xgo.com.cn/other/index190852.shtml等链接
def parse_type(self,response):
brand_id = response.meta['brand_id']
brand_name = response.meta['brand_name']
type_id = types_id
types_id = types_id + 1
type_name = response.css('.car_banner_l .num::text').extract()
#记录brand_id,brand_name,type_id,type_name
yield scrapy.Request(full_url,callback = self.parse_cars,meta={'brand_id':brand_id,'brand_name':brand_name,'type_id':type_id,'type_id':type_id})
#抓取http://product.xgo.com.cn/191/190852/param.shtml等链接
def parse_cars(self,response):
brand_id = response.meta['brand_id']
brand_name = response.meta['brand_name']
type_id = response.meta['type_id']
type_name = response.meta['type_name']
links = response.css('#theanchor .car_banner_r ul li p a::attr(href)').extract()
for link in links:
yield scrapy.Request(link,callback = self.parse_car_link,meta={'brand_id':brand_id,'brand_name':brand_name,'type_id':type_id,'type_name':type_name})
#解析车型的配置参数
def parse_car_link(self,response):
brand_id = response.meta['brand_id']
brand_name = response.meta['brand_name']
type_id = response.meta['type_id']
type_name = response.meta['type_name']
link = response.urljoin(response.css('.cxk-navbox ul li a::attr(href)').extract()[3])
yield scrapy.Request(link,callback = self.parse_car,meta={'brand_id':brand_id,'brand_name':brand_name,'type_id':type_id,'type_name':type_name})
def parse_car(self,response):
manufacturers1_id = response.meta['brand_id']
manufacturers1 = response.meta['brand_name']
manufacturers2_id = response.meta['type_id']
manufacturers2 = response.meta['type_name']
car_id = cars_id
cars_id = cars_id + 1
#记录brand_id,type_id,car_id,car_info
name = response.css('.offer_topnav h3 a::text').extract()[0]
where = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[0]
level = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[1]
year = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[2]
displacement = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[3]
maximumSpeed = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[4]
officialAcceleration = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[5]
ministryOfIntegratedFuelConsumption = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[6]
vehicleQuality = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[7]
longHighWith = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[9]
bodyStructure1 = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[15]
doorNum = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[17]
seatNum = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[18]
mailVolume = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[19]
model = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[22]
intakeForm = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[24]
fuelForm = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[38]
fuel = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[39]
fuleWay = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[40]
environmentalProtection = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[43]
powerType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[44]
gearbox = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[50]
drivingMethod = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[53]
bodyStructure2 = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[59]
frontBrakeType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[61]
brakeType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[62]
parkingBrakeType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[63]
price = response.css('.cxkmoneys .cxk-jg::text').extract()
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
写的脚本有误