Python程序中变量作用范围应该如何理解?
初学Python,自己尝试着写了一个爬虫,主要代码如下
import json
import scrapy
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
class BidsSpider(scrapy.Spider):
name = 'bids_spider'
allowed_domains = [
'deal.ggzy.gov.cn',
'www.ggzy.gov.cn',
]
start_urls = [
'http://deal.ggzy.gov.cn',
'http://www.ggzy.gov.cn',
]
custom_settings = {
"DOWNLOAD_DELAY": 1,
"RETRY_ENABLED": True,
}
page = 1
url = 'http://deal.ggzy.gov.cn/ds/deal/dealList_find.jsp?TIMEBEGIN_SHOW=2020-09-01&TIMEEND_SHOW=2020-10-20&TIMEBEGIN=2020-09-01&TIMEEND=2020-10-20&SOURCE_TYPE=1&DEAL_TIME=06&DEAL_CLASSIFY=01&DEAL_STAGE=0101&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&FINDTXT=风电&PAGENUMBER='
doc = Document()
def start_requests(self):
yield scrapy.Request(
url=self.url + str(self.page),
callback=self.parse,
method='GET',
)
def parse(self, response):
json_data = json.loads(response.text)
bid_list = json_data['data']
if not len(bid_list):
self.doc.save('D:/projects/test.docx')
return
for index, value in enumerate(bid_list):
item = dict()
item['title'] = value['title']
item['platformName'] = value['platformName']
item['districtShow'] = value['districtShow']
item['tradeShow'] = value['tradeShow']
item['timeShow'] = value['timeShow']
yield scrapy.Request(
url=value['url'],
callback=self.parse_detail,
meta={'item': item},
method='GET',
)
self.page += 1
yield scrapy.Request(
url=self.url + str(self.page),
callback=self.parse,
method='GET',
)
def parse_detail(self, response):
item = response.meta['item']
url = response.css('li.li_hover a::attr(onclick)').extract_first()
yield scrapy.Request(
url='http://www.ggzy.gov.cn/information' + url[25: -2],
callback=self.parse_text,
meta={'item': item},
method='GET',
)
def parse_text(self, response):
item = response.meta['item']
html = response.xpath('//div[@class="detail_content"]').extract_first()
soup = BeautifulSoup(str(html), 'html.parser')
item['detail'] = soup.get_text()
p = self.doc.add_paragraph()
r = p.add_run(item['title'])
r.font.name = '黑体'
r.bold = True
p = self.doc.add_paragraph()
r = p.add_run('来源平台:')
r.bold = True
p.add_run(item['platformName'])
p = self.doc.add_paragraph()
r = p.add_run('省份:')
r.bold = True
p.add_run(item['districtShow'])
p = self.doc.add_paragraph()
r = p.add_run('行业:')
r.bold = True
p.add_run(item['tradeShow'])
p = self.doc.add_paragraph()
r = p.add_run('发布日期:')
r.bold = True
p.add_run(item['timeShow'])
p = self.doc.add_paragraph()
r = p.add_run(item['detail'])
r.font.size = Pt(8)
self.doc.add_page_break()
self.logger.info('#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#')
self.logger.info(item['title'])
self.logger.info('#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#@#')
出现的问题是self.doc.save行生成的文件是空的,但是最后的日志输出语句是有内容的,的确爬到了数据。我觉得是我对Python的变量作用范围不理解造成代码有问题,但是尝试了各种方法,还是没法解决。希望哪位大佬可以指导我一下,万分感谢。
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
数据的存储去
Item Pipeline
里处理,不要在这里。