python爬虫多线程运行出错,但是单独函数成功
问题描述
1.使用thread运行自己的craw函数,craw函数里面用get_info获取信息,但是打印出错(get_info里面except打印出错)
可是将出错的url,单独拿出来用get_info可以爬取。
> ````
import requests
from lxml import etree
import math
import threading
import random
import time
import datetime
import csv
import gc
import os
# 写文件
def write_file(path_file, mode, write_str):
with open(path_file, mode) as file:
file.write(write_str)
# 写数据到csv中
def write_csv(path_file, mode, list_row):
with open(path_file, mode, newline='\n') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(list_row)
# 读取url文件
def read_file(path_file):
with open(path_file, 'r') as file:
lines = file.readlines()
return lines
# 把所有的ulr分成n等份
def chunks(list_url, n):
chunks_list = []
len_list = len(list_url)
step = math.ceil(len_list / n)
for i in range(0, n):
chunks_list.append(list_url[i * step:(i + 1) * step])
return chunks_list
# 获取页面
def get_page(url):
User_Agent = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)'
]
len_user_agent = len(User_Agent)
random_num = random.randint(0, len_user_agent - 1)
user_agent = User_Agent[random_num]
count = 0
status_code = 403
while status_code == 403:
try:
response = requests.get(
url=url,
proxies={
# 'http': 'http://c4b10796877647f297db63ecf2f92428:@proxy.crawlera.com:8010/',
},
headers={
'User-Agent': user_agent
}
)
response.encoding = 'utf-8'#cp936
html = response.text
#print(html)
status_code = response.status_code
count += 1
time.sleep(count * 3)
# print(response.status_code)
# print(str(count)+' ' + url)
except:
html = ''
if count > 30:
break
return html, url, status_code
# 获取页面信息
def get_info(html, url):
info = list()
info.append(url)
selector = etree.HTML(html)
try:
# 科室
hos_rank = selector.xpath("//a[@class='clinic']/text()")[0].strip()
info.append(hos_rank)
# 医生等级
doc_rank = selector.xpath("//span[@class='grade']/text()")[0].strip()
info.append(doc_rank)
# 医院等级、地点、医生从业
place = selector.xpath("//div[@class='doctor-hospital']/span/text()")
info.append(place)
# 服务人数
service_num, accept = selector.xpath('//li[@class="item odd-item"]/span[@class="number"]/text()')
info.append(service_num)
# 好评率
rating = selector.xpath('//li[@class="item"]/span[@class="number"]/text()')[0]
info.append(rating)
# 同行认可
info.append(accept)
# 患者心意
heart = selector.xpath('//li[@class="item last"]/span[@class="number"]/text()')[0]
info.append(heart)
# 咨询费用
cost = selector.xpath('//div[@class="doctor-pay-consult"]/span[@class="price"]/text()')[0] + '¥'
info.append(cost)
except:
print('craw failed, try again')
return info
def find_page_url(html):
selector = etree.HTML(html)
doc_url = []
try:
# 找医生url
url = selector.xpath("//div[@class='detail']/div[@class='des-item']/a[@class='name-wrap']/@href")
doc_url.extend(url)
except:
pass
return doc_url
# 保存url到csv文件中
def save_url(tar_url, path_url_save, page):
try:
for i in range(1, 23):
url_list = tar_url % i
for ii in range(1, page+1):
single_url = url_list + '?page=%d' % ii
html, url, status_code = get_page(single_url)
single_url_list = find_page_url(html)
print(single_url_list)
write_csv(path_url_save, mode='a', list_row=single_url_list)
except:
print('save_url failed, try again')
# index 为线程 序号
def craw(index, chunks_list, path_log_file, path_data):
url_list = chunks_list[index]
for url in url_list:
res_url = 'https://www.chunyuyisheng.com' + url
html, url, status_code = get_page(res_url)
info = get_info(html, url)
if len(info) == 1:
# 有问题记录日志
write_file(path_log_file, 'a', url + '\n')
else:
# 数据写入
# 系统时间
now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
info.insert(1, now_time)
write_csv(path_file=path_data, mode='a', list_row=info)
print(info)
# 回收垃圾
del info
del url_list
gc.collect()
def main(path_url_save, n, path_log_file, path_data):
# 将从csv提取出来的url放入列表中
page_doc_url_list = read_file(path_url_save)
res_url = list()
for i in page_doc_url_list:
res_url.extend(i.split(','))
chunks_list = chunks(res_url, n)
# print(chunks_list[0])
thread_list = []
# 把url分成n等份,也即n个线程
for index in range(0, n):
thread = threading.Thread(target=craw, args=(index, chunks_list, path_log_file, path_data))
thread_list.append(thread)
for t in thread_list:
# t.setDaemon(True)
t.start()
for t in thread_list:
t.join()
if __name__ == '__main__':
path_url_save = './files/URL.csv'
# 判断目标url文件是否存在
if os.path.exists(path_url_save):
pass
else:
tar_url = 'https://www.chunyuyisheng.com/pc/doctors/0-0-%d/'
save_url(tar_url, path_url_save, page=10)
path_data = './files/dataset/data.csv'
path_log_temp = './files/log_temp.txt'
path_log = './files/log.txt'
main(path_url_save=path_url_save, n=200, path_log_file=path_log_temp, path_data=path_data)
# html, url, status_code = get_page('https://www.chunyuyisheng.com/pc/doctor/e410cfe36962a9a992ab/')
# info = get_info(html, url)
# print(info)
问题出现的环境背景及自己尝试过哪些方法
相关代码
粘贴代码文本(请勿用截图)
你期待的结果是什么?实际看到的错误信息又是什么?
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
这里不对,for ii in range(1, n+1)。n没有定义,然后报错,恰好你又写了一个try...except,所以会console:save_url failed, try again。