python爬虫多线程运行出错,但是单独函数成功

发布于 2022-09-12 02:45:53 字数 7074 浏览 16 评论 0

问题描述

1.使用thread运行自己的craw函数,craw函数里面用get_info获取信息,但是打印出错(get_info里面except打印出错)
可是将出错的url,单独拿出来用get_info可以爬取。

> ````
import requests
from lxml import etree
import math
import threading
import random
import time
import datetime
import csv
import gc
import os


# 写文件
def write_file(path_file, mode, write_str):
    with open(path_file, mode) as file:
        file.write(write_str)


# 写数据到csv中
def write_csv(path_file, mode, list_row):
    with open(path_file, mode, newline='\n') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(list_row)


# 读取url文件
def read_file(path_file):
    with open(path_file, 'r') as file:
        lines = file.readlines()
    return lines


# 把所有的ulr分成n等份
def chunks(list_url, n):
    chunks_list = []
    len_list = len(list_url)
    step = math.ceil(len_list / n)

    for i in range(0, n):
        chunks_list.append(list_url[i * step:(i + 1) * step])
    return chunks_list


# 获取页面
def get_page(url):
    User_Agent = [
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)'
    ]
    len_user_agent = len(User_Agent)
    random_num = random.randint(0, len_user_agent - 1)
    user_agent = User_Agent[random_num]

    count = 0
    status_code = 403
    while status_code == 403:
        try:
            response = requests.get(
                url=url,
                proxies={
                    # 'http': 'http://c4b10796877647f297db63ecf2f92428:@proxy.crawlera.com:8010/',
                },
                headers={
                    'User-Agent': user_agent
                }
            )
            response.encoding = 'utf-8'#cp936
            html = response.text
            #print(html)
            status_code = response.status_code
            count += 1
            time.sleep(count * 3)
            # print(response.status_code)
            # print(str(count)+' ' + url)
        except:
            html = ''
        if count > 30:
            break

    return html, url, status_code


# 获取页面信息
def get_info(html, url):
    info = list()
    info.append(url)
    selector = etree.HTML(html)
    try:
        # 科室
        hos_rank = selector.xpath("//a[@class='clinic']/text()")[0].strip()
        info.append(hos_rank)
        # 医生等级
        doc_rank = selector.xpath("//span[@class='grade']/text()")[0].strip()
        info.append(doc_rank)
        # 医院等级、地点、医生从业
        place = selector.xpath("//div[@class='doctor-hospital']/span/text()")
        info.append(place)
        # 服务人数
        service_num, accept = selector.xpath('//li[@class="item odd-item"]/span[@class="number"]/text()')
        info.append(service_num)
        # 好评率
        rating = selector.xpath('//li[@class="item"]/span[@class="number"]/text()')[0]
        info.append(rating)
        # 同行认可
        info.append(accept)
        # 患者心意
        heart = selector.xpath('//li[@class="item last"]/span[@class="number"]/text()')[0]
        info.append(heart)
        # 咨询费用
        cost = selector.xpath('//div[@class="doctor-pay-consult"]/span[@class="price"]/text()')[0] + '¥'
        info.append(cost)
    except:
        print('craw failed, try again')

    return info


def find_page_url(html):
    selector = etree.HTML(html)
    doc_url = []
    try:
        # 找医生url
        url = selector.xpath("//div[@class='detail']/div[@class='des-item']/a[@class='name-wrap']/@href")
        doc_url.extend(url)
    except:
        pass

    return doc_url


# 保存url到csv文件中
def save_url(tar_url, path_url_save, page):

    try:
        for i in range(1, 23):
            url_list = tar_url % i
            for ii in range(1, page+1):
                single_url = url_list + '?page=%d' % ii
                html, url, status_code = get_page(single_url)
                single_url_list = find_page_url(html)
                print(single_url_list)
                write_csv(path_url_save, mode='a', list_row=single_url_list)

    except:
        print('save_url failed, try again')


# index 为线程 序号
def craw(index, chunks_list, path_log_file, path_data):
    url_list = chunks_list[index]
    for url in url_list:
        res_url = 'https://www.chunyuyisheng.com' + url
        html, url, status_code = get_page(res_url)
        info = get_info(html, url)

        if len(info) == 1:
            # 有问题记录日志
            write_file(path_log_file, 'a', url + '\n')
        else:
            # 数据写入
            # 系统时间
            now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            info.insert(1, now_time)
            write_csv(path_file=path_data, mode='a', list_row=info)
        print(info)
        # 回收垃圾
        del info
    del url_list
    gc.collect()


def main(path_url_save, n, path_log_file, path_data):

    # 将从csv提取出来的url放入列表中
    page_doc_url_list = read_file(path_url_save)
    res_url = list()
    for i in page_doc_url_list:
        res_url.extend(i.split(','))

    chunks_list = chunks(res_url, n)
    # print(chunks_list[0])
    thread_list = []

    # 把url分成n等份,也即n个线程
    for index in range(0, n):
        thread = threading.Thread(target=craw, args=(index, chunks_list, path_log_file, path_data))
        thread_list.append(thread)

    for t in thread_list:
        # t.setDaemon(True)
        t.start()

    for t in thread_list:
        t.join()


if __name__ == '__main__':
    path_url_save = './files/URL.csv'

    # 判断目标url文件是否存在
    if os.path.exists(path_url_save):
        pass
    else:
        tar_url = 'https://www.chunyuyisheng.com/pc/doctors/0-0-%d/'
        save_url(tar_url, path_url_save, page=10)

    path_data = './files/dataset/data.csv'
    path_log_temp = './files/log_temp.txt'
    path_log = './files/log.txt'

    main(path_url_save=path_url_save, n=200, path_log_file=path_log_temp, path_data=path_data)


# html, url, status_code = get_page('https://www.chunyuyisheng.com/pc/doctor/e410cfe36962a9a992ab/')
# info = get_info(html, url)
# print(info)

问题出现的环境背景及自己尝试过哪些方法

image.png

相关代码

粘贴代码文本(请勿用截图)

你期待的结果是什么?实际看到的错误信息又是什么?

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

烟雨凡馨 2022-09-19 02:45:53
# 保存url到csv文件中
def save_url(tar_url, path_url_save, page):

    try:
        for i in range(1, 23):
            url_list = tar_url % i
            for ii in range(1, n+1):
                single_url = url_list + '?page=%d' % ii
                html, url, status_code = get_page(single_url)
                single_url_list = find_page_url(html)
                print(single_url_list)
                write_csv(path_url_save, mode='a', list_row=single_url_list)

    except:
        print('save_url failed, try again')

这里不对,for ii in range(1, n+1)。n没有定义,然后报错,恰好你又写了一个try...except,所以会console:save_url failed, try again。

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文