tensorflow.python.framework.errors_impl.InternaalError:BLAS GEMM 启动失败: a.shpe=(8, 459), b.shape=(8, 2), m=459, n=2, k=8

发布于 2025-01-09 20:09:47 字数 10709 浏览 0 评论 0原文

我正在尝试使用tensorflow-gpu2.0.0在gpu上训练CNN模型,并且出现了标题提到的错误,它可以在使用tensorflow2.0.0的cpu上运行良好。我使用 cuda 10.0、cudnn 7.6.5、RTX 3080(10GB)。 我找不到哪个部分是错误的,我看到一些与此类似的问题,他们说是进程内存占用造成的,所以我尝试在

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])

import tensorflow as tf 后面添加,但是它对此不起作用,减少batch_size也不起作用。

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import layers
import time
import datetime
import os
import tensorflow_probability as tfp
import pickle
import numpy as np
from Fusion_layer import perception_layer_topologyV1_template
from loss_2 import Custom_loss
import math



class am_lfs():

    def __init__(self, M=6, sigma=0.2, epochs=1, batch_size=128, B=4, num_class=2, Dropout_rate=0.3):
        self.M = M
        self.sigma = sigma
        self.batch_size = batch_size
        self.Dropout_rate = Dropout_rate
        self.B = B
        self.epochs = epochs
        self.num_class = num_class
        self.train_set = train_dataset
        self.test_set = test_dataset

    def train_model(self, model, early_stopping, csv_logger, train_set, test_set, sear, i, train_dataset, test_dataset):
        print("training model...")
        history = model.fit(train_dataset,
                              steps_per_epoch = max(1, math.ceil(len(train_set) / self.batch_size)),                       
                              epochs=epochs,
                              validation_data = test_dataset,                 #remember to change it
                              validation_steps = max(1, math.ceil(len(test_set) / self.batch_size)),
                              callbacks=[early_stopping, csv_logger])  
        path1 = 'model{}_{}.h5'.format(i, sear)
        model.save(os.path.join(result_path,path1))
        return history


    def Mymodel(self, theta):
        inputs = layers.Input(shape=(112, 112, 1), name='input')  # shape of input
        x = inputs
        # normal convolution layers
        output = layers.Conv2D(137, (7, 7), strides=2, padding='same', activation='relu')(x)

        output = layers.MaxPooling2D(pool_size=(3, 3), strides=2, padding='same')(output)

        output = layers.Conv2D(137, (1, 1))(output)

        output = layers.Conv2D(144, (3, 3), padding='same')(output)

        output = layers.MaxPooling2D(pool_size=(3, 3), strides=2, padding='same')(output)

        # Inception module (fusion 3)
        output = perception_layer_topologyV1_template(output, output_size=245, convMM_size=2, convMM_num=24,
                                             num_of_c11_1=308, conv1_size=6, conv1_num=77,
                                             num_of_c11_2=24, conv2_size=7, conv2_num=24,
                                             pool1_size=2, num_of_pool1c1_max=40, num_of_pool1c1_min=40,
                                             pool2_size=5, num_of_pool2c1_max=20, num_of_pool2c1_min=20,
                                             name="fusion_3")

        # Inception module (fusion 4)
        output = perception_layer_topologyV1_template(output, output_size=228, convMM_size=2, convMM_num=35,
                                             num_of_c11_1=354, conv1_size=6, conv1_num=15,
                                             num_of_c11_2=281, conv2_size=7, conv2_num=13,
                                             pool1_size=2, num_of_pool1c1_max=24, num_of_pool1c1_min=23,
                                             pool2_size=5, num_of_pool2c1_max=59, num_of_pool2c1_min=59,
                                             name="fusion_4")

        # max_pool
        output = layers.MaxPooling2D(pool_size=[3, 3],strides =2, padding = 'same', name = 'pool2')(output)

        # Inception module (fusion 5)
        output = perception_layer_topologyV1_template(output, output_size=459, convMM_size=2, convMM_num=14,
                                             num_of_c11_1=46, conv1_size=3, conv1_num=25,
                                             num_of_c11_2=170, conv2_size=5, conv2_num=116,
                                             pool1_size=4, num_of_pool1c1_max=107, num_of_pool1c1_min=107,
                                             pool2_size=5, num_of_pool2c1_max=45, num_of_pool2c1_min=45,
                                             name="fusion_5")
        # AveragePooling
        output = layers.AveragePooling2D(pool_size=[7, 7], strides=1, name='aver_pool_1')(output)
        # Flatten
        # output = layers.Flatten()(output)
        # Dropout
        output = layers.Dropout(self.Dropout_rate)(output)
        output = tf.squeeze(output, axis=2)
        output1 = tf.squeeze(output, axis=1, name='output1')
        # Dense: activation - softmax
        # logits = MyLayer(2)           # original FC
        logits = layers.Dense(2, activation='elu', name='output')(output1)

        # Build model
        model = keras.Model(inputs, logits)
        model.summary()
        w = model.layers[-1].get_weights()
        w_1 = tf.cast(w[0], dtype=tf.float32, name='w_1')
        model.compile(optimizer=keras.optimizers.Adam(), loss=Custom_loss(theta, w_1, output1), metrics=['accuracy'],
                      experimental_run_tf_function = False
                      )
        return model

class idk():
    def __init__(self, mus, B, M, sigma, dist):
        self.mus = mus
        self.B = B
        self.optim_mus = tf.keras.optimizers.Adam(lr=0.05) 
        self.dist =dist

    def sample(self):
        self.thetas = self.dist.sample((self.B,))
        return self.thetas

    def run(self, dict_m, thetas):
        accs = []
        loss_mu = 0
        with tf.GradientTape() as Tape:
            Tape.watch(self.mus)
            for i in range(self.B):
                max_acc = dict_m['Max_acc{}'.format(i)]
                acc = dict_m['acc{}'.format(i)]
                accs += acc
                loss_mu -= dist.log_prob(thetas[i]) * (max_acc - np.mean(accs)) / (np.std(accs) + np.finfo(np.float32).eps.item())
            loss_mu = loss_mu/self.B
        grad = Tape.gradient(loss_mu, [self.mus])
        self.optim_mus.apply_gradients(zip(grad, [self.mus,]))

if __name__ == '__main__':
    if not os.path.exists(r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d"))):
        os.mkdir(r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d")))
    result_path = r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d"))

    start_time = time.time()
    batch_size = 128
    epochs = 1
    last_acc = 0.995
    B=4
    M=6
    sigma=0.25
    sear = 0
    dict_m = {}  
    
    train_dataset =...
    print("train data loaded...")

    # callbacks
    early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0.003, patience=3, verbose=1)        

    for i in range(B):
        dict_m['csv_logger{}'.format(str(i))] = tf.keras.callbacks.CSVLogger(r".\logs\csv\training_model{}_{}.log.csv".format(str(i), datetime.datetime.now().strftime("%Y%m%d%H%M%S")), append=True)
      

    # start training
    Am_Lfs = am_lfs(M=6, sigma=0.2, epochs=1, batch_size=128, B=4, num_class=2, Dropout_rate=0.3)
    mus = tf.Variable(tf.convert_to_tensor(np.concatenate([np.ones([6, ]), np.zeros([6, ])])), dtype=tf.float64, name='mus')
    dist = tfp.distributions.MultivariateNormalDiag(mus, tf.cast(np.ones(2 * M) * sigma, dtype=tf.float64))
    thetas = idk(mus, B, M, sigma, dist).sample()

    for i in range(B):
        model = Am_Lfs.Mymodel(thetas[i])
        dict_m["m{}".format(str(i))] = model

    for epo in range(200):
        sear = sear + 1
        max_acc_list = []
        for i in range(B):
            history = Am_Lfs.train_model(dict_m["m{}".format(str(i))], early_stopping, dict_m["csv_logger{}".format(str(i))], train_set, test_set, sear, i,
                                                        train_dataset, test_dataset)
            val_acc = history.history['val_accuracy']
            dict_m['acc{}'.format(i)] = val_acc
            dict_m['Max_acc{}'.format(i)] = max(val_acc)
            max_acc_list.append(dict_m['Max_acc{}'.format(i)])
          
        idk(mus, B, M, sigma, dist).run(dict_m, thetas)  
        dist = tfp.distributions.MultivariateNormalDiag(mus, tf.cast(np.ones(2 * M) * sigma, dtype=tf.float64))  
        thetas = idk(mus, B, M, sigma, dist).sample()   

        index, value = np.argmax(max_acc_list), np.max(max_acc_list)
        model_path = os.path.join(result_path, 'model{}_{}.h5'.format(index, sear))

        for i in range(B):
            dict_m["m{}".format(str(i))].load_weights(model_path)

    end_time = time.time()
    print('time_cost(s):', end_time - start_time)

这是错误(如果有帮助的话)

2022-02-25 10:05:38.104299: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_100.dll
2022-02-25 10:06:35.446853: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudnn64_7.dll
2022-02-25 10:17:34.713995: W tensorflow/stream_executor/cuda/redzone_allocator.cc:312] Internal: Invoking ptxas not supported on Windows
Relying on driver to perform ptx compilation. This message will be only logged once.
2022-02-25 10:17:34.956997: E tensorflow/stream_executor/cuda/cuda_blas.cc:428] failed to run cuBLAS routine: CUBLAS_STATUS_EXECUTION_FAILED
2022-02-25 10:17:34.957108: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Internal: Blas GEMM launch failed : a.shape=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
  [[{{node training/Adam/gradients/gradients/output/MatMul_grad/MatMul_1}}]]
Traceback (most recent call last):
  File "D:\anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\eager\function.py", line 511, in call
    ctx=ctx)
  File "D:\anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\eager\execute.py", line 67, in quick_execute
    six.raise_from(core._status_to_exception(e.code, message), None)
  File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InternalError:  Blas GEMM launch failed : a.shape=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
  [[node training/Adam/gradients/gradients/output/MatMul_grad/MatMul_1 (defined at \anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]] [Op:__inference_keras_scratch_graph_5416]

Function call stack:
keras_scratch_graph



I am trying to train a CNN model on gpu with tensorflow-gpu2.0.0, and a error occurred as title mentioned, It can run well on cpu with tensorflow2.0.0. I use cuda 10.0, cudnn 7.6.5, RTX 3080(10GB).
I couldn't find out which part is wrong, I saw some issues similar with this, they said it was caused by the memory occupation of process, so I tried to add

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])

behind import tensorflow as tf, but it didn't work to this, reduce batch_size didn't work as well.

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import layers
import time
import datetime
import os
import tensorflow_probability as tfp
import pickle
import numpy as np
from Fusion_layer import perception_layer_topologyV1_template
from loss_2 import Custom_loss
import math



class am_lfs():

    def __init__(self, M=6, sigma=0.2, epochs=1, batch_size=128, B=4, num_class=2, Dropout_rate=0.3):
        self.M = M
        self.sigma = sigma
        self.batch_size = batch_size
        self.Dropout_rate = Dropout_rate
        self.B = B
        self.epochs = epochs
        self.num_class = num_class
        self.train_set = train_dataset
        self.test_set = test_dataset

    def train_model(self, model, early_stopping, csv_logger, train_set, test_set, sear, i, train_dataset, test_dataset):
        print("training model...")
        history = model.fit(train_dataset,
                              steps_per_epoch = max(1, math.ceil(len(train_set) / self.batch_size)),                       
                              epochs=epochs,
                              validation_data = test_dataset,                 #remember to change it
                              validation_steps = max(1, math.ceil(len(test_set) / self.batch_size)),
                              callbacks=[early_stopping, csv_logger])  
        path1 = 'model{}_{}.h5'.format(i, sear)
        model.save(os.path.join(result_path,path1))
        return history


    def Mymodel(self, theta):
        inputs = layers.Input(shape=(112, 112, 1), name='input')  # shape of input
        x = inputs
        # normal convolution layers
        output = layers.Conv2D(137, (7, 7), strides=2, padding='same', activation='relu')(x)

        output = layers.MaxPooling2D(pool_size=(3, 3), strides=2, padding='same')(output)

        output = layers.Conv2D(137, (1, 1))(output)

        output = layers.Conv2D(144, (3, 3), padding='same')(output)

        output = layers.MaxPooling2D(pool_size=(3, 3), strides=2, padding='same')(output)

        # Inception module (fusion 3)
        output = perception_layer_topologyV1_template(output, output_size=245, convMM_size=2, convMM_num=24,
                                             num_of_c11_1=308, conv1_size=6, conv1_num=77,
                                             num_of_c11_2=24, conv2_size=7, conv2_num=24,
                                             pool1_size=2, num_of_pool1c1_max=40, num_of_pool1c1_min=40,
                                             pool2_size=5, num_of_pool2c1_max=20, num_of_pool2c1_min=20,
                                             name="fusion_3")

        # Inception module (fusion 4)
        output = perception_layer_topologyV1_template(output, output_size=228, convMM_size=2, convMM_num=35,
                                             num_of_c11_1=354, conv1_size=6, conv1_num=15,
                                             num_of_c11_2=281, conv2_size=7, conv2_num=13,
                                             pool1_size=2, num_of_pool1c1_max=24, num_of_pool1c1_min=23,
                                             pool2_size=5, num_of_pool2c1_max=59, num_of_pool2c1_min=59,
                                             name="fusion_4")

        # max_pool
        output = layers.MaxPooling2D(pool_size=[3, 3],strides =2, padding = 'same', name = 'pool2')(output)

        # Inception module (fusion 5)
        output = perception_layer_topologyV1_template(output, output_size=459, convMM_size=2, convMM_num=14,
                                             num_of_c11_1=46, conv1_size=3, conv1_num=25,
                                             num_of_c11_2=170, conv2_size=5, conv2_num=116,
                                             pool1_size=4, num_of_pool1c1_max=107, num_of_pool1c1_min=107,
                                             pool2_size=5, num_of_pool2c1_max=45, num_of_pool2c1_min=45,
                                             name="fusion_5")
        # AveragePooling
        output = layers.AveragePooling2D(pool_size=[7, 7], strides=1, name='aver_pool_1')(output)
        # Flatten
        # output = layers.Flatten()(output)
        # Dropout
        output = layers.Dropout(self.Dropout_rate)(output)
        output = tf.squeeze(output, axis=2)
        output1 = tf.squeeze(output, axis=1, name='output1')
        # Dense: activation - softmax
        # logits = MyLayer(2)           # original FC
        logits = layers.Dense(2, activation='elu', name='output')(output1)

        # Build model
        model = keras.Model(inputs, logits)
        model.summary()
        w = model.layers[-1].get_weights()
        w_1 = tf.cast(w[0], dtype=tf.float32, name='w_1')
        model.compile(optimizer=keras.optimizers.Adam(), loss=Custom_loss(theta, w_1, output1), metrics=['accuracy'],
                      experimental_run_tf_function = False
                      )
        return model

class idk():
    def __init__(self, mus, B, M, sigma, dist):
        self.mus = mus
        self.B = B
        self.optim_mus = tf.keras.optimizers.Adam(lr=0.05) 
        self.dist =dist

    def sample(self):
        self.thetas = self.dist.sample((self.B,))
        return self.thetas

    def run(self, dict_m, thetas):
        accs = []
        loss_mu = 0
        with tf.GradientTape() as Tape:
            Tape.watch(self.mus)
            for i in range(self.B):
                max_acc = dict_m['Max_acc{}'.format(i)]
                acc = dict_m['acc{}'.format(i)]
                accs += acc
                loss_mu -= dist.log_prob(thetas[i]) * (max_acc - np.mean(accs)) / (np.std(accs) + np.finfo(np.float32).eps.item())
            loss_mu = loss_mu/self.B
        grad = Tape.gradient(loss_mu, [self.mus])
        self.optim_mus.apply_gradients(zip(grad, [self.mus,]))

if __name__ == '__main__':
    if not os.path.exists(r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d"))):
        os.mkdir(r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d")))
    result_path = r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d"))

    start_time = time.time()
    batch_size = 128
    epochs = 1
    last_acc = 0.995
    B=4
    M=6
    sigma=0.25
    sear = 0
    dict_m = {}  
    
    train_dataset =...
    print("train data loaded...")

    # callbacks
    early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0.003, patience=3, verbose=1)        

    for i in range(B):
        dict_m['csv_logger{}'.format(str(i))] = tf.keras.callbacks.CSVLogger(r".\logs\csv\training_model{}_{}.log.csv".format(str(i), datetime.datetime.now().strftime("%Y%m%d%H%M%S")), append=True)
      

    # start training
    Am_Lfs = am_lfs(M=6, sigma=0.2, epochs=1, batch_size=128, B=4, num_class=2, Dropout_rate=0.3)
    mus = tf.Variable(tf.convert_to_tensor(np.concatenate([np.ones([6, ]), np.zeros([6, ])])), dtype=tf.float64, name='mus')
    dist = tfp.distributions.MultivariateNormalDiag(mus, tf.cast(np.ones(2 * M) * sigma, dtype=tf.float64))
    thetas = idk(mus, B, M, sigma, dist).sample()

    for i in range(B):
        model = Am_Lfs.Mymodel(thetas[i])
        dict_m["m{}".format(str(i))] = model

    for epo in range(200):
        sear = sear + 1
        max_acc_list = []
        for i in range(B):
            history = Am_Lfs.train_model(dict_m["m{}".format(str(i))], early_stopping, dict_m["csv_logger{}".format(str(i))], train_set, test_set, sear, i,
                                                        train_dataset, test_dataset)
            val_acc = history.history['val_accuracy']
            dict_m['acc{}'.format(i)] = val_acc
            dict_m['Max_acc{}'.format(i)] = max(val_acc)
            max_acc_list.append(dict_m['Max_acc{}'.format(i)])
          
        idk(mus, B, M, sigma, dist).run(dict_m, thetas)  
        dist = tfp.distributions.MultivariateNormalDiag(mus, tf.cast(np.ones(2 * M) * sigma, dtype=tf.float64))  
        thetas = idk(mus, B, M, sigma, dist).sample()   

        index, value = np.argmax(max_acc_list), np.max(max_acc_list)
        model_path = os.path.join(result_path, 'model{}_{}.h5'.format(index, sear))

        for i in range(B):
            dict_m["m{}".format(str(i))].load_weights(model_path)

    end_time = time.time()
    print('time_cost(s):', end_time - start_time)

here is the error if it helps

2022-02-25 10:05:38.104299: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_100.dll
2022-02-25 10:06:35.446853: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudnn64_7.dll
2022-02-25 10:17:34.713995: W tensorflow/stream_executor/cuda/redzone_allocator.cc:312] Internal: Invoking ptxas not supported on Windows
Relying on driver to perform ptx compilation. This message will be only logged once.
2022-02-25 10:17:34.956997: E tensorflow/stream_executor/cuda/cuda_blas.cc:428] failed to run cuBLAS routine: CUBLAS_STATUS_EXECUTION_FAILED
2022-02-25 10:17:34.957108: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Internal: Blas GEMM launch failed : a.shape=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
  [[{{node training/Adam/gradients/gradients/output/MatMul_grad/MatMul_1}}]]
Traceback (most recent call last):
  File "D:\anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\eager\function.py", line 511, in call
    ctx=ctx)
  File "D:\anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\eager\execute.py", line 67, in quick_execute
    six.raise_from(core._status_to_exception(e.code, message), None)
  File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InternalError:  Blas GEMM launch failed : a.shape=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
  [[node training/Adam/gradients/gradients/output/MatMul_grad/MatMul_1 (defined at \anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]] [Op:__inference_keras_scratch_graph_5416]

Function call stack:
keras_scratch_graph



如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文