tensorflow.python.framework.errors_impl.InternaalError:BLAS GEMM 启动失败: a.shpe=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
我正在尝试使用tensorflow-gpu2.0.0在gpu上训练CNN模型,并且出现了标题提到的错误,它可以在使用tensorflow2.0.0的cpu上运行良好。我使用 cuda 10.0、cudnn 7.6.5、RTX 3080(10GB)。 我找不到哪个部分是错误的,我看到一些与此类似的问题,他们说是进程内存占用造成的,所以我尝试在
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
import tensorflow as tf
后面添加,但是它对此不起作用,减少batch_size也不起作用。
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import layers
import time
import datetime
import os
import tensorflow_probability as tfp
import pickle
import numpy as np
from Fusion_layer import perception_layer_topologyV1_template
from loss_2 import Custom_loss
import math
class am_lfs():
def __init__(self, M=6, sigma=0.2, epochs=1, batch_size=128, B=4, num_class=2, Dropout_rate=0.3):
self.M = M
self.sigma = sigma
self.batch_size = batch_size
self.Dropout_rate = Dropout_rate
self.B = B
self.epochs = epochs
self.num_class = num_class
self.train_set = train_dataset
self.test_set = test_dataset
def train_model(self, model, early_stopping, csv_logger, train_set, test_set, sear, i, train_dataset, test_dataset):
print("training model...")
history = model.fit(train_dataset,
steps_per_epoch = max(1, math.ceil(len(train_set) / self.batch_size)),
epochs=epochs,
validation_data = test_dataset, #remember to change it
validation_steps = max(1, math.ceil(len(test_set) / self.batch_size)),
callbacks=[early_stopping, csv_logger])
path1 = 'model{}_{}.h5'.format(i, sear)
model.save(os.path.join(result_path,path1))
return history
def Mymodel(self, theta):
inputs = layers.Input(shape=(112, 112, 1), name='input') # shape of input
x = inputs
# normal convolution layers
output = layers.Conv2D(137, (7, 7), strides=2, padding='same', activation='relu')(x)
output = layers.MaxPooling2D(pool_size=(3, 3), strides=2, padding='same')(output)
output = layers.Conv2D(137, (1, 1))(output)
output = layers.Conv2D(144, (3, 3), padding='same')(output)
output = layers.MaxPooling2D(pool_size=(3, 3), strides=2, padding='same')(output)
# Inception module (fusion 3)
output = perception_layer_topologyV1_template(output, output_size=245, convMM_size=2, convMM_num=24,
num_of_c11_1=308, conv1_size=6, conv1_num=77,
num_of_c11_2=24, conv2_size=7, conv2_num=24,
pool1_size=2, num_of_pool1c1_max=40, num_of_pool1c1_min=40,
pool2_size=5, num_of_pool2c1_max=20, num_of_pool2c1_min=20,
name="fusion_3")
# Inception module (fusion 4)
output = perception_layer_topologyV1_template(output, output_size=228, convMM_size=2, convMM_num=35,
num_of_c11_1=354, conv1_size=6, conv1_num=15,
num_of_c11_2=281, conv2_size=7, conv2_num=13,
pool1_size=2, num_of_pool1c1_max=24, num_of_pool1c1_min=23,
pool2_size=5, num_of_pool2c1_max=59, num_of_pool2c1_min=59,
name="fusion_4")
# max_pool
output = layers.MaxPooling2D(pool_size=[3, 3],strides =2, padding = 'same', name = 'pool2')(output)
# Inception module (fusion 5)
output = perception_layer_topologyV1_template(output, output_size=459, convMM_size=2, convMM_num=14,
num_of_c11_1=46, conv1_size=3, conv1_num=25,
num_of_c11_2=170, conv2_size=5, conv2_num=116,
pool1_size=4, num_of_pool1c1_max=107, num_of_pool1c1_min=107,
pool2_size=5, num_of_pool2c1_max=45, num_of_pool2c1_min=45,
name="fusion_5")
# AveragePooling
output = layers.AveragePooling2D(pool_size=[7, 7], strides=1, name='aver_pool_1')(output)
# Flatten
# output = layers.Flatten()(output)
# Dropout
output = layers.Dropout(self.Dropout_rate)(output)
output = tf.squeeze(output, axis=2)
output1 = tf.squeeze(output, axis=1, name='output1')
# Dense: activation - softmax
# logits = MyLayer(2) # original FC
logits = layers.Dense(2, activation='elu', name='output')(output1)
# Build model
model = keras.Model(inputs, logits)
model.summary()
w = model.layers[-1].get_weights()
w_1 = tf.cast(w[0], dtype=tf.float32, name='w_1')
model.compile(optimizer=keras.optimizers.Adam(), loss=Custom_loss(theta, w_1, output1), metrics=['accuracy'],
experimental_run_tf_function = False
)
return model
class idk():
def __init__(self, mus, B, M, sigma, dist):
self.mus = mus
self.B = B
self.optim_mus = tf.keras.optimizers.Adam(lr=0.05)
self.dist =dist
def sample(self):
self.thetas = self.dist.sample((self.B,))
return self.thetas
def run(self, dict_m, thetas):
accs = []
loss_mu = 0
with tf.GradientTape() as Tape:
Tape.watch(self.mus)
for i in range(self.B):
max_acc = dict_m['Max_acc{}'.format(i)]
acc = dict_m['acc{}'.format(i)]
accs += acc
loss_mu -= dist.log_prob(thetas[i]) * (max_acc - np.mean(accs)) / (np.std(accs) + np.finfo(np.float32).eps.item())
loss_mu = loss_mu/self.B
grad = Tape.gradient(loss_mu, [self.mus])
self.optim_mus.apply_gradients(zip(grad, [self.mus,]))
if __name__ == '__main__':
if not os.path.exists(r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d"))):
os.mkdir(r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d")))
result_path = r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d"))
start_time = time.time()
batch_size = 128
epochs = 1
last_acc = 0.995
B=4
M=6
sigma=0.25
sear = 0
dict_m = {}
train_dataset =...
print("train data loaded...")
# callbacks
early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0.003, patience=3, verbose=1)
for i in range(B):
dict_m['csv_logger{}'.format(str(i))] = tf.keras.callbacks.CSVLogger(r".\logs\csv\training_model{}_{}.log.csv".format(str(i), datetime.datetime.now().strftime("%Y%m%d%H%M%S")), append=True)
# start training
Am_Lfs = am_lfs(M=6, sigma=0.2, epochs=1, batch_size=128, B=4, num_class=2, Dropout_rate=0.3)
mus = tf.Variable(tf.convert_to_tensor(np.concatenate([np.ones([6, ]), np.zeros([6, ])])), dtype=tf.float64, name='mus')
dist = tfp.distributions.MultivariateNormalDiag(mus, tf.cast(np.ones(2 * M) * sigma, dtype=tf.float64))
thetas = idk(mus, B, M, sigma, dist).sample()
for i in range(B):
model = Am_Lfs.Mymodel(thetas[i])
dict_m["m{}".format(str(i))] = model
for epo in range(200):
sear = sear + 1
max_acc_list = []
for i in range(B):
history = Am_Lfs.train_model(dict_m["m{}".format(str(i))], early_stopping, dict_m["csv_logger{}".format(str(i))], train_set, test_set, sear, i,
train_dataset, test_dataset)
val_acc = history.history['val_accuracy']
dict_m['acc{}'.format(i)] = val_acc
dict_m['Max_acc{}'.format(i)] = max(val_acc)
max_acc_list.append(dict_m['Max_acc{}'.format(i)])
idk(mus, B, M, sigma, dist).run(dict_m, thetas)
dist = tfp.distributions.MultivariateNormalDiag(mus, tf.cast(np.ones(2 * M) * sigma, dtype=tf.float64))
thetas = idk(mus, B, M, sigma, dist).sample()
index, value = np.argmax(max_acc_list), np.max(max_acc_list)
model_path = os.path.join(result_path, 'model{}_{}.h5'.format(index, sear))
for i in range(B):
dict_m["m{}".format(str(i))].load_weights(model_path)
end_time = time.time()
print('time_cost(s):', end_time - start_time)
这是错误(如果有帮助的话)
2022-02-25 10:05:38.104299: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_100.dll
2022-02-25 10:06:35.446853: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudnn64_7.dll
2022-02-25 10:17:34.713995: W tensorflow/stream_executor/cuda/redzone_allocator.cc:312] Internal: Invoking ptxas not supported on Windows
Relying on driver to perform ptx compilation. This message will be only logged once.
2022-02-25 10:17:34.956997: E tensorflow/stream_executor/cuda/cuda_blas.cc:428] failed to run cuBLAS routine: CUBLAS_STATUS_EXECUTION_FAILED
2022-02-25 10:17:34.957108: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Internal: Blas GEMM launch failed : a.shape=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
[[{{node training/Adam/gradients/gradients/output/MatMul_grad/MatMul_1}}]]
Traceback (most recent call last):
File "D:\anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\eager\function.py", line 511, in call
ctx=ctx)
File "D:\anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\eager\execute.py", line 67, in quick_execute
six.raise_from(core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InternalError: Blas GEMM launch failed : a.shape=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
[[node training/Adam/gradients/gradients/output/MatMul_grad/MatMul_1 (defined at \anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]] [Op:__inference_keras_scratch_graph_5416]
Function call stack:
keras_scratch_graph
I am trying to train a CNN model on gpu with tensorflow-gpu2.0.0, and a error occurred as title mentioned, It can run well on cpu with tensorflow2.0.0. I use cuda 10.0, cudnn 7.6.5, RTX 3080(10GB).
I couldn't find out which part is wrong, I saw some issues similar with this, they said it was caused by the memory occupation of process, so I tried to add
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
behind import tensorflow as tf
, but it didn't work to this, reduce batch_size didn't work as well.
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import layers
import time
import datetime
import os
import tensorflow_probability as tfp
import pickle
import numpy as np
from Fusion_layer import perception_layer_topologyV1_template
from loss_2 import Custom_loss
import math
class am_lfs():
def __init__(self, M=6, sigma=0.2, epochs=1, batch_size=128, B=4, num_class=2, Dropout_rate=0.3):
self.M = M
self.sigma = sigma
self.batch_size = batch_size
self.Dropout_rate = Dropout_rate
self.B = B
self.epochs = epochs
self.num_class = num_class
self.train_set = train_dataset
self.test_set = test_dataset
def train_model(self, model, early_stopping, csv_logger, train_set, test_set, sear, i, train_dataset, test_dataset):
print("training model...")
history = model.fit(train_dataset,
steps_per_epoch = max(1, math.ceil(len(train_set) / self.batch_size)),
epochs=epochs,
validation_data = test_dataset, #remember to change it
validation_steps = max(1, math.ceil(len(test_set) / self.batch_size)),
callbacks=[early_stopping, csv_logger])
path1 = 'model{}_{}.h5'.format(i, sear)
model.save(os.path.join(result_path,path1))
return history
def Mymodel(self, theta):
inputs = layers.Input(shape=(112, 112, 1), name='input') # shape of input
x = inputs
# normal convolution layers
output = layers.Conv2D(137, (7, 7), strides=2, padding='same', activation='relu')(x)
output = layers.MaxPooling2D(pool_size=(3, 3), strides=2, padding='same')(output)
output = layers.Conv2D(137, (1, 1))(output)
output = layers.Conv2D(144, (3, 3), padding='same')(output)
output = layers.MaxPooling2D(pool_size=(3, 3), strides=2, padding='same')(output)
# Inception module (fusion 3)
output = perception_layer_topologyV1_template(output, output_size=245, convMM_size=2, convMM_num=24,
num_of_c11_1=308, conv1_size=6, conv1_num=77,
num_of_c11_2=24, conv2_size=7, conv2_num=24,
pool1_size=2, num_of_pool1c1_max=40, num_of_pool1c1_min=40,
pool2_size=5, num_of_pool2c1_max=20, num_of_pool2c1_min=20,
name="fusion_3")
# Inception module (fusion 4)
output = perception_layer_topologyV1_template(output, output_size=228, convMM_size=2, convMM_num=35,
num_of_c11_1=354, conv1_size=6, conv1_num=15,
num_of_c11_2=281, conv2_size=7, conv2_num=13,
pool1_size=2, num_of_pool1c1_max=24, num_of_pool1c1_min=23,
pool2_size=5, num_of_pool2c1_max=59, num_of_pool2c1_min=59,
name="fusion_4")
# max_pool
output = layers.MaxPooling2D(pool_size=[3, 3],strides =2, padding = 'same', name = 'pool2')(output)
# Inception module (fusion 5)
output = perception_layer_topologyV1_template(output, output_size=459, convMM_size=2, convMM_num=14,
num_of_c11_1=46, conv1_size=3, conv1_num=25,
num_of_c11_2=170, conv2_size=5, conv2_num=116,
pool1_size=4, num_of_pool1c1_max=107, num_of_pool1c1_min=107,
pool2_size=5, num_of_pool2c1_max=45, num_of_pool2c1_min=45,
name="fusion_5")
# AveragePooling
output = layers.AveragePooling2D(pool_size=[7, 7], strides=1, name='aver_pool_1')(output)
# Flatten
# output = layers.Flatten()(output)
# Dropout
output = layers.Dropout(self.Dropout_rate)(output)
output = tf.squeeze(output, axis=2)
output1 = tf.squeeze(output, axis=1, name='output1')
# Dense: activation - softmax
# logits = MyLayer(2) # original FC
logits = layers.Dense(2, activation='elu', name='output')(output1)
# Build model
model = keras.Model(inputs, logits)
model.summary()
w = model.layers[-1].get_weights()
w_1 = tf.cast(w[0], dtype=tf.float32, name='w_1')
model.compile(optimizer=keras.optimizers.Adam(), loss=Custom_loss(theta, w_1, output1), metrics=['accuracy'],
experimental_run_tf_function = False
)
return model
class idk():
def __init__(self, mus, B, M, sigma, dist):
self.mus = mus
self.B = B
self.optim_mus = tf.keras.optimizers.Adam(lr=0.05)
self.dist =dist
def sample(self):
self.thetas = self.dist.sample((self.B,))
return self.thetas
def run(self, dict_m, thetas):
accs = []
loss_mu = 0
with tf.GradientTape() as Tape:
Tape.watch(self.mus)
for i in range(self.B):
max_acc = dict_m['Max_acc{}'.format(i)]
acc = dict_m['acc{}'.format(i)]
accs += acc
loss_mu -= dist.log_prob(thetas[i]) * (max_acc - np.mean(accs)) / (np.std(accs) + np.finfo(np.float32).eps.item())
loss_mu = loss_mu/self.B
grad = Tape.gradient(loss_mu, [self.mus])
self.optim_mus.apply_gradients(zip(grad, [self.mus,]))
if __name__ == '__main__':
if not os.path.exists(r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d"))):
os.mkdir(r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d")))
result_path = r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d"))
start_time = time.time()
batch_size = 128
epochs = 1
last_acc = 0.995
B=4
M=6
sigma=0.25
sear = 0
dict_m = {}
train_dataset =...
print("train data loaded...")
# callbacks
early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0.003, patience=3, verbose=1)
for i in range(B):
dict_m['csv_logger{}'.format(str(i))] = tf.keras.callbacks.CSVLogger(r".\logs\csv\training_model{}_{}.log.csv".format(str(i), datetime.datetime.now().strftime("%Y%m%d%H%M%S")), append=True)
# start training
Am_Lfs = am_lfs(M=6, sigma=0.2, epochs=1, batch_size=128, B=4, num_class=2, Dropout_rate=0.3)
mus = tf.Variable(tf.convert_to_tensor(np.concatenate([np.ones([6, ]), np.zeros([6, ])])), dtype=tf.float64, name='mus')
dist = tfp.distributions.MultivariateNormalDiag(mus, tf.cast(np.ones(2 * M) * sigma, dtype=tf.float64))
thetas = idk(mus, B, M, sigma, dist).sample()
for i in range(B):
model = Am_Lfs.Mymodel(thetas[i])
dict_m["m{}".format(str(i))] = model
for epo in range(200):
sear = sear + 1
max_acc_list = []
for i in range(B):
history = Am_Lfs.train_model(dict_m["m{}".format(str(i))], early_stopping, dict_m["csv_logger{}".format(str(i))], train_set, test_set, sear, i,
train_dataset, test_dataset)
val_acc = history.history['val_accuracy']
dict_m['acc{}'.format(i)] = val_acc
dict_m['Max_acc{}'.format(i)] = max(val_acc)
max_acc_list.append(dict_m['Max_acc{}'.format(i)])
idk(mus, B, M, sigma, dist).run(dict_m, thetas)
dist = tfp.distributions.MultivariateNormalDiag(mus, tf.cast(np.ones(2 * M) * sigma, dtype=tf.float64))
thetas = idk(mus, B, M, sigma, dist).sample()
index, value = np.argmax(max_acc_list), np.max(max_acc_list)
model_path = os.path.join(result_path, 'model{}_{}.h5'.format(index, sear))
for i in range(B):
dict_m["m{}".format(str(i))].load_weights(model_path)
end_time = time.time()
print('time_cost(s):', end_time - start_time)
here is the error if it helps
2022-02-25 10:05:38.104299: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_100.dll
2022-02-25 10:06:35.446853: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudnn64_7.dll
2022-02-25 10:17:34.713995: W tensorflow/stream_executor/cuda/redzone_allocator.cc:312] Internal: Invoking ptxas not supported on Windows
Relying on driver to perform ptx compilation. This message will be only logged once.
2022-02-25 10:17:34.956997: E tensorflow/stream_executor/cuda/cuda_blas.cc:428] failed to run cuBLAS routine: CUBLAS_STATUS_EXECUTION_FAILED
2022-02-25 10:17:34.957108: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Internal: Blas GEMM launch failed : a.shape=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
[[{{node training/Adam/gradients/gradients/output/MatMul_grad/MatMul_1}}]]
Traceback (most recent call last):
File "D:\anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\eager\function.py", line 511, in call
ctx=ctx)
File "D:\anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\eager\execute.py", line 67, in quick_execute
six.raise_from(core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InternalError: Blas GEMM launch failed : a.shape=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
[[node training/Adam/gradients/gradients/output/MatMul_grad/MatMul_1 (defined at \anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]] [Op:__inference_keras_scratch_graph_5416]
Function call stack:
keras_scratch_graph
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论