TensorFlow 2.8 GPU内存超过火车CNN

发布于 2025-01-22 06:26:22 字数 2661 浏览 0 评论 0原文

在模型期间无法使用GPU训练模型。FITGPU内存使用率100％并给出错误。

流程用退出代码-1073740791（0xc0000409）

import tensorflow as tf
import tensorflow_datasets as tfds

import numpy as np
import os
import PIL
import PIL.Image


import pathlib
dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
data_dir = tf.keras.utils.get_file(origin=dataset_url,
                                   fname='flower_photos',
                                   untar=True)
data_dir = pathlib.Path(data_dir)


image_count = len(list(data_dir.glob('*/*.jpg')))
print(image_count)

batch_size = 32
img_height = 180
img_width = 180


train_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="training",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)


val_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)


class_names = train_ds.class_names
print(class_names)



for image_batch, labels_batch in train_ds:
  print(image_batch.shape)
  print(labels_batch.shape)
  break




normalization_layer = tf.keras.layers.Rescaling(1./255)



AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)





num_classes = 5

model = tf.keras.Sequential([
  tf.keras.layers.Rescaling(1./255),
  tf.keras.layers.Conv2D(32, 3, activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Conv2D(32, 3, activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Conv2D(32, 3, activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_classes)
])




model.compile(
  optimizer='adam',
  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=['accuracy'])



model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=30
)**

错误：

3670 找到属于5个类的3670个文件。使用2936个文件进行培训。 2022-04-19 15：54：08.693362：i Tensorflow/core/core/platform/cpu_feature_guard.cc：151]该张力型二进制二进制二进制于Oneapi深层神经网络库（ONEDNN）使用以下CPU指导，以实现以下CPU指导。 AVX AVX2 要在其他操作中启用它们，请使用适当的编译器标志来重建TensorFlow。 2022-04-19 15：54：09.479056：i Tensorflow/core/councom/common_runtime/gpu/gpu/gpu_device.cc：1525]创建的设备/job：localhost/replica：0/task：0/task ： - ＆gt;设备：0，名称：Quadro P2000，PCI总线ID：0000：21：00.0，计算能力：6.1 找到属于5个类的3670个文件。使用734个文件进行验证。 ['daisy'，'蒲公英'，'玫瑰'，“向日葵”，“郁金香”] （32、180、180、3）（32，） EPOCH 1/30

过程以退出代码-1073740791（0xc0000409）完成

原文

Not able to train model with GPU during model.fit gpu memory usage 100% and give the error.

Process finished with exit code -1073740791 (0xC0000409)

import tensorflow as tf
import tensorflow_datasets as tfds

import numpy as np
import os
import PIL
import PIL.Image


import pathlib
dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
data_dir = tf.keras.utils.get_file(origin=dataset_url,
                                   fname='flower_photos',
                                   untar=True)
data_dir = pathlib.Path(data_dir)


image_count = len(list(data_dir.glob('*/*.jpg')))
print(image_count)

batch_size = 32
img_height = 180
img_width = 180


train_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="training",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)


val_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)


class_names = train_ds.class_names
print(class_names)



for image_batch, labels_batch in train_ds:
  print(image_batch.shape)
  print(labels_batch.shape)
  break




normalization_layer = tf.keras.layers.Rescaling(1./255)



AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)





num_classes = 5

model = tf.keras.Sequential([
  tf.keras.layers.Rescaling(1./255),
  tf.keras.layers.Conv2D(32, 3, activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Conv2D(32, 3, activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Conv2D(32, 3, activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_classes)
])




model.compile(
  optimizer='adam',
  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=['accuracy'])



model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=30
)**

Error :

3670
Found 3670 files belonging to 5 classes.
Using 2936 files for training.
2022-04-19 15:54:08.693362: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-19 15:54:09.479056: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3635 MB memory: -> device: 0, name: Quadro P2000, pci bus id: 0000:21:00.0, compute capability: 6.1
Found 3670 files belonging to 5 classes.
Using 734 files for validation.
['daisy', 'dandelion', 'roses', 'sunflowers', 'tulips']
(32, 180, 180, 3)
(32,)
Epoch 1/30

Process finished with exit code -1073740791 (0xC0000409)

分享到QQ

分享到微博