如何在 GCP 上使用多个 GPU 使用 jupyter 训练 keras 模型?

发布于 2025-01-10 08:15:02 字数 3274 浏览 0 评论 0原文

我在 GCP 上有 8 个 A100 GPU。所以我想使用多个 GPU 来训练模型,我的代码:

strategy = tf.distribute.MirroredStrategy()
#### Open a strategy scope.
with strategy.scope():       
# design network
  model = Sequential()
  model.add(LSTM(50, input_shape=(window, len(sel_cols)),
          return_sequences=True))
  model.add(LSTM(100))
  # model.add(LSTM(100))
  # model.add(LSTM(100))
  model.add(Dense(1))
  model.compile(loss=custom_loss, optimizer='adam')

然后当我运行
history = model.fit(train_X, train_y, epochs=epochs, batch_size=288)

我收到错误:

NotImplementedError                       Traceback (most recent call last)
 /tmp/ipykernel_65517/380346326.py in <module>
  17 #                 validation_data=(val_X, val_y),verbose=1)
  18 
   ---> 19 history = model.fit(train_X, train_y, epochs=epochs, batch_size=288)
  20 
  21 # history = model.fit_generator(train_X,steps_per_epoch=1000, epochs=100)

    /opt/conda/lib/python3.7/site-packages/keras/engine/training.py in fit(self, x, y, 
  batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, 
 class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, 
 validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
 1211         else:
  1212             fit_inputs = x + y + sample_weights
    -> 1213         self._make_train_function()
 1214         fit_function = self.train_function
 1215 

 /opt/conda/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
   314                     training_updates = self.optimizer.get_updates(
   315                         params=self._collected_trainable_weights,
     --> 316                         loss=self.total_loss)
   317                 updates = self.updates + training_updates
   318 

     /opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
       89                 warnings.warn('Update your `' + object_name + '` call to the ' +
       90                               'Keras 2 API: ' + signature, stacklevel=2)
  ---> 91             return func(*args, **kwargs)
       92         wrapper._original_function = func
       93         return wrapper

  /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in       symbolic_fn_wrapper(*args, **kwargs)
       73         if _SYMBOLIC_SCOPE.value:
       74             with get_graph().as_default():
  ---> 75                 return func(*args, **kwargs)
       76         else:
       77             return func(*args, **kwargs)

  /opt/conda/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
      548 
      549             # Apply constraints.
  --> 550             if getattr(p, 'constraint', None) is not None:
      551                 new_p = p.constraint(new_p)
      552 

  /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/variables.py in constraint(self)
      569       Can be `None` if no constraint was passed.
      570     """
  --> 571     raise NotImplementedError
      572 
      573   def assign(self, value, use_locking=False, name=None, read_value=True):

  NotImplementedError: 

如何解决此问题?多谢。

I have 8 A100 gpus on GCP. So I want to train the model using multiple gpus, my code:

strategy = tf.distribute.MirroredStrategy()
#### Open a strategy scope.
with strategy.scope():       
# design network
  model = Sequential()
  model.add(LSTM(50, input_shape=(window, len(sel_cols)),
          return_sequences=True))
  model.add(LSTM(100))
  # model.add(LSTM(100))
  # model.add(LSTM(100))
  model.add(Dense(1))
  model.compile(loss=custom_loss, optimizer='adam')

Then when I run
history = model.fit(train_X, train_y, epochs=epochs, batch_size=288)

I got errors:

NotImplementedError                       Traceback (most recent call last)
 /tmp/ipykernel_65517/380346326.py in <module>
  17 #                 validation_data=(val_X, val_y),verbose=1)
  18 
   ---> 19 history = model.fit(train_X, train_y, epochs=epochs, batch_size=288)
  20 
  21 # history = model.fit_generator(train_X,steps_per_epoch=1000, epochs=100)

    /opt/conda/lib/python3.7/site-packages/keras/engine/training.py in fit(self, x, y, 
  batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, 
 class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, 
 validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
 1211         else:
  1212             fit_inputs = x + y + sample_weights
    -> 1213         self._make_train_function()
 1214         fit_function = self.train_function
 1215 

 /opt/conda/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
   314                     training_updates = self.optimizer.get_updates(
   315                         params=self._collected_trainable_weights,
     --> 316                         loss=self.total_loss)
   317                 updates = self.updates + training_updates
   318 

     /opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
       89                 warnings.warn('Update your `' + object_name + '` call to the ' +
       90                               'Keras 2 API: ' + signature, stacklevel=2)
  ---> 91             return func(*args, **kwargs)
       92         wrapper._original_function = func
       93         return wrapper

  /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in       symbolic_fn_wrapper(*args, **kwargs)
       73         if _SYMBOLIC_SCOPE.value:
       74             with get_graph().as_default():
  ---> 75                 return func(*args, **kwargs)
       76         else:
       77             return func(*args, **kwargs)

  /opt/conda/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
      548 
      549             # Apply constraints.
  --> 550             if getattr(p, 'constraint', None) is not None:
      551                 new_p = p.constraint(new_p)
      552 

  /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/variables.py in constraint(self)
      569       Can be `None` if no constraint was passed.
      570     """
  --> 571     raise NotImplementedError
      572 
      573   def assign(self, value, use_locking=False, name=None, read_value=True):

  NotImplementedError: 

How could I fix this issue? Thanks a lot.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文