如何在 GCP 上使用多个 GPU 使用 jupyter 训练 keras 模型?
我在 GCP 上有 8 个 A100 GPU。所以我想使用多个 GPU 来训练模型,我的代码:
strategy = tf.distribute.MirroredStrategy()
#### Open a strategy scope.
with strategy.scope():
# design network
model = Sequential()
model.add(LSTM(50, input_shape=(window, len(sel_cols)),
return_sequences=True))
model.add(LSTM(100))
# model.add(LSTM(100))
# model.add(LSTM(100))
model.add(Dense(1))
model.compile(loss=custom_loss, optimizer='adam')
然后当我运行history = model.fit(train_X, train_y, epochs=epochs, batch_size=288)
我收到错误:
NotImplementedError Traceback (most recent call last)
/tmp/ipykernel_65517/380346326.py in <module>
17 # validation_data=(val_X, val_y),verbose=1)
18
---> 19 history = model.fit(train_X, train_y, epochs=epochs, batch_size=288)
20
21 # history = model.fit_generator(train_X,steps_per_epoch=1000, epochs=100)
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in fit(self, x, y,
batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle,
class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps,
validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
1211 else:
1212 fit_inputs = x + y + sample_weights
-> 1213 self._make_train_function()
1214 fit_function = self.train_function
1215
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
314 training_updates = self.optimizer.get_updates(
315 params=self._collected_trainable_weights,
--> 316 loss=self.total_loss)
317 updates = self.updates + training_updates
318
/opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
/opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in symbolic_fn_wrapper(*args, **kwargs)
73 if _SYMBOLIC_SCOPE.value:
74 with get_graph().as_default():
---> 75 return func(*args, **kwargs)
76 else:
77 return func(*args, **kwargs)
/opt/conda/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
548
549 # Apply constraints.
--> 550 if getattr(p, 'constraint', None) is not None:
551 new_p = p.constraint(new_p)
552
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/variables.py in constraint(self)
569 Can be `None` if no constraint was passed.
570 """
--> 571 raise NotImplementedError
572
573 def assign(self, value, use_locking=False, name=None, read_value=True):
NotImplementedError:
如何解决此问题?多谢。
I have 8 A100 gpus on GCP. So I want to train the model using multiple gpus, my code:
strategy = tf.distribute.MirroredStrategy()
#### Open a strategy scope.
with strategy.scope():
# design network
model = Sequential()
model.add(LSTM(50, input_shape=(window, len(sel_cols)),
return_sequences=True))
model.add(LSTM(100))
# model.add(LSTM(100))
# model.add(LSTM(100))
model.add(Dense(1))
model.compile(loss=custom_loss, optimizer='adam')
Then when I runhistory = model.fit(train_X, train_y, epochs=epochs, batch_size=288)
I got errors:
NotImplementedError Traceback (most recent call last)
/tmp/ipykernel_65517/380346326.py in <module>
17 # validation_data=(val_X, val_y),verbose=1)
18
---> 19 history = model.fit(train_X, train_y, epochs=epochs, batch_size=288)
20
21 # history = model.fit_generator(train_X,steps_per_epoch=1000, epochs=100)
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in fit(self, x, y,
batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle,
class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps,
validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
1211 else:
1212 fit_inputs = x + y + sample_weights
-> 1213 self._make_train_function()
1214 fit_function = self.train_function
1215
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
314 training_updates = self.optimizer.get_updates(
315 params=self._collected_trainable_weights,
--> 316 loss=self.total_loss)
317 updates = self.updates + training_updates
318
/opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
/opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in symbolic_fn_wrapper(*args, **kwargs)
73 if _SYMBOLIC_SCOPE.value:
74 with get_graph().as_default():
---> 75 return func(*args, **kwargs)
76 else:
77 return func(*args, **kwargs)
/opt/conda/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
548
549 # Apply constraints.
--> 550 if getattr(p, 'constraint', None) is not None:
551 new_p = p.constraint(new_p)
552
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/variables.py in constraint(self)
569 Can be `None` if no constraint was passed.
570 """
--> 571 raise NotImplementedError
572
573 def assign(self, value, use_locking=False, name=None, read_value=True):
NotImplementedError:
How could I fix this issue? Thanks a lot.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论