我试图将KERAS变压器模型转换为Pythorch,但模型参数的数量不同
为什么Pytorch模型只有约370万个参数,而Keras模型的参数超过3000万?我做错了什么,或者在凯拉斯和多障碍层的pytorch实现之间有如此巨大的区别?
多键后,KERA和TORCH如何处理顺序层有区别吗?
张量
def get_angles(pos, i, d_model):
angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
return pos * angle_rates
def positional_encoding(position, d_model):
angle_rads = get_angles(np.arange(position)[:, np.newaxis],
np.arange(d_model)[np.newaxis, :],
d_model)
# apply sin to even indices in the array; 2i
angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
# apply cos to odd indices in the array; 2i+1
angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
pos_encoding = angle_rads[np.newaxis, ...]
return tf.cast(pos_encoding, dtype=tf.float32)
class PositionEmbedding(layers.Layer):
def __init__(self, max_len, embed_dim):
super(PositionEmbedding, self).__init__()
self.pos_encoding = positional_encoding(max_len,
embed_dim)
def call(self, x):
seq_len = tf.shape(x)[1]
x += self.pos_encoding[:, :seq_len, :]
return x
class TransformerBlock(tf.keras.Model):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super(TransformerBlock, self).__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
def transformer_classifer(embed_dim, ff_dim, max_len, num_heads, dropout=0.1):
inputs = layers.Input(shape=(max_len, embed_dim))
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
embedding_layer = PositionEmbedding(1024, embed_dim)
x = embedding_layer(inputs)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(dropout)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(dropout)(x)
outputs = layers.Dense(2, activation="softmax")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
return model
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 75, 768)] 0
position_embedding (Positio (None, 75, 768) 0
nEmbedding)
transformer_block_1 (Transf (None, 75, 768) 31491584
ormerBlock)
global_average_pooling1d (G (None, 768) 0
lobalAveragePooling1D)
dropout_41 (Dropout) (None, 768) 0
dense_4 (Dense) (None, 32) 24608
dropout_42 (Dropout) (None, 32) 0
dense_5 (Dense) (None, 2) 66
=================================================================
Total params: 31,516,258
Trainable params: 31,516,258
Non-trainable params: 0
pythorch
class TransformerBlock(nn.Module):
def __init__(self, embed_dim, num_heads, ff_dim, max_len, rate=0.1):
super(TransformerBlock, self).__init__()
self.embed_dim = embed_dim
self.max_len = max_len
self.att = nn.MultiheadAttention(num_heads=num_heads, embed_dim=embed_dim, batch_first=True)
self.sequential = nn.Sequential(
nn.Linear(in_features=embed_dim, out_features=ff_dim),
nn.ReLU(inplace=True),
nn.Linear(in_features=ff_dim, out_features=embed_dim)
)
self.layernorm1 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
self.layernorm2 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
self.dropout1 = nn.Dropout(rate)
self.dropout2 = nn.Dropout(rate)
def forward(self, inputs, training=True):
#print("inputs shape", inputs.shape)
attn_output, _ = self.att(inputs, inputs, inputs)
attn_output = self.dropout1(attn_output)
out1 = self.layernorm1(inputs + attn_output)
seq_output = self.sequential(out1)
seq_output = self.dropout2(seq_output)
return self.layernorm2(out1 + seq_output)
class transformer_classifer(nn.Module):
def __init__(self, embed_dim, ff_dim, max_len, num_heads, dropout=0.1):
super(transformer_classifer, self).__init__()
self.max_len = max_len
self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim, max_len)
self.embedding_layer = PositionEmbedding(max_len, embed_dim)
self.dropout = nn.Dropout(dropout)
self.linear1 = nn.Linear(in_features=embed_dim, out_features=32)
self.relu = nn.ReLU()
self.linear2 = nn.Linear(in_features=32, out_features=2)
self.softmax = nn.Softmax(dim=-1)
def forward(self, inputs, training=True):
x = self.embedding_layer(inputs, self.max_len, training=training)
x = self.transformer_block(x, training=training)
x = torch.mean(x, dim=1)
x = self.dropout(x)
x = self.linear1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.linear2(x)
#x = self.softmax(x)
return x
================================================================================
Layer (type:depth-idx) Param #
================================================================================
├─TransformerBlock: 1-1 --
| └─MultiheadAttention: 2-1 --
| | └─NonDynamicallyQuantizableLinear: 3-1 590,592
| └─Sequential: 2-2 --
| | └─Linear: 3-2 1,574,912
| | └─ReLU: 3-3 --
| | └─Linear: 3-4 1,573,632
| └─LayerNorm: 2-3 1,536
| └─LayerNorm: 2-4 1,536
| └─Dropout: 2-5 --
| └─Dropout: 2-6 --
├─PositionEmbedding: 1-2 --
├─Dropout: 1-3 --
├─Linear: 1-4 24,608
├─ReLU: 1-5 --
├─Linear: 1-6 66
├─Softmax: 1-7 --
================================================================================
Total params: 3,766,882
Trainable params: 3,766,882
Non-trainable params: 0
================================================================================
Why does the pytorch model only have ~3.7 million parameters while the keras model has over 30 million? Is there something that I did wrong or is there such a big difference beetween the keras and the pytorch implementation of the multiheadattention layer?
Is there a difference how keras and torch handles the sequential layer after the multiheadattention?
Tensorflow
def get_angles(pos, i, d_model):
angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
return pos * angle_rates
def positional_encoding(position, d_model):
angle_rads = get_angles(np.arange(position)[:, np.newaxis],
np.arange(d_model)[np.newaxis, :],
d_model)
# apply sin to even indices in the array; 2i
angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
# apply cos to odd indices in the array; 2i+1
angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
pos_encoding = angle_rads[np.newaxis, ...]
return tf.cast(pos_encoding, dtype=tf.float32)
class PositionEmbedding(layers.Layer):
def __init__(self, max_len, embed_dim):
super(PositionEmbedding, self).__init__()
self.pos_encoding = positional_encoding(max_len,
embed_dim)
def call(self, x):
seq_len = tf.shape(x)[1]
x += self.pos_encoding[:, :seq_len, :]
return x
class TransformerBlock(tf.keras.Model):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super(TransformerBlock, self).__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
def transformer_classifer(embed_dim, ff_dim, max_len, num_heads, dropout=0.1):
inputs = layers.Input(shape=(max_len, embed_dim))
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
embedding_layer = PositionEmbedding(1024, embed_dim)
x = embedding_layer(inputs)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(dropout)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(dropout)(x)
outputs = layers.Dense(2, activation="softmax")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
return model
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 75, 768)] 0
position_embedding (Positio (None, 75, 768) 0
nEmbedding)
transformer_block_1 (Transf (None, 75, 768) 31491584
ormerBlock)
global_average_pooling1d (G (None, 768) 0
lobalAveragePooling1D)
dropout_41 (Dropout) (None, 768) 0
dense_4 (Dense) (None, 32) 24608
dropout_42 (Dropout) (None, 32) 0
dense_5 (Dense) (None, 2) 66
=================================================================
Total params: 31,516,258
Trainable params: 31,516,258
Non-trainable params: 0
Pythorch
class TransformerBlock(nn.Module):
def __init__(self, embed_dim, num_heads, ff_dim, max_len, rate=0.1):
super(TransformerBlock, self).__init__()
self.embed_dim = embed_dim
self.max_len = max_len
self.att = nn.MultiheadAttention(num_heads=num_heads, embed_dim=embed_dim, batch_first=True)
self.sequential = nn.Sequential(
nn.Linear(in_features=embed_dim, out_features=ff_dim),
nn.ReLU(inplace=True),
nn.Linear(in_features=ff_dim, out_features=embed_dim)
)
self.layernorm1 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
self.layernorm2 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
self.dropout1 = nn.Dropout(rate)
self.dropout2 = nn.Dropout(rate)
def forward(self, inputs, training=True):
#print("inputs shape", inputs.shape)
attn_output, _ = self.att(inputs, inputs, inputs)
attn_output = self.dropout1(attn_output)
out1 = self.layernorm1(inputs + attn_output)
seq_output = self.sequential(out1)
seq_output = self.dropout2(seq_output)
return self.layernorm2(out1 + seq_output)
class transformer_classifer(nn.Module):
def __init__(self, embed_dim, ff_dim, max_len, num_heads, dropout=0.1):
super(transformer_classifer, self).__init__()
self.max_len = max_len
self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim, max_len)
self.embedding_layer = PositionEmbedding(max_len, embed_dim)
self.dropout = nn.Dropout(dropout)
self.linear1 = nn.Linear(in_features=embed_dim, out_features=32)
self.relu = nn.ReLU()
self.linear2 = nn.Linear(in_features=32, out_features=2)
self.softmax = nn.Softmax(dim=-1)
def forward(self, inputs, training=True):
x = self.embedding_layer(inputs, self.max_len, training=training)
x = self.transformer_block(x, training=training)
x = torch.mean(x, dim=1)
x = self.dropout(x)
x = self.linear1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.linear2(x)
#x = self.softmax(x)
return x
================================================================================
Layer (type:depth-idx) Param #
================================================================================
├─TransformerBlock: 1-1 --
| └─MultiheadAttention: 2-1 --
| | └─NonDynamicallyQuantizableLinear: 3-1 590,592
| └─Sequential: 2-2 --
| | └─Linear: 3-2 1,574,912
| | └─ReLU: 3-3 --
| | └─Linear: 3-4 1,573,632
| └─LayerNorm: 2-3 1,536
| └─LayerNorm: 2-4 1,536
| └─Dropout: 2-5 --
| └─Dropout: 2-6 --
├─PositionEmbedding: 1-2 --
├─Dropout: 1-3 --
├─Linear: 1-4 24,608
├─ReLU: 1-5 --
├─Linear: 1-6 66
├─Softmax: 1-7 --
================================================================================
Total params: 3,766,882
Trainable params: 3,766,882
Non-trainable params: 0
================================================================================
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论