对象定位MNIST TENSORFLOW到PYTORCH：损失并不降低

发布于 2025-02-03 21:15:28 字数 6548 浏览 3 评论 0原文

我正在尝试将TensorFlow对象本地化代码转换为Pytorch。在原始代码中，作者使用model.compile/model.fit训练模型，因此我不了解MNIST数字和框的分类损失如何回归有效。尽管如此，我还是试图在Pytorch实施自己的训练循环。此处的目标是，经过一些预处理后，将MNIST数字随机地越过黑色正方形图像，然后对数字进行分类和本地化（边界框）。

我设置了两个损失：nn.Crossentropyloss和nn.mseloss，我做（lose_1+lose_2）.backward（）来计算梯度。我知道这是计算梯度两次损失的正确方法=“ https://blog.paperspace.com/object-localization-pytorch-2/” rel =“ nofollow noreferrer”>在这里。

但是，尽管如此，我的损失并没有减少，而它与TensorFlow代码恰当地崩溃了。我使用torchinfo.summary检查了模型，并且表现得像张力流的实现一样。

编辑：我寻找了模型的预测标签，但似乎根本没有改变。 This line of code label_preds, bbox_coords_preds = model(digits) always returns the same values

label_preds[0] = tensor([[0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156，0.0156，0.0156]]，设备='cuda：0'，grad_fn =＆lt; slicebackward0＆gt;）

这是我的问题：

我的自定义网络设置正确吗？
我的损失是否正确？
为什么我的标签预测不会改变？
我的训练循环以及.compile和.fit tensorflow方法吗？

非常感谢！

Pytorch代码

class ConvNetwork(nn.Module):
    def __init__(self):
        super(ConvNetwork, self).__init__()
        self.conv2d_1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3)
        self.conv2d_2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3)
        self.conv2d_3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
        self.avgPooling2D = nn.AvgPool2d((2,2))
        self.dense_1 = nn.Linear(in_features=3136, out_features=128)
        
        self.dense_classifier = nn.Linear(in_features=128, out_features=10)
        self.softmax = nn.Softmax(dim=0)
        self.dense_regression = nn.Linear(in_features=128, out_features=4)


    def forward(self, input):
        x = self.avgPooling2D(F.relu(self.conv2d_1(input)))
        x = self.avgPooling2D(F.relu(self.conv2d_2(x)))
        x = self.avgPooling2D(F.relu(self.conv2d_3(x)))
        x = nn.Flatten()(x)
        x = F.relu(self.dense_1(x))

        output_classifier = self.softmax(self.dense_classifier(x))
        output_regression = self.dense_regression(x)
        return [output_classifier, output_regression]

######################################################

learning_rate = 0.1
EPOCHS = 1
BATCH_SIZE = 64

model = ConvNetwork()
model = model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
classification_loss = nn.CrossEntropyLoss()
regression_loss = nn.MSELoss()

######################################################

begin_time = time.time()
for epoch in range(EPOCHS) : 
    tot_loss = 0
    train_start = time.time()
    training_losses = []
    
    print("-"*20)
    print(" "*5 + f"EPOCH {epoch+1}/{EPOCHS}")
    print("-"*20)

    model.train()
    for batch, (digits, labels, bbox_coords) in enumerate(training_dataset):
        digits, labels, bbox_coords = digits.to(device), labels.to(device), bbox_coords.to(device)
        optimizer.zero_grad()
        
        [label_preds, bbox_coords_preds] = model(digits)
        
        class_loss = classification_loss(label_preds, labels)
        box_loss = regression_loss(bbox_coords_preds, bbox_coords)

        training_loss = class_loss + box_loss
        training_loss.backward()
        
        optimizer.step()
        
        ######### print part #######################
        training_losses.append(training_loss.item())
        if batch+1 <= len_training_ds//BATCH_SIZE:
            current_training_sample = (batch+1)*BATCH_SIZE
        else:
            current_training_sample = (batch)*BATCH_SIZE + len_training_ds%BATCH_SIZE
        
        if (batch+1) == 1 or (batch+1)%100 == 0 or (batch+1) == len_training_ds//BATCH_SIZE +1:
            print(f"Elapsed time : {(time.time()-train_start)/60:.3f}",\
                  f" --- Digit : {current_training_sample}/{len_training_ds}",\
                  f" : loss = {training_loss:.5f}")
            if batch+1 == (len_training_ds//BATCH_SIZE)+1:
                print(f"Total elapsed time for training : {(time.time()-begin_time)/60:.3f}")

原始TensorFlow代码

def feature_extractor(inputs):
    x = tf.keras.layers.Conv2D(16, activation='relu', kernel_size=3, input_shape=(75, 75, 1))(inputs)
    x = tf.keras.layers.AveragePooling2D((2, 2))(x)
    x = tf.keras.layers.Conv2D(32,kernel_size=3,activation='relu')(x)
    x = tf.keras.layers.AveragePooling2D((2, 2))(x)
    x = tf.keras.layers.Conv2D(64,kernel_size=3,activation='relu')(x)
    x = tf.keras.layers.AveragePooling2D((2, 2))(x)
    return x

def dense_layers(inputs):
  x = tf.keras.layers.Flatten()(inputs)
  x = tf.keras.layers.Dense(128, activation='relu')(x)
  return x

def classifier(inputs):

  classification_output = tf.keras.layers.Dense(10, activation='softmax', name = 'classification')(inputs)
  return classification_output


def bounding_box_regression(inputs):
    bounding_box_regression_output = tf.keras.layers.Dense(units = '4', name = 'bounding_box')(inputs)
    return bounding_box_regression_output


def final_model(inputs):
    feature_cnn = feature_extractor(inputs)
    dense_output = dense_layers(feature_cnn)

    classification_output = classifier(dense_output)
    bounding_box_output = bounding_box_regression(dense_output)

    model = tf.keras.Model(inputs = inputs, outputs = [classification_output,bounding_box_output])
    return model
  
def define_and_compile_model(inputs):
  model = final_model(inputs)
  model.compile(optimizer='adam', 
              loss = {'classification' : 'categorical_crossentropy',
                      'bounding_box' : 'mse'
                     },
              metrics = {'classification' : 'accuracy',
                         'bounding_box' : 'mse'
                        })
  return model

    

inputs = tf.keras.layers.Input(shape=(75, 75, 1,))
model = define_and_compile_model(inputs)


EPOCHS = 10 # 45
steps_per_epoch = 60000//BATCH_SIZE  # 60,000 items in this dataset
validation_steps = 1

history = model.fit(training_dataset,
                    steps_per_epoch=steps_per_epoch, 
                    validation_data=validation_dataset, 
                    validation_steps=validation_steps, epochs=EPOCHS)

loss, classification_loss, bounding_box_loss, classification_accuracy, bounding_box_mse = model.evaluate(validation_dataset, steps=1)
print("Validation accuracy: ", classification_accuracy)

原文

I am trying to convert a Tensorflow object localization code into Pytorch. In the original code, the author use model.compile / model.fit to train the model so I don't understand how the losses of classification of the MNIST digits and box regressions work. Still, I'm trying to implement my own training loop in Pytorch.
The goal here is, after some preprocessing, past the MNIST digits randomly into a black square image and then, classify and localize (bounding boxes) the digit.

I set two losses : nn.CrossEntropyLoss and nn.MSELoss and I do (loss_1+loss_2).backward() to compute the gradients. I know it's the right way to compute gradients with two losses from here and here.

But still, my loss doesn't decrease whereas it collapses quasi-imediately with the Tensorflow code. I checked the model with torchinfo.summary and it seems behaving as well as the Tensorflow implementation.

EDIT :
I looked for the predicted labels of my model and it doesn't seem to change at all.
This line of code label_preds, bbox_coords_preds = model(digits) always returns the same values

label_preds[0] = tensor([[0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156]], device='cuda:0', grad_fn=<SliceBackward0>)

Here are my questions :

Is my custom network set correctly ?
Are my losses set correctly ?
Why my label predictions don't change ?
Do my training loop work as well as the .compile and .fit Tensorflow methods ?

Thanks a lot !

PYTORCH CODE

class ConvNetwork(nn.Module):
    def __init__(self):
        super(ConvNetwork, self).__init__()
        self.conv2d_1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3)
        self.conv2d_2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3)
        self.conv2d_3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
        self.avgPooling2D = nn.AvgPool2d((2,2))
        self.dense_1 = nn.Linear(in_features=3136, out_features=128)
        
        self.dense_classifier = nn.Linear(in_features=128, out_features=10)
        self.softmax = nn.Softmax(dim=0)
        self.dense_regression = nn.Linear(in_features=128, out_features=4)


    def forward(self, input):
        x = self.avgPooling2D(F.relu(self.conv2d_1(input)))
        x = self.avgPooling2D(F.relu(self.conv2d_2(x)))
        x = self.avgPooling2D(F.relu(self.conv2d_3(x)))
        x = nn.Flatten()(x)
        x = F.relu(self.dense_1(x))

        output_classifier = self.softmax(self.dense_classifier(x))
        output_regression = self.dense_regression(x)
        return [output_classifier, output_regression]

######################################################

learning_rate = 0.1
EPOCHS = 1
BATCH_SIZE = 64

model = ConvNetwork()
model = model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
classification_loss = nn.CrossEntropyLoss()
regression_loss = nn.MSELoss()

######################################################

begin_time = time.time()
for epoch in range(EPOCHS) : 
    tot_loss = 0
    train_start = time.time()
    training_losses = []
    
    print("-"*20)
    print(" "*5 + f"EPOCH {epoch+1}/{EPOCHS}")
    print("-"*20)

    model.train()
    for batch, (digits, labels, bbox_coords) in enumerate(training_dataset):
        digits, labels, bbox_coords = digits.to(device), labels.to(device), bbox_coords.to(device)
        optimizer.zero_grad()
        
        [label_preds, bbox_coords_preds] = model(digits)
        
        class_loss = classification_loss(label_preds, labels)
        box_loss = regression_loss(bbox_coords_preds, bbox_coords)

        training_loss = class_loss + box_loss
        training_loss.backward()
        
        optimizer.step()
        
        ######### print part #######################
        training_losses.append(training_loss.item())
        if batch+1 <= len_training_ds//BATCH_SIZE:
            current_training_sample = (batch+1)*BATCH_SIZE
        else:
            current_training_sample = (batch)*BATCH_SIZE + len_training_ds%BATCH_SIZE
        
        if (batch+1) == 1 or (batch+1)%100 == 0 or (batch+1) == len_training_ds//BATCH_SIZE +1:
            print(f"Elapsed time : {(time.time()-train_start)/60:.3f}",\
                  f" --- Digit : {current_training_sample}/{len_training_ds}",\
                  f" : loss = {training_loss:.5f}")
            if batch+1 == (len_training_ds//BATCH_SIZE)+1:
                print(f"Total elapsed time for training : {(time.time()-begin_time)/60:.3f}")

ORIGINAL TENSORFLOW CODE

def feature_extractor(inputs):
    x = tf.keras.layers.Conv2D(16, activation='relu', kernel_size=3, input_shape=(75, 75, 1))(inputs)
    x = tf.keras.layers.AveragePooling2D((2, 2))(x)
    x = tf.keras.layers.Conv2D(32,kernel_size=3,activation='relu')(x)
    x = tf.keras.layers.AveragePooling2D((2, 2))(x)
    x = tf.keras.layers.Conv2D(64,kernel_size=3,activation='relu')(x)
    x = tf.keras.layers.AveragePooling2D((2, 2))(x)
    return x

def dense_layers(inputs):
  x = tf.keras.layers.Flatten()(inputs)
  x = tf.keras.layers.Dense(128, activation='relu')(x)
  return x

def classifier(inputs):

  classification_output = tf.keras.layers.Dense(10, activation='softmax', name = 'classification')(inputs)
  return classification_output


def bounding_box_regression(inputs):
    bounding_box_regression_output = tf.keras.layers.Dense(units = '4', name = 'bounding_box')(inputs)
    return bounding_box_regression_output


def final_model(inputs):
    feature_cnn = feature_extractor(inputs)
    dense_output = dense_layers(feature_cnn)

    classification_output = classifier(dense_output)
    bounding_box_output = bounding_box_regression(dense_output)

    model = tf.keras.Model(inputs = inputs, outputs = [classification_output,bounding_box_output])
    return model
  
def define_and_compile_model(inputs):
  model = final_model(inputs)
  model.compile(optimizer='adam', 
              loss = {'classification' : 'categorical_crossentropy',
                      'bounding_box' : 'mse'
                     },
              metrics = {'classification' : 'accuracy',
                         'bounding_box' : 'mse'
                        })
  return model

    

inputs = tf.keras.layers.Input(shape=(75, 75, 1,))
model = define_and_compile_model(inputs)


EPOCHS = 10 # 45
steps_per_epoch = 60000//BATCH_SIZE  # 60,000 items in this dataset
validation_steps = 1

history = model.fit(training_dataset,
                    steps_per_epoch=steps_per_epoch, 
                    validation_data=validation_dataset, 
                    validation_steps=validation_steps, epochs=EPOCHS)

loss, classification_loss, bounding_box_loss, classification_accuracy, bounding_box_mse = model.evaluate(validation_dataset, steps=1)
print("Validation accuracy: ", classification_accuracy)

分享到QQ

分享到微博