RuntimeError：仅支持批量空间目标（3D 张量），但获得维度目标：4

发布于 2025-01-17 19:13:23 字数 5993 浏览 0 评论 0原文

我很难理解图像细分。我已经实现了图像分割的UNET模型。我正在使用Pascal VOC数据集，并且正在尝试训练我的模型。但是，在计算损失时，我被卡住了。我不确定输出和目标类别的预期形状应该是什么。有人可以教育我关于我做错了什么吗？我唯一的猜测是，当涉及到地面真相图像时，我缺少某些内容，因为我不知道该模型将如何学习哪个类。感谢！

这是我的UNET课程：

import torch
import torch.nn as nn
from torchvision import transforms


def x2conv(in_channels, out_channels):
    double_conv = nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=0),
        nn.ReLU(inplace=True),
        nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=0),
        nn.ReLU(inplace=True))
    return double_conv


class Encoder(nn.Module):
    def __init__(self, chs):
        super().__init__()
        self.enc_blocks = nn.ModuleList(
            [x2conv(chs[i], chs[i+1]) for i in range(len(chs)-1)])
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        ftrs = []
        for block in self.enc_blocks:
            x = block(x)
            ftrs.append(x)
            x = self.pool(x)
        return ftrs


class Decoder(nn.Module):
    def __init__(self, chs):
        super().__init__()
        self.chs = chs
        self.upconvs = nn.ModuleList(
            [nn.ConvTranspose2d(chs[i], chs[i+1], kernel_size=2, stride=2) for i in range(len(chs)-1)])
        self.dec_blocks = nn.ModuleList(
            [x2conv(chs[i], chs[i+1]) for i in range(len(chs)-1)])

    def forward(self, x, encoder_features):
        for i in range(len(self.chs)-1):
            x = self.upconvs[i](x)
            enc_ftrs = self.crop(encoder_features[i], x)
            x = torch.cat([x, enc_ftrs], dim=1)
            x = self.dec_blocks[i](x)
        return x

    def crop(self, enc_ftrs, x):
        _, _, H, W = x.shape
        enc_ftrs = transforms.CenterCrop([H, W])(enc_ftrs)
        return enc_ftrs


class UNet(nn.Module):
    def __init__(self, enc_chs, dec_chs, num_class):
        super(UNet, self).__init__()

        self.encoder = Encoder(enc_chs)
        self.decoder = Decoder(dec_chs)
        self.softmax = nn.Conv2d(dec_chs[-1], num_class, kernel_size=1)

    def forward(self, x):
        enc_ftrs = self.encoder(x)
        out = self.decoder(enc_ftrs[::-1][0], enc_ftrs[::-1][1:])
        out = self.softmax(out)
        return out

这是我的数据集类别：

from PIL import Image
import torchvision
VOC_CLASSES = [   # How to use?
    "background",
    "aeroplane",
    "bicycle",
    "bird",
    "boat",
    "bottle",
    "bus",
    "car",
    "cat",
    "chair",
    "cow",
    "diningtable",
    "dog",
    "horse",
    "motorbike",
    "person",
    "pottedplant",
    "sheep",
    "sofa",
    "train",
    "tvmonitor",
]


VOC_COLORMAP = [  # How to use?
    [0, 0, 0],  # Background
    [128, 0, 0],  # Aeroplane
    [0, 128, 0],  # Bicycle
    [128, 128, 0],  # Bird
    [0, 0, 128],  # Boat
    [128, 0, 128],  # Bottle
    [0, 128, 128],  # Bus
    [128, 128, 128],  # Car
    [64, 0, 0],  # Cat
    [192, 0, 0],  # Chair
    [64, 128, 0],  # Cow
    [192, 128, 0],  # Diningtable
    [64, 0, 128],  # Dog
    [192, 0, 128],  # Horse
    [64, 128, 128],  # Motorbike
    [192, 128, 128],  # Person
    [0, 64, 0],  # Pottedplant
    [128, 64, 0],  # Sheep
    [0, 192, 0],  # Sofa
    [128, 192, 0],  # Train
    [0, 64, 128],  # tvmonitor
]


class VocDataset(torchvision.datasets.VOCSegmentation):
    def __init__(self, image_set, transform, root="../data/VOCtrainval_11-May-2012/", download=False, year="2012"):
        self.transform = transform
        self.year = year
        super().__init__(root=root, image_set=image_set,
                         download=download, transform=transform, year=year)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        # open images and do transformation  img = jpg, mask = png
        img = Image.open(self.images[index]).convert("RGB")
        target = Image.open(self.masks[index]).convert("RGB")

        if self.transform:
            img = self.transform(img)
            trfm = T.Compose([T.ToTensor(), T.Resize((388, 388))])
            target = trfm(target)
        return img, target

最后是我的火车功能

import torch
import torch.nn as nn
import torch.optim as optim
from unet import UNet
from torch.utils.data import DataLoader
from dataset import VocDataset
import torchvision.transforms as T
import torch.nn.functional as F

# Hyperparameters etc.
STD = [0.2686, 0.2652, 0.2812]  # Std for dataset
MEAN = [0.4568, 0.4431, 0.4083]  # Mean for dataset
MOMENTUM = 0.9
LEARNING_RATE = 1e-4
BATCH_SIZE = 32
NUM_EPOCHS = 1
NUM_WORKERS = 2
NUM_CLASSES = 20
TRAIN_SET = "train"
VAL_SET = "val"
ENC_CHANNELS = (3, 64, 128, 256, 512, 1024)  # Encoder channels
DEC_CHANNELS = (1024, 512, 256, 128, 64)  # Decoder channels
TRANSFORM = T.Compose(
    [T.ToTensor(), T.Resize(SIZE), T.Normalize(MEAN, STD)]
)


def main():
    training_data = VocDataset(TRAIN_SET, TRANSFORM)
    train_dataloader = DataLoader(
        training_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, drop_last=True)


    # Create instance of unet
    unet = UNet(ENC_CHANNELS, DEC_CHANNELS, NUM_CLASSES)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(
        unet.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

    for epoch in range(NUM_EPOCHS):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data  # Shape for labels and inputs are: [32,3,388,388] 

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = unet(inputs) # output shape is [32, 32, 388, 388] 

            loss = criterion(outputs, labels)  # Error here

            loss.backward()
            optimizer.step()


    # print('Finished Training')


if __name__ == "__main__":
    main()

原文

I am having a hard time understanding image segmentation. I have implemented Unet model for image segmentation. I am using PASCAL VOC dataset and I am trying to train my model. However, I got stuck when calculating the loss. I am unsure of what should be the expected shapes of the output and target classes. Can someone please educate me on what I am doing wrong? My only guess is that I am missing something when it comes to the ground truth images since I don't know how the model will learn which class is which. Thank!

Here is my Unet class:

import torch
import torch.nn as nn
from torchvision import transforms


def x2conv(in_channels, out_channels):
    double_conv = nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=0),
        nn.ReLU(inplace=True),
        nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=0),
        nn.ReLU(inplace=True))
    return double_conv


class Encoder(nn.Module):
    def __init__(self, chs):
        super().__init__()
        self.enc_blocks = nn.ModuleList(
            [x2conv(chs[i], chs[i+1]) for i in range(len(chs)-1)])
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        ftrs = []
        for block in self.enc_blocks:
            x = block(x)
            ftrs.append(x)
            x = self.pool(x)
        return ftrs


class Decoder(nn.Module):
    def __init__(self, chs):
        super().__init__()
        self.chs = chs
        self.upconvs = nn.ModuleList(
            [nn.ConvTranspose2d(chs[i], chs[i+1], kernel_size=2, stride=2) for i in range(len(chs)-1)])
        self.dec_blocks = nn.ModuleList(
            [x2conv(chs[i], chs[i+1]) for i in range(len(chs)-1)])

    def forward(self, x, encoder_features):
        for i in range(len(self.chs)-1):
            x = self.upconvs[i](x)
            enc_ftrs = self.crop(encoder_features[i], x)
            x = torch.cat([x, enc_ftrs], dim=1)
            x = self.dec_blocks[i](x)
        return x

    def crop(self, enc_ftrs, x):
        _, _, H, W = x.shape
        enc_ftrs = transforms.CenterCrop([H, W])(enc_ftrs)
        return enc_ftrs


class UNet(nn.Module):
    def __init__(self, enc_chs, dec_chs, num_class):
        super(UNet, self).__init__()

        self.encoder = Encoder(enc_chs)
        self.decoder = Decoder(dec_chs)
        self.softmax = nn.Conv2d(dec_chs[-1], num_class, kernel_size=1)

    def forward(self, x):
        enc_ftrs = self.encoder(x)
        out = self.decoder(enc_ftrs[::-1][0], enc_ftrs[::-1][1:])
        out = self.softmax(out)
        return out

And here is my dataset class:

from PIL import Image
import torchvision
VOC_CLASSES = [   # How to use?
    "background",
    "aeroplane",
    "bicycle",
    "bird",
    "boat",
    "bottle",
    "bus",
    "car",
    "cat",
    "chair",
    "cow",
    "diningtable",
    "dog",
    "horse",
    "motorbike",
    "person",
    "pottedplant",
    "sheep",
    "sofa",
    "train",
    "tvmonitor",
]


VOC_COLORMAP = [  # How to use?
    [0, 0, 0],  # Background
    [128, 0, 0],  # Aeroplane
    [0, 128, 0],  # Bicycle
    [128, 128, 0],  # Bird
    [0, 0, 128],  # Boat
    [128, 0, 128],  # Bottle
    [0, 128, 128],  # Bus
    [128, 128, 128],  # Car
    [64, 0, 0],  # Cat
    [192, 0, 0],  # Chair
    [64, 128, 0],  # Cow
    [192, 128, 0],  # Diningtable
    [64, 0, 128],  # Dog
    [192, 0, 128],  # Horse
    [64, 128, 128],  # Motorbike
    [192, 128, 128],  # Person
    [0, 64, 0],  # Pottedplant
    [128, 64, 0],  # Sheep
    [0, 192, 0],  # Sofa
    [128, 192, 0],  # Train
    [0, 64, 128],  # tvmonitor
]


class VocDataset(torchvision.datasets.VOCSegmentation):
    def __init__(self, image_set, transform, root="../data/VOCtrainval_11-May-2012/", download=False, year="2012"):
        self.transform = transform
        self.year = year
        super().__init__(root=root, image_set=image_set,
                         download=download, transform=transform, year=year)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        # open images and do transformation  img = jpg, mask = png
        img = Image.open(self.images[index]).convert("RGB")
        target = Image.open(self.masks[index]).convert("RGB")

        if self.transform:
            img = self.transform(img)
            trfm = T.Compose([T.ToTensor(), T.Resize((388, 388))])
            target = trfm(target)
        return img, target

and lastly here is my train function

import torch
import torch.nn as nn
import torch.optim as optim
from unet import UNet
from torch.utils.data import DataLoader
from dataset import VocDataset
import torchvision.transforms as T
import torch.nn.functional as F

# Hyperparameters etc.
STD = [0.2686, 0.2652, 0.2812]  # Std for dataset
MEAN = [0.4568, 0.4431, 0.4083]  # Mean for dataset
MOMENTUM = 0.9
LEARNING_RATE = 1e-4
BATCH_SIZE = 32
NUM_EPOCHS = 1
NUM_WORKERS = 2
NUM_CLASSES = 20
TRAIN_SET = "train"
VAL_SET = "val"
ENC_CHANNELS = (3, 64, 128, 256, 512, 1024)  # Encoder channels
DEC_CHANNELS = (1024, 512, 256, 128, 64)  # Decoder channels
TRANSFORM = T.Compose(
    [T.ToTensor(), T.Resize(SIZE), T.Normalize(MEAN, STD)]
)


def main():
    training_data = VocDataset(TRAIN_SET, TRANSFORM)
    train_dataloader = DataLoader(
        training_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, drop_last=True)


    # Create instance of unet
    unet = UNet(ENC_CHANNELS, DEC_CHANNELS, NUM_CLASSES)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(
        unet.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

    for epoch in range(NUM_EPOCHS):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data  # Shape for labels and inputs are: [32,3,388,388] 

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = unet(inputs) # output shape is [32, 32, 388, 388] 

            loss = criterion(outputs, labels)  # Error here

            loss.backward()
            optimizer.step()


    # print('Finished Training')


if __name__ == "__main__":
    main()

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

风和你 2025-01-24 19:13:23

对于初学者，您的标签和输出具有不同的尺寸。（32 vs 3通道）。交叉熵损失期望它们具有相同数量的通道，或者目标只有一个具有整数值表示相关类别的通道。

让我们处理后一种情况。在这种情况下，我们需要将目标减少为单个通道[32 x 388 x 388]用于您的输入和批处理大小。（其次，理想情况下，UNET应该为每个类具有一个输出通道（看起来有22个类，因此您应该更改UNET解码器的最终输出层以具有22个输出））。

要将大小的标签转换为[32 x 3 x 388 x 388] [32 x 388 x 388] ，您需要使用colormap进行转换。也就是说，创建一个新的张量size的目标 [32 x 1 x 388 x 388] 。对于每个值target [i，j，k]，将索引分配到voc_colormap与在像素中存储的值相匹配的label [i，：，k]。