RuntimeError:仅支持批量空间目标(3D 张量)但获得维度目标:4

原文标题RuntimeError: only batches of spatial targets supported (3D tensors) but got targets of dimension: 4

我很难理解图像分割。我已经实现了用于图像分割的 Unet 模型。我正在使用 PASCAL VOC 数据集,并且正在尝试训练我的模型。但是,我在计算损失时卡住了。我不确定输出和目标类的预期形状应该是什么。有人可以告诉我我做错了什么吗?我唯一的猜测是,当涉及到地面实况图像时,我遗漏了一些东西,因为我不知道模型将如何学习哪个类是哪个。谢谢!

这是我的 Unet 课程:

import torch
import torch.nn as nn
from torchvision import transforms


def x2conv(in_channels, out_channels):
    double_conv = nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=0),
        nn.ReLU(inplace=True),
        nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=0),
        nn.ReLU(inplace=True))
    return double_conv


class Encoder(nn.Module):
    def __init__(self, chs):
        super().__init__()
        self.enc_blocks = nn.ModuleList(
            [x2conv(chs[i], chs[i+1]) for i in range(len(chs)-1)])
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        ftrs = []
        for block in self.enc_blocks:
            x = block(x)
            ftrs.append(x)
            x = self.pool(x)
        return ftrs


class Decoder(nn.Module):
    def __init__(self, chs):
        super().__init__()
        self.chs = chs
        self.upconvs = nn.ModuleList(
            [nn.ConvTranspose2d(chs[i], chs[i+1], kernel_size=2, stride=2) for i in range(len(chs)-1)])
        self.dec_blocks = nn.ModuleList(
            [x2conv(chs[i], chs[i+1]) for i in range(len(chs)-1)])

    def forward(self, x, encoder_features):
        for i in range(len(self.chs)-1):
            x = self.upconvs[i](x)
            enc_ftrs = self.crop(encoder_features[i], x)
            x = torch.cat([x, enc_ftrs], dim=1)
            x = self.dec_blocks[i](x)
        return x

    def crop(self, enc_ftrs, x):
        _, _, H, W = x.shape
        enc_ftrs = transforms.CenterCrop([H, W])(enc_ftrs)
        return enc_ftrs


class UNet(nn.Module):
    def __init__(self, enc_chs, dec_chs, num_class):
        super(UNet, self).__init__()

        self.encoder = Encoder(enc_chs)
        self.decoder = Decoder(dec_chs)
        self.softmax = nn.Conv2d(dec_chs[-1], num_class, kernel_size=1)

    def forward(self, x):
        enc_ftrs = self.encoder(x)
        out = self.decoder(enc_ftrs[::-1][0], enc_ftrs[::-1][1:])
        out = self.softmax(out)
        return out

这是我的数据集类:

from PIL import Image
import torchvision
VOC_CLASSES = [   # How to use?
    "background",
    "aeroplane",
    "bicycle",
    "bird",
    "boat",
    "bottle",
    "bus",
    "car",
    "cat",
    "chair",
    "cow",
    "diningtable",
    "dog",
    "horse",
    "motorbike",
    "person",
    "pottedplant",
    "sheep",
    "sofa",
    "train",
    "tvmonitor",
]


VOC_COLORMAP = [  # How to use?
    [0, 0, 0],  # Background
    [128, 0, 0],  # Aeroplane
    [0, 128, 0],  # Bicycle
    [128, 128, 0],  # Bird
    [0, 0, 128],  # Boat
    [128, 0, 128],  # Bottle
    [0, 128, 128],  # Bus
    [128, 128, 128],  # Car
    [64, 0, 0],  # Cat
    [192, 0, 0],  # Chair
    [64, 128, 0],  # Cow
    [192, 128, 0],  # Diningtable
    [64, 0, 128],  # Dog
    [192, 0, 128],  # Horse
    [64, 128, 128],  # Motorbike
    [192, 128, 128],  # Person
    [0, 64, 0],  # Pottedplant
    [128, 64, 0],  # Sheep
    [0, 192, 0],  # Sofa
    [128, 192, 0],  # Train
    [0, 64, 128],  # tvmonitor
]


class VocDataset(torchvision.datasets.VOCSegmentation):
    def __init__(self, image_set, transform, root="../data/VOCtrainval_11-May-2012/", download=False, year="2012"):
        self.transform = transform
        self.year = year
        super().__init__(root=root, image_set=image_set,
                         download=download, transform=transform, year=year)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        # open images and do transformation  img = jpg, mask = png
        img = Image.open(self.images[index]).convert("RGB")
        target = Image.open(self.masks[index]).convert("RGB")

        if self.transform:
            img = self.transform(img)
            trfm = T.Compose([T.ToTensor(), T.Resize((388, 388))])
            target = trfm(target)
        return img, target


最后这是我的训练函数

import torch
import torch.nn as nn
import torch.optim as optim
from unet import UNet
from torch.utils.data import DataLoader
from dataset import VocDataset
import torchvision.transforms as T
import torch.nn.functional as F

# Hyperparameters etc.
STD = [0.2686, 0.2652, 0.2812]  # Std for dataset
MEAN = [0.4568, 0.4431, 0.4083]  # Mean for dataset
MOMENTUM = 0.9
LEARNING_RATE = 1e-4
BATCH_SIZE = 32
NUM_EPOCHS = 1
NUM_WORKERS = 2
NUM_CLASSES = 20
TRAIN_SET = "train"
VAL_SET = "val"
ENC_CHANNELS = (3, 64, 128, 256, 512, 1024)  # Encoder channels
DEC_CHANNELS = (1024, 512, 256, 128, 64)  # Decoder channels
TRANSFORM = T.Compose(
    [T.ToTensor(), T.Resize(SIZE), T.Normalize(MEAN, STD)]
)


def main():
    training_data = VocDataset(TRAIN_SET, TRANSFORM)
    train_dataloader = DataLoader(
        training_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, drop_last=True)


    # Create instance of unet
    unet = UNet(ENC_CHANNELS, DEC_CHANNELS, NUM_CLASSES)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(
        unet.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

    for epoch in range(NUM_EPOCHS):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data  # Shape for labels and inputs are: [32,3,388,388] 

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = unet(inputs) # output shape is [32, 32, 388, 388] 

            loss = criterion(outputs, labels)  # Error here

            loss.backward()
            optimizer.step()


    # print('Finished Training')


if __name__ == "__main__":
    main()

原文链接:https://stackoverflow.com//questions/71674595/runtimeerror-only-batches-of-spatial-targets-supported-3d-tensors-but-got-tar

回复

我来回复
  • DerekG的头像
    DerekG 评论

    对于初学者,您的标签和输出具有不同的维度。 (32 对 3 通道)。 Cross Entropy Loss 期望它们要么具有相同数量的通道,要么目标只有一个通道,其整数值指示相关类别。

    让我们处理后一种情况。在这种情况下,我们需要将目标减少为单通道[32 x 388 x 388],以适应您的输入和批量大小。 (其次,理想情况下,Unet 应该为每个类提供一个输出通道(看起来有 22 个类,因此您应该将 Unet 解码器的最终输出层更改为具有 22 个输出))。

    要将大小[32 x 3 x 388 x 388]的标签转换成[32 x 388 x 388],需要使用colormap进行转换。也就是说,创建一个大小为[32 x 1 x 388 x 388]的新张量target。对于每个值target[i,j,k],将索引分配给VOC_COLORMAP,以匹配存储在label[i,:,j,k]处的像素中的值。

    2年前 0条评论