ViVIT PyTorch: RuntimeError: многоцелевой режим не поддерживается в /pytorch/aten/src/THCUNN/generic/ClassNLLCriterion.cu:15

avatar
Vaibhav Sah
9 августа 2021 в 06:35
131
0
0

Я пытаюсь запустить код Video Vision Transformer (ViViT) с моим набором данных, но получаю сообщение об ошибке, используя CrossEntropyLoss из Pytorch в качестве функции потери.

У меня есть 6 классов:

['Run', 'Sit', 'Walk', 'Wave', 'Sit', 'Stand']

Оптимизатор

optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, weight_decay=1e-9, momentum=0.9)

Веса классов

tensor([0.0045, 0.0042, 0.0048, 0.0038, 0.0070, 0.0065])

Функция потерь

loss_func = nn.CrossEntropyLoss(weight=class_weights.to(device))

Ошибка создания кода

train_epoch(model, optimizer, train_loader, train_loss_history, loss_func)

Ошибка

Error Stack

RuntimeError: multi-target not supported at /pytorch/aten/src/THCUNN/generic/ClassNLLCriterion.cu:15

Код вызова преобразователя

model = ViViT(224, 16, 100, 16).cuda()

Получение кадров видео

def get_frames(filename, n_frames=1):
    frames = []
    v_cap = cv2.VideoCapture(filename)
    v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_list = np.linspace(0, v_len - 1, n_frames + 1, dtype=np.int16)
    frame_dims = np.array([224, 224, 3])
    for fn in range(v_len):
        success, frame = v_cap.read()
        if success is False:
            continue
        if (fn in frame_list):
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, (frame_dims[0], frame_dims[1]))
            frames.append(frame)
    v_cap.release()
    return frames, v_len

Предварительная обработка набора данных

class DatasetProcessing(data.Dataset):
    def __init__(self, df, root_dir):
        super(DatasetProcessing, self).__init__()
        # List of all videos path
        video_list = df["Video"].apply(lambda x: root_dir + '/' + x)
        self.video_list = np.asarray(video_list)
        self.df = df    
    def __getitem__(self, index):
        # Ensure that the raw videos are in respective folders and folder name matches the output class label
        video_label = self.video_list[index].split('/')[-2]
        video_name = self.video_list[index].split('/')[-1]
        
        video_frames, len_ = get_frames(self.video_list[index], n_frames = 15)
        video_frames = np.asarray(video_frames)
        video_frames = video_frames/255
        class_list = ['Run', 'Walk', 'Wave', 'Sit', 'Turn', 'Stand']
        class_id_loc = np.where(class_list == video_label)
        label = class_id_loc
        d = torch.as_tensor(np.array(video_frames).astype('float'))
        l = torch.as_tensor(np.array(label).astype('float'))
        return (d, l)

    def __len__(self):
        return self.video_list.shape[0]

Эпохи обучения

def train_epoch(model, optimizer, data_loader, loss_history, loss_func):
    total_samples = len(data_loader.dataset)
    model.train()

    for i, (data, target) in enumerate(data_loader):
        optimizer.zero_grad()
        x = data.cuda()
        data = rearrange(x, 'b p h w c -> b p c h w').cuda()
        target = target.type(torch.LongTensor).cuda()
        pred = model(data.float())
        output = F.log_softmax(pred, dim=1)
        loss = loss_func(output, target.squeeze(1))
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print('[' +  '{:5}'.format(i * len(data)) + '/' + '{:5}'.format(total_samples) +
                  ' (' + '{:3.0f}'.format(100 * i / len(data_loader)) + '%)]  Loss: ' +
                  '{:6.4f}'.format(loss.item()))
            loss_history.append(loss.item())

Вычислить модель

def evaluate(model, data_loader, loss_history, loss_func):
    model.eval()

    total_samples = len(data_loader.dataset)
    correct_samples = 0
    total_loss = 0

    with torch.no_grad():
        for data, target in data_loader:
            x = data.cuda()
            data = rearrange(x, 'b p h w c -> b p c h w').cuda()
            target = target.type(torch.LongTensor).cuda()
            output = F.log_softmax(model(data.float()), dim=1)
            loss = loss_func(output, target)
            _, pred = torch.max(output, dim=1)
            
            total_loss += loss.item()
            correct_samples += pred.eq(target).sum()

    avg_loss = total_loss / total_samples
    loss_history.append(avg_loss)
    print('\nAverage test loss: ' + '{:.4f}'.format(avg_loss) +
          '  Accuracy:' + '{:5}'.format(correct_samples) + '/' +
          '{:5}'.format(total_samples) + ' (' +
          '{:4.2f}'.format(100.0 * correct_samples / total_samples) + '%)\n')

Трансформатор

class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
        self.layers = nn.ModuleList([])
        self.norm = nn.LayerNorm(dim)
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
            ]))

    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
        return self.norm(x)

Код ViViT

class ViViT(nn.Module):
    def __init__(self, image_size, patch_size, num_classes, num_frames, dim = 192, depth = 4, heads = 3, pool = 'cls', in_channels = 3, dim_head = 64, dropout = 0. 
                 emb_dropout = 0.  scale_dim = 4, ):
        super().__init__()
        
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
        num_patches = (image_size // patch_size) ** 2
        patch_dim = in_channels * patch_size ** 2
        self.to_patch_embedding = nn.Sequential(
            Rearrange('b t c (h p1) (w p2) -> b t (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
            nn.Linear(patch_dim, dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_frames, num_patches + 1, dim))
        self.space_token = nn.Parameter(torch.randn(1, 1, dim))
        self.space_transformer = Transformer(dim, depth, heads, dim_head, dim*scale_dim, dropout)

        self.temporal_token = nn.Parameter(torch.randn(1, 1, dim))
        self.temporal_transformer = Transformer(dim, depth, heads, dim_head, dim*scale_dim, dropout)

        self.dropout = nn.Dropout(emb_dropout)
        self.pool = pool

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, x):
        x = self.to_patch_embedding(x)
        b, t, n, _ = x.shape

        cls_space_tokens = repeat(self.space_token, '() n d -> b t n d', b = b, t=t)
        x = torch.cat((cls_space_tokens, x), dim=2)
        x += self.pos_embedding[:, :, :(n + 1)]
        x = self.dropout(x)

        x = rearrange(x, 'b t n d -> (b t) n d')
        x = self.space_transformer(x)
        x = rearrange(x[:, 0], '(b t) ... -> b t ...', b=b)

        cls_temporal_tokens = repeat(self.temporal_token, '() n d -> b n d', b=b)
        x = torch.cat((cls_temporal_tokens, x), dim=1)
        x = self.temporal_transformer(x)
        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]

        return self.mlp_head(x)
Источник

Ответы (0)