Рабочий процесс DataLoader (pid(s) 5852, 3332, 1108, 5760) неожиданно завершился

avatar
MinJae
9 августа 2021 в 04:45
376
0
1

Я столкнулся с этой ошибкой в ​​процессе классификации и изучения наборов данных о преступлениях, и выполнение num_workers = 0 из нескольких сообществ приводит к тому, что объект NoneType не подлежит подписке. Я не знаю, где проблема. версия pytorch = 1.9.0, python = 3.8

Набор данных использует UCF-crime.

это мой код ошибки

Empty                                     Traceback (most recent call last)
D:\anaconda\lib\site-packages\torch\utils\data\dataloader.py in _try_get_data(self, timeout)
    989         try:
--> 990             data = self._data_queue.get(timeout=timeout)
    991             return (True, data)

D:\anaconda\lib\multiprocessing\queues.py in get(self, block, timeout)
    107                     if not self._poll(timeout):
--> 108                         raise Empty
    109                 elif not self._poll():

Empty: 

The above exception was the direct cause of the following exception:

RuntimeError                              Traceback (most recent call last)
<ipython-input-114-36bae707b2e8> in <module>
     13         print(f"--- Phase {phase} ---")
     14         epoch_metrics = {"loss": [], "acc": []}
---> 15         for batch_i, (X, y) in enumerate(dataloaders[phase]):
     16             #iteration = iteration+1
     17             image_sequences = Variable(X.to(device), requires_grad=True)

D:\anaconda\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
    519             if self._sampler_iter is None:
    520                 self._reset()
--> 521             data = self._next_data()
    522             self._num_yielded += 1
    523             if self._dataset_kind == _DatasetKind.Iterable and \

D:\anaconda\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
   1184 
   1185             assert not self._shutdown and self._tasks_outstanding > 0
-> 1186             idx, data = self._get_data()
   1187             self._tasks_outstanding -= 1
   1188             if self._dataset_kind == _DatasetKind.Iterable:

D:\anaconda\lib\site-packages\torch\utils\data\dataloader.py in _get_data(self)
   1150         else:
   1151             while True:
-> 1152                 success, data = self._try_get_data()
   1153                 if success:
   1154                     return data

D:\anaconda\lib\site-packages\torch\utils\data\dataloader.py in _try_get_data(self, timeout)
   1001             if len(failed_workers) > 0:
   1002                 pids_str = ', '.join(str(w.pid) for w in failed_workers)
-> 1003                 raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e
   1004             if isinstance(e, queue.Empty):
   1005                 return (False, None)

RuntimeError: DataLoader worker (pid(s) 8424, 14232, 8856, 1204) exited unexpectedly

а затем часть поезда

import torchvision
import torch
from torch import nn
import torch.nn.functional as F
import torchvision.models as models
import torch.optim as optim
import copy
import os
from tqdm.autonotebook import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from torchvision import transforms
from torch.utils.data import DataLoader
import numpy as np
from torch.utils.data.sampler import SubsetRandomSampler
import cv2
import sys
#Label file:
data_path = 'Dataset'
classes = os.listdir(data_path)
decoder = {}
for i in range(len(classes)):
    decoder[classes[i]] = i
encoder = {}
for i in range(len(classes)):
    encoder[i] = classes[i]
id = list()
path = 'Dataset'
for i in os.listdir(path):
  p1 = os.path.join(path,i)
  for j in os.listdir(p1):
    p2 = os.path.join(p1,j)
    id.append((i,p2))
class video_dataset(Dataset):
    def __init__(self,frame_list,sequence_length = 16,transform = None):
        self.frame_list = frame_list
        self.transform = transform
        self.sequence_length = sequence_length
    def __len__(self):
        return len(self.frame_list)
    def __getitem__(self,idx):
        label,path = self.frame_list[idx]
        img = cv2.imread(path)
        seq_img = list()
        for i in range(16):
          img1 = img[:,128*i:128*(i+1),:]
          if(self.transform):
            img1 = self.transform(img1)
          seq_img.append(img1)
        seq_image = torch.stack(seq_img)
        seq_image = seq_image.reshape(3,16,im_size,im_size)
        return seq_image,decoder[label]
im_size = 128
mean = [0.4889, 0.4887, 0.4891]
std = [0.2074, 0.2074, 0.2074]


train_transforms = transforms.Compose([
                                        transforms.ToPILImage(),
                                        transforms.Resize((im_size,im_size)),
                                        transforms.RandomHorizontalFlip(),
                                        transforms.RandomRotation(degrees=10),
                                        transforms.ToTensor(),
                                        transforms.Normalize(mean,std)])

train_data = video_dataset(id,sequence_length = 16,transform = train_transforms)
train_loader = DataLoader(train_data,batch_size = 8,num_workers = 0,shuffle = True)
dataloaders = {'train':train_loader}
from model import resnet50
model = resnet50(class_num=8).to('cuda')
from clr import *
device = 'cuda'
cls_criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum = 0.9,weight_decay = 1e-4)
num_epochs = 10
onecyc = OneCycle(len(train_loader)*num_epochs,1e-3)

В этой части произошла ошибка.

os.makedirs('weights_crime',exist_ok = True)
from torch.autograd import Variable
iteration = 0
acc_all = list()
loss_all = list()
    
for epoch in range(num_epochs):
    print('')
    print(f"--- Epoch {epoch} ---")
    phase1 = dataloaders.keys()
    for phase in phase1:
        print('')
        print(f"--- Phase {phase} ---")
        epoch_metrics = {"loss": [], "acc": []}
        for batch_i, (X, y) in enumerate(dataloaders[phase]):
            #iteration = iteration+1
            image_sequences = Variable(X.to(device), requires_grad=True)
            labels = Variable(y.to(device), requires_grad=False)
            optimizer.zero_grad()
            #model.lstm.reset_hidden_state()
            predictions = model(image_sequences)
            loss = cls_criterion(predictions, labels)
            acc = 100 * (predictions.detach().argmax(1) == labels).cpu().numpy().mean()
            loss.backward()
            optimizer.step()
            epoch_metrics["loss"].append(loss.item())
            epoch_metrics["acc"].append(acc)
            if(phase=='train'):
                lr,mom = onecyc.calc()
                update_lr(optimizer, lr)
                update_mom(optimizer, mom)
            batches_done = epoch * len(dataloaders[phase]) + batch_i
            batches_left = num_epochs * len(dataloaders[phase]) - batches_done
            sys.stdout.write(
                    "\r[Epoch %d/%d] [Batch %d/%d] [Loss: %f (%f), Acc: %.2f%% (%.2f%%)]"
                    % (
                        epoch,
                        num_epochs,
                        batch_i,
                        len(dataloaders[phase]),
                        loss.item(),
                        np.mean(epoch_metrics["loss"]),
                        acc,
                        np.mean(epoch_metrics["acc"]),
                    )
                )

                # Empty cache
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
        print('')
        print('{} , acc: {}'.format(phase,np.mean(epoch_metrics["acc"])))
        torch.save(model.state_dict(),'weights_crime/c3d_{}.h5'.format(epoch))
        if(phase=='train'):
          acc_all.append(np.mean(epoch_metrics["acc"]))
          loss_all.append(np.mean(epoch_metrics["loss"]))
Источник
seraph
9 августа 2021 в 05:56
0

не могли бы вы поделиться информацией о вашей системе? это может быть вызвано нехваткой памяти, вы можете попробовать уменьшить размер пакета или настроить скорость обучения

MinJae
14 августа 2021 в 10:33
0

процессор использует i-7700k, gpu использует 1060, а оперативная память использует 32G.

Ответы (0)