2017-08-24 160 views
1

我在我的代码中做了一些修改,以便它不使用DataParallelDistributedDataParallel。代码如下:PyTorch给cuda运行时错误

import argparse 
import os 
import shutil 
import time 

import torch 
import torch.nn as nn 
import torch.nn.parallel 
import torch.backends.cudnn as cudnn 
import torch.distributed as dist 
import torch.optim 
import torch.utils.data 
import torch.utils.data.distributed 
import torchvision.transforms as transforms 
import torchvision.datasets as datasets 
import torchvision.models as models 

model_names = sorted(name for name in models.__dict__ 
    if name.islower() and not name.startswith("__") 
    and callable(models.__dict__[name])) 

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 
parser.add_argument('data', metavar='DIR', 
        help='path to dataset') 
parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18', 
        choices=model_names, 
        help='model architecture: ' + 
         ' | '.join(model_names) + 
         ' (default: resnet18)') 
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 
        help='number of data loading workers (default: 4)') 
parser.add_argument('--epochs', default=90, type=int, metavar='N', 
        help='number of total epochs to run') 
parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 
        help='manual epoch number (useful on restarts)') 
parser.add_argument('-b', '--batch-size', default=256, type=int, 
        metavar='N', help='mini-batch size (default: 256)') 
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, 
        metavar='LR', help='initial learning rate') 
parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 
        help='momentum') 
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, 
        metavar='W', help='weight decay (default: 1e-4)') 
parser.add_argument('--print-freq', '-p', default=10, type=int, 
        metavar='N', help='print frequency (default: 10)') 
parser.add_argument('--resume', default='', type=str, metavar='PATH', 
        help='path to latest checkpoint (default: none)') 
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 
        help='evaluate model on validation set') 
parser.add_argument('--pretrained', dest='pretrained', action='store_true', 
        help='use pre-trained model') 
parser.add_argument('--world-size', default=1, type=int, 
        help='number of distributed processes') 
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 
        help='url used to set up distributed training') 
parser.add_argument('--dist-backend', default='gloo', type=str, 
        help='distributed backend') 

best_prec1 = 0 


def main(): 
    global args, best_prec1 
    args = parser.parse_args() 

    args.distributed = args.world_size > 1 

    if args.distributed: 
     dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 
           world_size=args.world_size) 

    # create model 
    if args.pretrained: 
     print("=> using pre-trained model '{}'".format(args.arch)) 
     model = models.__dict__[args.arch](pretrained=True) 
    else: 
     print("=> creating model '{}'".format(args.arch)) 
     model = models.__dict__[args.arch]() 

    if not args.distributed: 
     if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): 
      #model.features = torch.nn.DataParallel(model.features) 
      model.cuda() 
     #else: 
      #model = torch.nn.DataParallel(model).cuda() 
    else: 
     model.cuda() 
     #model = torch.nn.parallel.DistributedDataParallel(model) 

    # define loss function (criterion) and optimizer 
    criterion = nn.CrossEntropyLoss().cuda() 

    optimizer = torch.optim.SGD(model.parameters(), args.lr, 
           momentum=args.momentum, 
           weight_decay=args.weight_decay) 

    # optionally resume from a checkpoint 
    if args.resume: 
     if os.path.isfile(args.resume): 
      print("=> loading checkpoint '{}'".format(args.resume)) 
      checkpoint = torch.load(args.resume) 
      args.start_epoch = checkpoint['epoch'] 
      best_prec1 = checkpoint['best_prec1'] 
      model.load_state_dict(checkpoint['state_dict']) 
      optimizer.load_state_dict(checkpoint['optimizer']) 
      print("=> loaded checkpoint '{}' (epoch {})" 
        .format(args.resume, checkpoint['epoch'])) 
     else: 
      print("=> no checkpoint found at '{}'".format(args.resume)) 

    cudnn.benchmark = True 

    # Data loading code 
    traindir = os.path.join(args.data, 'train') 
    valdir = os.path.join(args.data, 'val') 
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 
            std=[0.229, 0.224, 0.225]) 

    train_dataset = datasets.ImageFolder(
     traindir, 
     transforms.Compose([ 
      transforms.RandomSizedCrop(224), 
      transforms.RandomHorizontalFlip(), 
      transforms.ToTensor(), 
      normalize, 
     ])) 

    if args.distributed: 
     train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) 
    else: 
     train_sampler = None 

    train_loader = torch.utils.data.DataLoader(
     train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), 
     num_workers=args.workers, pin_memory=True, sampler=train_sampler) 

    val_loader = torch.utils.data.DataLoader(
     datasets.ImageFolder(valdir, transforms.Compose([ 
      transforms.Scale(256), 
      transforms.CenterCrop(224), 
      transforms.ToTensor(), 
      normalize, 
     ])), 
     batch_size=args.batch_size, shuffle=False, 
     num_workers=args.workers, pin_memory=True) 

    if args.evaluate: 
     validate(val_loader, model, criterion) 
     return 

    for epoch in range(args.start_epoch, args.epochs): 
     if args.distributed: 
      train_sampler.set_epoch(epoch) 
     adjust_learning_rate(optimizer, epoch) 

     # train for one epoch 
     train(train_loader, model, criterion, optimizer, epoch) 

     # evaluate on validation set 
     prec1 = validate(val_loader, model, criterion) 

     # remember best [email protected] and save checkpoint 
     is_best = prec1 > best_prec1 
     best_prec1 = max(prec1, best_prec1) 
     save_checkpoint({ 
      'epoch': epoch + 1, 
      'arch': args.arch, 
      'state_dict': model.state_dict(), 
      'best_prec1': best_prec1, 
      'optimizer' : optimizer.state_dict(), 
     }, is_best) 


def train(train_loader, model, criterion, optimizer, epoch): 
    batch_time = AverageMeter() 
    data_time = AverageMeter() 
    losses = AverageMeter() 
    top1 = AverageMeter() 
    top5 = AverageMeter() 

    # switch to train mode 
    model.train() 

    end = time.time() 
    for i, (input, target) in enumerate(train_loader): 
     # measure data loading time 
     data_time.update(time.time() - end) 

     target = target.cuda(async=True) 
     input_var = torch.autograd.Variable(input) 
     target_var = torch.autograd.Variable(target) 

     # compute output 
     output = model(input_var) 
     loss = criterion(output, target_var) 

     # measure accuracy and record loss 
     prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 
     losses.update(loss.data[0], input.size(0)) 
     top1.update(prec1[0], input.size(0)) 
     top5.update(prec5[0], input.size(0)) 

     # compute gradient and do SGD step 
     optimizer.zero_grad() 
     loss.backward() 
     optimizer.step() 

     # measure elapsed time 
     batch_time.update(time.time() - end) 
     end = time.time() 

     if i % args.print_freq == 0: 
      print('Epoch: [{0}][{1}/{2}]\t' 
        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 
        'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 
        'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 
        '[email protected] {top1.val:.3f} ({top1.avg:.3f})\t' 
        '[email protected] {top5.val:.3f} ({top5.avg:.3f})'.format(
        epoch, i, len(train_loader), batch_time=batch_time, 
        data_time=data_time, loss=losses, top1=top1, top5=top5)) 


def validate(val_loader, model, criterion): 
    batch_time = AverageMeter() 
    losses = AverageMeter() 
    top1 = AverageMeter() 
    top5 = AverageMeter() 

    # switch to evaluate mode 
    model.eval() 

    end = time.time() 
    for i, (input, target) in enumerate(val_loader): 
     target = target.cuda(async=True) 
     input_var = torch.autograd.Variable(input, volatile=True) 
     target_var = torch.autograd.Variable(target, volatile=True) 

     # compute output 
     output = model(input_var) 
     loss = criterion(output, target_var) 

     # measure accuracy and record loss 
     prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 
     losses.update(loss.data[0], input.size(0)) 
     top1.update(prec1[0], input.size(0)) 
     top5.update(prec5[0], input.size(0)) 

     # measure elapsed time 
     batch_time.update(time.time() - end) 
     end = time.time() 

     if i % args.print_freq == 0: 
      print('Test: [{0}/{1}]\t' 
        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 
        'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 
        '[email protected] {top1.val:.3f} ({top1.avg:.3f})\t' 
        '[email protected] {top5.val:.3f} ({top5.avg:.3f})'.format(
        i, len(val_loader), batch_time=batch_time, loss=losses, 
        top1=top1, top5=top5)) 

    print(' * [email protected] {top1.avg:.3f} [email protected] {top5.avg:.3f}' 
      .format(top1=top1, top5=top5)) 

    return top1.avg 


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): 
    torch.save(state, filename) 
    if is_best: 
     shutil.copyfile(filename, 'model_best.pth.tar') 


class AverageMeter(object): 
    """Computes and stores the average and current value""" 
    def __init__(self): 
     self.reset() 

    def reset(self): 
     self.val = 0 
     self.avg = 0 
     self.sum = 0 
     self.count = 0 

    def update(self, val, n=1): 
     self.val = val 
     self.sum += val * n 
     self.count += n 
     self.avg = self.sum/self.count 


def adjust_learning_rate(optimizer, epoch): 
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 
    lr = args.lr * (0.1 ** (epoch // 30)) 
    for param_group in optimizer.param_groups: 
     param_group['lr'] = lr 


def accuracy(output, target, topk=(1,)): 
    """Computes the [email protected] for the specified values of k""" 
    maxk = max(topk) 
    batch_size = target.size(0) 

    _, pred = output.topk(maxk, 1, True, True) 
    pred = pred.t() 
    correct = pred.eq(target.view(1, -1).expand_as(pred)) 

    res = [] 
    for k in topk: 
     correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 
     res.append(correct_k.mul_(100.0/batch_size)) 
    return res 


if __name__ == '__main__': 
    main() 

而且,当我运行了一套具有alexnet neuralnet结构图像的这段代码,它给出了一个怪异的CUDA错误,主要内容如下:

=> creating model 'alexnet' 
THCudaCheck FAIL file=/pytorch/torch/lib/THC/THCGeneral.c line=70 error=30 : unknown error 
Traceback (most recent call last): 
    File "imagenet2.py", line 319, in <module> 
    main() 
    File "imagenet2.py", line 87, in main 
    model.cuda() 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 147, in cuda 
    return self._apply(lambda t: t.cuda(device_id)) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply 
    module._apply(fn) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply 
    module._apply(fn) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply 
    module._apply(fn) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 124, in _apply 
    param.data = fn(param.data) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 147, in <lambda> 
    return self._apply(lambda t: t.cuda(device_id)) 
    File "/usr/local/lib/python2.7/dist-packages/torch/_utils.py", line 66, in _cuda 
    return new_type(self.size()).copy_(self, async) 
    File "/usr/local/lib/python2.7/dist-packages/torch/cuda/__init__.py", line 266, in _lazy_new 
    _lazy_init() 
    File "/usr/local/lib/python2.7/dist-packages/torch/cuda/__init__.py", line 85, in _lazy_init 
    torch._C._cuda_init() 
RuntimeError: cuda runtime error (30) : unknown error at /pytorch/torch/lib/THC/THCGeneral.c:70 

用于运行代码的命令:python imagenet.py --world-size 1 --arch 'alexnet' <image_folder>

我哪里出错了?

PS:在AWS上运行g2.2xlarge Ubuntu实例。

的CUDA版本如下:

nvcc: NVIDIA (R) Cuda compiler driver 
Copyright (c) 2005-2016 NVIDIA Corporation 
Built on Tue_Jan_10_13:22:03_CST_2017 
Cuda compilation tools, release 8.0, V8.0.61 

回答

1
  1. CUDNN给出无用的错误消息。要进行调试,请使用net.cpu()在CPU上测试您的网络,或者只需简单地删除net.cuda()即可。您必须对培训,验证和输出变量进行相同的操作。

  2. 接缝问题是,您使用预先训练过的AlexNet图像的大小不同于224x224。根据文件,只要图像尺寸至少为224x224,它就应该可以工作。

  3. 这可能是由于pytorch的AlexNet实现中的硬编码参数导致的张量整形问题。在vision/torchvision/models/alexnet.py在第44行,它说

x = x.view(x.size(0), 256 * 6 * 6) 

将其更改为

x = x.view(x.size(0), -1) 

这应该允许它与不同的图像大小的工作。

  1. 我将此修改subbmitted到github存储库,但我想它尚未更新。