From b436bd488e04934371bf65b5bcb9fafb3587a115 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Mon, 20 Jan 2025 16:24:13 -0800 Subject: [PATCH] Update references to torchvision (#949) Signed-off-by: zhangsmallshark --- training/cifar/cifar10_deepspeed.py | 2 +- training/cifar/cifar10_tutorial.py | 2 +- .../vit_finetuning/main_imagenet.py | 46 +++++++++---------- .../vit_finetuning/utils/get_data.py | 22 ++++----- training/gan/gan_baseline_train.py | 2 +- training/gan/gan_deepspeed_train.py | 2 +- training/imagenet/main.py | 16 +++---- training/pipeline_parallelism/train.py | 2 +- 8 files changed, 47 insertions(+), 47 deletions(-) diff --git a/training/cifar/cifar10_deepspeed.py b/training/cifar/cifar10_deepspeed.py index 9888544d5..e92c43937 100755 --- a/training/cifar/cifar10_deepspeed.py +++ b/training/cifar/cifar10_deepspeed.py @@ -6,7 +6,7 @@ import torch.nn as nn import torch.nn.functional as F import torchvision -import torchvision.transforms as transforms +from torchvision import transforms from deepspeed.accelerator import get_accelerator from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer diff --git a/training/cifar/cifar10_tutorial.py b/training/cifar/cifar10_tutorial.py index 114e8c5fa..b7c7e01bd 100644 --- a/training/cifar/cifar10_tutorial.py +++ b/training/cifar/cifar10_tutorial.py @@ -57,7 +57,7 @@ """ import torch import torchvision -import torchvision.transforms as transforms +from torchvision import transforms ######################################################################## # The output of torchvision datasets are PILImage images of range [0, 1]. diff --git a/training/data_efficiency/vit_finetuning/main_imagenet.py b/training/data_efficiency/vit_finetuning/main_imagenet.py index 4d39ac9af..0042b49b4 100644 --- a/training/data_efficiency/vit_finetuning/main_imagenet.py +++ b/training/data_efficiency/vit_finetuning/main_imagenet.py @@ -19,8 +19,8 @@ import torch.multiprocessing as mp import torch.utils.data import torch.utils.data.distributed -import torchvision.transforms as transforms -import torchvision.datasets as datasets +from torchvision import transforms +from torchvision import datasets import torchvision.models as models from torch.utils.data import Subset import models @@ -105,7 +105,7 @@ def _get_model(args): nchannels = 3 model = models.__dict__[args.arch](num_classes=nclasses, nchannels=nchannels) return model - + def _get_dist_model(gpu, args): ngpus_per_node = torch.cuida.device_count() if args.distributed: @@ -149,9 +149,9 @@ def _get_dist_model(gpu, args): else: model = torch.nn.DataParallel(model).cuda() return model - + def main(): - + args = parser.parse_args() if args.seed is not None: @@ -190,7 +190,7 @@ def main(): def main_worker(gpu, ngpus_per_node, args): global best_acc1 global history - + if args.deepspeed: gpu = args.local_rank args.gpu = gpu @@ -205,7 +205,7 @@ def main_worker(gpu, ngpus_per_node, args): deepspeed.init_distributed() print(f'created model on gpu {gpu}') # exit () - + # define loss function (criterion), optimizer, and learning rate scheduler criterion = nn.CrossEntropyLoss().cuda(args.gpu) @@ -284,14 +284,14 @@ def main_worker(gpu, ngpus_per_node, args): validate(val_loader, model, criterion, args) # return args.completed_step = 0 - + optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) - + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" scheduler = StepLR(optimizer, step_size=int(len(train_loader)*args.epochs//3), gamma=0.1)# None # - + model, optimizer, _, scheduler = deepspeed.initialize( model=model, @@ -311,17 +311,17 @@ def main_worker(gpu, ngpus_per_node, args): time_epoch = time.time() - start_time # evaluate on validation set top5_val, top1_val, losses_val = validate(val_loader, model, criterion, args) - if args.gpu==0: + if args.gpu==0: history["epoch"].append(epoch) history["val_loss"].append(losses_val) - history["val_acc1"].append(top1_val) - history["val_acc5"].append(top5_val) + history["val_acc1"].append(top1_val) + history["val_acc5"].append(top5_val) history["train_loss"].append(losses_train) - history["train_acc1"].append(top1_train) + history["train_acc1"].append(top1_train) history["train_acc5"].append(top5_train) - torch.save(history,f"{args.out_dir}/stat.pt") + torch.save(history,f"{args.out_dir}/stat.pt") try: - print (f'{epoch} epoch at time {time_epoch}s and learning rate {scheduler.get_last_lr()}') + print (f'{epoch} epoch at time {time_epoch}s and learning rate {scheduler.get_last_lr()}') except: print (f'{epoch} epoch at time {time_epoch}s and learning rate {args.lr}') print (f"finish epoch {epoch} or iteration {args.completed_step}, train_accuracy is {top1_train}, val_accuracy {top1_val}") @@ -393,14 +393,14 @@ def train(scheduler, train_loader, model, criterion, optimizer, epoch, args): loss.backward() optimizer.step() scheduler.step() - + # measure elapsed time batch_time.update(time.time() - end) end = time.time() - if i % args.print_freq == 0 and args.gpu==0: + if i % args.print_freq == 0 and args.gpu==0: progress.display(i + 1) - + if args.distributed: losses.all_reduce() top1.all_reduce() @@ -432,7 +432,7 @@ def run_validate(loader, base_progress=0): batch_time.update(time.time() - end) end = time.time() - if i % args.print_freq == 0 and args.gpu==0: + if i % args.print_freq == 0 and args.gpu==0: progress.display(i + 1) batch_time = AverageMeter('Time', ':6.3f', Summary.NONE) @@ -509,7 +509,7 @@ def all_reduce(self): def __str__(self): fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' return fmtstr.format(**self.__dict__) - + def summary(self): fmtstr = '' if self.summary_type is Summary.NONE: @@ -522,7 +522,7 @@ def summary(self): fmtstr = '{name} {count:.3f}' else: raise ValueError('invalid summary type %r' % self.summary_type) - + return fmtstr.format(**self.__dict__) @@ -536,7 +536,7 @@ def display(self, batch): entries = [self.prefix + self.batch_fmtstr.format(batch)] entries += [str(meter) for meter in self.meters] print ('\t'.join(entries)) - + def display_summary(self): entries = [" *"] entries += [meter.summary() for meter in self.meters] diff --git a/training/data_efficiency/vit_finetuning/utils/get_data.py b/training/data_efficiency/vit_finetuning/utils/get_data.py index dfad5f3ba..c2505fd17 100644 --- a/training/data_efficiency/vit_finetuning/utils/get_data.py +++ b/training/data_efficiency/vit_finetuning/utils/get_data.py @@ -13,18 +13,18 @@ # limitations under the License. import torch import os -import torchvision.transforms as transforms -import torchvision.datasets as datasets +from torchvision import transforms +from torchvision import datasets def get_dataset(dataset_name, data_dir, split, rand_fraction=None,clean=False, transform=None, imsize=None, bucket='pytorch-data', **kwargs): if dataset_name in [ 'cifar10', 'cifar100']: - dataset = globals()[f'get_{dataset_name}'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs) + dataset = globals()[f'get_{dataset_name}'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs) elif dataset_name in [ 'cifar10vit224', 'cifar100vit224','cifar10vit384', 'cifar100vit384',]: imsize = int(dataset_name.split('vit')[-1]) dataset_name = dataset_name.split('vit')[0] #print ('here') - dataset = globals()['get_cifar_vit'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs) + dataset = globals()['get_cifar_vit'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs) else: assert 'cifar' in dataset_name print (dataset_name) @@ -59,10 +59,10 @@ def get_transform(split, normalize=None, transform=None, imsize=None, aug='large if transform is None: if normalize is None: if aug == 'large': - + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) else: - normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) + normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) transform = transforms.Compose(get_aug(split, imsize=imsize, aug=aug) + [transforms.ToTensor(), normalize]) return transform @@ -71,7 +71,7 @@ def get_transform(split, normalize=None, transform=None, imsize=None, aug='large def get_cifar10(dataset_name, data_dir, split, transform=None, imsize=None, bucket='pytorch-data', **kwargs): if imsize==224: transform = get_transform(split, transform=transform, imsize=imsize, aug='large') - else: + else: transform = get_transform(split, transform=transform, imsize=imsize, aug='small') return datasets.CIFAR10(data_dir, train=(split=='train'), transform=transform, download=True, **kwargs) @@ -88,7 +88,7 @@ def get_cifar100N(dataset_name, data_dir, split, rand_fraction=None,transform=No if split=='train': return CIFAR100N(root=data_dir, train=(split=='train'), transform=transform, download=True, rand_fraction=rand_fraction) else: - return datasets.CIFAR100(data_dir, train=(split=='train'), transform=transform, download=True, **kwargs) + return datasets.CIFAR100(data_dir, train=(split=='train'), transform=transform, download=True, **kwargs) def get_cifar_vit(dataset_name, data_dir, split, transform=None, imsize=None, bucket='pytorch-data', **kwargs): if imsize==224: @@ -111,12 +111,12 @@ def get_cifar_vit(dataset_name, data_dir, split, transform=None, imsize=None, bu if dataset_name =='cifar10': return datasets.CIFAR10(data_dir, train=(split=='train'), transform=transform_data, download=True, **kwargs) elif dataset_name =='cifar100': - + return datasets.CIFAR100(data_dir, train=(split=='train'), transform=transform_data, download=True, **kwargs) else: assert dataset_name in ['cifar10', 'cifar100'] else: - + if split=='train': transform_data = transforms.Compose([# transforms.ColorJitter(brightness= 0.4, contrast= 0.4, saturation= 0.4, hue= 0.1), transforms.Resize(imsize), @@ -164,4 +164,4 @@ def get_imagenet_vit(dataset_name, data_dir, split, transform=None, imsize=None, #return torch.utils.data.distributed.DistributedSampler(train_dataset) else: return datasets.ImageFolder(valdir, transform_data) - #Ereturn torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True) \ No newline at end of file + #Ereturn torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True) diff --git a/training/gan/gan_baseline_train.py b/training/gan/gan_baseline_train.py index 3d223542f..ab2d67740 100755 --- a/training/gan/gan_baseline_train.py +++ b/training/gan/gan_baseline_train.py @@ -3,7 +3,7 @@ import torch.nn as nn import torch.utils.data import torchvision.datasets as dset -import torchvision.transforms as transforms +from torchvision import transforms import torchvision.utils as vutils from torch.utils.tensorboard import SummaryWriter from time import time diff --git a/training/gan/gan_deepspeed_train.py b/training/gan/gan_deepspeed_train.py index f209a4273..ffaf59375 100755 --- a/training/gan/gan_deepspeed_train.py +++ b/training/gan/gan_deepspeed_train.py @@ -3,7 +3,7 @@ import torch.nn as nn import torch.utils.data import torchvision.datasets as dset -import torchvision.transforms as transforms +from torchvision import transforms import torchvision.utils as vutils from torch.utils.tensorboard import SummaryWriter from time import time diff --git a/training/imagenet/main.py b/training/imagenet/main.py index 1558e6ae0..414d152f1 100644 --- a/training/imagenet/main.py +++ b/training/imagenet/main.py @@ -18,9 +18,9 @@ import torch.optim import torch.utils.data import torch.utils.data.distributed -import torchvision.datasets as datasets import torchvision.models as models -import torchvision.transforms as transforms +from torchvision import transforms +from torchvision import datasets from torch.optim.lr_scheduler import StepLR from torch.utils.data import Subset @@ -94,7 +94,7 @@ def main(): 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') - + if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') @@ -112,7 +112,7 @@ def main(): args.world_size = ngpus_per_node * args.world_size t_losses, t_acc1s = main_worker(args.gpu, ngpus_per_node, args) #dist.barrier() - + # Write the losses to an excel file if dist.get_rank() ==0: all_losses = [torch.empty_like(t_losses) for _ in range(ngpus_per_node)] @@ -278,7 +278,7 @@ def print_rank_0(msg): acc1s[epoch] = acc1 scheduler.step() - + # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) @@ -449,7 +449,7 @@ def all_reduce(self): def __str__(self): fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' return fmtstr.format(**self.__dict__) - + def summary(self): fmtstr = '' if self.summary_type is Summary.NONE: @@ -462,7 +462,7 @@ def summary(self): fmtstr = '{name} {count:.3f}' else: raise ValueError('invalid summary type %r' % self.summary_type) - + return fmtstr.format(**self.__dict__) @@ -476,7 +476,7 @@ def display(self, batch): entries = [self.prefix + self.batch_fmtstr.format(batch)] entries += [str(meter) for meter in self.meters] print('\t'.join(entries)) - + def display_summary(self): entries = [" *"] entries += [meter.summary() for meter in self.meters] diff --git a/training/pipeline_parallelism/train.py b/training/pipeline_parallelism/train.py index 1a418b427..b4bc49bf6 100755 --- a/training/pipeline_parallelism/train.py +++ b/training/pipeline_parallelism/train.py @@ -7,7 +7,7 @@ import torch.distributed as dist import torchvision -import torchvision.transforms as transforms +from torchvision import transforms from torchvision.models import AlexNet from torchvision.models import vgg19