|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +import os |
| 4 | +import numpy as np |
| 5 | +import torch |
| 6 | +from torch import nn |
| 7 | +import torch.multiprocessing as mp |
| 8 | + |
| 9 | +import config |
| 10 | +from core import Agent, Trainer, train_local_mp |
| 11 | +from model import CifarModel |
| 12 | +from data import CifarData |
| 13 | + |
| 14 | + |
| 15 | +class CIFARAgent(Agent): |
| 16 | + """CIFARAgent for CIFAR10 and CIFAR100.""" |
| 17 | + def __init__(self, global_args, subset=tuple(range(10)), fine='CIFAR10'): |
| 18 | + super().__init__(global_args, subset, fine) |
| 19 | + self.distr_type = global_args.distr_type |
| 20 | + if self.distr_type == 'uniform': |
| 21 | + self.distribution = np.array([0.1] * 10) |
| 22 | + elif self.distr_type == 'dirichlet': |
| 23 | + self.distribution = np.random.dirichlet([global_args.alpha] * 10) |
| 24 | + else: |
| 25 | + raise ValueError(f'Invalid distribution type: {self.distr_type}.') |
| 26 | + |
| 27 | + def load_data(self, data_alloc, center=False): |
| 28 | + print("=> loading data") |
| 29 | + if center: |
| 30 | + self.train_loader, self.test_loader, self.num_train = \ |
| 31 | + data_alloc.create_dataset_for_center(self.batch_size, self.num_workers) |
| 32 | + else: |
| 33 | + self.train_loader, self.test_loader, self.num_train = \ |
| 34 | + data_alloc.create_dataset_for_client(self.distribution, self.batch_size, |
| 35 | + self.num_workers, self.subset) |
| 36 | + |
| 37 | + def build_model(self): |
| 38 | + print("=> building model") |
| 39 | + if self.fine == 'CIFAR10': |
| 40 | + num_class = 10 |
| 41 | + elif self.fine == 'CIFAR100': |
| 42 | + num_class = 100 |
| 43 | + else: |
| 44 | + raise ValueError('Invalid dataset choice.') |
| 45 | + self.model = CifarModel(num_class).to(self.device) |
| 46 | + self.criterion = nn.CrossEntropyLoss().to(self.device) |
| 47 | + self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr, |
| 48 | + momentum=0.9, weight_decay=1e-4) |
| 49 | + |
| 50 | + |
| 51 | +class CIFARTrainer(Trainer): |
| 52 | + """CIFAR Trainer.""" |
| 53 | + def __init__(self, global_args): |
| 54 | + super().__init__(global_args) |
| 55 | + self.data_alloc = CifarData(self.num_locals, self.sample_rate, fine=self.fine) |
| 56 | + |
| 57 | + # init the global model |
| 58 | + self.global_agent = CIFARAgent(global_args, fine=self.fine) |
| 59 | + self.global_agent.load_data(self.data_alloc, center=True) |
| 60 | + self.global_agent.build_model() |
| 61 | + self.global_agent.resume_model(self.resume) |
| 62 | + |
| 63 | + def build_local_models(self, global_args): |
| 64 | + self.nets_pool = list() |
| 65 | + for _ in range(self.num_locals): |
| 66 | + self.nets_pool.append(CIFARAgent(global_args, fine=self.fine)) |
| 67 | + self.init_local_models() |
| 68 | + |
| 69 | + def train(self): |
| 70 | + for rnd in range(self.rounds): |
| 71 | + np.random.shuffle(self.nets_pool) |
| 72 | + pool = mp.Pool(self.num_per_rnd) |
| 73 | + self.q = mp.Manager().Queue() |
| 74 | + dict_new = self.global_agent.model.state_dict() |
| 75 | + if self.estimate_weights_in_center and rnd % self.interval == 0: |
| 76 | + w_d = self.global_agent.estimate_weights(self.policy) |
| 77 | + else: |
| 78 | + w_d = None |
| 79 | + for net in self.nets_pool[:self.num_per_rnd]: |
| 80 | + net.model.load_state_dict(dict_new) |
| 81 | + net.set_lr(self.global_agent.lr) |
| 82 | + pool.apply_async(train_local_mp, (net, self.local_epochs, rnd, self.q, self.policy, w_d)) |
| 83 | + pool.close() |
| 84 | + pool.join() |
| 85 | + self.update_global(rnd) |
| 86 | + |
| 87 | +def main(): |
| 88 | + os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu |
| 89 | + torch.manual_seed(args.seed) |
| 90 | + torch.cuda.manual_seed(args.seed) |
| 91 | + np.random.seed(args.seed) |
| 92 | + mp.set_start_method('forkserver') |
| 93 | + |
| 94 | + cifar_trainer = CIFARTrainer(args) |
| 95 | + |
| 96 | + # test |
| 97 | + if args.mode == 'test': |
| 98 | + cifar_trainer.test() |
| 99 | + return |
| 100 | + |
| 101 | + cifar_trainer.build_local_models(args) |
| 102 | + cifar_trainer.train() |
| 103 | + |
| 104 | +if __name__ == '__main__': |
| 105 | + args = config.get_args() |
| 106 | + args.fine = 'CIFAR10' |
| 107 | + main() |
0 commit comments