imagenet example - add logic to broadcast most recent checkpoint from max_rank (#93)

Kiuk Chung · facebook-github-bot · commit 1465dfca76a4 · 2020-04-09T18:59:53.000-07:00
Summary: Pull Request resolved: #93 Rationale for adding checkpoint broadcasting: - In our example we don't have access to globally visible storage - Each local rank =0 writes the checkpoint - when a container/node dies, the replacement container has no checkpoints (since it was lost with the node) - new nodes starts from scratch vs surviving nodes are ahead - the logic is to find the checkpoint with the max epoch and broadcast that Rationale for removing nnode==1 assertion for launcher with --with_etcd option. - you can run two agents on the same node to simulate a multi-node run - you can first start agent#1 by giving `--with_etcd` option - you can start agent#2 by copy pasting the rdzv info (from the logs) and passing the `--rdzv_id, --rdzv_backend, --rdzv_endpoint` from the first launch. Reviewed By: tierex, drdarshan Differential Revision: D20956704 fbshipit-source-id: 3170e1bcbedf1a7522f3aeee23f0fc67cd038253
diff --git a/examples/imagenet/main.py b/examples/imagenet/main.py
@@ -45,8 +45,10 @@
 
 import argparse
 import os
+import pickle
 import shutil
 import time
+from contextlib import contextmanager
 from typing import List, Tuple
 
 import torch
@@ -145,7 +147,6 @@
 
 def main():
     args = parser.parse_args()
-
     device_id = int(os.environ["LOCAL_RANK"])
     torch.cuda.set_device(device_id)
     print(f"=> set cuda device = {device_id}")
@@ -161,18 +162,21 @@ def main():
         args.arch, args.lr, args.momentum, args.weight_decay, device_id
     )
 
-    # resume from checkpoint if one exists
-    start_epoch, best_acc1 = load_checkpoint(
-        args.checkpoint_file, device_id, model, optimizer
-    )
-    print(f"=> start_epoch: {start_epoch}, best_acc1: {best_acc1}")
-
     train_loader, val_loader = initialize_data_loader(
         args.data, args.batch_size, args.workers
     )
 
+    # resume from checkpoint if one exists;
+    state = load_checkpoint(
+        args.checkpoint_file, device_id, args.arch, model, optimizer
+    )
+
+    start_epoch = state.epoch + 1
+    print(f"=> start_epoch: {start_epoch}, best_acc1: {state.best_acc1}")
+
     print_freq = args.print_freq
     for epoch in range(start_epoch, args.epochs):
+        state.epoch = epoch
         train_loader.batch_sampler.sampler.set_epoch(epoch)
         adjust_learning_rate(optimizer, epoch, args.lr)
 
@@ -183,21 +187,64 @@ def main():
         acc1 = validate(val_loader, model, criterion, device_id, print_freq)
 
         # remember best acc@1 and save checkpoint
-        is_best = acc1 > best_acc1
-        best_acc1 = max(acc1, best_acc1)
+        is_best = acc1 > state.best_acc1
+        state.best_acc1 = max(acc1, state.best_acc1)
 
         if device_id == 0:
-            save_checkpoint(
-                {
-                    "epoch": epoch + 1,
-                    "best_acc1": best_acc1,
-                    "arch": args.arch,
-                    "state_dict": model.state_dict(),
-                    "optimizer": optimizer.state_dict(),
-                },
-                is_best,
-                args.checkpoint_file,
-            )
+            save_checkpoint(state, is_best, args.checkpoint_file)
+
+
+class State:
+    """
+    Container for objects that we want to checkpoint. Represents the
+    current "state" of the worker. This object is mutable.
+    """
+
+    def __init__(self, arch, model, optimizer):
+        self.epoch = -1
+        self.best_acc1 = 0
+        self.arch = arch
+        self.model = model
+        self.optimizer = optimizer
+
+    def capture_snapshot(self):
+        """
+        Essentially a ``serialize()`` function, returns the state as an
+        object compatible with ``torch.save()``. The following should work
+        ::
+
+        snapshot = state_0.capture_snapshot()
+        state_1.apply_snapshot(snapshot)
+        assert state_0 == state_1
+        """
+        return {
+            "epoch": self.epoch,
+            "best_acc1": self.best_acc1,
+            "arch": self.arch,
+            "state_dict": self.model.state_dict(),
+            "optimizer": self.optimizer.state_dict(),
+        }
+
+    def apply_snapshot(self, obj, device_id):
+        """
+        The complimentary function of ``capture_snapshot()``. Applies the
+        snapshot object that was returned by ``capture_snapshot()``.
+        This function mutates this state object.
+        """
+
+        self.epoch = obj["epoch"]
+        self.best_acc1 = obj["best_acc1"]
+        self.state_dict = obj["state_dict"]
+        self.model.load_state_dict(obj["state_dict"])
+        self.optimizer.load_state_dict(obj["optimizer"])
+
+    def save(self, f):
+        torch.save(self.capture_snapshot(), f)
+
+    def load(self, f, device_id):
+        # Map model to be loaded to specified single gpu.
+        snapshot = torch.load(f, map_location=f"cuda:{device_id}")
+        self.apply_snapshot(snapshot, device_id)
 
 
 def initialize_model(
@@ -273,21 +320,100 @@ def initialize_data_loader(
 def load_checkpoint(
     checkpoint_file: str,
     device_id: int,
+    arch: str,
     model: DistributedDataParallel,
     optimizer,  # SGD
-) -> Tuple[int, float]:
-    start_epoch = 0
-    best_acc1 = 0
+) -> State:
+    """
+    Loads a local checkpoint (if any). Otherwise, checks to see if any of
+    the neighbors have a non-zero state. If so, restore the state
+    from the rank that has the most up-to-date checkpoint.
+
+    .. note:: when your job has access to a globally visible persistent storage
+              (e.g. nfs mount, S3) you can simply have all workers load
+              from the most recent checkpoint from such storage. Since this
+              example is expected to run on vanilla hosts (with no shared
+              storage) the checkpoints are written to local disk, hence
+              we have the extra logic to broadcast the checkpoint from a
+              surviving node.
+    """
+
+    state = State(arch, model, optimizer)
+
     if os.path.isfile(checkpoint_file):
-        print(f"=> loading checkpoint: {checkpoint_file}")
-        # Map model to be loaded to specified single gpu.
-        checkpoint = torch.load(checkpoint_file, map_location=f"cuda:{device_id}")
-        start_epoch = checkpoint["epoch"]
-        best_acc1 = checkpoint["best_acc1"]
-        model.load_state_dict(checkpoint["state_dict"])
-        optimizer.load_state_dict(checkpoint["optimizer"])
-        print(f"=> loaded checkpoint: {checkpoint_file}")
-    return start_epoch, best_acc1
+        print(f"=> loading checkpoint file: {checkpoint_file}")
+        state.load(checkpoint_file, device_id)
+        print(f"=> loaded checkpoint file: {checkpoint_file}")
+
+    # logic below is unnecessary when the checkpoint is visible on all nodes!
+    # create a temporary cpu pg to broadcast most up-to-date checkpoint
+    with tmp_process_group(backend="gloo") as pg:
+        rank = dist.get_rank(group=pg)
+
+        # get rank that has the largest state.epoch
+        epochs = torch.zeros(dist.get_world_size(), dtype=torch.int32)
+        epochs[rank] = state.epoch
+        dist.all_reduce(epochs, op=dist.ReduceOp.SUM, group=pg)
+        t_max_epoch, t_max_rank = torch.max(epochs, dim=0)
+        max_epoch = t_max_epoch.item()
+        max_rank = t_max_rank.item()
+
+        # max_epoch == -1 means no one has checkpointed return base state
+        if max_epoch == -1:
+            print(f"=> no workers have checkpoints, starting from epoch 0")
+            return state
+
+        # broadcast the state from max_rank (which has the most up-to-date state)
+        # pickle the snapshot, convert it into a byte-blob tensor
+        # then broadcast it, unpickle it and apply the snapshot
+        print(f"=> using checkpoint from rank: {max_rank}, max_epoch: {max_epoch}")
+        raw_blob = bytearray(pickle.dumps(state.capture_snapshot()))
+        blob_len = torch.tensor(len(raw_blob))
+        dist.broadcast(blob_len, src=max_rank, group=pg)
+        print(f"=> checkpoint broadcast size is: {blob_len}")
+
+        if rank != max_rank:
+            blob = torch.zeros(blob_len.item(), dtype=torch.uint8)
+        else:
+            blob = torch.tensor(raw_blob, dtype=torch.uint8)
+
+        dist.broadcast(blob, src=max_rank, group=pg)
+        print(f"=> done broadcasting checkpoint")
+
+        if rank != max_rank:
+            snapshot = pickle.loads(blob.numpy())
+            state.apply_snapshot(snapshot, device_id)
+
+        # wait till everyone has loaded the checkpoint
+        dist.barrier(group=pg)
+
+    print(f"=> done restoring from previous checkpoint")
+    return state
+
+
+@contextmanager
+def tmp_process_group(backend):
+    cpu_pg = dist.new_group(backend=backend)
+    try:
+        yield cpu_pg
+    finally:
+        dist.destroy_process_group(cpu_pg)
+
+
+def save_checkpoint(state: State, is_best: bool, filename: str):
+    checkpoint_dir = os.path.dirname(filename)
+    os.mkdir(checkpoint_dir)
+
+    # save to tmp, then commit by moving the file in case the job
+    # gets interrupted while writing the checkpoint
+    tmp_filename = filename + ".tmp"
+    torch.save(state.capture_snapshot(), tmp_filename)
+    os.rename(tmp_filename, filename)
+    print(f"=> saved checkpoint for epoch {state.epoch} at {filename}")
+    if is_best:
+        best = os.path.join(checkpoint_dir, "model_best.pth.tar")
+        print(f"=> best model found at epoch {state.epoch} saving to {best}")
+        shutil.copyfile(filename, best)
 
 
 def train(
@@ -394,16 +520,6 @@ def validate(
     return top1.avg
 
 
-def save_checkpoint(state, is_best: bool, filename: str):
-    # save to tmp, then commit by moving the file in case the job
-    # gets interrupted while writing the checkpoint
-    tmp_filename = filename + ".tmp"
-    torch.save(state, tmp_filename)
-    os.rename(tmp_filename, filename)
-    if is_best:
-        shutil.copyfile(filename, "model_best.pth.tar")
-
-
 class AverageMeter(object):
     """Computes and stores the average and current value"""
 
diff --git a/torchelastic/distributed/launch.py b/torchelastic/distributed/launch.py
@@ -431,15 +431,19 @@ def main(args=None):
     assert args.max_restarts > 0
 
     if args.with_etcd:
-        assert (
-            min_nodes == max_nodes == 1
-        ), "--with_etcd can only be used with --nodes=1"
-
         etcd_server = EtcdServer()
         etcd_server.start()
         args.rdzv_backend = "etcd"
         args.rdzv_endpoint = etcd_server.get_endpoint()
         args.rdzv_id = str(uuid.uuid4())
+        log.info(
+            f"\n**************************************\n"
+            f"Rendezvous info:\n"
+            f"--rdzv_backend={args.rdzv_backend} "
+            f"--rdzv_endpoint={args.rdzv_endpoint} "
+            f"--rdzv_id={args.rdzv_id}\n"
+            f"**************************************\n"
+        )
 
     rdzv_parameters = parameters.RendezvousParameters(
         args.rdzv_backend,