From 0a2dd4e5135b746b7e1910c39b62fcff93e7815c Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Wed, 7 Oct 2020 02:07:41 +0000
Subject: [PATCH] fixing the run and model scripts for running the
 BingBertSquad

---
 .../convert_bert_ckpt_to_deepspeed.py         | 80 +++++++++++++------
 BingBertSquad/nvidia_run_squad_deepspeed.py   | 43 ++++++----
 BingBertSquad/run_squad_deepspeed.sh          |  2 +-
 BingBertSquad/turing/nvidia_modeling.py       |  2 +
 BingBertSquad/turing/nvidia_modelingpreln.py  |  2 +
 BingBertSquad/utils.py                        |  7 +-
 6 files changed, 94 insertions(+), 42 deletions(-)

diff --git a/BingBertSquad/convert_bert_ckpt_to_deepspeed.py b/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
index 3f11fab7f..e37bea90d 100755
--- a/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
+++ b/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
@@ -14,6 +14,7 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+
 def set_data(param, array):
     try:
         assert param.shape == array.shape
@@ -22,6 +23,7 @@ def set_data(param, array):
         raise
     param.data = torch.from_numpy(array)
 
+
 def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
     """ Load tf checkpoints in DeepSpeed model.
     """
@@ -52,10 +54,10 @@ def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
         name = name_str.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
-        if any(
-            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
-            for n in name
-        ):
+        if any(n in [
+                "adam_v", "adam_m", "AdamWeightDecayOptimizer",
+                "AdamWeightDecayOptimizer_1", "global_step"
+        ] for n in name):
             logger.info("Skipping {}".format("/".join(name)))
             continue
         pointer = model
@@ -76,11 +78,14 @@ def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
             elif scope_names[0] == "squad":
                 pointer = getattr(pointer, "classifier")
             # Special in deepspeed.
-            elif name_str.find("bert/pooler/dense") >= 0 and scope_names[0] == "dense":
+            elif name_str.find(
+                    "bert/pooler/dense") >= 0 and scope_names[0] == "dense":
                 pointer = getattr(pointer, "dense_act")
-            elif name_str.find("bert/embeddings/LayerNorm/gamma") >= 0 and scope_names[0] == "gamma":
+            elif name_str.find("bert/embeddings/LayerNorm/gamma"
+                               ) >= 0 and scope_names[0] == "gamma":
                 pointer = getattr(pointer, "weight")
-            elif name_str.find("bert/embeddings/LayerNorm/beta") >= 0 and scope_names[0] == "beta":
+            elif name_str.find("bert/embeddings/LayerNorm/beta"
+                               ) >= 0 and scope_names[0] == "beta":
                 pointer = getattr(pointer, "bias")
             else:
                 try:
@@ -121,16 +126,26 @@ def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
                         pointer = getattr(pointer, "inter_w")
                     elif name_str.find("intermediate/dense/bias") > 0:
                         pointer = getattr(pointer, "inter_b")
-                    elif name_str.find("output/dense/kernel") > 0 and name_str.find("attention") < 0:
+                    elif name_str.find(
+                            "output/dense/kernel") > 0 and name_str.find(
+                                "attention") < 0:
                         pointer = getattr(pointer, "output_w")
-                    elif name_str.find("output/dense/bias") > 0 and name_str.find("attention") < 0:
+                    elif name_str.find(
+                            "output/dense/bias") > 0 and name_str.find(
+                                "attention") < 0:
                         pointer = getattr(pointer, "output_b")
-                    elif name_str.find("output/LayerNorm/gamma") > 0 and name_str.find("attention") < 0:
+                    elif name_str.find(
+                            "output/LayerNorm/gamma") > 0 and name_str.find(
+                                "attention") < 0:
                         pointer = getattr(pointer, "norm_w")
-                    elif name_str.find("output/LayerNorm/beta") > 0 and name_str.find("attention") < 0:
+                    elif name_str.find(
+                            "output/LayerNorm/beta") > 0 and name_str.find(
+                                "attention") < 0:
                         pointer = getattr(pointer, "norm_b")
                     else:
-                        raise ValueError(f"unexpect scope name {name_str} in transformer layer.")
+                        raise ValueError(
+                            f"unexpect scope name {name_str} in transformer layer."
+                        )
                     break
 
         if skipping:
@@ -161,7 +176,8 @@ def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
             continue
 
         # DeepSpeed BERT model has voc_size 8 aligned.
-        if voc_size_diff > 0 and name_str.find("embeddings/word_embeddings") >= 0:
+        if voc_size_diff > 0 and name_str.find(
+                "embeddings/word_embeddings") >= 0:
             z = np.zeros((voc_size_diff, array.shape[1]), dtype=array.dtype)
             array = np.concatenate((array, z), axis=0)
 
@@ -170,6 +186,7 @@ def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
 
     return model
 
+
 def load_hf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
     """ Load huggingface checkpoints and convert to a deepspeed model.
     """
@@ -181,7 +198,8 @@ def load_hf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
     qkv = {}
     for name_str in ckpt.keys():
         array = ckpt[name_str].numpy()
-        logger.info("Loading Huggingface weight {} with shape {}".format(name_str, array.shape))
+        logger.info("Loading Huggingface weight {} with shape {}".format(
+            name_str, array.shape))
         name = name_str.split(".")
         pointer = model
         key = None
@@ -235,16 +253,22 @@ def load_hf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
                     pointer = getattr(pointer, "inter_w")
                 elif name_str.find("intermediate.dense.bias") > 0:
                     pointer = getattr(pointer, "inter_b")
-                elif name_str.find("output.dense.weight") > 0 and name_str.find("attention") < 0:
+                elif name_str.find("output.dense.weight"
+                                   ) > 0 and name_str.find("attention") < 0:
                     pointer = getattr(pointer, "output_w")
-                elif name_str.find("output.dense.bias") > 0 and name_str.find("attention") < 0:
+                elif name_str.find("output.dense.bias") > 0 and name_str.find(
+                        "attention") < 0:
                     pointer = getattr(pointer, "output_b")
-                elif name_str.find("output.LayerNorm.weight") > 0 and name_str.find("attention") < 0:
+                elif name_str.find("output.LayerNorm.weight"
+                                   ) > 0 and name_str.find("attention") < 0:
                     pointer = getattr(pointer, "norm_w")
-                elif name_str.find("output.LayerNorm.bias") > 0 and name_str.find("attention") < 0:
+                elif name_str.find("output.LayerNorm.bias"
+                                   ) > 0 and name_str.find("attention") < 0:
                     pointer = getattr(pointer, "norm_b")
                 else:
-                    raise ValueError(f"unexpect scope name {name_str} in transformer layer.")
+                    raise ValueError(
+                        f"unexpect scope name {name_str} in transformer layer."
+                    )
                 break
 
         if skipping:
@@ -270,7 +294,8 @@ def load_hf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
             continue
 
         # DeepSpeed BERT model has voc_size 8 aligned.
-        if voc_size_diff > 0 and name_str.find("embeddings.word_embeddings") >= 0:
+        if voc_size_diff > 0 and name_str.find(
+                "embeddings.word_embeddings") >= 0:
             z = np.zeros((voc_size_diff, array.shape[1]), dtype=array.dtype)
             array = np.concatenate((array, z), axis=0)
 
@@ -279,6 +304,7 @@ def load_hf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
 
     return model
 
+
 def load_hf_weights_in_bert_torch(model, ckpt_path, voc_size_diff):
     """ Load huggingface checkpoints and convert to a deepspeed model.
     """
@@ -290,7 +316,8 @@ def load_hf_weights_in_bert_torch(model, ckpt_path, voc_size_diff):
     qkv = {}
     for name_str in ckpt.keys():
         array = ckpt[name_str].numpy()
-        logger.info("Loading Huggingface weight {} with shape {}".format(name_str, array.shape))
+        logger.info("Loading Huggingface weight {} with shape {}".format(
+            name_str, array.shape))
         name = name_str.split(".")
         pointer = model
         key = None
@@ -314,7 +341,8 @@ def load_hf_weights_in_bert_torch(model, ckpt_path, voc_size_diff):
             continue
 
         # DeepSpeed BERT model has voc_size 8 aligned.
-        if voc_size_diff > 0 and name_str.find("embeddings.word_embeddings") >= 0:
+        if voc_size_diff > 0 and name_str.find(
+                "embeddings.word_embeddings") >= 0:
             z = np.zeros((voc_size_diff, array.shape[1]), dtype=array.dtype)
             array = np.concatenate((array, z), axis=0)
 
@@ -323,7 +351,9 @@ def load_hf_weights_in_bert_torch(model, ckpt_path, voc_size_diff):
 
     return model
 
-def convert_ckpt_to_deepspeed(model, ckpt_type, ckpt_path, vocab_diff, kernel_enabled):
+
+def convert_ckpt_to_deepspeed(model, ckpt_type, ckpt_path, vocab_diff,
+                              kernel_enabled):
 
     # Load weights from checkpoint
     if ckpt_type == "HF":
@@ -335,6 +365,8 @@ def convert_ckpt_to_deepspeed(model, ckpt_type, ckpt_path, vocab_diff, kernel_en
         if kernel_enabled:
             load_tf_weights_in_bert_kernel(model, ckpt_path, vocab_diff)
         else:
-            raise ValueError("--deepspeed_transformer_kernel is required for loading TF checkpoint.")
+            raise ValueError(
+                "--deepspeed_transformer_kernel is required for loading TF checkpoint."
+            )
     else:
         raise ValueError(f"Invalid ckpt_type.")
diff --git a/BingBertSquad/nvidia_run_squad_deepspeed.py b/BingBertSquad/nvidia_run_squad_deepspeed.py
index 7c112b725..1f88e2b89 100755
--- a/BingBertSquad/nvidia_run_squad_deepspeed.py
+++ b/BingBertSquad/nvidia_run_squad_deepspeed.py
@@ -795,11 +795,15 @@ def main():
     else:
         # Models from Tensorflow and Huggingface are post-LN.
         if args.preln:
-            raise ValueError("Should NOT use --preln if the loading checkpoint doesn't use pre-layer-norm.")
+            raise ValueError(
+                "Should NOT use --preln if the loading checkpoint doesn't use pre-layer-norm."
+            )
 
         # Use the original bert config if want to load from non-DeepSpeed checkpoint.
         if args.origin_bert_config_file is None:
-            raise ValueError("--origin_bert_config_file is required for loading non-DeepSpeed checkpoint.")
+            raise ValueError(
+                "--origin_bert_config_file is required for loading non-DeepSpeed checkpoint."
+            )
 
         bert_config = BertConfig.from_json_file(args.origin_bert_config_file)
 
@@ -812,6 +816,7 @@ def main():
         vocab_diff = 8 - (bert_config.vocab_size % 8)
         bert_config.vocab_size += vocab_diff
 
+    torch.distributed.init_process_group(backend='nccl')
     if args.preln:
         model = BertForQuestionAnsweringPreLN(bert_config, args)
     else:
@@ -822,20 +827,22 @@ def main():
         logger.info(f"Loading Pretrained Bert Encoder from: {args.model_file}")
 
         if args.ckpt_type == "DS":
-            checkpoint_state_dict = torch.load(args.model_file,
-                                               map_location=torch.device("cpu"))
+            checkpoint_state_dict = torch.load(
+                args.model_file, map_location=torch.device("cpu"))
             if 'module' in checkpoint_state_dict:
                 logger.info('Loading DeepSpeed v2.0 style checkpoint')
                 model.load_state_dict(checkpoint_state_dict['module'],
                                       strict=False)
             elif 'model_state_dict' in checkpoint_state_dict:
-                model.load_state_dict(checkpoint_state_dict['model_state_dict'],
-                                      strict=False)
+                model.load_state_dict(
+                    checkpoint_state_dict['model_state_dict'], strict=False)
             else:
                 raise ValueError("Unable to find model state in checkpoint")
         else:
             from convert_bert_ckpt_to_deepspeed import convert_ckpt_to_deepspeed
-            convert_ckpt_to_deepspeed(model, args.ckpt_type, args.model_file, vocab_diff, args.deepspeed_transformer_kernel)
+            convert_ckpt_to_deepspeed(model, args.ckpt_type, args.model_file,
+                                      vocab_diff,
+                                      args.deepspeed_transformer_kernel)
 
         logger.info(f"Pretrained Bert Encoder Loaded from: {args.model_file}")
 
@@ -852,7 +859,7 @@ def main():
         [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay':
         0.01
-    },{
+    }, {
         'params':
         [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay':
@@ -864,7 +871,7 @@ def main():
         model=model,
         model_parameters=optimizer_grouped_parameters,
         dist_init_required=True)
-    
+
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available()
                               and not args.no_cuda else "cpu")
@@ -911,8 +918,6 @@ def main():
     else:
         args.summary_writer = None
 
-
-
     logger.info("propagate deepspeed-config settings to client settings")
     args.train_batch_size = model.train_micro_batch_size_per_gpu()
     args.gradient_accumulation_steps = model.gradient_accumulation_steps()
@@ -1056,12 +1061,20 @@ def main():
                         f'Warning: Early epoch termination due to max steps limit, epoch step ={epoch_step}, global step = {global_step}, epoch = {num_epoch}'
                     )
                     break
-                one_step_time = time.time() -start_time
+                one_step_time = time.time() - start_time
                 all_step_time += one_step_time
-                if (step + 1)%(ave_rounds) == 0 and torch.distributed.get_rank() == 0:
-                    print('At Step {}, Averaged Throughput for {} rounds is: {} Samples/s'.format(step, ave_rounds, bs_size * ave_rounds * torch.distributed.get_world_size() / all_step_time ), flush=True )
+                if (step + 1) % (
+                        ave_rounds) == 0 and torch.distributed.get_rank() == 0:
+                    print(
+                        'At Step {}, Averaged Throughput for {} rounds is: {} Samples/s'
+                        .format(
+                            step, ave_rounds,
+                            bs_size * ave_rounds *
+                            torch.distributed.get_world_size() /
+                            all_step_time),
+                        flush=True)
                     all_step_time = 0.0
-      
+
     # Save a trained model
     # model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
     #output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
diff --git a/BingBertSquad/run_squad_deepspeed.sh b/BingBertSquad/run_squad_deepspeed.sh
index 3c5c15b72..6095c76ca 100755
--- a/BingBertSquad/run_squad_deepspeed.sh
+++ b/BingBertSquad/run_squad_deepspeed.sh
@@ -32,7 +32,7 @@ else
        GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
 fi
 JOB_NAME="deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
-config_json=onebit_deepspeed_bsz24_config.json
+config_json=deepspeed_bsz24_config.json
 run_cmd="deepspeed --num_nodes ${NUM_NODES} --num_gpus ${NGPU_PER_NODE} \
        --master_port=${MASTER_PORT} \
        --hostfile ${HOSTFILE} \
diff --git a/BingBertSquad/turing/nvidia_modeling.py b/BingBertSquad/turing/nvidia_modeling.py
index caf8a3a3c..c09e699ee 100755
--- a/BingBertSquad/turing/nvidia_modeling.py
+++ b/BingBertSquad/turing/nvidia_modeling.py
@@ -517,6 +517,8 @@ def __init__(self, config, args):
                 hidden_dropout_ratio=config.hidden_dropout_prob,
                 num_hidden_layers=config.num_hidden_layers,
                 initializer_range=config.initializer_range,
+                local_rank=args.local_rank
+                if hasattr(args, 'local_rank') else -1,
                 seed=args.seed,
                 fp16=ds_config.fp16_enabled,
                 pre_layer_norm=False)
diff --git a/BingBertSquad/turing/nvidia_modelingpreln.py b/BingBertSquad/turing/nvidia_modelingpreln.py
index 91e9b3c4f..2ecfb2577 100755
--- a/BingBertSquad/turing/nvidia_modelingpreln.py
+++ b/BingBertSquad/turing/nvidia_modelingpreln.py
@@ -537,6 +537,8 @@ def __init__(self, config, args):
                 hidden_dropout_ratio=config.hidden_dropout_prob,
                 num_hidden_layers=config.num_hidden_layers,
                 initializer_range=config.initializer_range,
+                local_rank=args.local_rank
+                if hasattr(args, 'local_rank') else -1,
                 seed=args.seed,
                 fp16=ds_config.fp16_enabled,
                 pre_layer_norm=True)
diff --git a/BingBertSquad/utils.py b/BingBertSquad/utils.py
index ce9ee235c..c2a8e5a07 100755
--- a/BingBertSquad/utils.py
+++ b/BingBertSquad/utils.py
@@ -213,13 +213,16 @@ def get_argument_parser():
         '--ckpt_type',
         type=str,
         default="DS",
-        help="Checkpoint's type, DS - DeepSpeed, TF - Tensorflow, HF - Huggingface.")
+        help=
+        "Checkpoint's type, DS - DeepSpeed, TF - Tensorflow, HF - Huggingface."
+    )
 
     parser.add_argument(
         "--origin_bert_config_file",
         type=str,
         default=None,
-        help="The config json file corresponding to the non-DeepSpeed pre-trained BERT model."
+        help=
+        "The config json file corresponding to the non-DeepSpeed pre-trained BERT model."
     )
 
     return parser