From 18450b022c936f2ce48ad2624b82b73a01aabe2b Mon Sep 17 00:00:00 2001
From: Konstantinos Fertakis <fertakis@gmail.com>
Date: Tue, 29 Oct 2024 22:57:55 +0000
Subject: [PATCH 01/13] =?UTF-8?q?=CE=95nable=20reward=20model=20offloading?=
 =?UTF-8?q?=20option=20(#930)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* enable reward model offloading option

* fixed code formatting

* more formatting fixes

* Pre-commit formatting fix

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Logan Adams <loadams@microsoft.com>
Signed-off-by: zhangsmallshark <zhangsmallshark@gmail.com>
---
 .../DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py   | 17 ++++-------------
 .../training/step3_rlhf_finetuning/main.py      |  3 +++
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py b/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py
index 5b6778cc2..0e67efcf9 100755
--- a/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py
+++ b/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py
@@ -268,23 +268,14 @@ def _init_reward(self, critic_model_name_or_path):
             # If critic is ZeRO-3 then we use it for everything, otherwise assume we have enough memory
             zero_stage = 0
 
-        ds_config = get_eval_ds_config(offload=self.args.offload,
+        ds_config = get_eval_ds_config(offload=self.args.offload_reward_model,
                                        dtype=self.args.dtype,
                                        stage=zero_stage)
-        ds_config[
-            'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
-        ds_config[
-            'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
-            ) * self.args.gradient_accumulation_steps
-
-        ds_eval_config = get_eval_ds_config(offload=False,
-                                            dtype=self.args.dtype,
-                                            stage=zero_stage)
 
         # We need to set train batch size and micro batch size here to pass the sanity check of DeepSpeed engine.
-        ds_eval_config[
+        ds_config[
             'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
-        ds_eval_config[
+        ds_config[
             'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
             ) * self.args.gradient_accumulation_steps
 
@@ -292,7 +283,7 @@ def _init_reward(self, critic_model_name_or_path):
         reward_model = create_critic_model(
             model_name_or_path=critic_model_name_or_path,
             tokenizer=self.tokenizer,
-            ds_config=ds_eval_config,
+            ds_config=ds_config,
             num_padding_at_beginning=self.args.num_padding_at_beginning,
             rlhf_training=True,
             dropout=self.args.critic_dropout,
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
index a5be5671b..1378dc4e6 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
@@ -246,6 +246,9 @@ def parse_args():
         '--offload_reference_model',
         action='store_true',
         help='Enable ZeRO Offload techniques for reference model')
+    parser.add_argument('--offload_reward_model',
+                        action='store_true',
+                        help='Enable ZeRO Offload techniques for reward model')
     parser.add_argument(
         '--actor_zero_stage',
         type=int,

From 216e147ba9b18b265fe4a7d4e4b95fb4dd3448a0 Mon Sep 17 00:00:00 2001
From: Xingfu Yi <54938415+Xingfu-Yi@users.noreply.github.com>
Date: Thu, 31 Oct 2024 00:13:05 +0800
Subject: [PATCH 02/13] Remove the fixed `eot_token` mechanism for SFT (#927)

Not all pretrained LLMs use `<|endoftext|>` as the `eot_token`, therefore it's inappropriate to fix it.

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Signed-off-by: zhangsmallshark <zhangsmallshark@gmail.com>
---
 .../training/step1_supervised_finetuning/main.py      | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py
index d9527af54..aa505a25d 100755
--- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py
+++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py
@@ -191,7 +191,13 @@ def parse_args():
     parser.add_argument(
         "--add_eot_token",
         action='store_true',
-        help="Add <|endoftext|> as additional special token to tokenizer")
+        help="Add `eot_token` as additional special token to tokenizer")
+    parser.add_argument(
+        "--eot_token",
+        type=str,
+        default="<|endoftext|>",
+        help="Specify the format of the `eot_token`",
+    )
     ## Print loss
     parser.add_argument('--print_loss',
                         action='store_true',
@@ -234,8 +240,7 @@ def main():
     torch.distributed.barrier()
 
     # load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family
-    args.end_of_conversation_token = "<|endoftext|>"
-    additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
+    additional_special_tokens = args.eot_token if args.add_eot_token else None
     tokenizer = load_hf_tokenizer(args.model_name_or_path,
                                   fast_tokenizer=True,
                                   add_special_tokens=additional_special_tokens)

From 8d143c904ba4f9076cced3f7f3998dda6eb3415c Mon Sep 17 00:00:00 2001
From: Chengming Zhang <zhangsmallshark@gmail.com>
Date: Thu, 7 Nov 2024 15:27:38 -0600
Subject: [PATCH 03/13] DeepSpeed-Domino (#929)

* add domino

* use transformer from deepspeed

* clean args

* mega opt

* add opt & timer

* add opt

* fix loss

* folder name

* Change arguent in pretrain script

* Add readme for domino

* Update readme for domino

* Fixing usage issues

* update dataset

* megatron dependencies

* path

* Update README.md

* remove imports

* update import

* Update README.md

* Minor example script changes

* train bash

* require

* Update README.md

---------

Co-authored-by: chengming-zhang <chengming.zhang@anl.gov>
Co-authored-by: Zheyu SHEN <zyshen@umd.edu>
Co-authored-by: root <root@ecehpavw1202b.umd.edu>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Signed-off-by: zhangsmallshark <zhangsmallshark@gmail.com>
---
 training/DeepSpeed-Domino/README.md           |   86 +
 training/DeepSpeed-Domino/domino/__init__.py  |    0
 training/DeepSpeed-Domino/domino/arguments.py |  400 ++++
 .../DeepSpeed-Domino/domino/data/.DS_Store    |  Bin 0 -> 6148 bytes
 .../DeepSpeed-Domino/domino/data/__init__.py  |    1 +
 .../domino/data/data_samplers.py              |  183 ++
 .../domino/data/gpt_dataset.py                |  469 +++++
 .../domino/data/indexed_dataset.py            |  613 ++++++
 training/DeepSpeed-Domino/domino/gpt_model.py |   64 +
 .../DeepSpeed-Domino/domino/initialize.py     |  220 +++
 .../DeepSpeed-Domino/domino/language_model.py |  193 ++
 .../DeepSpeed-Domino/domino/llama_model.py    |   91 +
 .../domino/modules/__init__.py                |    0
 .../domino/modules/distributed.py             |  117 ++
 .../DeepSpeed-Domino/domino/modules/enums.py  |   20 +
 .../domino/modules/fused_bias_gelu.py         |   44 +
 .../domino/modules/fused_func.py              |   96 +
 .../domino/modules/fused_layer_norm.py        |   59 +
 .../DeepSpeed-Domino/domino/modules/module.py |   93 +
 .../DeepSpeed-Domino/domino/modules/utils.py  |   29 +
 .../domino/optimizer/__init__.py              |  144 ++
 .../domino/optimizer/clip_grads.py            |  134 ++
 .../domino/optimizer/distrib_optimizer.py     | 1022 ++++++++++
 .../domino/optimizer/grad_scaler.py           |  120 ++
 .../domino/optimizer/optimizer.py             |  774 ++++++++
 .../domino/optimizer_param_scheduler.py       |  234 +++
 .../DeepSpeed-Domino/domino/parallel_state.py |  260 +++
 training/DeepSpeed-Domino/domino/schedules.py |  354 ++++
 .../domino/tensor_parallel/__init__.py        |    0
 .../domino/tensor_parallel/comm.py            |   83 +
 .../domino/tensor_parallel/cross_entropy.py   |   65 +
 .../domino/tensor_parallel/data.py            |  105 +
 .../domino/tensor_parallel/partition.py       |  756 ++++++++
 .../domino/tensor_parallel/random.py          |  241 +++
 .../domino/tensor_parallel/utils.py           |  112 ++
 training/DeepSpeed-Domino/domino/timer.py     |  304 +++
 training/DeepSpeed-Domino/domino/training.py  |  431 +++++
 training/DeepSpeed-Domino/domino/utils.py     |  242 +++
 .../DeepSpeed-Domino/megatron/__init__.py     |   18 +
 .../DeepSpeed-Domino/megatron/arguments.py    | 1305 +++++++++++++
 .../megatron/checkpointing.py                 |  686 +++++++
 .../DeepSpeed-Domino/megatron/core/README.md  |    1 +
 .../megatron/core/__init__.py                 |   11 +
 .../core/dist_checkpointing/__init__.py       |    5 +
 .../megatron/core/dist_checkpointing/core.py  |   41 +
 .../core/dist_checkpointing/dict_utils.py     |  200 ++
 .../core/dist_checkpointing/mapping.py        |  289 +++
 .../core/dist_checkpointing/optimizer.py      |   86 +
 .../core/dist_checkpointing/serialization.py  |  334 ++++
 .../dist_checkpointing/strategies/__init__.py |   16 +
 .../dist_checkpointing/strategies/base.py     |   68 +
 .../strategies/tensorstore.py                 |  110 ++
 .../strategies/two_stage.py                   |  249 +++
 .../dist_checkpointing/strategies/zarr.py     |  230 +++
 .../megatron/core/dist_checkpointing/utils.py |   44 +
 .../DeepSpeed-Domino/megatron/core/enums.py   |   10 +
 .../megatron/core/fusions/__init__.py         |    0
 .../core/fusions/fused_bias_dropout.py        |   60 +
 .../megatron/core/fusions/fused_bias_gelu.py  |   48 +
 .../megatron/core/fusions/fused_layer_norm.py |  119 ++
 .../megatron/core/fusions/fused_softmax.py    |  204 ++
 .../megatron/core/inference_params.py         |   27 +
 .../megatron/core/model_parallel_config.py    |  167 ++
 .../megatron/core/models/__init__.py          |    0
 .../megatron/core/models/common/__init__.py   |    0
 .../models/common/rotary_pos_embedding.py     |   56 +
 .../megatron/core/models/gpt/__init__.py      |    1 +
 .../megatron/core/models/gpt/gpt_embedding.py |  123 ++
 .../megatron/core/models/gpt/gpt_model.py     |  308 +++
 .../megatron/core/package_info.py             |   29 +
 .../megatron/core/parallel_state.py           |  651 +++++++
 .../core/pipeline_parallel/__init__.py        |    1 +
 .../pipeline_parallel/p2p_communication.py    |  571 ++++++
 .../core/pipeline_parallel/schedules.py       | 1254 ++++++++++++
 .../megatron/core/requirements.txt            |    1 +
 .../megatron/core/tensor_parallel/__init__.py |   56 +
 .../core/tensor_parallel/cross_entropy.py     |  142 ++
 .../megatron/core/tensor_parallel/data.py     |  104 +
 .../megatron/core/tensor_parallel/layers.py   |  899 +++++++++
 .../megatron/core/tensor_parallel/mappings.py |  283 +++
 .../megatron/core/tensor_parallel/random.py   |  249 +++
 .../megatron/core/tensor_parallel/utils.py    |  113 ++
 .../megatron/core/transformer/__init__.py     |    3 +
 .../megatron/core/transformer/attention.py    |  368 ++++
 .../transformer/custom_layers/__init__.py     |    0
 .../custom_layers/transformer_engine.py       |  249 +++
 .../core/transformer/dot_product_attention.py |  165 ++
 .../megatron/core/transformer/enums.py        |   25 +
 .../megatron/core/transformer/identity_op.py  |   14 +
 .../megatron/core/transformer/mlp.py          |   87 +
 .../megatron/core/transformer/module.py       |  132 ++
 .../core/transformer/transformer_block.py     |  286 +++
 .../core/transformer/transformer_config.py    |  273 +++
 .../core/transformer/transformer_layer.py     |  270 +++
 .../megatron/core/transformer/utils.py        |   40 +
 .../DeepSpeed-Domino/megatron/core/utils.py   |  207 ++
 .../DeepSpeed-Domino/megatron/data/Makefile   |    9 +
 .../megatron/data/__init__.py                 |    1 +
 .../megatron/data/autoaugment.py              |  320 +++
 .../megatron/data/bert_dataset.py             |  183 ++
 .../megatron/data/biencoder_dataset_utils.py  |  209 ++
 .../megatron/data/blendable_dataset.py        |  124 ++
 .../megatron/data/data_samplers.py            |  186 ++
 .../megatron/data/dataset_utils.py            |  806 ++++++++
 .../megatron/data/gpt_dataset.py              |  590 ++++++
 .../megatron/data/helpers.cpp                 |  701 +++++++
 .../megatron/data/ict_dataset.py              |  156 ++
 .../megatron/data/image_folder.py             |  302 +++
 .../megatron/data/indexed_dataset.py          |  623 ++++++
 .../megatron/data/multimodal_dataset.py       |   54 +
 .../megatron/data/orqa_wiki_dataset.py        |  193 ++
 .../megatron/data/realm_dataset_utils.py      |  199 ++
 .../megatron/data/realm_index.py              |  224 +++
 .../megatron/data/t5_dataset.py               |  257 +++
 .../data/test/test_indexed_dataset.py         |  125 ++
 .../data/test/test_preprocess_data.sh         |   10 +
 .../megatron/data/vit_dataset.py              |  249 +++
 .../megatron/dist_signal_handler.py           |   81 +
 .../megatron/fused_kernels/__init__.py        |   75 +
 .../megatron/fused_kernels/compat.h           |   17 +
 .../megatron/fused_kernels/tests/__init__.py  |    0
 .../fused_kernels/tests/test_fused_kernels.py |  388 ++++
 .../megatron/fused_kernels/type_shim.h        |  103 +
 .../DeepSpeed-Domino/megatron/global_vars.py  |  191 ++
 training/DeepSpeed-Domino/megatron/indexer.py |  129 ++
 .../DeepSpeed-Domino/megatron/initialize.py   |  349 ++++
 training/DeepSpeed-Domino/megatron/memory.py  |  132 ++
 .../DeepSpeed-Domino/megatron/microbatches.py |  144 ++
 .../megatron/model/__init__.py                |   10 +
 .../megatron/model/bert_model.py              |  250 +++
 .../megatron/model/biencoder_model.py         |  328 ++++
 .../megatron/model/classification.py          |  101 +
 .../megatron/model/distributed.py             |  232 +++
 .../DeepSpeed-Domino/megatron/model/enums.py  |   21 +
 .../megatron/model/fused_bias_gelu.py         |   43 +
 .../megatron/model/fused_layer_norm.py        |   92 +
 .../megatron/model/fused_softmax.py           |  213 ++
 .../megatron/model/gpt_model.py               |  122 ++
 .../megatron/model/language_model.py          |  634 ++++++
 .../DeepSpeed-Domino/megatron/model/module.py |  197 ++
 .../megatron/model/multiple_choice.py         |  112 ++
 .../megatron/model/realm_model.py             |  204 ++
 .../megatron/model/t5_model.py                |  186 ++
 .../megatron/model/transformer.py             | 1710 +++++++++++++++++
 .../DeepSpeed-Domino/megatron/model/utils.py  |   54 +
 .../megatron/model/vision/classification.py   |   85 +
 .../megatron/model/vision/dino.py             |  290 +++
 .../model/vision/esvit_swin_backbone.py       |  849 ++++++++
 .../megatron/model/vision/inpainting.py       |  151 ++
 .../megatron/model/vision/knn_monitor.py      |  129 ++
 .../megatron/model/vision/mit_backbone.py     |  415 ++++
 .../megatron/model/vision/swin_backbone.py    |  625 ++++++
 .../megatron/model/vision/utils.py            |   27 +
 .../megatron/model/vision/vit_backbone.py     |  245 +++
 .../megatron/text_generation_server.py        |  241 +++
 training/DeepSpeed-Domino/megatron/timers.py  |  304 +++
 .../megatron/tokenizer/__init__.py            |    4 +
 .../megatron/tokenizer/bert_tokenization.py   |  431 +++++
 .../megatron/tokenizer/gpt2_tokenization.py   |  321 ++++
 .../megatron/tokenizer/tokenizer.py           |  536 ++++++
 training/DeepSpeed-Domino/megatron/utils.py   |  213 ++
 training/DeepSpeed-Domino/pretrain_base.py    |   59 +
 training/DeepSpeed-Domino/pretrain_gpt.py     |  139 ++
 .../DeepSpeed-Domino/pretrain_gpt3_2.7b.sh    |   72 +
 .../DeepSpeed-Domino/pretrain_gpt3_6.7b.sh    |   71 +
 training/DeepSpeed-Domino/pretrain_llama.py   |  160 ++
 .../DeepSpeed-Domino/pretrain_llama_13b.sh    |   84 +
 .../DeepSpeed-Domino/pretrain_llama_7b.sh     |   80 +
 training/DeepSpeed-Domino/requirements.txt    |    6 +
 169 files changed, 36623 insertions(+)
 create mode 100644 training/DeepSpeed-Domino/README.md
 create mode 100644 training/DeepSpeed-Domino/domino/__init__.py
 create mode 100644 training/DeepSpeed-Domino/domino/arguments.py
 create mode 100644 training/DeepSpeed-Domino/domino/data/.DS_Store
 create mode 100644 training/DeepSpeed-Domino/domino/data/__init__.py
 create mode 100644 training/DeepSpeed-Domino/domino/data/data_samplers.py
 create mode 100644 training/DeepSpeed-Domino/domino/data/gpt_dataset.py
 create mode 100644 training/DeepSpeed-Domino/domino/data/indexed_dataset.py
 create mode 100644 training/DeepSpeed-Domino/domino/gpt_model.py
 create mode 100644 training/DeepSpeed-Domino/domino/initialize.py
 create mode 100644 training/DeepSpeed-Domino/domino/language_model.py
 create mode 100644 training/DeepSpeed-Domino/domino/llama_model.py
 create mode 100644 training/DeepSpeed-Domino/domino/modules/__init__.py
 create mode 100644 training/DeepSpeed-Domino/domino/modules/distributed.py
 create mode 100644 training/DeepSpeed-Domino/domino/modules/enums.py
 create mode 100644 training/DeepSpeed-Domino/domino/modules/fused_bias_gelu.py
 create mode 100644 training/DeepSpeed-Domino/domino/modules/fused_func.py
 create mode 100644 training/DeepSpeed-Domino/domino/modules/fused_layer_norm.py
 create mode 100644 training/DeepSpeed-Domino/domino/modules/module.py
 create mode 100644 training/DeepSpeed-Domino/domino/modules/utils.py
 create mode 100644 training/DeepSpeed-Domino/domino/optimizer/__init__.py
 create mode 100644 training/DeepSpeed-Domino/domino/optimizer/clip_grads.py
 create mode 100644 training/DeepSpeed-Domino/domino/optimizer/distrib_optimizer.py
 create mode 100644 training/DeepSpeed-Domino/domino/optimizer/grad_scaler.py
 create mode 100644 training/DeepSpeed-Domino/domino/optimizer/optimizer.py
 create mode 100644 training/DeepSpeed-Domino/domino/optimizer_param_scheduler.py
 create mode 100644 training/DeepSpeed-Domino/domino/parallel_state.py
 create mode 100644 training/DeepSpeed-Domino/domino/schedules.py
 create mode 100644 training/DeepSpeed-Domino/domino/tensor_parallel/__init__.py
 create mode 100644 training/DeepSpeed-Domino/domino/tensor_parallel/comm.py
 create mode 100644 training/DeepSpeed-Domino/domino/tensor_parallel/cross_entropy.py
 create mode 100644 training/DeepSpeed-Domino/domino/tensor_parallel/data.py
 create mode 100644 training/DeepSpeed-Domino/domino/tensor_parallel/partition.py
 create mode 100644 training/DeepSpeed-Domino/domino/tensor_parallel/random.py
 create mode 100644 training/DeepSpeed-Domino/domino/tensor_parallel/utils.py
 create mode 100644 training/DeepSpeed-Domino/domino/timer.py
 create mode 100644 training/DeepSpeed-Domino/domino/training.py
 create mode 100644 training/DeepSpeed-Domino/domino/utils.py
 create mode 100644 training/DeepSpeed-Domino/megatron/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/arguments.py
 create mode 100644 training/DeepSpeed-Domino/megatron/checkpointing.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/README.md
 create mode 100644 training/DeepSpeed-Domino/megatron/core/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/dist_checkpointing/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/dist_checkpointing/core.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/dist_checkpointing/dict_utils.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/dist_checkpointing/mapping.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/dist_checkpointing/optimizer.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/dist_checkpointing/serialization.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/base.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/tensorstore.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/two_stage.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/zarr.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/dist_checkpointing/utils.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/enums.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/fusions/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/fusions/fused_bias_dropout.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/fusions/fused_bias_gelu.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/fusions/fused_layer_norm.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/fusions/fused_softmax.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/inference_params.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/model_parallel_config.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/models/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/models/common/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/models/common/rotary_pos_embedding.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/models/gpt/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/models/gpt/gpt_embedding.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/models/gpt/gpt_model.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/package_info.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/parallel_state.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/pipeline_parallel/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/pipeline_parallel/p2p_communication.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/pipeline_parallel/schedules.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/requirements.txt
 create mode 100644 training/DeepSpeed-Domino/megatron/core/tensor_parallel/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/tensor_parallel/cross_entropy.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/tensor_parallel/data.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/tensor_parallel/layers.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/tensor_parallel/mappings.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/tensor_parallel/random.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/tensor_parallel/utils.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/transformer/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/transformer/attention.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/transformer/custom_layers/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/transformer/custom_layers/transformer_engine.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/transformer/dot_product_attention.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/transformer/enums.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/transformer/identity_op.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/transformer/mlp.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/transformer/module.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/transformer/transformer_block.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/transformer/transformer_config.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/transformer/transformer_layer.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/transformer/utils.py
 create mode 100644 training/DeepSpeed-Domino/megatron/core/utils.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/Makefile
 create mode 100644 training/DeepSpeed-Domino/megatron/data/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/autoaugment.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/bert_dataset.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/biencoder_dataset_utils.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/blendable_dataset.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/data_samplers.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/dataset_utils.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/gpt_dataset.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/helpers.cpp
 create mode 100644 training/DeepSpeed-Domino/megatron/data/ict_dataset.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/image_folder.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/indexed_dataset.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/multimodal_dataset.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/orqa_wiki_dataset.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/realm_dataset_utils.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/realm_index.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/t5_dataset.py
 create mode 100644 training/DeepSpeed-Domino/megatron/data/test/test_indexed_dataset.py
 create mode 100755 training/DeepSpeed-Domino/megatron/data/test/test_preprocess_data.sh
 create mode 100644 training/DeepSpeed-Domino/megatron/data/vit_dataset.py
 create mode 100644 training/DeepSpeed-Domino/megatron/dist_signal_handler.py
 create mode 100644 training/DeepSpeed-Domino/megatron/fused_kernels/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/fused_kernels/compat.h
 create mode 100644 training/DeepSpeed-Domino/megatron/fused_kernels/tests/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/fused_kernels/tests/test_fused_kernels.py
 create mode 100644 training/DeepSpeed-Domino/megatron/fused_kernels/type_shim.h
 create mode 100644 training/DeepSpeed-Domino/megatron/global_vars.py
 create mode 100644 training/DeepSpeed-Domino/megatron/indexer.py
 create mode 100644 training/DeepSpeed-Domino/megatron/initialize.py
 create mode 100644 training/DeepSpeed-Domino/megatron/memory.py
 create mode 100644 training/DeepSpeed-Domino/megatron/microbatches.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/bert_model.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/biencoder_model.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/classification.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/distributed.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/enums.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/fused_bias_gelu.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/fused_layer_norm.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/fused_softmax.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/gpt_model.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/language_model.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/module.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/multiple_choice.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/realm_model.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/t5_model.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/transformer.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/utils.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/vision/classification.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/vision/dino.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/vision/esvit_swin_backbone.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/vision/inpainting.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/vision/knn_monitor.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/vision/mit_backbone.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/vision/swin_backbone.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/vision/utils.py
 create mode 100644 training/DeepSpeed-Domino/megatron/model/vision/vit_backbone.py
 create mode 100644 training/DeepSpeed-Domino/megatron/text_generation_server.py
 create mode 100644 training/DeepSpeed-Domino/megatron/timers.py
 create mode 100644 training/DeepSpeed-Domino/megatron/tokenizer/__init__.py
 create mode 100644 training/DeepSpeed-Domino/megatron/tokenizer/bert_tokenization.py
 create mode 100644 training/DeepSpeed-Domino/megatron/tokenizer/gpt2_tokenization.py
 create mode 100644 training/DeepSpeed-Domino/megatron/tokenizer/tokenizer.py
 create mode 100644 training/DeepSpeed-Domino/megatron/utils.py
 create mode 100644 training/DeepSpeed-Domino/pretrain_base.py
 create mode 100644 training/DeepSpeed-Domino/pretrain_gpt.py
 create mode 100644 training/DeepSpeed-Domino/pretrain_gpt3_2.7b.sh
 create mode 100644 training/DeepSpeed-Domino/pretrain_gpt3_6.7b.sh
 create mode 100644 training/DeepSpeed-Domino/pretrain_llama.py
 create mode 100644 training/DeepSpeed-Domino/pretrain_llama_13b.sh
 create mode 100644 training/DeepSpeed-Domino/pretrain_llama_7b.sh
 create mode 100644 training/DeepSpeed-Domino/requirements.txt

diff --git a/training/DeepSpeed-Domino/README.md b/training/DeepSpeed-Domino/README.md
new file mode 100644
index 000000000..3c1f4040b
--- /dev/null
+++ b/training/DeepSpeed-Domino/README.md
@@ -0,0 +1,86 @@
+# Domino Example
+
+## Install Dependency Libraries
+```
+pip install -r requirements.txt
+```
+
+## Prepare the Dataset
+Follow the instructions from [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/universal_checkpointing#download-and-pre-process-training-dataset) to prepare the training dataset.
+
+## Execute Domino Training
+
+To start training, adjust the following parameters in the script as needed:
+
+- **GPUS_PER_NODE**: Number of GPUs per node.
+- **CHECKPOINT_PATH**: Path to the checkpoint, if applicable.
+- **VOCAB_FILE**, **MERGE_FILE**, **DATA_PATH**: Paths to the dataset files.
+- **--micro-batch-size**: Batch size per GPU.
+
+### Available Models and Scripts
+
+| Model      | Script                   |
+|------------|--------------------------|
+| GPT-3 2.7B | `pretrain_gpt3_2.7b.sh`  |
+| GPT-3 6.7B | `pretrain_gpt3_6.7b.sh`  |
+| LLaMA 7B   | `pretrain_llama_7b.sh`   |
+| LLaMA 13B  | `pretrain_llama_13b.sh`  |
+
+### Example
+
+To train the GPT-3 2.7B model, run the following command:
+
+```bash
+bash pretrain_gpt3_2.7b.sh
+```
+
+The output should look like this:
+
+```
+training ...
+iteration: 1 | loss: 11.318 | iteration time (ms): 2174.0469932556152 
+iteration: 2 | loss: 11.307 | iteration time (ms): 1414.4024848937988 
+iteration: 3 | loss: 11.323 | iteration time (ms): 1385.9455585479736 
+iteration: 4 | loss: 11.310 | iteration time (ms): 1475.5175113677979 
+iteration: 5 | loss: 11.306 | iteration time (ms): 1395.7207202911377 
+iteration: 6 | loss: 11.315 | iteration time (ms): 1392.2104835510254 
+iteration: 7 | loss: 11.314 | iteration time (ms): 1402.6703834533691 
+iteration: 8 | loss: 11.309 | iteration time (ms): 1450.613260269165 
+iteration: 9 | loss: 11.305 | iteration time (ms): 1473.1688499450684 
+iteration: 10 | loss: 11.320 | iteration time (ms): 1398.4534740447998 
+[2024-11-04 15:32:30,918] [INFO] [launch.py:351:main] Process 73015 exits successfully.
+[2024-11-04 15:32:30,918] [INFO] [launch.py:351:main] Process 73017 exits successfully.
+[2024-11-04 15:32:30,919] [INFO] [launch.py:351:main] Process 73014 exits successfully.
+[2024-11-04 15:32:30,919] [INFO] [launch.py:351:main] Process 73016 exits successfully.
+```
+
+## Advanced Usage
+You can compile Pytorch and Apex from source for better performance.
+
+### Compile PyTorch from Source
+Compile PyTorch from source could enable JIT script.
+```
+git clone -b v2.1.0 https://github.com/pytorch/pytorch.git
+git submodule sync
+git submodule update --init --recursive
+conda install cmake ninja
+pip install -r requirements.txt
+conda install intel::mkl-static intel::mkl-include
+conda install -c pytorch magma-cuda121 # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo
+export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+python setup.py develop
+
+# Build torchvision
+git clone https://github.com/pytorch/vision.git
+python setup.py develop
+```
+
+## Build Apex
+```
+git clone https://github.com/NVIDIA/apex
+cd apex
+# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
+pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --config-settings "--build-option=--fast_layer_norm" ./
+# otherwise
+pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" --config-settings "--build-option=--fast_layer_norm" ./
+```
\ No newline at end of file
diff --git a/training/DeepSpeed-Domino/domino/__init__.py b/training/DeepSpeed-Domino/domino/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/DeepSpeed-Domino/domino/arguments.py b/training/DeepSpeed-Domino/domino/arguments.py
new file mode 100644
index 000000000..8bc59223a
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/arguments.py
@@ -0,0 +1,400 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from arguments.py in Megatron-LM
+
+"""Domino arguments."""
+
+import argparse
+import os
+import types
+import math
+import torch
+import torch.nn.functional as F
+
+import dataclasses
+from dataclasses import dataclass
+from typing import Callable
+from domino.timer import Timers
+from megatron.tokenizer import build_tokenizer
+
+
+_GLOBAL_ARGS = None
+_GLOBAL_TOKENIZER = None
+_GLOBAL_TIMERS = None
+
+
+def get_args():
+    """Return arguments."""
+    return _GLOBAL_ARGS
+
+
+def set_args(args):
+    global _GLOBAL_ARGS
+    _GLOBAL_ARGS = args
+
+
+def build_tokenizer_g(args):
+    """Initialize tokenizer."""
+    global _GLOBAL_TOKENIZER
+    _GLOBAL_TOKENIZER = build_tokenizer(args)
+    return _GLOBAL_TOKENIZER
+
+
+def get_tokenizer():
+    """Return tokenizer."""
+    return _GLOBAL_TOKENIZER
+
+
+def get_num_microbatches():
+    return 1
+
+
+def init_method_normal(std_dev):
+    def initialize(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std_dev)
+    return initialize
+
+
+def scaled_init_method_normal(std_dev, layer_count):
+    scaled_std_dev = std_dev / math.sqrt(2.0 * layer_count)
+    def initialize(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=scaled_std_dev)
+    return initialize
+
+
+def get_timers():
+    """Return timers."""
+    return _GLOBAL_TIMERS
+
+
+def set_timers():
+    """Initialize timers."""
+    global _GLOBAL_TIMERS
+    _GLOBAL_TIMERS = Timers(0, "maxmin")
+
+
+def parse_args():
+    """Parse all arguments."""
+    parser = argparse.ArgumentParser(description='Domino Arguments', allow_abbrev=False)
+    parser.add_argument('--num-layers', type=int, default=None,
+                       help='Number of transformer layers.')
+    parser.add_argument('--hidden-size', type=int, default=None,
+                       help='Tansformer hidden size.')
+    parser.add_argument('--num-attention-heads', type=int, default=None,
+                       help='Number of transformer attention heads.')
+    parser.add_argument('--ffn-hidden-size', type=int, default=None,
+                       help='Transformer Feed-Forward Network hidden size. '
+                       'This is set to 4*hidden-size if not provided')
+    parser.add_argument('--seq-length', type=int, default=None,
+                       help='Maximum sequence length to process.')
+    parser.add_argument('--max-position-embeddings', type=int, default=None,
+                       help='Maximum number of position embeddings to use. '
+                       'This is the size of position embedding.')
+    parser.add_argument('--position-embedding-type', type=str, default='learned_absolute',
+                       choices=['learned_absolute', 'rope'],
+                       help='Position embedding type.')
+    parser.add_argument('--rotary-percent', type=float, default=1.0,
+                       help='Percent of rotary dimension to use, default 100%')
+    parser.add_argument('--rotary-seq-len-interpolation-factor', type=int, default=None,
+                       help='Sequence length interpolation factor for rotary embeddings.')
+    parser.add_argument('--hidden-dropout', type=float, default=0.1,
+                       help='Dropout probability for hidden state transformer.')
+    parser.add_argument('--attention-dropout', type=float, default=0.1,
+                       help='Post attention dropout probability.')
+    parser.add_argument('--no-masked-softmax-fusion',
+                       action='store_false',
+                       help='Disable fusion of query_key_value scaling, '
+                       'masking, and softmax.',
+                       dest='masked_softmax_fusion')
+    parser.add_argument('--tensor-model-parallel-size', type=int, default=1,
+                    help='Degree of tensor model parallelism.')
+    parser.add_argument('--local_rank', type=int, default=None,
+                       help='local rank passed from distributed launcher.')
+    parser.add_argument('--distributed-backend', default='nccl',
+                       choices=['nccl', 'gloo'],
+                       help='Which backend to use for distributed training.')
+    parser.add_argument('--seed', type=int, default=1234,
+                       help='Random seed used for python, numpy, pytorch, and cuda.')
+    parser.add_argument('--train-iters', type=int, default=None,
+                       help='Total number of iterations to train over all '
+                       'training runs. Note that either train-iters or '
+                       'train-samples should be provided.')
+    parser.add_argument('--micro-batch-size', type=int, default=None,
+                       help='Batch size per model instance (local batch size). '
+                       'Global batch size is local batch size times data '
+                       'parallel size times number of micro batches.')
+    parser.add_argument('--global-batch-size', type=int, default=None,
+                   help='Training batch size. If set, it should be a '
+                   'multiple of micro-batch-size times data-parallel-size. '
+                   'If this value is None, then '
+                   'use micro-batch-size * data-parallel-size as the '
+                   'global batch size. This choice will result in 1 for '
+                   'number of micro-batches.')
+    parser.add_argument('--lr', type=float, default=None,
+                       help='Initial learning rate. Depending on decay style '
+                       'and initial warmup, the learing rate at each '
+                       'iteration would be different.')
+    parser.add_argument('--min-lr', type=float, default=0.0,
+                       help='Minumum value for learning rate. The scheduler'
+                       'clip values below this threshold.')
+    parser.add_argument('--lr-warmup-fraction', type=float, default=None,
+                       help='fraction of lr-warmup-(iters/samples) to use '
+                       'for warmup (as a float)')
+    parser.add_argument('--lr-decay-style', type=str, default='linear',
+                       choices=['constant', 'linear', 'cosine', 'inverse-square-root'],
+                       help='Learning rate decay function.')
+    parser.add_argument('--lr-decay-iters', type=int, default=None,
+                       help='number of iterations to decay learning rate over,'
+                       ' If None defaults to `--train-iters`')
+    parser.add_argument('--weight-decay', type=float, default=0.01,
+                       help='Weight decay coefficient for L2 regularization.')
+    parser.add_argument('--clip-grad', type=float, default=1.0,
+                       help='Gradient clipping based on global L2 norm.')
+    parser.add_argument('--data-path', nargs='*', default=None,
+                       help='Path to the training dataset. Accepted format:'
+                       '1) a single data path, 2) multiple datasets in the'
+                       'form: dataset1-weight dataset1-path dataset2-weight '
+                       'dataset2-path ... It is used with --split when a '
+                       'single dataset used for all three: train, valid '
+                       'and test. It is exclusive to the other '
+                       '--*-data-path args')
+    parser.add_argument('--split', type=str, default='969, 30, 1',
+                       help='Comma-separated list of proportions for training,'
+                       ' validation, and test split. For example the split '
+                       '`90,5,5` will use 90%% of data for training, 5%% for '
+                       'validation and 5%% for test.')
+    parser.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file.')
+    parser.add_argument('--merge-file', type=str, default=None,
+                    help='Path to the BPE merge file.')
+    parser.add_argument('--data-impl', type=str, default='infer',
+                       choices=['mmap', 'infer'],
+                       help='Implementation of indexed datasets.')
+    parser.add_argument('--fp16', action='store_true',
+                       help='Run model in fp16 mode.')
+    parser.add_argument('--bf16', action='store_true',
+                       help='Run model in bfloat16 mode.')
+    parser.add_argument('--tokenizer-type', type=str,
+                       default='GPT2BPETokenizer',
+                       choices=['BertWordPieceLowerCase',
+                                'BertWordPieceCase',
+                                'GPT2BPETokenizer',
+                                'SentencePieceTokenizer',
+                                'GPTSentencePieceTokenizer',
+                                'NullTokenizer'],
+                       help='What type of tokenizer to use.')
+    parser.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
+                       help='Pad the vocab size to be divisible by this value.'
+                       'This is added for computational efficieny reasons.')
+    parser.add_argument('--llama-model', action='store_true', help='Use LLaMA model.')
+    parser.add_argument('--swiglu', action='store_true',
+                       help='Use gated linear units and SiLU activation instead of default gelu')
+    parser.add_argument('--add-bias-linear', action='store_true',
+                       help='Enable bias in the linear layers')
+    parser.add_argument('--normalization', default='LayerNorm',
+                       choices=['LayerNorm', 'RMSNorm'],
+                       help='Which normalization technique to use.',
+                       dest='normalization')
+    parser.add_argument('--layernorm-epsilon', type=float, default=1e-5,
+                       help='Layer norm epsilon.')
+    parser.add_argument('--eval-iters', type=int, default=100,
+                    help='Number of iterations to run for evaluation'
+                    'validation/test for.')
+    parser.add_argument('--eval-interval', type=int, default=1000,
+                       help='Interval between running evaluation on '
+                       'validation set.')
+    parser.add_argument('--log-interval', type=int, default=100,
+                       help='Report loss and timing interval.')
+    parser.add_argument('--save-interval', type=int, default=None,
+                       help='Number of iterations between checkpoint saves.')
+    
+    args = parser.parse_args()
+
+    args.rank = int(os.getenv('RANK', '0'))
+    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
+
+    if args.ffn_hidden_size is None:
+        args.ffn_hidden_size = 4 * args.hidden_size
+    if args.swiglu:
+        args.ffn_hidden_size = int((4 * args.hidden_size * 2 / 3) / 64) * 64
+
+    args.kv_channels = args.hidden_size // args.num_attention_heads
+
+    args.perform_initialization = True
+    args.apply_residual_connection_post_layernorm = False
+    args.no_persist_layer_norm = False
+
+    args.activation_func = F.gelu
+    args.add_bias_linear = True
+    args.gated_linear_unit = False
+    if args.swiglu:
+        args.activation_func = F.silu
+        args.gated_linear_unit = True
+        args.bias_gelu_fusion = False
+
+    init_method_std = 0.02
+    args.init_method = init_method_normal(init_method_std)
+    args.output_layer_init_method = scaled_init_method_normal(
+        init_method_std, args.num_layers)
+
+    args.optimizer = 'adam'
+    args.adam_beta1 = 0.9
+    args.adam_beta2 = 0.999
+    args.adam_eps = 1e-8
+    args.weight_decay = 0.01
+    args.lr_warmup_init = 0.0
+    args.lr_decay_style = 'cosine'
+    args.start_weight_decay = 0.1
+    args.end_weight_decay = 0.1
+    args.weight_decay_incr_style ='constant'
+    args.start_weight_decay = args.weight_decay
+    args.end_weight_decay = args.weight_decay
+    args.use_checkpoint_opt_param_scheduler = False
+    args.override_opt_param_scheduler = False
+
+    args.mmap_warmup = False
+
+    args.num_workers = 1
+    args.dataloader_type = 'single'
+    args.train_data_path = None
+    args.valid_data_path = None
+    args.test_data_path = None
+    args.data_cache_path = None
+    args.train_samples = None
+    args.consumed_train_samples = 0
+    args.consumed_valid_samples = 0
+    args.decoder_seq_length = None
+    args.reset_position_ids = False
+    args.reset_attention_mask = False
+    args.eod_mask_loss = False
+    args.empty_unused_memory_level = 1
+    args.tokenizer_type = 'GPT2BPETokenizer'
+
+    args.loss_scale = 1024
+    args.initial_loss_scale = 2**32
+    args.min_loss_scale = 1.0
+    args.loss_scale_window = 1000
+    args.hysteresis = 2
+    args.use_distributed_optimizer = False
+    args.log_num_zeros_in_grad = False
+
+    args.rampup_batch_size = None
+    # Parameters dtype.
+    args.accumulate_allreduce_grads_in_fp32 = False
+    args.params_dtype = torch.float
+    if args.fp16:
+        args.params_dtype = torch.half
+    if args.bf16:
+        args.params_dtype = torch.bfloat16
+        # bfloat16 requires gradient accumulation and all-reduce to
+        # be done in fp32.
+        if not args.accumulate_allreduce_grads_in_fp32:
+            args.accumulate_allreduce_grads_in_fp32 = True
+            if args.rank == 0:
+                print('accumulate and all-reduce gradients in fp32 for '
+                      'bfloat16 data type.', flush=True)
+
+    args.async_tensor_model_parallel_allreduce = True
+    args.gradient_accumulation_fusion = True
+    args.padded_vocab_size = 0 # tokenizer.py
+    args.model_type = 1
+    args.data_parallel_size = 1
+    args.DDP_impl = 'local'
+    args.use_contiguous_buffers_in_local_ddp = True
+    args.data_parallel_random_init = False
+
+    return args
+
+
+@dataclass
+class TransformerConfig():
+    """Configuration object for transformers.
+    """
+    sequence_parallel: bool = False
+    llama_model: bool = False
+    apply_residual_connection_post_layernorm = False
+    no_persist_layer_norm = False
+
+    # Initialization
+    perform_initialization: bool = True
+    use_cpu_initialization: bool = False
+
+    # Training
+    fp16: bool = False
+    bf16: bool = False
+    params_dtype: torch.dtype = torch.float32
+    timers: Callable = None
+
+    # Optimizations
+    gradient_accumulation_fusion: bool = True
+    async_tensor_model_parallel_allreduce: bool = True
+
+    # model architecture
+    num_layers: int = 0
+    hidden_size: int = 0
+    num_attention_heads: int = 0
+    ffn_hidden_size: int = None
+    kv_channels: int = None
+    hidden_dropout: float = 0.1
+    attention_dropout: float = 0.1
+    layernorm_epsilon: float = 1e-5
+    layernorm_zero_centered_gamma: bool = False
+    add_bias_linear: bool = True
+    swiglu = False
+    gated_linear_unit: bool = False
+    activation_func: Callable = F.gelu
+    bias_gelu_fusion = False
+
+    # initialization
+    init_method: Callable = None
+    output_layer_init_method: Callable = None
+    init_method_std: float = 0.02
+
+    enable_autocast: bool = False
+    # autocast_dtype: torch.dtype = None
+    deallocate_pipeline_outputs: bool = False
+    no_sync_func: Callable = None
+    # grad_sync_func: Callable = None
+    # param_sync_func: Callable = None
+
+    def __post_init__(self):
+        """ Python dataclass method that is used to modify attributes after initialization.
+            See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
+        """
+        if self.fp16 and self.bf16:
+            raise ValueError(
+                f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.'
+            )
+
+        # if self.num_attention_heads % self.tensor_model_parallel_size != 0:
+        #     raise ValueError(
+        #         f"num_attention_heads ({self.num_attention_heads}) must be a multiple of "
+        #         f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
+        #     )
+
+        if self.ffn_hidden_size is None:
+            self.ffn_hidden_size = 4 * self.hidden_size
+
+        if self.kv_channels is None:
+            self.kv_channels = self.hidden_size // self.num_attention_heads
+
+        if self.init_method is None:
+            self.init_method = init_method_normal(self.init_method_std)
+
+        if self.output_layer_init_method is None:
+            self.output_layer_init_method = scaled_init_method_normal(
+                self.init_method_std, self.num_layers
+            )
+
+def core_transformer_config_from_args(args):
+    # Translate args to core transformer configuration
+    kw_args = {}
+    for f in dataclasses.fields(TransformerConfig):
+        if hasattr(args, f.name):
+            kw_args[f.name] = getattr(args, f.name)
+ 
+    kw_args['hidden_size'] = args.hidden_size
+    kw_args['init_method'] = args.init_method
+    kw_args['output_layer_init_method'] = args.init_method
+    kw_args['params_dtype'] = args.params_dtype
+
+    return TransformerConfig(**kw_args)
diff --git a/training/DeepSpeed-Domino/domino/data/.DS_Store b/training/DeepSpeed-Domino/domino/data/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..d02a1acaaed83168d65321c3f703c83ad2b2c9a0
GIT binary patch
literal 6148
zcmeHKJxc>Y5S>X<1OgJQEVs50EbNyM5o@v5#0w#CCSoFJcYn$s<F7g2+u4YBUSTDo
zGhz45?#`RreUSYS5uH30ry^q!8KMxCPKU9(bEubt2S80ZR<e^D{7&SdBhhc{;@n4R
z&MR5V=lQ=}mYchIxtSw^@?4{w)yd$ouaFtmI|F-*`W|a8Wg(Afas4fR?+u(|<JUF(
z16d-9dhPRU!QORo`xh<lA!fP;Zz0bp#kX?2wsN59<_fq1u7E4>FBQO<O&g93y><m$
z0axHd0qze0g%~qdhH2|Sm7V}VUZa!2=Vh6kkTGVg3^4*zqXLbp`H7)MM}Nq1F=J(D
zbkY3y(ER27<3-cgkw2tx(P-$kE8q$=73f=Y%=7;of0@olev{%YSHKncV+!bCdNrM3
zQTc5BvOJ!(0p%8j#<+?O41VJhfCl%Gqcmvafj;75#>$Xc*q`V?zX&8jymAG8K!JCW
Ca#9Td

literal 0
HcmV?d00001

diff --git a/training/DeepSpeed-Domino/domino/data/__init__.py b/training/DeepSpeed-Domino/domino/data/__init__.py
new file mode 100644
index 000000000..cd5f898c6
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/data/__init__.py
@@ -0,0 +1 @@
+from . import indexed_dataset
diff --git a/training/DeepSpeed-Domino/domino/data/data_samplers.py b/training/DeepSpeed-Domino/domino/data/data_samplers.py
new file mode 100644
index 000000000..d3df72f66
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/data/data_samplers.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+
+import random
+import torch
+import numpy as np
+from torch.utils.data import Dataset
+
+from domino.arguments import get_args 
+import domino.parallel_state as mpu
+
+def build_pretraining_data_loader(dataset, consumed_samples):
+    """Buld dataloader given an input dataset."""
+
+    if dataset is None:
+        return None
+    args = get_args()
+
+    # Megatron sampler
+    if args.dataloader_type == 'single':
+        batch_sampler = MegatronPretrainingSampler(
+            total_samples=len(dataset),
+            consumed_samples=consumed_samples,
+            micro_batch_size=args.micro_batch_size,
+            data_parallel_rank=mpu.get_data_parallel_rank(),
+            data_parallel_size=mpu.get_data_parallel_world_size())
+    elif args.dataloader_type == 'cyclic':
+        batch_sampler = MegatronPretrainingRandomSampler(
+            dataset,
+            total_samples=len(dataset),
+            consumed_samples=consumed_samples,
+            micro_batch_size=args.micro_batch_size,
+            data_parallel_rank=mpu.get_data_parallel_rank(),
+            data_parallel_size=mpu.get_data_parallel_world_size(),
+            data_sharding=args.data_sharding)
+    else:
+        raise Exception('{} dataloader type is not supported.'.format(
+                args.dataloader_type))
+
+    # Torch dataloader.
+    return torch.utils.data.DataLoader(dataset,
+                                       batch_sampler=batch_sampler,
+                                       num_workers=args.num_workers,
+                                       pin_memory=True)
+
+class MegatronPretrainingSampler:
+
+    def __init__(self, total_samples, consumed_samples, micro_batch_size,
+                 data_parallel_rank, data_parallel_size, drop_last=True):
+        # Keep a copy of input params for later use.
+        self.total_samples = total_samples
+        self.consumed_samples = consumed_samples
+        self.micro_batch_size = micro_batch_size
+        self.data_parallel_rank = data_parallel_rank
+        self.micro_batch_times_data_parallel_size = \
+            self.micro_batch_size * data_parallel_size
+        self.drop_last = drop_last
+
+        # Sanity checks.
+        assert self.total_samples > 0, \
+            'no sample to consume: {}'.format(self.total_samples)
+        assert self.consumed_samples < self.total_samples, \
+            'no samples left to consume: {}, {}'.format(self.consumed_samples,
+                                                        self.total_samples)
+        assert self.micro_batch_size > 0
+        assert data_parallel_size > 0
+        assert self.data_parallel_rank < data_parallel_size, \
+            'data_parallel_rank should be smaller than data size: {}, ' \
+            '{}'.format(self.data_parallel_rank, data_parallel_size)
+
+    def __len__(self):
+        return self.total_samples
+
+    def get_start_end_idx(self):
+        start_idx = self.data_parallel_rank * self.micro_batch_size
+        end_idx = start_idx + self.micro_batch_size
+        return start_idx, end_idx
+
+    def __iter__(self):
+        batch = []
+        # Last batch will be dropped if drop_last is not set False
+        for idx in range(self.consumed_samples, self.total_samples):
+            batch.append(idx)
+            if len(batch) == self.micro_batch_times_data_parallel_size:
+                start_idx, end_idx = self.get_start_end_idx()
+                yield batch[start_idx:end_idx]
+                batch = []
+
+        # Check the last partial batch and see drop_last is set
+        if len(batch) > 0 and not self.drop_last:
+            start_idx, end_idx = self.get_start_end_idx()
+            yield batch[start_idx:end_idx]
+
+
+class RandomSeedDataset(Dataset):
+
+    def __init__(self, dataset):
+        args = get_args()
+        self.base_seed = args.seed
+        self.curr_seed = args.seed
+        self.dataset = dataset
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def set_epoch(self, epoch):
+        self.curr_seed = self.base_seed + epoch
+
+    def __getitem__(self, idx):
+        seed = idx + self.curr_seed
+        torch.manual_seed(seed)
+        random.seed(seed)
+        np.random.seed(seed)
+        return self.dataset[idx]
+
+
+class MegatronPretrainingRandomSampler:
+
+    def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size,
+                 data_parallel_rank, data_parallel_size, data_sharding):
+        # Keep a copy of input params for later use.
+        self.dataset = dataset
+        self.total_samples = total_samples
+        self.consumed_samples = consumed_samples
+        self.micro_batch_size = micro_batch_size
+        self.data_parallel_rank = data_parallel_rank
+        self.data_parallel_size = data_parallel_size
+        self.data_sharding = data_sharding
+        self.micro_batch_times_data_parallel_size = \
+            self.micro_batch_size * data_parallel_size
+        self.last_batch_size = \
+            self.total_samples % self.micro_batch_times_data_parallel_size
+
+        # Sanity checks.
+        assert self.total_samples > 0, \
+            'no sample to consume: {}'.format(self.total_samples)
+        assert self.micro_batch_size > 0
+        assert data_parallel_size > 0
+        assert self.data_parallel_rank < data_parallel_size, \
+            'data_parallel_rank should be smaller than data size: {}, ' \
+            '{}'.format(self.data_parallel_rank, data_parallel_size)
+
+    def __len__(self):
+        return self.total_samples
+
+    def __iter__(self):
+        active_total_samples = self.total_samples - self.last_batch_size
+        self.epoch = self.consumed_samples // active_total_samples
+        current_epoch_samples = self.consumed_samples % active_total_samples
+        assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0
+
+        if isinstance(self.dataset, RandomSeedDataset):
+            self.dataset.set_epoch(self.epoch)
+
+        # data sharding and random sampling
+        if self.data_sharding:
+            bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \
+                           * self.micro_batch_size
+            bucket_offset = current_epoch_samples // self.data_parallel_size
+            start_idx = self.data_parallel_rank * bucket_size
+            
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            random_idx = torch.randperm(bucket_size, generator=g).tolist()
+            idx_range = [start_idx + x for x in random_idx[bucket_offset:]]
+        else:
+            full_bucket_size = (self.total_samples // self.micro_batch_size) \
+                                * self.micro_batch_size
+            full_bucket_offset = current_epoch_samples
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            idx_range_total = \
+                torch.randperm(full_bucket_size, generator=g).tolist()
+            idx_range_active = idx_range_total[full_bucket_offset:]
+            idx_range = idx_range_active[self.data_parallel_rank::self.data_parallel_size]
+
+        batch = []
+        # Last batch if not complete will be dropped.
+        for idx in idx_range:
+            batch.append(idx)
+            if len(batch) == self.micro_batch_size:
+                self.consumed_samples += self.micro_batch_times_data_parallel_size
+                yield batch
+                batch = []
diff --git a/training/DeepSpeed-Domino/domino/data/gpt_dataset.py b/training/DeepSpeed-Domino/domino/data/gpt_dataset.py
new file mode 100644
index 000000000..da75d77cf
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/data/gpt_dataset.py
@@ -0,0 +1,469 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+
+import hashlib
+import os
+import time
+import numpy as np
+import torch
+
+from domino.utils import print_rank_0
+import domino.parallel_state as mpu
+from domino.data.indexed_dataset import make_dataset as make_indexed_dataset
+
+
+def get_train_valid_test_split_(splits_string, size):
+    """ Get dataset splits from comma or '/' separated string list."""
+
+    splits = []
+    if splits_string.find(',') != -1:
+        splits = [float(s) for s in splits_string.split(',')]
+    elif splits_string.find('/') != -1:
+        splits = [float(s) for s in splits_string.split('/')]
+    else:
+        splits = [float(splits_string)]
+    while len(splits) < 3:
+        splits.append(0.)
+    splits = splits[:3]
+    splits_sum = sum(splits)
+    assert splits_sum > 0.0
+    splits = [split / splits_sum for split in splits]
+    splits_index = [0]
+    for index, split in enumerate(splits):
+        splits_index.append(splits_index[index] +
+                            int(round(split * float(size))))
+    diff = splits_index[-1] - size
+    for index in range(1, len(splits_index)):
+        splits_index[index] -= diff
+    assert len(splits_index) == 4
+    assert splits_index[-1] == size
+    return splits_index
+
+
+def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                    train_valid_test_num_samples,
+                                    seq_length, seed, skip_warmup,
+                                    train_data_prefix=None,
+                                    valid_data_prefix=None,
+                                    test_data_prefix=None,
+                                    return_doc_ids=False, *,
+                                    data_cache_path=None):
+    """Build train, valid, and test datasets."""
+
+    # Single data path provided for train, valid & test
+    return _build_train_valid_test_datasets(data_prefix[0],
+                                                    data_impl, splits_string,
+                                                    train_valid_test_num_samples,
+                                                    seq_length, seed, skip_warmup,
+                                                    data_cache_path=data_cache_path)
+
+
+def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                     train_valid_test_num_samples,
+                                     seq_length, seed, skip_warmup,
+                                     return_doc_ids=False, *,
+                                     data_cache_path=None):
+    """Build train, valid, and test datasets."""
+
+    # Indexed dataset.
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+
+    total_num_of_documents = indexed_dataset.sizes.shape[0]
+    splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
+
+    # Print stats about the splits.
+    print_rank_0(' > dataset split:')
+
+    def print_split_stats(name, index):
+        print_rank_0('    {}:'.format(name))
+        print_rank_0('     document indices in [{}, {}) total of {} '
+                     'documents'.format(splits[index], splits[index + 1],
+                                        splits[index + 1] - splits[index]))
+    print_split_stats('train', 0)
+    print_split_stats('validation', 1)
+    print_split_stats('test', 2)
+
+    def build_dataset(index, name):
+        dataset = None
+        if splits[index + 1] > splits[index]:
+            documents = np.arange(start=splits[index], stop=splits[index + 1],
+                                  step=1, dtype=np.int32)
+            dataset = GPTDataset(name, data_prefix, documents, indexed_dataset,
+                                 splits_string,
+                                 train_valid_test_num_samples[index],
+                                 seq_length, seed,
+                                 return_doc_ids,
+                                 data_cache_path=data_cache_path)
+        return dataset
+
+    train_dataset = build_dataset(0, 'train')
+    valid_dataset = build_dataset(1, 'valid')
+    test_dataset = build_dataset(2, 'test')
+
+    return (train_dataset, valid_dataset, test_dataset)
+
+
+def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
+    """Build indexed dataset."""
+    print_rank_0(' > building dataset index ...')
+
+    start_time = time.time()
+    indexed_dataset = make_indexed_dataset(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+    print_rank_0(' > finished creating indexed dataset in {:4f} '
+                 'seconds'.format(time.time() - start_time))
+    print_rank_0('    number of documents: {}'.format(
+        indexed_dataset.sizes.shape[0]))
+
+    return indexed_dataset
+
+
+class GPTDataset(torch.utils.data.Dataset):
+
+    def __init__(self, name, data_prefix, documents, indexed_dataset,
+                 splits_string, num_samples, seq_length, seed,
+                 return_doc_ids=False, *,
+                 data_cache_path=None):
+
+        self.name = name
+        self.indexed_dataset = indexed_dataset
+        self.return_doc_ids = return_doc_ids
+
+        # Checks
+        assert np.min(documents) >= 0
+        assert np.max(documents) < indexed_dataset.sizes.shape[0]
+
+        # Build index mappings.
+        self.doc_idx, self.sample_idx, self.shuffle_idx, self.desc, self.desc_hash = \
+            _build_index_mappings(self.name, data_prefix,
+                                  documents, self.indexed_dataset.sizes,
+                                  splits_string, num_samples, seq_length, seed,
+                                  data_cache_path=data_cache_path)
+
+
+    def __len__(self):
+        # -1 is due to data structure used to retieve the index:
+        #    sample i --> [sample_idx[i], sample_idx[i+1])
+        return self.sample_idx.shape[0] - 1
+
+    def __getitem__(self, idx):
+        # Get the shuffled index.
+        idx = self.shuffle_idx[idx]
+        # Start and end documents and offsets.
+        doc_index_f = self.sample_idx[idx][0]
+        doc_index_l = self.sample_idx[idx + 1][0]
+        offset_f = self.sample_idx[idx][1]
+        offset_l = self.sample_idx[idx + 1][1]
+        # If we are within the same document, just extract the chunk.
+        doc_ids = []
+        if doc_index_f == doc_index_l:
+            doc_ids.append(self.doc_idx[doc_index_f])
+            sample = self.indexed_dataset.get(self.doc_idx[doc_index_f],
+                                              offset=offset_f,
+                                              length=offset_l - offset_f + 1)
+        else:
+            # Otherwise, get the rest of the initial document.
+            doc_ids.append(self.doc_idx[doc_index_f])
+            sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f],
+                                                    offset=offset_f)]
+            # Loop over all in between documents and add the entire document.
+            for i in range(doc_index_f + 1, doc_index_l):
+                doc_ids.append(self.doc_idx[i])
+                sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
+            # And finally add the relevant portion of last document.
+            doc_ids.append(self.doc_idx[doc_index_l])
+            sample_list.append(self.indexed_dataset.get(
+                self.doc_idx[doc_index_l],
+                length=offset_l + 1))
+            sample = np.concatenate(sample_list)
+
+        if self.return_doc_ids: # for retro preprocessing
+            return {'text': np.array(sample, dtype=np.int64),
+                    'doc_ids': np.array(doc_ids, dtype=np.int64)}
+        else:
+            return {'text': np.array(sample, dtype=np.int64)}
+
+
+def _build_index_mappings(name, data_prefix, documents, sizes,
+                          splits_string, num_samples, seq_length, seed,
+                          *,
+                          data_cache_path):
+    """Build doc-idx, sample-idx, and shuffle-idx.
+    doc-idx: is an array (ordered) of documents to be used in training.
+    sample-idx: is the start document index and document offset for each
+       training sample.
+    shuffle-idx: maps the sample index into a random index into sample-idx.
+    """
+    # Number of tokens in each epoch and number of required epochs.
+    tokens_per_epoch = _num_tokens(documents, sizes)
+    num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
+
+    # rng state
+    np_rng = np.random.RandomState(seed=seed)
+
+    # Filename of the index mappings.
+    desc = "GPT Dataset\n\n"
+    desc += f"Data prefix {data_prefix}\n"
+    desc += f"Dataset name {name}\n"
+    desc += f"Number of samples {num_samples}\n"
+    desc += f"Sequence length {seq_length}\n"
+    desc += f"Random seed {seed}\n"
+    desc += f"Split {splits_string}\n"
+    desc_hash = hashlib.md5(desc.encode('utf-8')).hexdigest()
+    desc_filename = desc_hash + ".dsc"
+    doc_idx_filename = desc_hash + '_doc_idx.npy'
+    sample_idx_filename = desc_hash + '_sample_idx.npy'
+    shuffle_idx_filename = desc_hash + '_shuffle_idx.npy'
+
+    # Look for cache in main data dir first to avoid unnecessary
+    # duplication, then look in data-cache-path if specified,
+    # If nothing is found, use the last path looked in
+    build_indices = True
+    prefixes = [os.path.join(os.path.dirname(data_prefix), 'index-cache')]
+    if data_cache_path is not None:
+        prefixes.append(data_cache_path)
+    for prefix in prefixes:
+        idx_path = {
+            'desc': os.path.join(prefix, desc_filename),
+            'doc': os.path.join(prefix, doc_idx_filename),
+            'sample': os.path.join(prefix, sample_idx_filename),
+            'shuffle': os.path.join(prefix, shuffle_idx_filename)
+        }
+        for f in idx_path.values():
+            if not os.path.isfile(f):
+                break
+        else:
+            # Found our files!
+            build_indices = False
+            break
+    data_cache_dir = os.path.dirname(idx_path['desc'])
+    data_cache_success = True
+
+    # Build the indexed mapping if not exist.
+    if build_indices and torch.distributed.get_rank() == 0:
+        print_rank_0(' > WARNING: could not find index map files, building '
+                     'the indices on rank 0 ...')
+
+        # For the last epoch, decide whether include the entire epoch
+        # in the global shuffle or not.
+
+        # If we need only one epoch, then separating last epoch  does
+        # not mean anything.
+        if num_epochs == 1:
+            separate_last_epoch = False
+            print(' > only one epoch required, setting '
+                  'separate_last_epoch to False', flush=True)
+
+        else:
+            # Get the number of samples for the last epoch
+            num_samples_from_epochs_minus_one = (
+                (num_epochs - 1) * tokens_per_epoch - 1) // seq_length
+            last_epoch_num_samples = num_samples - \
+                                     num_samples_from_epochs_minus_one
+            assert last_epoch_num_samples >= 0, \
+                'last epoch number of samples should be non-negative.'
+            num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length
+            assert last_epoch_num_samples <= (num_samples_per_epoch + 1), \
+                'last epoch number of samples exceeded max value.'
+            # If we have less than 80% of the samples for the last epoch,
+            # seperate out the epoch and treat it differently.
+            # Note: the 80% number is just based on common sense and can
+            # be adjusted if needed.
+            separate_last_epoch = (last_epoch_num_samples <
+                                   int(0.80 * num_samples_per_epoch))
+            if separate_last_epoch:
+                string = ' > last epoch number of samples ({}) is smaller '\
+                         'than 80% of number of samples per epoch ({}), '\
+                         'setting separate_last_epoch to True'
+            else:
+                string = ' > last epoch number of samples ({}) is larger '\
+                         'than 80% of number of samples per epoch ({}), '\
+                         'setting separate_last_epoch to False'
+            print(string.format(last_epoch_num_samples,
+                                num_samples_per_epoch), flush=True)
+
+
+        try:
+            os.makedirs(data_cache_dir, exist_ok=True)
+
+            # description
+            with open(idx_path['desc'], 'wt') as fd:
+                fd.write(desc)
+
+            # doc-idx.
+            start_time = time.time()
+            doc_idx = _build_doc_idx(documents, num_epochs, np_rng,
+                                     separate_last_epoch)
+            np.save(idx_path['doc'], doc_idx, allow_pickle=True)
+            print_rank_0(' > elasped time to build and save doc-idx mapping '
+                         '(seconds): {:4f}'.format(time.time() - start_time))
+            # sample-idx.
+            start_time = time.time()
+            # Use C++ implementation for speed.
+            # First compile and then import.
+            from megatron.data import helpers
+            assert doc_idx.dtype == np.int32
+            assert sizes.dtype == np.int32
+            sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
+                                                  num_epochs, tokens_per_epoch)
+            np.save(idx_path['sample'], sample_idx, allow_pickle=True)
+            print_rank_0(' > elasped time to build and save sample-idx mapping '
+                         '(seconds): {:4f}'.format(time.time() - start_time))
+            # shuffle-idx.
+            start_time = time.time()
+            # -1 is due to data structure used to retieve the index:
+            #    sample i --> [sample_idx[i], sample_idx[i+1])
+            if separate_last_epoch:
+                num_samples_ = num_samples_from_epochs_minus_one
+            else:
+                num_samples_ = sample_idx.shape[0] - 1
+            shuffle_idx = _build_shuffle_idx(num_samples_,
+                                             sample_idx.shape[0] - 1, np_rng)
+            np.save(idx_path['shuffle'], shuffle_idx, allow_pickle=True)
+            print_rank_0(' > elasped time to build and save shuffle-idx mapping'
+                         ' (seconds): {:4f}'.format(time.time() - start_time))
+        except OSError:
+            print(f'There was an error trying to create the data cache directory ({data_cache_dir})')
+            print('or a file in it. This defaults to a directory "index-cache" within the directory')
+            print('the data files are in and can be set with the --data-cache-path argument. Please')
+            print('ensure you have write access to this directory or specify one that you do have')
+            print('write access to.')
+            data_cache_success = False
+
+    counts = torch.cuda.LongTensor([data_cache_success])
+    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
+    torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
+    if counts[0].item() != (
+        torch.distributed.get_world_size() //
+        torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())):
+        print_rank_0("Data index creation unsuccessful, exiting.")
+        exit()
+
+    # Load mappings.
+    start_time = time.time()
+    print_rank_0(f" > loading doc-idx mapping from {idx_path['doc']}")
+    doc_idx = np.load(idx_path['doc'], allow_pickle=True, mmap_mode='r')
+
+    print_rank_0(f" > loading sample-idx mapping from {idx_path['sample']}")
+    sample_idx = np.load(idx_path['sample'], allow_pickle=True, mmap_mode='r')
+
+    print_rank_0(f" > loading shuffle-idx mapping from {idx_path['shuffle']}")
+    shuffle_idx = np.load(idx_path['shuffle'], allow_pickle=True, mmap_mode='r')
+
+    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('    total number of samples: {}'.format(
+        sample_idx.shape[0]))
+    print_rank_0('    total number of epochs: {}'.format(num_epochs))
+
+    return doc_idx, sample_idx, shuffle_idx, desc, desc_hash
+
+
+def _num_tokens(documents, sizes):
+    """Total number of tokens in the dataset."""
+    return np.sum(sizes[documents])
+
+
+def _num_epochs(tokens_per_epoch, seq_length, num_samples):
+    """Based on number of samples and sequence lenght, calculate how many
+    epochs will be needed."""
+    num_epochs = 0
+    total_tokens = 0
+    while True:
+        num_epochs += 1
+        total_tokens += tokens_per_epoch
+        # -1 is because we need to retrieve seq_length + 1 token each time
+        # but the last token will overlap with the first token of the next
+        # sample except for the last sample.
+        if ((total_tokens - 1) // seq_length) >= num_samples:
+            return num_epochs
+
+
+def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
+    """Build an array with length = number-of-epochs * number-of-dcuments.
+    Each index is mapped to a corresponding document."""
+    if not separate_last_epoch or num_epochs == 1:
+        doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1]
+        doc_idx[:] = documents
+        doc_idx = doc_idx.reshape(-1)
+        doc_idx = doc_idx.astype(np.int32)
+        np_rng.shuffle(doc_idx)
+        return doc_idx
+
+    doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False)
+    doc_idx_last = _build_doc_idx(documents, 1, np_rng, False)
+    return np.concatenate((doc_idx_first, doc_idx_last))
+
+
+# def _build_sample_idx(sizes, doc_idx, seq_length,
+#                       num_epochs, tokens_per_epoch):
+#     """Sample index mapping is a 2D array with sizes
+#     [number-of-samples + 1, 2] where [..., 0] contains
+#     the index into `doc_idx` and [..., 1] is the
+#     starting offset in that document."""
+
+#     # Total number of samples. For -1 see comments in `_num_epochs`.
+#     num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length
+#     sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32)
+
+#     # Index into sample_idx.
+#     sample_index = 0
+#     # Index into doc_idx.
+#     doc_idx_index = 0
+#     # Begining offset for each document.
+#     doc_offset = 0
+#     # Start with first document and no offset.
+#     sample_idx[sample_index][0] = doc_idx_index
+#     sample_idx[sample_index][1] = doc_offset
+#     sample_index += 1
+#     while sample_index <= num_samples:
+#         # Start with a fresh sequence.
+#         remaining_seq_length = seq_length + 1
+#         while remaining_seq_length != 0:
+#             # Get the document length.
+#             doc_id = doc_idx[doc_idx_index]
+#             doc_length = sizes[doc_id] - doc_offset
+#             # And add it to the current sequence.
+#             remaining_seq_length -= doc_length
+#             # If we have more than a full sequence, adjust offset and set
+#             # remaining length to zero so we return from the while loop.
+#             # Note that -1 here is for the same reason we have -1 in
+#             # `_num_epochs` calculations.
+#             if remaining_seq_length <= 0:
+#                 doc_offset += (remaining_seq_length + doc_length - 1)
+#                 remaining_seq_length = 0
+#             else:
+#                 # Otherwise, start from the begining of the next document.
+#                 doc_idx_index += 1
+#                 doc_offset = 0
+#         # Record the sequence.
+#         sample_idx[sample_index][0] = doc_idx_index
+#         sample_idx[sample_index][1] = doc_offset
+#         sample_index += 1
+
+#     return sample_idx
+
+
+def _build_shuffle_idx(num_samples, total_size, np_rng):
+    """Build the range [0, size) and shuffle."""
+    print(' > building shuffle index with split [0, {}) and [{}, {}) '
+          '...'.format(num_samples, num_samples, total_size), flush=True)
+
+    dtype_ = np.uint32
+    if total_size >= (np.iinfo(np.uint32).max - 1):
+        dtype_ = np.int64
+
+    shuffle_idx_first = np.arange(start=0, stop=num_samples,
+                                  step=1, dtype=dtype_)
+    np_rng.shuffle(shuffle_idx_first)
+    if num_samples == total_size:
+        return shuffle_idx_first
+
+    shuffle_idx_last = np.arange(start=num_samples, stop=total_size,
+                                 step=1, dtype=dtype_)
+    np_rng.shuffle(shuffle_idx_last)
+
+    return np.concatenate((shuffle_idx_first, shuffle_idx_last))
+
diff --git a/training/DeepSpeed-Domino/domino/data/indexed_dataset.py b/training/DeepSpeed-Domino/domino/data/indexed_dataset.py
new file mode 100644
index 000000000..e513e7f73
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/data/indexed_dataset.py
@@ -0,0 +1,613 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+
+from functools import lru_cache
+import os
+import shutil
+import struct
+from itertools import accumulate
+
+import numpy as np
+import torch
+from domino.utils import print_rank_0
+
+
+def __best_fitting_dtype(vocab_size=None):
+    if vocab_size is not None and vocab_size < 65500:
+        return np.uint16
+    else:
+        return np.int32
+
+
+def get_available_dataset_impl():
+    return ['lazy', 'cached', 'mmap']
+
+
+def infer_dataset_impl(path):
+    if IndexedDataset.exists(path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            if magic == IndexedDataset._HDR_MAGIC:
+                return 'cached'
+            elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
+                return 'mmap'
+            else:
+                return None
+    else:
+        print(f"Dataset does not exist: {path}")
+        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
+        return None
+
+
+def make_builder(out_file, impl, vocab_size=None):
+    if impl == 'mmap':
+        return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size))
+    else:
+        return IndexedDatasetBuilder(out_file)
+
+
+def make_dataset(path, impl, skip_warmup=False, multimodal=False):
+    if not IndexedDataset.exists(path):
+        print(f"Dataset does not exist: {path}")
+        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
+        return None
+    if impl == 'infer':
+        impl = infer_dataset_impl(path)
+    if impl == 'lazy' and IndexedDataset.exists(path):
+        return IndexedDataset(path)
+    elif impl == 'cached' and IndexedDataset.exists(path):
+        return IndexedCachedDataset(path)
+    elif impl == 'mmap' and MMapIndexedDataset.exists(path):
+        return MMapIndexedDataset(path, skip_warmup, multimodal)
+    print(f"Unknown dataset implementation: {impl}")
+    return None
+
+
+def dataset_exists(path, impl):
+    if impl == 'mmap':
+        return MMapIndexedDataset.exists(path)
+    else:
+        return IndexedDataset.exists(path)
+
+
+def read_longs(f, n):
+    a = np.empty(n, dtype=np.int64)
+    f.readinto(a)
+    return a
+
+
+def write_longs(f, a):
+    f.write(np.array(a, dtype=np.int64))
+
+
+dtypes = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: np.float64,
+    7: np.float32,
+    8: np.uint16,
+}
+
+
+def code(dtype):
+    for k in dtypes.keys():
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+
+
+def index_file_path(prefix_path):
+    return prefix_path + '.idx'
+
+
+def data_file_path(prefix_path):
+    return prefix_path + '.bin'
+
+
+def create_doc_idx(sizes):
+    doc_idx = [0]
+    for i, s in enumerate(sizes):
+        if s == 0:
+            doc_idx.append(i + 1)
+    return doc_idx
+
+
+class IndexedDataset(torch.utils.data.Dataset):
+    """Loader for IndexedDataset"""
+    _HDR_MAGIC = b'TNTIDX\x00\x00'
+
+    def __init__(self, path):
+        super().__init__()
+        self.path = path
+        self.data_file = None
+        self.read_index(path)
+
+    def read_index(self, path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            assert magic == self._HDR_MAGIC, (
+                'Index file doesn\'t match expected format. '
+                'Make sure that --dataset-impl is configured properly.'
+            )
+            version = f.read(8)
+            assert struct.unpack('<Q', version) == (1,)
+            code, self.element_size = struct.unpack('<QQ', f.read(16))
+            self.dtype = dtypes[code]
+            self._len, self.s = struct.unpack('<QQ', f.read(16))
+            self.doc_count = struct.unpack('<Q', f.read(8))
+            self.dim_offsets = read_longs(f, self._len + 1)
+            self.data_offsets = read_longs(f, self._len + 1)
+            self.sizes = read_longs(f, self.s)
+            self.doc_idx = read_longs(f, self.doc_count)
+
+    def read_data(self, path):
+        self.data_file = open(data_file_path(path), 'rb', buffering=0)
+
+    def check_index(self, i):
+        if i < 0 or i >= self._len:
+            raise IndexError('index out of range')
+
+    def __del__(self):
+        if self.data_file:
+            self.data_file.close()
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if not self.data_file:
+            self.read_data(self.path)
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            return a
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]]
+            size = sum(sizes)
+            a = np.empty(size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[start] * self.element_size)
+            self.data_file.readinto(a)
+            offsets = list(accumulate(sizes))
+            sents = np.split(a, offsets[:-1])
+            return sents
+
+    def __len__(self):
+        return self._len
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    @staticmethod
+    def exists(path):
+        return (
+            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+        )
+
+    @property
+    def supports_prefetch(self):
+        return False  # avoid prefetching to save memory
+
+
+class IndexedCachedDataset(IndexedDataset):
+
+    def __init__(self, path):
+        super().__init__(path)
+        self.cache = None
+        self.cache_index = {}
+
+    @property
+    def supports_prefetch(self):
+        return True
+
+    def prefetch(self, indices):
+        if all(i in self.cache_index for i in indices):
+            return
+        if not self.data_file:
+            self.read_data(self.path)
+        indices = sorted(set(indices))
+        total_size = 0
+        for i in indices:
+            total_size += self.data_offsets[i + 1] - self.data_offsets[i]
+        self.cache = np.empty(total_size, dtype=self.dtype)
+        ptx = 0
+        self.cache_index.clear()
+        for i in indices:
+            self.cache_index[i] = ptx
+            size = self.data_offsets[i + 1] - self.data_offsets[i]
+            a = self.cache[ptx: ptx + size]
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            ptx += size
+        if self.data_file:
+            # close and delete data file after prefetch so we can pickle
+            self.data_file.close()
+            self.data_file = None
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            ptx = self.cache_index[i]
+            np.copyto(a, self.cache[ptx: ptx + a.size])
+            return a
+        elif isinstance(idx, slice):
+            # Hack just to make this work, can optimizer later if necessary
+            sents = []
+            for i in range(*idx.indices(len(self))):
+                sents.append(self[i])
+            return sents
+
+
+class IndexedDatasetBuilder(object):
+    element_sizes = {
+        np.uint8: 1,
+        np.int8: 1,
+        np.int16: 2,
+        np.int32: 4,
+        np.int64: 8,
+        np.float32: 4,
+        np.float64: 8,
+    }
+
+    def __init__(self, out_file, dtype=np.int32):
+        self.out_file = open(out_file, 'wb')
+        self.dtype = dtype
+        self.data_offsets = [0]
+        self.dim_offsets = [0]
+        self.sizes = []
+        self.element_size = self.element_sizes[self.dtype]
+        self.doc_idx = [0]
+
+    def add_item(self, tensor):
+        bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype))
+        self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
+        for s in tensor.size():
+            self.sizes.append(s)
+        self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
+
+    def end_document(self):
+        self.doc_idx.append(len(self.sizes))
+
+    def merge_file_(self, another_file):
+        index = IndexedDataset(another_file)
+        assert index.dtype == self.dtype
+
+        doc_offset = len(self.sizes)
+
+        begin = self.data_offsets[-1]
+        for data_offset in index.data_offsets[1:]:
+            self.data_offsets.append(begin + data_offset)
+        self.sizes.extend(index.sizes)
+
+        begin = self.dim_offsets[-1]
+        for dim_offset in index.dim_offsets[1:]:
+            self.dim_offsets.append(begin + dim_offset)
+
+        self.doc_idx.extend((doc_offset + index.doc_idx)[1:])
+
+        with open(data_file_path(another_file), 'rb') as f:
+            while True:
+                data = f.read(1024)
+                if data:
+                    self.out_file.write(data)
+                else:
+                    break
+
+    def finalize(self, index_file):
+        self.out_file.close()
+        index = open(index_file, 'wb')
+        index.write(b'TNTIDX\x00\x00')
+        index.write(struct.pack('<Q', 1))
+        index.write(struct.pack('<QQ', code(self.dtype), self.element_size))
+        index.write(struct.pack('<QQ', len(self.data_offsets) - 1, len(self.sizes)))
+        index.write(struct.pack('<Q', len(self.doc_idx)))
+        write_longs(index, self.dim_offsets)
+        write_longs(index, self.data_offsets)
+        write_longs(index, self.sizes)
+        write_longs(index, self.doc_idx)
+        index.close()
+
+
+def _warmup_mmap_file(path):
+    with open(path, 'rb') as stream:
+        while stream.read(100 * 1024 * 1024):
+            pass
+
+
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    class Index(object):
+        _HDR_MAGIC = b'MMIDIDX\x00\x00'
+
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer(object):
+                def __enter__(self):
+                    self._file = open(path, 'wb')
+
+                    self._file.write(cls._HDR_MAGIC)
+                    self._file.write(struct.pack('<Q', 1))
+                    self._file.write(struct.pack('<B', code(dtype)))
+
+                    return self
+
+                @staticmethod
+                def _get_pointers(sizes):
+                    dtype_size = dtype().itemsize
+                    address = 0
+                    pointers = []
+
+                    for size in sizes:
+                        pointers.append(address)
+                        address += size * dtype_size
+
+                    return pointers
+
+                def write(self, sizes, modes, doc_idx):
+                    pointers = self._get_pointers(sizes)
+
+                    self._file.write(struct.pack('<Q', len(sizes)))
+                    self._file.write(struct.pack('<Q', len(doc_idx)))
+
+                    sizes = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes.tobytes(order='C'))
+                    del sizes
+
+                    if modes is not None:
+                        modes = np.array(modes, dtype=np.int32)
+                        self._file.write(modes.tobytes(order='C'))
+                        del modes
+
+                    pointers = np.array(pointers, dtype=np.int64)
+                    self._file.write(pointers.tobytes(order='C'))
+                    del pointers
+
+                    doc_idx = np.array(doc_idx, dtype=np.int64)
+                    self._file.write(doc_idx.tobytes(order='C'))
+
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+
+            return _Writer()
+
+        def __init__(self, path, skip_warmup=False, multimodal=False):
+            with open(path, 'rb') as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    'Index file doesn\'t match expected format. '
+                    'Make sure that --dataset-impl is configured properly.'
+                )
+                version = struct.unpack('<Q', stream.read(8))
+                assert (1,) == version
+
+                dtype_code, = struct.unpack('<B', stream.read(1))
+                self._dtype = dtypes[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+                self.multimodal = multimodal
+
+                self._len = struct.unpack('<Q', stream.read(8))[0]
+                self._doc_count = struct.unpack('<Q', stream.read(8))[0]
+                offset = stream.tell()
+
+            if not skip_warmup:
+                print_rank_0("    warming up index mmap file...")
+                _warmup_mmap_file(path)
+
+            self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
+            self._bin_buffer = memoryview(self._bin_buffer_mmap)
+            print_rank_0("    reading sizes...")
+            self._sizes = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int32,
+                count=self._len,
+                offset=offset)
+
+            print_rank_0("    reading pointers...")
+            self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len,
+                                           offset=offset + self._sizes.nbytes)
+            print_rank_0("    reading document index...")
+            self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count,
+                                          offset=offset + self._sizes.nbytes + self._pointers.nbytes)
+            self._modes = None
+            if multimodal:
+                print_rank_0("    reading modes...")
+                self._modes = np.frombuffer(
+                    self._bin_buffer,
+                    dtype=np.int8,
+                    count=self._len,
+                    offset=offset + self._sizes.nbytes + self._pointers.nbytes + self._doc_idx.nbytes)
+
+        def __del__(self):
+            self._bin_buffer_mmap._mmap.close()
+            del self._bin_buffer_mmap
+
+        @property
+        def dtype(self):
+            return self._dtype
+
+        @property
+        def sizes(self):
+            return self._sizes
+
+        @property
+        def modes(self):
+            return self._modes
+
+        @property
+        def doc_idx(self):
+            return self._doc_idx
+
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i], (self._modes[i] if self.multimodal else None)
+
+        def __len__(self):
+            return self._len
+
+    def __init__(self, path, skip_warmup=False, multimodal=False):
+        super().__init__()
+
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+        self.multimodal = multimodal
+
+        self._do_init(path, skip_warmup, multimodal)
+
+    def __getstate__(self):
+        return self._path
+
+    def __setstate__(self, state):
+        self._do_init(state, skip_warmup=True, multimodal=False)
+
+    def _do_init(self, path, skip_warmup, multimodal):
+        self._path = path
+        self._index = self.Index(index_file_path(self._path), skip_warmup, multimodal)
+
+        if not skip_warmup:
+            print_rank_0("    warming up data mmap file...")
+            _warmup_mmap_file(data_file_path(self._path))
+        print_rank_0("    creating numpy buffer of mmap...")
+        self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C')
+        print_rank_0("    creating memory view of numpy buffer...")
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, (int, np.integer)):
+            ptr, size, mode = self._index[idx]
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                     count=size, offset=ptr)
+            return (np_array, mode) if mode is not None else np_array
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            ptr = self._index._pointers[start]
+            sizes = self._index._sizes[idx]
+            modes = self._index._modes[idx] if self.multimodal else None
+            offsets = list(accumulate(sizes))
+            total_size = sum(sizes)
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                     count=total_size, offset=ptr)
+            sents = np.split(np_array, offsets[:-1])
+            return (sents, modes) if modes is not None else sents
+        else:
+            raise TypeError("Unexpected type received for idx: {}".format(type(idx)))
+
+    def get(self, idx, offset=0, length=None):
+        """ Retrieves a single item from the dataset with the option to only
+        return a portion of the item.
+
+        get(idx) is the same as [idx] but get() does not support slicing.
+        """
+        ptr, size, mode = self._index[idx]
+        if length is None:
+            length = size - offset
+        ptr += offset * np.dtype(self._index.dtype).itemsize
+        np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                 count=length, offset=ptr)
+        return (np_array, mode) if mode is not None else np_array
+            
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    @property
+    def modes(self):
+        return self._index.modes
+
+    @property
+    def doc_idx(self):
+        return self._index.doc_idx
+
+    def get_doc_idx(self):
+        return self._index._doc_idx
+
+    def set_doc_idx(self, doc_idx_):
+        self._index._doc_idx = doc_idx_
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @staticmethod
+    def exists(path):
+        return (
+            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+        )
+
+
+class MMapIndexedDatasetBuilder(object):
+    def __init__(self, out_file, dtype=np.int64, multimodal=False):
+        self._data_file = open(out_file, 'wb')
+        self._dtype = dtype
+        self._multimodal = multimodal
+        self._sizes = []
+        self._doc_idx = [0]
+        self._modes = [] if self._multimodal else None
+
+    def add_item(self, tensor, mode=0):
+        np_array = np.array(tensor.numpy(), dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order='C'))
+        self._sizes.append(np_array.size)
+        
+        if self._multimodal:
+            self._modes.append(mode)
+
+    def add_doc(self, tensor, sizes, modes=None):
+        np_array = np.array(tensor, dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order='C'))
+        self._sizes.extend(sizes)
+        self._doc_idx.append(len(self._sizes))
+        
+        if self._multimodal:
+            self._modes.extend(modes if modes is not None else [0]*sizes)
+
+    def end_document(self):
+        self._doc_idx.append(len(self._sizes))
+
+    def merge_file_(self, another_file):
+        # Concatenate index
+        index = MMapIndexedDataset.Index(
+                index_file_path(another_file),
+                multimodal=self._multimodal)
+        assert index.dtype == self._dtype
+
+        offset = len(self._sizes)
+        self._sizes.extend(index.sizes)
+        self._doc_idx.extend((offset + index.doc_idx)[1:])
+        
+        if self._multimodal:
+            self._modes.extend(index.modes)
+
+        # Concatenate data
+        with open(data_file_path(another_file), 'rb') as f:
+            shutil.copyfileobj(f, self._data_file)
+
+    def finalize(self, index_file):
+        self._data_file.close()
+
+        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
+            index.write(self._sizes, self._modes, self._doc_idx)
diff --git a/training/DeepSpeed-Domino/domino/gpt_model.py b/training/DeepSpeed-Domino/domino/gpt_model.py
new file mode 100644
index 000000000..02492ebca
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/gpt_model.py
@@ -0,0 +1,64 @@
+from domino.modules.module import DominoModule
+from domino.modules.enums import AttnMaskType
+from domino.language_model import parallel_lm_logits
+from domino.language_model import get_language_model
+from domino.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
+
+
+def post_language_model_processing(lm_output, labels, logit_weights, parallel_output):
+    output = parallel_lm_logits(lm_output, logit_weights, parallel_output)
+    labels = labels.transpose(0, 1).contiguous()
+    loss = vocab_parallel_cross_entropy(output.float(), labels)
+    loss = loss.transpose(0, 1).contiguous()
+    return loss
+
+
+class GPTModel(DominoModule):
+    def __init__(
+        self,
+        config,
+        num_tokentypes=0,
+        parallel_output=True,
+        pre_process=True,
+        post_process=True,
+    ):
+        super().__init__(config=config)
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.language_model = get_language_model(
+            config=config,
+            num_tokentypes=num_tokentypes,
+            encoder_attn_mask_type=AttnMaskType.causal,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+        self.initialize_word_embeddings()
+
+    def set_input_tensor(self, input_tensor):
+        self.language_model.set_input_tensor(input_tensor)
+
+    def forward(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        labels=None,
+        inference_params=None,
+    ):
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            inference_params=inference_params,
+        )
+
+        if self.post_process:
+            return post_language_model_processing(
+                lm_output,
+                labels,
+                self.shared_embedding_or_output_weight(),
+                self.parallel_output,
+            )
+        else:
+            return lm_output
diff --git a/training/DeepSpeed-Domino/domino/initialize.py b/training/DeepSpeed-Domino/domino/initialize.py
new file mode 100644
index 000000000..36e0fa1bc
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/initialize.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from initialize.py in Megatron-LM
+
+import random
+import os
+import time
+import numpy as np
+import torch
+
+from domino.arguments import parse_args, get_args, set_args, build_tokenizer_g, set_timers
+import domino.parallel_state as mpu
+from domino.tensor_parallel.random import model_parallel_cuda_manual_seed
+from domino.modules.fused_func import bias_dropout_add_fused_train
+from domino.modules.fused_bias_gelu import bias_gelu
+
+from megatron import fused_kernels
+
+
+def initialize_domino():
+    """Set global variables, initialize distributed, and 
+    random seeds.
+    """
+
+    args = parse_args()
+
+    set_args(args)
+    build_tokenizer_g(args)
+    set_timers()
+
+    args = get_args()
+    device_count = torch.cuda.device_count()
+    device = args.rank % device_count
+    args.local_rank = device
+    torch.cuda.set_device(device)
+    torch.distributed.init_process_group(
+        backend=args.distributed_backend,
+        world_size=args.world_size,
+        rank=args.rank
+    )
+    mpu.initialize_model_parallel(args.tensor_model_parallel_size)
+    seed_ = args.seed
+    data_parallel_random_init = False
+    if seed_ is not None and seed_ > 0:
+        # Ensure that different pipeline MP stages get different seeds.
+        # seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
+        seed = seed_
+        # Ensure different data parallel ranks get different seeds
+        if data_parallel_random_init:
+            seed = seed + (10 * mpu.get_data_parallel_rank())
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.device_count() > 0:
+            model_parallel_cuda_manual_seed(seed)
+
+    # Compile dependencies.
+    _compile_dependencies()
+
+    # No continuation function
+    return None
+
+
+def _compile_dependencies():
+
+    args = get_args()
+
+    # =========================
+    # Compile dataset C++ code.
+    # =========================
+    # TODO: move this to ninja
+    if torch.distributed.get_rank() == 0:
+        start_time = time.time()
+        print("> compiling dataset index builder ...")
+        from megatron.data.dataset_utils import compile_helper
+
+        compile_helper()
+        print(
+            ">>> done with dataset index builder. Compilation time: {:.3f} "
+            "seconds".format(time.time() - start_time),
+            flush=True,
+        )
+
+    # ==================
+    # Load fused kernels
+    # ==================
+
+    # Custom kernel constraints check.
+    seq_len = args.seq_length
+    attn_batch_size = (
+        args.num_attention_heads / args.tensor_model_parallel_size
+    ) * args.micro_batch_size
+    # Constraints on sequence length and attn_batch_size to enable warp based
+    # optimization and upper triangular optimization (for causal mask)
+    custom_kernel_constraint = (
+        seq_len > 16
+        and seq_len <= 16384
+        and seq_len % 4 == 0
+        and attn_batch_size % 4 == 0
+    )
+    # Print a warning.
+    if not (
+        (args.fp16 or args.bf16)
+        and custom_kernel_constraint
+        and args.masked_softmax_fusion
+    ):
+        if args.rank == 0:
+            print(
+                "WARNING: constraints for invoking optimized"
+                " fused softmax kernel are not met. We default"
+                " back to unfused kernel invocations.",
+                flush=True,
+            )
+
+    # Always build on rank zero first.
+    if torch.distributed.get_rank() == 0:
+        start_time = time.time()
+        print("> compiling and loading fused kernels ...", flush=True)
+        fused_kernels.load(args)
+        torch.distributed.barrier()
+    else:
+        torch.distributed.barrier()
+        fused_kernels.load(args)
+    # Simple barrier to make sure all ranks have passed the
+    # compilation phase successfully before moving on to the
+    # rest of the program. We think this might ensure that
+    # the lock is released.
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(
+            ">>> done with compiling and loading fused kernels. "
+            "Compilation time: {:.3f} seconds".format(time.time() - start_time),
+            flush=True,
+        )
+
+
+def set_jit_fusion_options():
+    """Set PyTorch JIT layer fusion options."""
+    # flags required to enable jit fusion kernels
+    TORCH_MAJOR = int(torch.__version__.split(".")[0])
+    TORCH_MINOR = int(torch.__version__.split(".")[1])
+    if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10):
+        # nvfuser
+        torch._C._jit_set_profiling_executor(True)
+        torch._C._jit_set_profiling_mode(True)
+        torch._C._jit_override_can_fuse_on_cpu(False)
+        torch._C._jit_override_can_fuse_on_gpu(False)
+        torch._C._jit_set_texpr_fuser_enabled(False)
+        torch._C._jit_set_nvfuser_enabled(True)
+        torch._C._debug_set_autodiff_subgraph_inlining(False)
+    else:
+        # legacy pytorch fuser
+        torch._C._jit_set_profiling_mode(False)
+        torch._C._jit_set_profiling_executor(False)
+        torch._C._jit_override_can_fuse_on_cpu(True)
+        torch._C._jit_override_can_fuse_on_gpu(True)
+
+    _warmup_jit_function()
+
+
+def _warmup_jit_function():
+    """Compilie JIT functions before the main training steps"""
+    args = get_args()
+    if args.bf16:
+        dtype = torch.bfloat16
+    elif args.fp16:
+        dtype = torch.float16
+    else:
+        dtype = torch.float32
+
+    # Warmup fused bias+gelu
+    bias = torch.rand(
+        args.ffn_hidden_size // args.tensor_model_parallel_size,
+        dtype=dtype,
+        device="cuda",
+    )
+    input = torch.rand(
+        (
+            args.seq_length,
+            args.micro_batch_size,
+            args.ffn_hidden_size // args.tensor_model_parallel_size,
+        ),
+        dtype=dtype,
+        device="cuda",
+    )
+    # Warmup JIT fusions with the input grad_enable state of both forward
+    # prop and recomputation
+    for bias_grad, input_grad in zip([True, True], [False, True]):
+        bias.requires_grad, input.requires_grad = bias_grad, input_grad
+        for _ in range(5):
+            output = bias_gelu(bias, input)
+    del bias, input, output
+
+    # Warmup fused bias+dropout+add
+    seq_length = args.seq_length
+    input = torch.rand(
+        (seq_length, args.micro_batch_size, args.hidden_size),
+        dtype=dtype,
+        device="cuda",
+    )
+    residual = torch.rand(
+        (seq_length, args.micro_batch_size, args.hidden_size),
+        dtype=dtype,
+        device="cuda",
+    )
+    bias = torch.rand((args.hidden_size), dtype=dtype, device="cuda").expand_as(
+        residual
+    )
+    dropout_rate = 0.1
+    # Warmup JIT fusions with the input grad_enable state of both forward
+    # prop and recomputation
+    for input_grad, bias_grad, residual_grad in zip(
+        [False, True], [True, True], [True, True]
+    ):
+        input.requires_grad = input_grad
+        bias.requires_grad = bias_grad
+        residual.requires_grad = residual_grad
+        for _ in range(5):
+            output = bias_dropout_add_fused_train(input, bias, residual, dropout_rate)
+    del bias, input, residual, output
+    torch.cuda.empty_cache()
diff --git a/training/DeepSpeed-Domino/domino/language_model.py b/training/DeepSpeed-Domino/domino/language_model.py
new file mode 100644
index 000000000..2cfb2f9fd
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/language_model.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from language_model.py in Megatron-LM
+
+import torch
+from torch import einsum, nn
+from domino.arguments import get_args
+from domino.modules.enums import ModelType
+import domino.parallel_state as mpu
+from domino.modules.module import DominoModule
+from domino.tensor_parallel.comm import GatherFromModelParallelRegion
+from domino.tensor_parallel.partition  import VocabParallelEmbedding, linear_with_grad_accumulation_and_async_allreduce
+from domino.modules.fused_layer_norm import MixedFusedLayerNorm as fused_layer_norm
+from domino.modules.fused_func import bias_dropout_add_fused_train, bias_dropout_add_fused_inference, apply_rotary_pos_emb
+from domino.tensor_parallel.partition import _initialize_affine_weight_gpu, set_tensor_model_parallel_attributes
+from domino.tensor_parallel.partition import ColumnParallelLinear, RowParallelLinearNoComm
+
+from deepspeed.runtime.domino.transformer import DominoTransformer
+
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
+                       bias=None):
+    """LM logits using word embedding weights."""
+    args = get_args()
+    # Parallel logits.
+    if args.async_tensor_model_parallel_allreduce:
+        input_parallel = input_
+        model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
+        async_grad_allreduce = args.async_tensor_model_parallel_allreduce and model_parallel
+
+    # Matrix multiply.
+    logits_parallel = linear_with_grad_accumulation_and_async_allreduce(
+        input=input_parallel,
+        weight=word_embeddings_weight,
+        bias=bias,
+        gradient_accumulation_fusion=args.gradient_accumulation_fusion,
+        async_grad_allreduce=async_grad_allreduce,
+        sequence_parallel=False)
+    # Gather if needed.
+
+    if parallel_output:
+        return logits_parallel
+
+    return GatherFromModelParallelRegion.apply(logits_parallel)
+
+
+def get_language_model(config, num_tokentypes,
+                       encoder_attn_mask_type,
+                       pre_process=True, post_process=True):
+    language_model = TransformerLanguageModel(
+        config,
+        encoder_attn_mask_type,
+        num_tokentypes=num_tokentypes,
+        pre_process=pre_process,
+        post_process=post_process
+    )
+
+    return language_model
+
+
+class Embedding(DominoModule):
+    def __init__(self, hidden_dim, vocab_size, max_seq_len, dropout_prob, config):
+        super(Embedding, self).__init__()
+        self.hidden_dim = hidden_dim
+        self.init_method = config.init_method
+        args = get_args()
+        self.word_embeddings = VocabParallelEmbedding(
+            vocab_size, self.hidden_dim, config=config, init_method=config.init_method
+        )
+        self.use_position_embedding = args.position_embedding_type == 'learned_absolute'
+        if self.use_position_embedding:
+            self.position_embeddings = torch.nn.Embedding(max_seq_len, self.hidden_dim)
+            self.init_method(self.position_embeddings.weight)
+        self.embedding_dropout = torch.nn.Dropout(dropout_prob)
+
+    def forward(self, input_ids, position_ids):
+        word_embeds = self.word_embeddings(input_ids)
+        if self.use_position_embedding:
+            pos_embeds = self.position_embeddings(position_ids)
+            combined_embeds = word_embeds + pos_embeds
+        else:
+            combined_embeds = word_embeds
+
+        combined_embeds = combined_embeds.transpose(0, 1).contiguous()
+        combined_embeds = self.embedding_dropout(combined_embeds)
+
+        return combined_embeds
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, seq_len_interpolation_factor=None):
+        super().__init__()
+        self.seq_len_interpolation_factor = seq_len_interpolation_factor
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq, persistent=False)
+
+    def forward(self, max_seq_len, offset=0):
+        seq = torch.arange(max_seq_len, device=self.inv_freq.device) + offset
+        if self.seq_len_interpolation_factor is not None:
+            seq = seq.type_as(self.inv_freq)
+            seq *= 1 / self.seq_len_interpolation_factor
+        freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq)
+        # first part even vector components, second part odd vector components,
+        #  2 * dim in dimension size
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # emb [seq_length, .., dim]
+        return emb[:, None, None, :]
+
+    # def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+    #     state_dict.pop(f'{prefix}inv_freq', None)
+    #     return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+
+class TransformerLanguageModel(DominoModule):
+    def __init__(self,
+                 config,
+                 encoder_attn_mask_type,
+                 num_tokentypes=0,
+                 pre_process=True,
+                 post_process=True):
+
+        args = get_args()
+        super(TransformerLanguageModel, self).__init__(share_embeddings_and_output_weights=True)
+
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.hidden_size = config.hidden_size
+        self.num_tokentypes = num_tokentypes
+        self.init_method = config.init_method
+        self.encoder_attn_mask_type = encoder_attn_mask_type
+        self.encoder_hidden_state = None
+
+        if self.pre_process:
+            self.embedding = Embedding(self.hidden_size,
+                                       args.padded_vocab_size,
+                                       args.max_position_embeddings,
+                                       args.hidden_dropout,
+                                       config)
+
+        self.use_rotary_position_embeddings = \
+            args.position_embedding_type == 'rope'
+        if self.use_rotary_position_embeddings:
+            self.seq_length = args.seq_length
+            rotary_dim = args.hidden_size // args.num_attention_heads \
+                if args.kv_channels is None else args.kv_channels
+            if args.rotary_percent < 1.0:
+                rotary_dim = int(rotary_dim * args.rotary_percent)
+            self.rotary_pos_emb = RotaryEmbedding(
+                rotary_dim,
+                seq_len_interpolation_factor=args.rotary_seq_len_interpolation_factor
+            )
+
+        self.encoder = DominoTransformer(
+            config, ModelType.encoder_or_decoder, mpu,
+            fused_layer_norm, _initialize_affine_weight_gpu,
+            ColumnParallelLinear, RowParallelLinearNoComm, apply_rotary_pos_emb,
+            bias_dropout_add_fused_train, bias_dropout_add_fused_inference,
+            self_attn_mask_type=self.encoder_attn_mask_type,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+
+    def set_input_tensor(self, input_tensor):
+        pass
+        # self.encoder.set_input_tensor(input_tensor[0])
+
+    def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
+                inference_params=None):
+
+        if self.pre_process:
+            encoder_input = self.embedding(enc_input_ids, enc_position_ids)
+        else:
+            encoder_input = None
+
+        rotary_pos_emb = None
+        if self.use_rotary_position_embeddings:
+            rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+        rotary_pos_emb = ((rotary_pos_emb,) * 2)
+
+        encoder_out_size = encoder_input.shape
+        p_batch_size = encoder_out_size[1] // 2
+        dtype = encoder_input.dtype
+        encoder_output_t = torch.empty(encoder_out_size, dtype=dtype, device=torch.cuda.current_device())
+        intra_partitions = 2
+        encoder_inputs = torch.tensor_split(encoder_input, intra_partitions, dim=1)
+        encoder_outputs = self.encoder(
+            encoder_inputs,
+            enc_attn_mask,
+            rotary_pos_emb=rotary_pos_emb)
+        encoder_output_t[:, 0:p_batch_size, :] = encoder_outputs[0]
+        encoder_output_t[:, p_batch_size:2*p_batch_size, :] = encoder_outputs[1]
+        encoder_output = encoder_output_t
+
+        return encoder_output
+        
\ No newline at end of file
diff --git a/training/DeepSpeed-Domino/domino/llama_model.py b/training/DeepSpeed-Domino/domino/llama_model.py
new file mode 100644
index 000000000..3b929eeeb
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/llama_model.py
@@ -0,0 +1,91 @@
+import torch
+from domino.arguments import get_args 
+from domino.language_model import parallel_lm_logits
+from domino.modules.enums import AttnMaskType
+from domino.modules.module import DominoModule
+from domino.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
+from domino.language_model import get_language_model
+
+
+def post_language_model_processing(lm_output, labels, logit_weights, parallel_output):
+    output = parallel_lm_logits(lm_output, logit_weights, parallel_output)
+    labels = labels.transpose(0, 1).contiguous()
+    loss = vocab_parallel_cross_entropy(output.float(), labels)
+    loss = loss.transpose(0, 1).contiguous()
+    return loss
+
+
+class LLaMAModel(DominoModule):
+    """LLaMA Language model."""
+
+    def __init__(
+        self,
+        config,
+        num_tokentypes=0,
+        parallel_output=True,
+        pre_process=True,
+        post_process=True,
+    ):
+        args = get_args()
+        super(LLaMAModel, self).__init__(
+            config=config,
+            share_embeddings_and_output_weights=True)
+
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.padded_vocab_size = args.padded_vocab_size
+        self.language_model = get_language_model(
+            config=config,
+            num_tokentypes=num_tokentypes,
+            encoder_attn_mask_type=AttnMaskType.causal,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+        self.initialize_word_embeddings()
+        self.lm_head = torch.nn.Linear(
+            args.hidden_size, args.padded_vocab_size, bias=False
+        )
+
+    def set_input_tensor(self, input_tensor):
+        self.language_model.set_input_tensor(input_tensor)
+
+    def _causal_lm_process(self, lm_output, labels):
+        lm_output = lm_output.transpose(0, 1)
+        logits = self.lm_head(lm_output)
+        loss = None
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., :-1].contiguous()
+        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=0)
+        shift_logits = shift_logits.view(-1, self.padded_vocab_size)
+        shift_labels = shift_labels.view(-1)
+        shift_labels = shift_labels.to(shift_logits.device)
+        loss = loss_fct(shift_logits, shift_labels)
+
+        return loss
+
+    def forward(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        retriever_input_ids=None,
+        retriever_position_ids=None,
+        retriever_attn_mask=None,
+        labels=None,
+        inference_params=None,
+    ):
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            retriever_input_ids=retriever_input_ids,
+            retriever_position_ids=retriever_position_ids,
+            retriever_attn_mask=retriever_attn_mask,
+            inference_params=inference_params,
+        )
+
+        if self.post_process:
+            return self._causal_lm_process(lm_output=lm_output, labels=labels)
+        else:
+            return lm_output
diff --git a/training/DeepSpeed-Domino/domino/modules/__init__.py b/training/DeepSpeed-Domino/domino/modules/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/DeepSpeed-Domino/domino/modules/distributed.py b/training/DeepSpeed-Domino/domino/modules/distributed.py
new file mode 100644
index 000000000..4c69aa9e7
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/modules/distributed.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from distributed.py in Megatron-LM
+
+import math
+import torch
+import domino.parallel_state as mpu
+
+
+class FlattenMemory:
+
+    def __init__(self, numel, dtype):
+        self.numel = numel
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+        self.numel_padded = data_parallel_world_size * \
+                int(math.ceil(numel / data_parallel_world_size))
+        self.dtype = dtype
+        self.data = torch.zeros(self.numel_padded,
+                                dtype=self.dtype,
+                                device=torch.cuda.current_device(),
+                                requires_grad=False)
+
+
+    def get(self, shape, start_index):
+        end_index = start_index + shape.numel()
+        assert end_index <= self.numel
+        buffer_tensor = self.data[start_index:end_index]
+        buffer_tensor = buffer_tensor.view(shape)
+        return buffer_tensor
+
+
+class DistributedDataParallel(torch.nn.Module):
+
+    def __init__(self, module,
+                 accumulate_allreduce_grads_in_fp32,
+                 use_contiguous_buffers):
+
+        super(DistributedDataParallel, self).__init__()
+
+        self.module = module
+        self.accumulate_allreduce_grads_in_fp32 \
+            = accumulate_allreduce_grads_in_fp32
+        self.use_contiguous_buffers = use_contiguous_buffers
+
+        if self.accumulate_allreduce_grads_in_fp32:
+            assert self.use_contiguous_buffers
+
+        if not self.use_contiguous_buffers:
+            self._grad_buffers = None
+            self._grad_buffer_param_index_map = None
+            return
+        
+        self._grad_buffers = {}
+        self._grad_buffer_param_index_map = {}
+
+        def _get_buffer_type(param):
+            return torch.float if \
+                self.accumulate_allreduce_grads_in_fp32 else param.dtype
+
+        type_num_elements = {}
+        for param in self.module.parameters():
+            if param.requires_grad:
+                dtype = _get_buffer_type(param)
+                type_num_elements[dtype] = type_num_elements.get(dtype, 0) \
+                                            + param.data.nelement()
+
+        # Allocate the memory.
+        for dtype, num_elements in type_num_elements.items():
+            self._grad_buffers[dtype] = FlattenMemory(num_elements, dtype)
+
+        self.grad_accs = []
+        for param in self.module.parameters():
+            if param.requires_grad:
+                dtype = _get_buffer_type(param)
+                type_num_elements[dtype] -= param.data.nelement()
+                param.main_grad = self._grad_buffers[dtype].get(
+                    param.data.shape, type_num_elements[dtype])
+                if dtype not in self._grad_buffer_param_index_map:
+                    self._grad_buffer_param_index_map[dtype] = {}
+                self._grad_buffer_param_index_map[dtype][param] = (
+                    type_num_elements[dtype],
+                    type_num_elements[dtype] + param.data.nelement(),
+                )
+                # Backward hook.
+                param_tmp = param.expand_as(param)
+                grad_acc = param_tmp.grad_fn.next_functions[0][0]
+                grad_acc.register_hook(self._make_param_hook(param))
+                self.grad_accs.append(grad_acc)
+
+
+    def _make_param_hook(self, param):
+        """Create the all-reduce hook for backprop."""
+        # Hook used for back-prop.
+        def param_hook(*unused):
+            # Add the gradient to the buffer.
+            if param.grad is not None:
+                # The gradient function of linear layers is fused with GEMMs
+                param.main_grad.add_(param.grad.data)
+                # Now we can deallocate grad memory.
+                param.grad = None
+        return param_hook
+
+
+    def zero_grad_buffer(self):
+        assert self._grad_buffers is not None, 'buffers are not initialized.'
+        for _, buffer_ in self._grad_buffers.items():
+            buffer_.data.zero_()
+
+
+    def broadcast_params(self):
+        for param in self.module.parameters():
+            torch.distributed.broadcast(param.data,
+                                        src=mpu.get_data_parallel_src_rank(),
+                                        group=mpu.get_data_parallel_group())
+
+
+    def forward(self, *inputs, **kwargs):
+        return self.module(*inputs, **kwargs)
diff --git a/training/DeepSpeed-Domino/domino/modules/enums.py b/training/DeepSpeed-Domino/domino/modules/enums.py
new file mode 100644
index 000000000..7ec422a86
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/modules/enums.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from enums.py in Megatron-LM
+
+import enum
+
+class LayerType(enum.Enum):
+    encoder = 1
+    decoder = 2
+ 
+class AttnType(enum.Enum):
+    self_attn = 1
+    cross_attn = 2
+
+class AttnMaskType(enum.Enum):
+    padding = 1
+    causal = 2
+
+class ModelType(enum.Enum):
+    encoder_or_decoder = 1
+    encoder_and_decoder = 2
diff --git a/training/DeepSpeed-Domino/domino/modules/fused_bias_gelu.py b/training/DeepSpeed-Domino/domino/modules/fused_bias_gelu.py
new file mode 100644
index 000000000..5c9f341b5
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/modules/fused_bias_gelu.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from fused_bias_gelu.py in Megatron-LM
+
+import torch
+
+
+###### BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+
+@torch.jit.script
+def bias_gelu(bias, y):
+    x = bias + y
+    return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, bias, y):
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    return ff*g
+
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(bias, input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, bias, input)
+        return tmp, tmp
+
+bias_gelu_impl = GeLUFunction.apply
diff --git a/training/DeepSpeed-Domino/domino/modules/fused_func.py b/training/DeepSpeed-Domino/domino/modules/fused_func.py
new file mode 100644
index 000000000..7a38dc855
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/modules/fused_func.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+
+from typing import Optional
+import torch
+
+
+class AddDropoutFuseFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input1, bias1, residual1, input2, bias2, residual2, prob, training):
+        if bias1 is not None and bias2 is not None:
+            output1, mask1, output2, mask2 = torch._C._nn.native_add_dropout_add_fuse(
+                input1, bias1, residual1, input2, bias2, residual2, prob, training
+            )
+        else:
+            output1, mask1, output2, mask2 = torch._C._nn.native_add_dropout_fuse(
+                input1, residual1, input2, residual2, prob, training
+            )
+        scale = 1.0 / (1.0 - prob)
+        ctx.save_for_backward(mask1, mask2)
+        ctx.scale = scale
+        ctx.with_bias = bias1 is not None and bias2 is not None
+        return output1, output2
+
+    @staticmethod
+    def backward(ctx, grad_output1, grad_output2):
+        (mask1, mask2) = ctx.saved_tensors
+        scale = ctx.scale
+        with_bias = ctx.with_bias
+        if with_bias:
+            grad_input1, grad_bias1, grad_residual1, grad_input2, grad_bias2, grad_residual2 = (
+                torch._C._nn.native_add_dropout_add_fuse_2(grad_output1, mask1, grad_output2, mask2, scale)
+            )
+        else:
+            grad_input1, grad_residual1, grad_input2, grad_residual2 = (
+                torch._C._nn.native_add_dropout_fuse_2(grad_output1, mask1, grad_output2, mask2, scale)
+            )
+            grad_bias1 = None
+            grad_bias2 = None
+        return grad_input1, grad_bias1, grad_residual1, grad_input2, grad_bias2, grad_residual2, None, None
+
+
+class AddDropoutFuse(torch.nn.Module):
+    def __init__(self):
+        super(AddDropoutFuse, self).__init__()
+
+    def forward(self, input1, bias1, residual1, input2, bias2, residual2, prob, training):
+        return AddDropoutFuseFunction.apply(input1, bias1, residual1, input2, bias2, residual2, prob, training)
+
+
+def bias_dropout_add(x, bias, residual, prob, training):
+    # type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor
+    if bias is not None:
+        x = x + bias
+    out = torch.nn.functional.dropout(x, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+@torch.jit.script
+def bias_dropout_add_fused_train(x: torch.Tensor,
+                                 bias: Optional[torch.Tensor],
+                                 residual: torch.Tensor,
+                                 prob: float) -> torch.Tensor:
+    return bias_dropout_add(x, bias, residual, prob, True)
+
+
+@torch.jit.script
+def bias_dropout_add_fused_inference(x: torch.Tensor,
+                                     bias: Optional[torch.Tensor],
+                                     residual: torch.Tensor,
+                                     prob: float) -> torch.Tensor:
+    return bias_dropout_add(x, bias, residual, prob, False)
+
+
+def _rotate_half(x):
+    """
+    change sign so the last dimension becomes [-odd, +even]
+    """
+    x1, x2 = torch.chunk(x, 2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(t, freqs):
+    """
+    input tensor t is of shape [seq_length, ..., dim]
+    rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
+    check https://kexue.fm/archives/8265 for detailed formulas
+    """
+    rot_dim = freqs.shape[-1]
+    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
+    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
+
+    # first part is cosine component
+    # second part is sine component, need to change signs with _rotate_half method
+    t = (t * freqs.cos()) + (_rotate_half(t) * freqs.sin())
+    return torch.cat((t, t_pass), dim=-1)
\ No newline at end of file
diff --git a/training/DeepSpeed-Domino/domino/modules/fused_layer_norm.py b/training/DeepSpeed-Domino/domino/modules/fused_layer_norm.py
new file mode 100644
index 000000000..72e6463a9
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/modules/fused_layer_norm.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copied and modified from NVIDIA apex
+
+import numbers
+import torch
+from torch.nn.parameter import Parameter
+from torch.nn import init
+from domino.utils import make_viewless_tensor
+
+try:
+    from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
+    HAVE_PERSIST_LAYER_NORM = True
+except:
+    HAVE_PERSIST_LAYER_NORM = False
+
+from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+
+
+class MixedFusedLayerNorm(torch.nn.Module):
+
+  def __init__(self, normalized_shape, eps=1e-5,
+               no_persist_layer_norm=True):
+        super(MixedFusedLayerNorm, self).__init__()
+
+        persist_ln_hidden_sizes = [1024, 1536, 2048, 2304, 3072, 3840, 4096,
+            5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480,
+            24576, 25600, 30720, 32768, 40960, 49152, 65536]
+        if normalized_shape not in persist_ln_hidden_sizes or \
+                not HAVE_PERSIST_LAYER_NORM:
+            no_persist_layer_norm = True
+
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = torch.Size(normalized_shape)
+        self.eps = eps
+        self.weight = Parameter(torch.Tensor(*normalized_shape))
+        self.bias = Parameter(torch.Tensor(*normalized_shape))
+        self.reset_parameters()
+
+        self.no_persist_layer_norm = no_persist_layer_norm
+
+
+  def reset_parameters(self):
+    init.ones_(self.weight)
+    init.zeros_(self.bias)
+
+  def forward(self, input):
+    weight = self.weight
+
+    if self.no_persist_layer_norm:
+        return FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.normalized_shape, self.eps)
+    else:
+        output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
+
+        output = make_viewless_tensor(inp = output,
+                                      requires_grad = input.requires_grad,
+                                      keep_graph = True)
+
+        return output
diff --git a/training/DeepSpeed-Domino/domino/modules/module.py b/training/DeepSpeed-Domino/domino/modules/module.py
new file mode 100644
index 000000000..b89bbc21f
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/modules/module.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from module.py in Megatron-LM
+
+import torch
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+from domino.arguments import get_args 
+import domino.parallel_state as mpu
+
+_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
+
+
+def param_is_not_shared(param):
+    return not hasattr(param, 'shared') or not param.shared
+
+
+class DominoModule(torch.nn.Module):
+    """extensions of torch Module."""
+
+    def __init__(self, config=None, share_embeddings_and_output_weights=True):
+        super(DominoModule, self).__init__()
+        self.config = config
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+
+    def initialize_word_embeddings(self):
+        self.share_embeddings_and_output_weights = True
+        return
+
+    def shared_embedding_or_output_weight(self):
+        if self.pre_process:
+            return self.language_model.embedding.word_embeddings.weight
+        else:
+            if not self.share_embeddings_and_output_weights:
+                raise Exception('shared_embedding_or_output_weight() called for last '
+                                'stage, but share_embeddings_and_output_weights is false')
+            return self.word_embeddings.weight
+
+
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val`
+    #is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+
+def fp32_to_float16(val):
+    """Convert fp32 `val` to fp16/bf16"""
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, _FLOAT_TYPES):
+            val = val.half()
+        return val
+    return conversion_helper(val, half_conversion)
+
+
+def float16_to_fp32(val):
+    """Convert fp16/bf16 `val` to fp32"""
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)):
+            val = val.float()
+        return val
+    return conversion_helper(val, float_conversion)
+
+
+class Float16Module(torch.nn.Module):
+
+    def __init__(self, module, args):
+        super(Float16Module, self).__init__()
+        self.add_module('module', module.half())
+
+    def set_input_tensor(self, input_tensor):
+        return self.module.set_input_tensor(input_tensor)
+
+    def forward(self, *inputs, **kwargs):
+        if mpu.is_pipeline_first_stage():
+            inputs = fp32_to_float16(inputs)
+        outputs = self.module(*inputs, **kwargs)
+        if mpu.is_pipeline_last_stage():
+            outputs = float16_to_fp32(outputs)
+        return outputs
+
diff --git a/training/DeepSpeed-Domino/domino/modules/utils.py b/training/DeepSpeed-Domino/domino/modules/utils.py
new file mode 100644
index 000000000..5c5938b90
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/modules/utils.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copied and modified from Megatron-LM
+
+import math
+import torch
+from domino.arguments import get_args 
+
+def init_method_normal(std_dev):
+    def initialize(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std_dev)
+    return initialize
+
+def scaled_init_method_normal(std_dev, layer_count):
+    scaled_std_dev = std_dev / math.sqrt(2.0 * layer_count)
+    def initialize(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=scaled_std_dev)
+    return initialize
+
+
+def get_linear_layer(input_dim, output_dim, init_method):
+    linear_layer = torch.nn.Linear(input_dim, output_dim)
+    if get_args().perform_initialization:
+        init_method(linear_layer.weight)
+    with torch.no_grad():
+        linear_layer.bias.zero_()
+    return linear_layer
+
+def param_is_not_shared(param):
+    return not hasattr(param, 'shared') or not param.shared
diff --git a/training/DeepSpeed-Domino/domino/optimizer/__init__.py b/training/DeepSpeed-Domino/domino/optimizer/__init__.py
new file mode 100644
index 000000000..74a7e2c97
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/optimizer/__init__.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+from apex.optimizers import FusedAdam as Adam
+from apex.optimizers import FusedSGD as SGD
+
+from domino.arguments import get_args
+
+from .distrib_optimizer import DistributedOptimizer
+from .grad_scaler import ConstantGradScaler, DynamicGradScaler
+from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
+
+
+def get_param_groups(modules,
+                     no_weight_decay_cond,
+                     scale_lr_cond,
+                     lr_mult):
+    """creates param groups based on weight decay condition (regularized vs non regularized)
+       and learning rate scale condition (args.lr vs lr_mult * args.lr)
+       scale_lr_cond is used during finetuning where head of the network requires a scaled
+       version of the base learning rate. 
+    """
+    wd_no_scale_lr = []
+    wd_scale_lr = []
+    no_wd_no_scale_lr = []
+    no_wd_scale_lr = []
+    for module in modules:
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue
+
+            if no_weight_decay_cond is not None:
+                no_wd = no_weight_decay_cond(name, param)
+            else:
+                # do not regularize biases nor Norm parameters
+                no_wd = name.endswith(".bias") or len(param.shape) == 1
+
+            if scale_lr_cond is not None:
+                scale_lr = scale_lr_cond(name, param)
+            else:
+                scale_lr = False
+
+            if not no_wd and not scale_lr:
+                wd_no_scale_lr.append(param)
+            elif not no_wd and scale_lr:
+                wd_scale_lr.append(param)
+            elif no_wd and not scale_lr:
+                no_wd_no_scale_lr.append(param)
+            else:
+                no_wd_scale_lr.append(param)
+
+    param_groups = []
+    if len(wd_no_scale_lr):
+        param_groups.append({'params': wd_no_scale_lr, 'wd_mult': 1.0, 'lr_mult': 1.0})
+    if len(wd_scale_lr):
+        param_groups.append({'params': wd_scale_lr, 'wd_mult': 1.0, 'lr_mult': lr_mult})
+    if len(no_wd_no_scale_lr):
+        param_groups.append({'params': no_wd_no_scale_lr, 'wd_mult': 0.0, 'lr_mult': 1.0})
+    if len(no_wd_scale_lr):
+        param_groups.append({'params': no_wd_scale_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult})
+
+    return param_groups
+
+def get_megatron_optimizer(model,
+                           no_weight_decay_cond=None,
+                           scale_lr_cond=None,
+                           lr_mult=1.0):
+    args = get_args()
+
+    # Base optimizer.
+    param_groups = get_param_groups(model,
+                                    no_weight_decay_cond,
+                                    scale_lr_cond,
+                                    lr_mult)
+
+    if args.optimizer == 'adam':
+        optimizer = Adam(param_groups,
+                         lr=args.lr,
+                         weight_decay=args.weight_decay,
+                         betas=(args.adam_beta1, args.adam_beta2),
+                         eps=args.adam_eps)
+    elif args.optimizer == 'sgd':
+        optimizer = SGD(param_groups,
+                        lr=args.lr,
+                        weight_decay=args.weight_decay,
+                        momentum=args.sgd_momentum)
+    else:
+        raise Exception('{} optimizer is not supported.'.format(
+            args.optimizer))
+
+    # Determine whether the params have main-grad field.
+    params_have_main_grad = False
+    if args.DDP_impl == 'local':
+        params_have_main_grad = True
+
+    # Mixed precision optimizer.
+    # - Note: both the Float16Optimizer and the DistributedOptimizer inherit
+    #   from the MixedPrecisionOptimizer, which manages any optimizer where
+    #   the model params and main params are distinct.
+    if args.fp16 or args.bf16 or args.use_distributed_optimizer:
+
+        # Grad scaler:
+        #    if loss-scale is provided, instantiate the constant scaler.
+        #    if we are using fp16 and loss-scale is not present, use a
+        #       dynamic scaler.
+        #    otherwise we are running in bf16 with no loss-scale so
+        #       leave it as None.
+        grad_scaler = None
+
+        # Constant loss scale.
+        if args.loss_scale:
+            grad_scaler = ConstantGradScaler(args.loss_scale)
+
+        # Dynamic loss scale.
+        else:
+            if args.fp16:
+                grad_scaler = DynamicGradScaler(
+                    initial_scale=args.initial_loss_scale,
+                    min_scale=args.min_loss_scale,
+                    growth_factor=2.0,
+                    backoff_factor=0.5,
+                    growth_interval=args.loss_scale_window,
+                    hysteresis=args.hysteresis)
+
+        # Megatron optimizer.
+        opt_ty = DistributedOptimizer \
+            if args.use_distributed_optimizer else \
+            Float16OptimizerWithFloat16Params
+        return opt_ty(optimizer,
+                      args.clip_grad,
+                      args.log_num_zeros_in_grad,
+                      params_have_main_grad,
+                      args.use_contiguous_buffers_in_local_ddp,
+                      args.fp16,
+                      args.bf16,
+                      args.params_dtype,
+                      grad_scaler,
+                      model)
+
+    # FP32.
+    return FP32Optimizer(optimizer, args.clip_grad,
+                         args.log_num_zeros_in_grad,
+                         params_have_main_grad,
+                         args.use_contiguous_buffers_in_local_ddp,
+                         model)
diff --git a/training/DeepSpeed-Domino/domino/optimizer/clip_grads.py b/training/DeepSpeed-Domino/domino/optimizer/clip_grads.py
new file mode 100644
index 000000000..092e34729
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/optimizer/clip_grads.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Gradient clipping."""
+
+import torch
+from torch import inf
+
+from apex.multi_tensor_apply import multi_tensor_applier
+import amp_C
+
+from domino.modules.module import param_is_not_shared
+from domino.tensor_parallel.partition import param_is_not_tensor_parallel_duplicate
+
+def clip_grad_norm_fp32(parameters, grads_for_norm,
+                        max_norm, norm_type=2,
+                        model_parallel_group=None):
+    """Clips gradient norm of an iterable of parameters whose gradients
+       are in fp32.
+
+    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    added functionality to handle model parallel parameters. Note that
+    the gradients are modified in place.
+
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        grads_for_norm (Iterable[Tensor]): an iterable of Tensors or a single
+            Tensor that will be used for calculating the grad norm.
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+        model_parallel_group (group): given the nature of the distributed
+            optimizer, this is passed as an argument.
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    if isinstance(grads_for_norm, torch.Tensor):
+        grads_for_norm = [grads_for_norm]
+
+    # Grads.
+    grads = []
+    for param in parameters:
+        if param.grad is not None:
+            assert param.grad.type() == 'torch.cuda.FloatTensor'
+            grads.append(param.grad.detach())
+
+    # Norm parameters.
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    total_norm = 0.0
+
+    # Calculate norm.
+    if norm_type == inf:
+        total_norm = max(grad.abs().max() for grad in grads_for_norm)
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        # Take max across all model-parallel GPUs.
+        torch.distributed.all_reduce(total_norm_cuda,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=model_parallel_group)
+        total_norm = total_norm_cuda[0].item()
+
+    else:
+        if norm_type == 2.0:
+            dummy_overflow_buf = torch.cuda.IntTensor([0])
+            # Use apex's multi-tensor applier for efficiency reasons.
+            # Multi-tensor applier takes a function and a list of list
+            # and performs the operation on that list all in one kernel.
+            if grads_for_norm:
+                grad_norm, _ = multi_tensor_applier(
+                    amp_C.multi_tensor_l2norm,
+                    dummy_overflow_buf,
+                    [grads_for_norm],
+                    False # no per-parameter norm
+                )
+            else:
+                grad_norm = torch.cuda.FloatTensor([0])
+            # Since we will be summing across data parallel groups,
+            # we need the pow(norm-type).
+            total_norm = grad_norm ** norm_type
+
+        else:
+            for grad in grads_for_norm:
+                grad_norm = torch.norm(grad, norm_type)
+                total_norm += grad_norm ** norm_type
+
+        # Sum across all model-parallel GPUs.
+        torch.distributed.all_reduce(total_norm,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=model_parallel_group)
+        total_norm = total_norm.item() ** (1.0 / norm_type)
+
+    # Scale.
+    clip_coeff = max_norm / (total_norm + 1.0e-6)
+    if clip_coeff < 1.0:
+        dummy_overflow_buf = torch.cuda.IntTensor([0])
+        multi_tensor_applier(amp_C.multi_tensor_scale,
+                             dummy_overflow_buf,
+                             [grads, grads],
+                             clip_coeff)
+
+    return total_norm
+
+
+def count_zeros_fp32(parameters, model_parallel_group):
+
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+
+    # Filter parameters based on:
+    #   - grad should not be none
+    #   - parameter should not be shared
+    #   - should not be a replica due to tensor model parallelism
+    total_num_zeros = torch.cuda.FloatTensor([0.0])
+    for param in parameters:
+        grad_not_none = param.grad is not None
+        is_not_shared = param_is_not_shared(param)
+        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+        if grad_not_none and is_not_shared and is_not_tp_duplicate:
+            grad = param.grad.detach()
+            num_zeros = grad.numel() - torch.count_nonzero(grad)
+            total_num_zeros = num_zeros + total_num_zeros
+
+    # Sum across all model-parallel GPUs.
+    torch.distributed.all_reduce(total_num_zeros,
+                                 op=torch.distributed.ReduceOp.SUM,
+                                 group=model_parallel_group)
+
+    total_num_zeros = total_num_zeros.item()
+
+    return total_num_zeros
diff --git a/training/DeepSpeed-Domino/domino/optimizer/distrib_optimizer.py b/training/DeepSpeed-Domino/domino/optimizer/distrib_optimizer.py
new file mode 100644
index 000000000..951b9a58e
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/optimizer/distrib_optimizer.py
@@ -0,0 +1,1022 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron distributed optimizer."""
+
+
+from apex.optimizers import FusedAdam as Adam
+import math
+import torch
+
+from domino.utils import print_rank_0
+import domino.parallel_state as mpu
+from domino.tensor_parallel.partition import copy_tensor_model_parallel_attributes
+from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper
+
+
+class Range:
+    """
+    A range represents a start and end points for indexing a shard
+    from a full tensor.
+    """
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+        self.size = end - start
+    def normalize(self, start = 0):
+        return Range(start, start + self.size)
+    def __str__(self):
+        return "%d,%d [%d]" % (self.start, self.end, self.size)
+    def __len__(self):
+        return self.end - self.start
+
+
+class DistributedOptimizer(MixedPrecisionOptimizer):
+    """Distributed optimizer, for all data types (fp16, bf16, and fp32).
+
+    Arguments:
+        optimizer: base optimizer such as Adam or SGD
+        clip_grad: clip gradeints with this global L2 norm. Note
+            that clipping is ignored if clip_grad == 0
+        log_num_zeros_in_grad: return number of zeros in the gradients.
+        params_have_main_grad: flag indicating if parameters have
+            a `main_grad` field. If this is set, we are assuming
+            that the model parameters are store in the `main_grad`
+            field instead of the typical `grad` field. This happens
+            for the DDP cases where there is a continuous buffer
+            holding the gradients. For example for bfloat16, we want
+            to do gradient accumulation and all-reduces in float32
+            and as a result we store those gradients in the main_grad.
+            Note that main grad is not necessarily in float32.
+        use_contiguous_buffers_in_local_ddp: if true, the local DDP model
+            is using a contiguous buffer to hold the model grads.
+        fp16: if true, the model is running in fp16.
+        bf16: if true, the model is running in bfloat16.
+        grad_scaler: used for scaling gradients. Note that this can be
+            None. This case happens when `bf16 = True` and we don't
+            use any loss scale. Note that for `bf16 = True`, we can have
+            a constnat gradient scaler. Also for `bf16 = False`, we
+            always require a grad scaler.
+        models: list of models (i.e., the virtual pipelining models). This
+            is used by the distributed optimizer for mapping parameters.
+    """
+
+    @classmethod
+    def build_model_gbuf_param_range_map(cls, model, dtype, gbuf_world_range):
+        """
+        Build mapping from param reference to grad buffer shard ranges.
+
+        This method builds a mapping from parameter references to grad
+        buffer shard ranges, specific to each data-parallel (DP) rank's
+        set of 'owned' parameters. Each grad buffer (padded to be an even
+        multiple of DP-world-size) is conceptually divided into DP-world-size
+        contiguous regions, where each DP rank 'owns' a contiguous regions.
+        Ownership in this sense means DP rank is responsible for reducing
+        the relevant subset of grads, and updating the relevant subset of
+        params.
+
+        This conceptual partitioning of the grad buffer does NOT respect
+        parameter boundaries, and as such it is assumed that each created
+        range references a shard (or subset) of the full parameter. It is
+        easiest to think of each DP rank as operating (i.e., reducing,
+        gathering) purely on views into the grad buffer, for all model-to-
+        main & main-to-model operations.
+
+        This method creates three ranges:
+        - The param's range within the entire grad buffer (i.e., world index).
+        - The param's range within the DP rank's local view of the grad buffer.
+        - The param's range within itself (i.e., its shard).
+        """
+
+        # Param range map.
+        param_world_index_map = model._grad_buffer_param_index_map[dtype]
+        param_range_map = {}
+        for param, param_world_indexes in param_world_index_map.items():
+
+            # Param range.
+            param_world_start, param_world_end = param_world_indexes
+            param_local_start = max(
+                0,
+                param_world_start - gbuf_world_range.start)
+            param_local_end = min(
+                gbuf_world_range.size,
+                param_world_end - gbuf_world_range.start)
+
+            # Add param, if within local gbuf range.
+            if param_local_end > param_local_start:
+                param_local_range = Range(param_local_start, param_local_end)
+                param_world_range = param_local_range.normalize(
+                    param_local_start + gbuf_world_range.start)
+                sub_param_start = max(0, gbuf_world_range.start-param_world_start)
+                sub_param_range = param_local_range.normalize(sub_param_start)
+                param_range_map[param] = {
+                    "gbuf_world" : param_world_range,
+                    "gbuf_local" : param_local_range,
+                    "param" : sub_param_range,
+                }
+
+        return param_range_map
+
+
+    @classmethod
+    def build_model_gbuf_range(cls, model, dtype):
+        """
+        Build mapping between params and their grad buffers.
+
+        This method does the initial setup for the method above. This setup
+        includes determining the shard ranges into the DDP's grad buffer for
+        each data-parallel (DP) rank. Each DP rank keeps range info for
+        all other DP ranks, for the purpose of creating args for
+        reduce-scatter and all-gather.
+        """
+
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+
+        # Grad buffer range.
+        grad_buffer = model._grad_buffers[dtype]
+        gbuf_size = grad_buffer.numel
+        max_gbuf_range_size = int(math.ceil(gbuf_size / data_parallel_world_size))
+
+        # All world ranges. (i.e., across all data parallel ranks)
+        gbuf_world_all_ranges = []
+        for r in range(data_parallel_world_size):
+            gbuf_world_start = r * max_gbuf_range_size
+            gbuf_world_end = min(gbuf_size, gbuf_world_start+max_gbuf_range_size)
+            gbuf_world_range = Range(gbuf_world_start, gbuf_world_end)
+            gbuf_world_all_ranges.append(gbuf_world_range)
+
+        # Local DP's ranges.
+        gbuf_world_range = gbuf_world_all_ranges[data_parallel_rank]
+        gbuf_local_range = gbuf_world_range.normalize()
+
+        # Get each param's ranges.
+        param_range_map = cls.build_model_gbuf_param_range_map(model,
+                                                               dtype,
+                                                               gbuf_world_range)
+
+        # Group into dict.
+        data = {
+            "local" : gbuf_local_range,
+            "world" : gbuf_world_range,
+            "world_all" : gbuf_world_all_ranges,
+            "param_map" : param_range_map,
+            "max_range_size" : max_gbuf_range_size,
+        }
+
+        return data
+
+
+    @classmethod
+    def build_model_gbuf_range_map(cls, model):
+        """
+        Create param-to-grad-buffer mappings, for grad buffer data types
+        within a specific virtual model.
+        """
+        return {
+            dtype : cls.build_model_gbuf_range(model, dtype)
+            for dtype in model._grad_buffers
+        }
+
+
+    @classmethod
+    def build_model_param_gbuf_map(cls, model_gbuf_ranges):
+        """
+        Create a reverse of the model_gbuf_ranges, for referencing in
+        opposite direction.
+        """
+        param_gbuf_map = {}
+        for model_index, model_gbuf_range_map in enumerate(model_gbuf_ranges):
+            for dtype, gbuf_range_map in model_gbuf_range_map.items():
+                for param, param_range_map in gbuf_range_map["param_map"].items():
+                    param_gbuf_map[param] = (model_index, dtype)
+        return param_gbuf_map
+
+
+    @classmethod
+    def build_optimizer_group_ranges(cls, param_groups, model_gbuf_ranges):
+        """
+        Create optimizer groups.
+
+        Given the set of parameter shard ranges that are owned by the current
+        data-parallel (DP) rank, gather the set of parameters that will be
+        used (in the method below) to create the current DP's optimizer
+        groups.
+        """
+
+        num_groups = len(param_groups)
+
+        # Param group map.
+        # World param group map.
+        # - Store a mapping of <model_parameter:group_index> for all parameters
+        #   across all DP ranks. This is necessary because it is our first
+        #   cross reference between the DDP mappings and the optimizer group
+        #   parameters. This mapping only for use in the next step of building
+        #   the local mapping over this DP rank's parameters.
+        world_param_group_map = {}
+        for group_index, group in enumerate(param_groups):
+            for param in group["params"]:
+                assert param.requires_grad
+                world_param_group_map[param] = group_index
+
+        # Optimizer group ranges & param-group mapping.
+        # - Build a mapping from groups to their contained parameters, and also
+        #   from parameters to their containing group index and order within
+        #   the group. The group index and order are particularly important for
+        #   saving and loading checkpoints.
+        local_param_group_map = {}
+        group_ranges = [ {"params": []} for _ in param_groups ]
+        for model_gbuf_range_map in model_gbuf_ranges:
+            for dtype, gbuf_range_map in model_gbuf_range_map.items():
+                for param in gbuf_range_map["param_map"]:
+                    group_index = world_param_group_map[param]
+                    group_range = group_ranges[group_index]
+                    group_range["params"].append(param)
+                    local_param_group_map[param] = \
+                        (group_index, len(group_range["params"]) - 1)
+
+        # Squeeze zero-size group ranges.
+        for group_index, group_range in enumerate(group_ranges):
+            group_range["orig_group"] = param_groups[group_index]
+            group_range["orig_group_idx"] = param_groups[group_index]
+
+        return local_param_group_map, group_ranges
+
+
+    @classmethod
+    def build_model_and_main_param_groups(cls,
+                                          model_gbuf_ranges,
+                                          param_gbuf_map,
+                                          opt_group_ranges):
+        """
+        Create main parameter groups needed for the optimizer step.
+
+        These groups encompass both: 1) groups used by this class, for
+        reducing/gather, and 2) groups used by the inner optimizer for the
+        parameter update. Given that the conceptual grad buffer partitioning
+        (created in earlier method) doesn't respect parameter boundaries,
+        the optimizer operates on shards of the model parameters, rather than
+        the full parameters.
+        """
+
+        # Parameter groups:
+        #   model_float16_groups: original float16 parameters
+        #   model_fp32_groups: original fp32 parameters
+        #   shard_float16_groups: shards of original float16 parameters
+        #   shard_fp32_groups: shards of original fp32 parameters
+        #   shard_fp32_from_float16_groups: fp32 copy of float16 parameters
+        model_float16_groups = []
+        model_fp32_groups = []
+        shard_float16_groups = []
+        shard_fp32_groups = []
+        shard_fp32_from_float16_groups = []
+
+        # Allocate (or slice) each group's param shard.
+        for group_index, group_range in enumerate(opt_group_ranges):
+
+            # Params of this group.
+            model_float16_params_this_group = []
+            model_fp32_params_this_group = []
+            shard_float16_params_this_group = []
+            shard_fp32_params_this_group = []
+            shard_fp32_from_float16_params_this_group = []
+            model_float16_groups.append(model_float16_params_this_group)
+            model_fp32_groups.append(model_fp32_params_this_group)
+            shard_float16_groups.append(shard_float16_params_this_group)
+            shard_fp32_groups.append(shard_fp32_params_this_group)
+            shard_fp32_from_float16_groups.append(
+                shard_fp32_from_float16_params_this_group)
+
+            for model_param in group_range["params"]:
+
+                assert model_param.requires_grad
+
+                model_index, dtype = param_gbuf_map[model_param]
+                gbuf_range = model_gbuf_ranges[model_index][dtype]
+                param_range = gbuf_range["param_map"][model_param]["param"]
+
+                # fp16, bf16 params.
+                if model_param.type() in ['torch.cuda.HalfTensor',
+                                          'torch.cuda.BFloat16Tensor']:
+
+                    # Clone model -> main.
+                    shard_model_param = model_param.detach().view(-1) \
+                        [param_range.start:param_range.end]
+                    shard_main_param = shard_model_param.clone().float()
+                    copy_tensor_model_parallel_attributes(
+                        shard_model_param, model_param)
+                    copy_tensor_model_parallel_attributes(
+                        shard_main_param, model_param)
+                    if hasattr(model_param, 'shared'):
+                        shard_model_param.shared = model_param.shared
+                        shard_main_param.shared = model_param.shared
+
+                    # Add to group.
+                    model_float16_params_this_group.append(model_param)
+                    shard_float16_params_this_group.append(shard_model_param)
+                    shard_fp32_from_float16_params_this_group.append(shard_main_param)
+
+                # fp32 params.
+                elif model_param.type() == 'torch.cuda.FloatTensor':
+                    shard_model_param = model_param.view(-1) \
+                        [param_range.start:param_range.end]
+                    model_fp32_params_this_group.append(model_param)
+                    shard_fp32_params_this_group.append(shard_model_param)
+                    copy_tensor_model_parallel_attributes(
+                        shard_model_param, model_param)
+                    if hasattr(model_param, 'shared'):
+                        shard_model_param.shared = model_param.shared
+
+                else:
+                    raise TypeError('Wrapped parameters must be one of '
+                                    'torch.cuda.FloatTensor,  '
+                                    'torch.cuda.HalfTensor, or '
+                                    'torch.cuda.BFloat16Tensor. '
+                                    'Received {}'.format(model_param.type()))
+
+            # Update optimizer's params.
+            group_range["orig_group"]["params"] = [
+                *shard_fp32_params_this_group,
+                *shard_fp32_from_float16_params_this_group,
+            ]
+
+        return (
+            model_float16_groups,
+            model_fp32_groups,
+            shard_float16_groups,
+            shard_fp32_groups,
+            shard_fp32_from_float16_groups,
+        )
+
+
+    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
+                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+                 fp16, bf16, params_dtype, grad_scaler, models):
+        """
+        See top of class definition for argument descriptions.
+
+        The steps in this method create the core mapping between DDP grad
+        buffers, parameters, and parameter shard ranges, that is needed for
+        converting between model param indexes and main parameter shard
+        indexes. This method also updates the optimizer parameter groups
+        with the newly created shards.
+        """
+
+        super().__init__(
+            optimizer, clip_grad, log_num_zeros_in_grad,
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+            fp16, bf16, params_dtype, grad_scaler, models)
+
+        # Verify that contiguous buffers are being used.
+        # - Note: this should already be checked in arguments.py.
+        assert use_contiguous_buffers_in_local_ddp
+        assert isinstance(optimizer, Adam), \
+            "Only Adam currently supported, due to checkpointing requirements."
+
+        # Model grad buffer ranges.
+        self.model_gbuf_ranges = []
+        for model_index, model in enumerate(self.models):
+            self.model_gbuf_ranges.append(self.build_model_gbuf_range_map(model))
+        self.model_param_gbuf_map = \
+            self.build_model_param_gbuf_map(self.model_gbuf_ranges)
+
+        # Optimizer ranges.
+        self.model_param_group_index_map, self.opt_group_ranges = \
+            self.build_optimizer_group_ranges(self.optimizer.param_groups,
+                                              self.model_gbuf_ranges)
+
+        # Allocate main param shards.
+        (
+            self.model_float16_groups,
+            self.model_fp32_groups,
+            self.shard_float16_groups,
+            self.shard_fp32_groups,
+            self.shard_fp32_from_float16_groups,
+        ) = self.build_model_and_main_param_groups(self.model_gbuf_ranges,
+                                                   self.model_param_gbuf_map,
+                                                   self.opt_group_ranges)
+
+        # Initialize param buffers.
+        # - These are views on the DDP model's grad buffers, that share
+        #   storage & have their own dtype. This is safe because the param
+        #   dtype size is always <= grad dtype size.
+        self.param_buffers = []
+        for model_index, model in enumerate(self.models):
+            current_param_buffers = {}
+            for dtype, grad_buffer in model._grad_buffers.items():
+
+                # Handle older/newer method for getting untyped storage.
+                try:
+                    storage = grad_buffer.data.storage()._untyped()
+                except:
+                    storage = grad_buffer.data.storage().untyped()
+
+                # Typed param buffer.
+                param_buffer = torch.tensor(
+                    storage,
+                    dtype = params_dtype,
+                    device = grad_buffer.data.device)
+                param_buffer = param_buffer[:grad_buffer.numel_padded]
+                current_param_buffers[dtype] = param_buffer
+            self.param_buffers.append(current_param_buffers)
+
+        # Update optimizer groups.
+        # - Also, leverage state_dict() and load_state_dict() to
+        #   recast preexisting per-param state tensors.
+        self.optimizer.param_groups = \
+            [ g["orig_group"] for g in self.opt_group_ranges ]
+        self.optimizer.load_state_dict(self.optimizer.state_dict())
+
+
+    def get_model_param_range_map(self, param):
+        """
+        Given a model param, get the index sub-range of the param that this
+        data-parallel rank owns.
+        """
+        model_index, dtype = self.model_param_gbuf_map[param]
+        gbuf_range_map = self.model_gbuf_ranges[model_index][dtype]
+        param_range_map = gbuf_range_map["param_map"][param]
+        return param_range_map
+
+
+    def get_model_parallel_group(self):
+        """
+        With the distributed optimizer, the model parallel group is the
+        entire world.
+        """
+        return None
+
+
+    def state_dict(self):
+        """
+        The state dict contains all non-DP-rank-dependent (i.e., non-parameter-
+        related) optimizer variables. The returned state dict can be stored in
+        the standard model/RNG checkpoint file. The parameter and dependent
+        optimizer state (e.g., exp_avg, exp_avg_sq) are stored in a separate
+        checkpoint file by calling 'save_parameter_state()'.
+        """
+
+        state_dict = {}
+
+        # Optimizer state (do not store parameter state here).
+        state_dict['optimizer'] = {
+            k : v
+            for k, v in self.optimizer.state_dict().items()
+            if k != "state"
+        }
+        for param_group in state_dict["optimizer"]["param_groups"]:
+            del param_group["params"]
+
+        # Grad scaler state.
+        if self.grad_scaler:
+            state_dict['grad_scaler'] = self.grad_scaler.state_dict()
+
+        return state_dict
+
+
+    def load_state_dict(self, state_dict):
+        """Load the state dict.
+
+        As detailed in state_dict(), the state dict contains all non-
+        parameter-related variables. This method is notably longer than
+        state_dict(), because the Torch optimizers state has yet to be
+        allocated at this point, and so we must do a cross referencing between
+        the optimizers state (and the ordering it expects for parameter state)
+        and this DP rank's shards. The optimizer at this point does not contain
+        any tensor dimension information, so we must get these dimensions from
+        the DP shards mapped during DistributedOptimizer.__init__().
+
+        The tensor parameter state is loaded via load_parameter_state(), and
+        so this method also must populate the loaded state dict with dummy
+        tensor data (i.e., via torch.empty() below). This will be overwritten
+        during load_parameter_state().
+
+        ** Note: Torch optimizer's state structure. **
+        The Torch optimizer stores its state in two levels. The top level is a
+        list of groups, where each group contains a list of integer indexes
+        (corresponding to parameters) that index into a master parameter list
+        that is shared by all groups. As such, three values are necessary for
+        maintaining this ordering:
+
+        - group_index : The group to which a parameter belongs.
+        - group_order : The index of a parameter within its group.
+        - state_order : The index of a parameter within the shared parameter
+            list.
+        """
+
+        # Get the Torch optimizer's state dict.
+        # - This 'inner' optimizer at this point is unallocated, and only
+        #   contains an integer odering of parameters within each group, and
+        #   the ordering of parameters within its flattened parameter state
+        #   list.
+        inner_state_dict = self.optimizer.state_dict()
+        state_dict_param_groups = [{
+            **group,
+            "params" : list(inner_state_dict["param_groups"][idx]["params"]),
+        } for idx, group in enumerate(state_dict["optimizer"]["param_groups"])]
+
+        # Allocate 'dummy' data for optimizer state (i.e., torch.empty() below)
+        # - Real data is overwritten during load_parameter_state().
+        state_dict_state = []
+        for gbuf_range_maps in self.model_gbuf_ranges:
+            for gbuf_range_map in gbuf_range_maps.values():
+                for model_param, param_range_map in \
+                    gbuf_range_map["param_map"].items():
+
+                    # Get parameter ordering information (see method docstring
+                    # for details).
+                    group_index, group_order = \
+                        self.model_param_group_index_map[model_param]
+                    state_order = inner_state_dict["param_groups"] \
+                        [group_index]["params"][group_order]
+
+                    # Allocate dummy tensors.
+                    numel = len(param_range_map["gbuf_world"])
+                    init_shard = lambda : torch.empty(
+                        (numel,),
+                        dtype=torch.float32,
+                        device=torch.cuda.current_device())
+
+                    state_dict_state.append((state_order, {
+                        "exp_avg" : init_shard(),
+                        "exp_avg_sq" : init_shard(),
+                    }))
+
+        # Sort by state order (see method docstring for details).
+        state_dict_state.sort(key = lambda s : s[0])
+        state_dict_state = {s[0]:s[1] for s in state_dict_state}
+
+        # Optimizer.
+        self.optimizer.load_state_dict({
+            "state" : state_dict_state,
+            "param_groups" : state_dict_param_groups,
+        })
+
+        # Grad scaler.
+        if 'grad_scaler' not in state_dict:
+            if self.fp16:
+                print_rank_0('***WARNING*** found an old checkpoint, will not '
+                             'load grad scaler ...')
+        else:
+            if self.grad_scaler:
+                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
+            else:
+                print_rank_0('***WARNING*** fould the grad scaler in the '
+                             'checkpoint but it is None in the class. '
+                             'Skipping loading grad scaler ...')
+
+
+    def save_parameter_state(self, filename):
+        """Save parameter state (i.e., parameter & optimizer tensors).
+
+        This method performs three steps:
+        - For each DP rank, copy param & optimizer shards to contiguous CPU
+          buffers. (e.g., one buffer each for main_param, exp_avg, and
+          exp_avg_sq).
+        - Gather contiguous buffers on DP rank 0 and concatenate to world
+          buffers.
+        - Save world buffers to disk (i.e., distrib_opt.pt).
+        """
+
+        # Data parallelism variables.
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        data_parallel_group_gloo = mpu.get_data_parallel_group_gloo()
+        data_parallel_global_ranks = list(mpu._DATA_PARALLEL_GLOBAL_RANKS)
+
+        # Collect param states.
+        state = {}
+        for model_idx, gbuf_range_maps in enumerate(self.model_gbuf_ranges):
+
+            # Iterate grad buffers (by data type).
+            dtype_state = {}
+            assert len(gbuf_range_maps) == 1, "single dtype supported, for now."
+            for dtype, gbuf_range_map in gbuf_range_maps.items():
+
+                # Compute local DP contiguous shard's size.
+                model = self.models[model_idx]
+                gbuf_world_numel = model._grad_buffers[dtype].numel_padded
+                gbuf_local_numel = int(gbuf_world_numel/data_parallel_world_size)
+                local_shards = {key:torch.empty((gbuf_local_numel,),
+                                             dtype=torch.float32,
+                                             device="cpu")
+                             for key in ("param", "exp_avg", "exp_avg_sq")}
+
+                # Build contiguous DP rank shards (for param + optim states).
+                for model_param, param_range_map in \
+                    gbuf_range_map["param_map"].items():
+
+                    # Main param & optimizer states.
+                    group_index, group_order = \
+                        self.model_param_group_index_map[model_param]
+                    main_param = self.optimizer.param_groups \
+                        [group_index]["params"][group_order]
+                    optim_state = self.optimizer.state[main_param]
+
+                    tensors = {
+                        "param" : main_param,
+                        **optim_state,
+                    }
+
+                    # Copy states into contiguous shard.
+                    gbuf_local_start = param_range_map["gbuf_local"].start
+                    gbuf_local_end = param_range_map["gbuf_local"].end
+                    for key in local_shards:
+                        local_shards[key][gbuf_local_start:gbuf_local_end] \
+                            .data.copy_(tensors[key].detach().cpu())
+
+                # Gather contiguous shards on DP rank 0.
+                world_tensors = {}
+                for key, send_tensor in local_shards.items():
+
+                    # Gather tensor list.
+                    if data_parallel_rank == 0:
+                        recv_tensors = [torch.empty((gbuf_local_numel,),
+                                                    dtype=torch.float32,
+                                                    device="cpu")
+                                        for _ in range(data_parallel_world_size)]
+                    else:
+                        recv_tensors = None
+
+                    # Gather.
+                    torch.distributed.gather(
+                        send_tensor,
+                        recv_tensors,
+                        data_parallel_global_ranks[0],
+                        data_parallel_group_gloo,
+                    )
+
+                    # Concatenate.
+                    if data_parallel_rank == 0:
+                        world_tensors[key] = torch.cat(recv_tensors)
+
+                # Collect world state.
+                dtype_state[dtype] = world_tensors
+            state[model_idx] = dtype_state
+
+        # Save param state.
+        if data_parallel_rank == 0:
+            torch.save(state, filename)
+
+
+    def load_parameter_state(self, filename):
+        """Load parameter state (i.e., parameter & optimizer tensors).
+
+        This method performs the reverse of save_parameter_state():
+        - Load world buffers from disk (i.e., distrib_opt.pt).
+        - Scatter contiguous buffers from DP rank 0 to each DP rank (each DP
+          rank receives its relevant subset of the world buffers).
+        - For each DP rank, copy param & optimizer shards from contiguous CPU
+          buffers. (e.g., one buffer each for main_param, exp_avg, and
+          exp_avg_sq).
+        """
+
+        # Data parallelism variables.
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        data_parallel_group_gloo = mpu.get_data_parallel_group_gloo()
+        data_parallel_global_ranks = list(mpu._DATA_PARALLEL_GLOBAL_RANKS)
+
+        # Load on DP rank 0.
+        if data_parallel_rank == 0:
+            loaded_state = torch.load(filename)
+
+        # Scatter tensors to all DP ranks.
+        for model_idx, gbuf_range_maps in enumerate(self.model_gbuf_ranges):
+            for dtype, gbuf_range_map in gbuf_range_maps.items():
+
+                # Compute local DP contiguous shard's size.
+                model = self.models[model_idx]
+                gbuf_world_numel = model._grad_buffers[dtype].numel_padded
+                gbuf_local_numel = int(gbuf_world_numel/data_parallel_world_size)
+
+                # Contiguous local shards (received from DP rank 0).
+                local_shards = {key:torch.empty((gbuf_local_numel,),
+                                                dtype=torch.float32,
+                                                device="cpu")
+                                for key in ("param", "exp_avg", "exp_avg_sq")}
+
+                # Scatter local shards from DP rank 0.
+                for key, recv_tensor in local_shards.items():
+
+                    # Scatter tensor list.
+                    if data_parallel_rank == 0:
+                        world_tensor = loaded_state[model_idx][dtype][key]
+                        gbuf_start_idxs = \
+                            list(range(0, gbuf_world_numel, gbuf_local_numel))
+                        send_tensors = [world_tensor[i:(i+gbuf_local_numel)]
+                                        for i in gbuf_start_idxs]
+                    else:
+                        send_tensors = None
+
+                    # Scatter.
+                    torch.distributed.scatter(
+                        recv_tensor,
+                        send_tensors,
+                        data_parallel_global_ranks[0],
+                        data_parallel_group_gloo,
+                    )
+
+                # Copy local contiguous shards to param/optim shards.
+                for model_param, param_range_map in \
+                    gbuf_range_map["param_map"].items():
+
+                    # Main param & optimizer states.
+                    group_index, group_order = \
+                        self.model_param_group_index_map[model_param]
+                    main_param = self.optimizer.param_groups \
+                        [group_index]["params"][group_order]
+                    optim_state = self.optimizer.state[main_param]
+
+                    tensors = {
+                        "param" : main_param,
+                        **optim_state,
+                    }
+
+                    # Copy states into contiguous shard.
+                    gbuf_local_start = param_range_map["gbuf_local"].start
+                    gbuf_local_end = param_range_map["gbuf_local"].end
+                    for key in local_shards:
+                        tensors[key].data.copy_(
+                            local_shards[key][gbuf_local_start:gbuf_local_end])
+
+
+    def zero_grad(self, set_to_none=True):
+        """
+        Zero grads.
+
+        We only need to zero the model related parameters, i.e.,
+        model_float16_groups & model_fp32_groups. We additionally zero
+        the remaining groups as a memory optimization to reduce
+        fragmentation; in the case of set_to_none==True, the space
+        used by this field can be safely deallocated at this point.
+        """
+        for groups in (
+                self.model_float16_groups,
+                self.model_fp32_groups,
+                self.shard_float16_groups, # grad empty/unused here?
+                self.shard_fp32_groups, # throws grad-access warning
+                self.shard_fp32_from_float16_groups):
+            for group in groups:
+                _zero_grad_group_helper(group, set_to_none)
+
+
+    @staticmethod
+    def get_model_buffer_dp_views(model_buffers):
+        """
+        Get shard views of each of the DDP's param/grad buffers.
+
+        In this nested list, the top level is grouped by the virtual model
+        index and the buffer's data type. The sub-level is a list of
+        shards of that buffer, where each shard in the list represents
+        a contiguous view of the buffer, that is owned by a data-parallel
+        rank. The shard boundary does not respect parameter boundaries, and
+        so the elements of some parameters are split across data parallel
+        ranks.
+
+        Additionally, return references to the entire buffers, for use
+        in _reduce_scatter_base and _all_gather_base.
+        """
+
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+
+        # Buffer views.
+        view_items = []
+        for model_index, buffers in enumerate(model_buffers):
+            for dtype, buf in buffers.items():
+
+                assert buf.numel() % data_parallel_world_size == 0
+                shard_size = int(buf.numel() / data_parallel_world_size)
+                buf_views = [buf[(r*shard_size):((r+1)*shard_size)]
+                             for r in range(data_parallel_world_size)]
+                view_items.append((model_index, dtype, buf, buf_views))
+
+        return view_items
+
+
+    def get_model_grad_buffer_dp_views(self):
+        return self.get_model_buffer_dp_views([
+            {dtype : mem_buffer.data}
+            for model in self.models
+            for dtype, mem_buffer in model._grad_buffers.items()])
+
+
+    def get_model_param_buffer_dp_views(self):
+        return self.get_model_buffer_dp_views(self.param_buffers)
+
+
+    def reduce_model_grads(self, args, timers):
+        """
+        Reduce-scatter model grads.
+
+        The DDP's grad buffer is used for the reduce-scatter, and thus no
+        tensors are dynamically allocated.
+
+        Note: this is a different order of reduction, versus the non-
+        distributed optimizer, which reduces: 1) layernorm grads, 2) all
+        grads, 3) embedding grads.
+        """
+
+        # All-reduce layer-norm grads (for sequence parallelism).
+        timers('layernorm-grads-all-reduce', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
+        self.allreduce_layernorm_grads(args)
+        timers('layernorm-grads-all-reduce').stop()
+
+        # All-reduce embedding grads.
+        timers('embedding-grads-all-reduce', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
+        self.allreduce_embedding_grads(args)
+        timers('embedding-grads-all-reduce').stop()
+
+        # Reduce-scatter setup.
+        timers('grads-reduce-scatter', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+        data_parallel_group = mpu.get_data_parallel_group()
+
+        # Scale grad buffers by '1 / data_parallel_world_size'.
+        for model in self.models:
+            for dtype, gbuf in model._grad_buffers.items():
+                gbuf.data /= data_parallel_world_size
+
+        # Reduce-scatter all grads.
+        gbuf_view_items = self.get_model_grad_buffer_dp_views()
+        for index, (model_index, dtype, gbuf, gbuf_views) \
+            in enumerate(gbuf_view_items):
+
+            torch.distributed._reduce_scatter_base(
+                gbuf_views[data_parallel_rank],
+                gbuf,
+                group = data_parallel_group,
+            )
+
+        timers('grads-reduce-scatter').stop()
+
+
+
+    def gather_model_params(self, args, timers):
+        """
+        All-gather updated model params.
+
+        The DDP's param buffer is used for the all-gather, and thus no
+        tensors are dynamically allocated. After the all-gather, the params
+        can be copied from the param buffer to the param.
+        """
+
+        timers('params-all-gather', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
+
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        data_parallel_group = mpu.get_data_parallel_group()
+
+        # All-gather updated main params.
+        # - All param buffer views are guaranteed to have the same num elements
+        #   across all data parallel ranks, due to grad buffer padding that is
+        #   done in distributed.py, and extended to the param buffers. Thus,
+        #   all sub-views will have consistent start/end indexes across data
+        #   parallel ranks.
+        pbuf_view_items = self.get_model_param_buffer_dp_views()
+        for index, (model_index, dtype, pbuf, pbuf_views) \
+            in enumerate(pbuf_view_items):
+
+            torch.distributed._all_gather_base(
+                pbuf,
+                pbuf_views[data_parallel_rank],
+                group = data_parallel_group,
+            )
+
+        # Copy from param buffer to each param.
+        for model_id, model in enumerate(self.models):
+            for dtype, param_map in model._grad_buffer_param_index_map.items():
+                for param, (buf_start, buf_end) in param_map.items():
+                    param_buf = self.param_buffers[model_id][dtype]
+                    param_buf_shard = param_buf[buf_start:buf_end]
+                    param.view(-1).detach().copy_(param_buf_shard)
+
+        timers('params-all-gather').stop()
+
+
+    def _collect_main_grad_data_for_unscaling(self):
+        """
+        Note: this should be equivalent to the float-16 optimizer's method,
+        but writtent differently, so the two should be combined.
+        """
+        return [
+            param.grad.data
+            for group in self.optimizer.param_groups
+            for param in group["params"]
+        ]
+
+
+    def _get_model_and_main_params_data_float16(self):
+        """
+        Get aligned list of model and main params.
+        """
+        model_data = []
+        main_data = []
+        for model_group, main_group in zip(self.shard_float16_groups,
+                                           self.shard_fp32_from_float16_groups):
+            for model_param, main_param in zip(model_group, main_group):
+                model_data.append(model_param.data)
+                main_data.append(main_param.data)
+        return model_data, main_data
+
+
+    def _copy_model_grads_to_main_grads(self):
+        """
+        Copy model grads to main grads.
+
+        Since this step follows a reduce-scatter through the DDP's grad
+        buffer, this method is responsible for copying the updated grads
+        from the grad buffer to the main shard's grad field.
+        """
+
+        # Utility method for copying group grads.
+        def copy_group_grads(model_groups, shard_main_groups):
+            for model_group, shard_main_group in zip(model_groups,
+                                                     shard_main_groups):
+                for model_param, shard_main_param in zip(model_group,
+                                                         shard_main_group):
+
+                    param_range_map = self.get_model_param_range_map(model_param)
+                    param_range = param_range_map["param"]
+                    assert param_range.size == shard_main_param.nelement()
+
+                    model_grad = model_param.main_grad
+                    shard_model_grad = model_grad.view(-1) \
+                        [param_range.start:param_range.end]
+                    shard_main_param.grad = shard_model_grad.float()
+
+        # Copy model groups to shard groups.
+        copy_group_grads(self.model_float16_groups,
+                         self.shard_fp32_from_float16_groups)
+        copy_group_grads(self.model_fp32_groups,
+                         self.shard_fp32_groups)
+
+
+    def _copy_main_params_to_model_params(self):
+        """
+        Copy main params to model params.
+
+        Since this step is followed by an all-gather through the DDP's grad
+        buffer, this method is responsible for copying the updated params
+        from the main shards into the correct position in the grad buffer.
+        """
+
+        # Utility method for copying group params.
+        def copy_group_params(shard_main_groups, model_groups):
+            for shard_main_group, model_group in zip(shard_main_groups,
+                                                     model_groups):
+                for shard_main_param, model_param in zip(shard_main_group,
+                                                         model_group):
+
+                    param_range_map = self.get_model_param_range_map(model_param)
+                    world_range = param_range_map["gbuf_world"]
+
+                    assert world_range.size == shard_main_param.nelement()
+
+                    model_id, dtype = self.model_param_gbuf_map[model_param]
+                    model_param_buffer = self.param_buffers[model_id][dtype]
+
+                    shard_model_param = model_param_buffer.view(-1) \
+                        [world_range.start:world_range.end]
+
+                    shard_model_param.data.copy_(shard_main_param)
+
+        # Copy shard groups to model groups.
+        copy_group_params(self.shard_fp32_from_float16_groups,
+                          self.model_float16_groups)
+        copy_group_params(self.shard_fp32_groups,
+                          self.model_fp32_groups)
+
+
+    def _copy_model_params_to_main_params(self):
+        """
+        Copy model params to main params.
+
+        During finetuning, this method is used to reload the main params from
+        the model params. This copy does not make use of the grad buffer as
+        an intermediary.
+        """
+
+        # Utility method for copying group params.
+        def copy_group_params(model_groups, shard_main_groups):
+            for model_group, shard_main_group in zip(model_groups,
+                                                     shard_main_groups):
+                for model_param, shard_main_param in zip(model_group,
+                                                         shard_main_group):
+
+                    param_range_map = self.get_model_param_range_map(model_param)
+                    param_range = param_range_map["param"]
+                    assert param_range.size == shard_main_param.nelement()
+
+                    shard_model_param = model_param.view(-1) \
+                        [param_range.start:param_range.end]
+                    shard_main_param.data.copy_(shard_model_param)
+
+        # Copy model groups to shard groups.
+        copy_group_params(self.model_float16_groups,
+                          self.shard_fp32_from_float16_groups)
+        copy_group_params(self.model_fp32_groups,
+                          self.shard_fp32_groups)
diff --git a/training/DeepSpeed-Domino/domino/optimizer/grad_scaler.py b/training/DeepSpeed-Domino/domino/optimizer/grad_scaler.py
new file mode 100644
index 000000000..66f7c907a
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/optimizer/grad_scaler.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron grad scaler."""
+
+from abc import ABC
+from abc import abstractmethod
+
+import torch
+
+
+class MegatronGradScaler(ABC):
+
+    def __init__(self, initial_scale):
+        """Initialize scale value with the input initial scale."""
+        assert initial_scale > 0.0
+        self._scale = torch.cuda.FloatTensor([initial_scale])
+
+    @property
+    def scale(self):
+        return self._scale
+
+    @property
+    def inv_scale(self):
+        return self._scale.double().reciprocal().float()
+
+    @abstractmethod
+    def update(self, found_inf):
+        pass
+
+    @abstractmethod
+    def state_dict(self):
+        pass
+
+    @abstractmethod
+    def load_state_dict(self, state_dict):
+        pass
+
+
+
+class ConstantGradScaler(MegatronGradScaler):
+
+    def update(self, found_inf):
+        pass
+
+    def state_dict(self):
+        return dict()
+
+    def load_state_dict(self, state_dict):
+        pass
+
+
+
+class DynamicGradScaler(MegatronGradScaler):
+
+    def __init__(self, initial_scale, min_scale,
+                 growth_factor, backoff_factor,
+                 growth_interval, hysteresis):
+        """"Grad scaler with dynamic scale that gets adjusted
+        during training."""
+        super(DynamicGradScaler, self).__init__(initial_scale)
+
+        # Lower bound on the scale.
+        assert min_scale > 0.0
+        assert min_scale <= initial_scale
+        self.min_scale = torch.cuda.FloatTensor([min_scale])
+        # Growth and backoff factors for the scale.
+        assert growth_factor > 1.0
+        self.growth_factor = torch.cuda.FloatTensor([growth_factor])
+        assert backoff_factor < 1.0
+        assert backoff_factor > 0.0
+        self.backoff_factor = torch.cuda.FloatTensor([backoff_factor])
+        # Interval over which if we don't see any inf/nan,
+        # we will scale the grad scale by the growth factor.
+        assert growth_interval > 0
+        self.growth_interval = growth_interval
+        # Number of inf/nans we should see before scaling down
+        # the grad scale by the backoff factor.
+        assert hysteresis > 0
+        self.hysteresis = hysteresis
+
+        # Trackers.
+        self._growth_tracker = 0
+        self._hysteresis_tracker = self.hysteresis
+
+
+    def update(self, found_inf):
+
+        # If we have an inf/nan, growth tracker is set to 0
+        # and hysterisis tracker is reduced by 1.
+        if found_inf:
+            self._growth_tracker = 0
+            self._hysteresis_tracker -= 1
+            # Now if we are out of hysteresis count, scale down the loss.
+            if self._hysteresis_tracker <= 0:
+                self._scale = torch.max(self._scale * self.backoff_factor,
+                                        self.min_scale)
+        else:
+            # If there is no nan/inf, increment the growth tracker.
+            self._growth_tracker += 1
+            # If we have had enough consequitive intervals with no nan/inf:
+            if self._growth_tracker == self.growth_interval:
+                # Reset the tracker and hysteresis trackers,
+                self._growth_tracker = 0
+                self._hysteresis_tracker = self.hysteresis
+                # and scale up the loss scale.
+                self._scale = self._scale * self.growth_factor
+
+
+    def state_dict(self):
+        state_dict = {}
+        state_dict['scale'] = self._scale
+        state_dict['growth_tracker'] = self._growth_tracker
+        state_dict['hysteresis_tracker'] = self._hysteresis_tracker
+        return state_dict
+
+
+    def load_state_dict(self, state_dict):
+        self._scale = state_dict['scale'].cuda(torch.cuda.current_device())
+        self._growth_tracker = state_dict['growth_tracker']
+        self._hysteresis_tracker = state_dict['hysteresis_tracker']
diff --git a/training/DeepSpeed-Domino/domino/optimizer/optimizer.py b/training/DeepSpeed-Domino/domino/optimizer/optimizer.py
new file mode 100644
index 000000000..1578dcb8c
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/optimizer/optimizer.py
@@ -0,0 +1,774 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron optimizer."""
+
+from abc import ABC
+from abc import abstractmethod
+from apex.multi_tensor_apply import multi_tensor_applier
+import amp_C
+import torch
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from domino.utils import print_rank_0
+import domino.parallel_state as mpu
+from domino.tensor_parallel.partition import copy_tensor_model_parallel_attributes, param_is_not_tensor_parallel_duplicate
+from domino.modules.distributed import DistributedDataParallel as LocalDDP
+from domino.modules.module import Float16Module, param_is_not_shared
+from domino.utils import unwrap_model
+
+from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
+
+
+
+
+def _zero_grad_group_helper(group, set_to_none):
+    """Zero out the gradient for a group of parameters.
+    Note: copied from torch.optim.optimizer."""
+    for param in group:
+        if param.grad is not None:
+            if set_to_none:
+                param.grad = None
+            else:
+                if param.grad.grad_fn is not None:
+                    param.grad.detach_()
+                else:
+                    param.grad.requires_grad_(False)
+                param.grad.zero_()
+
+
+def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
+    """Use multi-tensor-applier to copy values from one list to another.
+    We don't have a blfoat16 implementation so for now if the overflow_buf
+    is not provided, we default back to simple loop copy to be compatible
+    with bfloat16."""
+    if overflow_buf:
+        overflow_buf.fill_(0)
+        # Scaling with factor `1.0` is equivalent to copy.
+        multi_tensor_applier(amp_C.multi_tensor_scale,
+                             overflow_buf,
+                             [this, that],
+                             1.0)
+    else:
+        for this_, that_ in zip(this, that):
+            that_.copy_(this_)
+
+
+
+class MegatronOptimizer(ABC):
+
+
+    def __init__(self, optimizer, clip_grad,
+                 log_num_zeros_in_grad,
+                 params_have_main_grad,
+                 use_contiguous_buffers_in_local_ddp,
+                 models):
+
+        """Input optimizer is the base optimizer for example Adam."""
+        self.optimizer = optimizer
+        assert self.optimizer, 'no optimizer is provided.'
+        # Set gradient clipping and logging params.
+        self.clip_grad = clip_grad
+        self.log_num_zeros_in_grad = log_num_zeros_in_grad
+        self.params_have_main_grad = params_have_main_grad
+        self.use_contiguous_buffers_in_local_ddp = use_contiguous_buffers_in_local_ddp
+
+        # 'models' are retained for access to the contiguous grad buffers.
+        # (see distributed optimizer)
+        self.models = models
+
+        if self.use_contiguous_buffers_in_local_ddp:
+            assert self.params_have_main_grad, \
+                "use of contiguous buffer requires that params have main grad"
+
+
+    def get_parameters(self):
+        params = []
+        for param_group in self.optimizer.param_groups:
+            for param in param_group['params']:
+                params.append(param)
+        return params
+
+
+    def get_main_grads_for_grad_norm(self):
+
+        # Filter parameters based on:
+        #   - grad should not be none
+        #   - parameter should not be shared
+        #   - should not be a replica due to tensor model parallelism
+        params = self.get_parameters()
+        grads_for_norm = []
+        for param in params:
+            grad = param.grad
+            grad_not_none = grad is not None
+            is_not_shared = param_is_not_shared(param)
+            is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+            if grad_not_none and is_not_shared and is_not_tp_duplicate:
+                grads_for_norm.append(grad)
+
+        return grads_for_norm
+
+
+    def get_model_parallel_group(self):
+        """Default returned here, but the distributed optimizer overrides this."""
+        return mpu.get_model_parallel_group()
+
+
+    def clip_grad_norm(self, clip_grad):
+        params = self.get_parameters()
+        grads_for_norm = self.get_main_grads_for_grad_norm()
+        return clip_grad_norm_fp32(
+            params, grads_for_norm, clip_grad,
+            model_parallel_group=self.get_model_parallel_group())
+
+
+    def count_zeros(self):
+        params = self.get_parameters()
+        return count_zeros_fp32(params,
+                                model_parallel_group=self.get_model_parallel_group())
+
+
+    @abstractmethod
+    def zero_grad(self, set_to_none=True):
+        pass
+
+
+    @abstractmethod
+    def get_loss_scale(self):
+        """The output should be a cuda tensor of size 1."""
+        pass
+
+
+    def scale_loss(self, loss):
+        """Simple scaling."""
+        return self.get_loss_scale() * loss
+
+
+    @abstractmethod
+    def reload_model_params(self):
+        """Refreshes any internal state from the current model parameters.
+        Call whenever the parameters are changed outside of the optimizer.
+        For example, when we load a model from a checkpoint  without loading
+        the optimizer, the model parameters are updated but for fp16 optimizer
+        with main parameters, the main parameters need to also be updated."""
+        pass
+
+
+    @abstractmethod
+    def state_dict(self):
+        pass
+
+
+    @abstractmethod
+    def load_state_dict(self, state_dict):
+        pass
+
+
+    # Promote state so it can be retrieved or set via
+    # "optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+
+    # Promote param_groups so it can be retrieved or set via
+    # "optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
+
+
+    @abstractmethod
+    def step(self, args, timers):
+        pass
+
+
+    def gather_model_params(self, args, timers):
+        """
+        For the case of a non-distributed-optimizer, there is nothing to
+        do here.
+        """
+        pass
+
+
+    def allreduce_word_embedding_grads(self, args):
+        """
+        All-reduce word embedding grads.
+
+        Reduce grads across first and last stages to ensure that word_embeddings
+        parameters stay in sync. This should only run for models that support
+        pipelined model parallelism (BERT and GPT-2).
+        """
+
+        if mpu.is_rank_in_embedding_group(ignore_virtual=True) and \
+                mpu.get_pipeline_model_parallel_world_size() > 1:
+            if mpu.is_pipeline_first_stage(ignore_virtual=True):
+                unwrapped_model = self.models[0]
+            elif mpu.is_pipeline_last_stage(ignore_virtual=True):
+                unwrapped_model = self.models[-1]
+            else:  # We do not support the interleaved schedule for T5 yet.
+                unwrapped_model = self.models[0]
+            unwrapped_model = unwrap_model(
+                unwrapped_model, (torchDDP, LocalDDP, Float16Module))
+
+            if unwrapped_model.share_embeddings_and_output_weights:
+                weight = unwrapped_model.shared_embedding_or_output_weight()
+                if args.DDP_impl == 'local':
+                    grad = weight.main_grad
+                else:
+                    grad = weight.grad
+                torch.distributed.all_reduce(grad, group=mpu.get_embedding_group())
+
+
+    def allreduce_position_embedding_grads(self, args):
+        """
+        All-reduce position_embeddings grad across first (encoder) and
+        split (decoder) stages to ensure that position embeddings parameters
+        stay in sync. This should only run for T5 models with pipeline
+        parallelism.
+        """
+        if mpu.is_rank_in_position_embedding_group() and \
+                mpu.get_pipeline_model_parallel_world_size() > 1 and \
+                args.pipeline_model_parallel_split_rank is not None:
+            unwrapped_model = self.models[0]
+            unwrapped_model = unwrap_model(
+                unwrapped_model, (torchDDP, LocalDDP, Float16Module))
+            assert args.DDP_impl == 'local', \
+                'T5 model is only supported with local DDP mode'
+            grad = unwrapped_model.language_model.embedding.position_embeddings.weight.main_grad
+            torch.distributed.all_reduce(grad, group=mpu.get_position_embedding_group())
+
+
+    def allreduce_embedding_grads(self, args):
+        """All-reduce both word and position embeddings."""
+        self.allreduce_word_embedding_grads(args)
+        self.allreduce_position_embedding_grads(args)
+
+
+    def allreduce_layernorm_grads(self, args):
+        """All-reduce layernorm grads (for sequence parallelism)."""
+
+        # All-reduce layernorm parameters across model parallel nodes
+        # when sequence parallelism is used
+        if mpu.get_tensor_model_parallel_world_size() > 1 and \
+                args.sequence_parallel:
+            grads = []
+            for model_module in self.models:
+                unwrapped_model = unwrap_model( 
+                    model_module, (torchDDP, LocalDDP, Float16Module))
+                for param in unwrapped_model.parameters():
+                    if getattr(param, 'sequence_parallel', False):
+                        grad = param.main_grad if args.DDP_impl == 'local' else param.grad
+                        grads.append(grad.data)
+            coalesced = _flatten_dense_tensors(grads)
+            torch.distributed.all_reduce(
+                coalesced, group=mpu.get_tensor_model_parallel_group())
+            for buf, synced in zip(grads, _unflatten_dense_tensors(
+                    coalesced, grads)):
+                buf.copy_(synced)
+
+    def reduce_model_grads(self, args, timers):
+        """All-reduce all grads, and all-reduce embeddings."""
+
+        # All-reduce layer-norm grads (for sequence parallelism).
+        timers('layernorm-grads-all-reduce', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
+        self.allreduce_layernorm_grads(args)
+        timers('layernorm-grads-all-reduce').stop()
+
+        # All-reduce if needed.
+        if args.DDP_impl == 'local':
+            timers('grads-all-reduce', log_level=1).start(
+                barrier=args.barrier_with_L1_time)
+            for model in self.models:
+                model.allreduce_gradients()
+            timers('grads-all-reduce').stop()
+
+        # All-reduce embedding grads.
+        timers('embedding-grads-all-reduce', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
+        self.allreduce_embedding_grads(args)
+        timers('embedding-grads-all-reduce').stop()
+
+
+class MixedPrecisionOptimizer(MegatronOptimizer):
+    """Base class for both the float-16 and the distributed optimizer.
+
+    Arguments:
+        optimizer: base optimizer such as Adam or SGD
+        clip_grad: clip gradeints with this global L2 norm. Note
+            that clipping is ignored if clip_grad == 0
+        log_num_zeros_in_grad: return number of zeros in the gradients.
+        params_have_main_grad: flag indicating if parameters have
+            a `main_grad` field. If this is set, we are assuming
+            that the model parameters are store in the `main_grad`
+            field instead of the typical `grad` field. This happens
+            for the DDP cases where there is a continuous buffer
+            holding the gradients. For example for bfloat16, we want
+            to do gradient accumulation and all-reduces in float32
+            and as a result we store those gradients in the main_grad.
+            Note that main grad is not necessarily in float32.
+        use_contiguous_buffers_in_local_ddp: if true, the local DDP model
+            is using a contiguous buffer to hold the model grads.
+        fp16: if true, the model is running in fp16.
+        bf16: if true, the model is running in bfloat16.
+        params_dtype: used by distributed optimizer.
+        grad_scaler: used for scaling gradients. Note that this can be
+            None. This case happens when `bf16 = True` and we don't
+            use any loss scale. Note that for `bf16 = True`, we can have
+            a constnat gradient scaler. Also for `bf16 = False`, we
+            always require a grad scaler.
+        models: list of models (i.e., the virtual pipelining models). This
+            is used by the distributed optimizer for mapping parameters.
+    """
+
+    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
+                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+                 fp16, bf16, params_dtype, grad_scaler,
+                 models):
+
+        super().__init__(
+            optimizer, clip_grad, log_num_zeros_in_grad,
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+            models)
+
+        self.fp16 = fp16
+        self.bf16 = bf16
+        self.params_dtype = params_dtype
+        self.grad_scaler = grad_scaler
+
+        # None grad scaler is only supported for bf16.
+        if self.grad_scaler is None:
+            assert not self.fp16, 'fp16 expects a grad scaler.'
+
+        # Tensor used to determine if a nan/if has happend.
+        # Any non-zero value indicates inf/nan.
+        # Note that we keep this for the cases that grad scaler is none.
+        # We still record nan/inf if we have a bfloat16 with a grad scaler.
+        if self.grad_scaler:
+            self.found_inf = torch.cuda.FloatTensor([0.0])
+
+        # Dummy tensor needed for apex multi-apply tensor.
+        # For bfloat, we don't have multi-tensor apply and for now
+        # we set it to none so the multi-tensor apply gets ignored.
+        if bf16:
+            self._dummy_overflow_buf = None
+        else:
+            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
+
+        # In case grad scaler is not passed, define the unity scale.
+        if self.grad_scaler is None:
+            self._scale_one = torch.cuda.FloatTensor([1.0])
+
+
+    def get_loss_scale(self):
+        if self.grad_scaler is None:
+            return self._scale_one
+        return self.grad_scaler.scale
+
+
+    def reload_model_params(self):
+        self._copy_model_params_to_main_params()
+
+
+    def _unscale_main_grads_and_check_for_nan(self):
+
+        # Collect main grads.
+        main_grads = self._collect_main_grad_data_for_unscaling()
+
+        # Reset found inf.
+        self.found_inf.fill_(0.0)
+
+        # Unscale and set found inf/nan
+        torch._amp_foreach_non_finite_check_and_unscale_(
+            main_grads, self.found_inf, self.grad_scaler.inv_scale)
+
+        # Update across all model parallel instances.
+        torch.distributed.all_reduce(self.found_inf,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=self.get_model_parallel_group())
+
+        # Check for nan.
+        found_inf_flag = (self.found_inf.item() > 0)
+
+        return found_inf_flag
+
+
+    @torch.no_grad()
+    def step(self, args, timers):
+
+        # Copy gradients from model params to main params.
+        # timers('optimizer-copy-to-main-grad', log_level=1).start(
+        #     barrier=args.barrier_with_L1_time)
+        self._copy_model_grads_to_main_grads()
+        # timers('optimizer-copy-to-main-grad').stop()
+
+        # Do unscale, check for inf, and update grad scaler only for
+        # the case that grad scaler is provided.
+        if self.grad_scaler:
+
+            # Unscale and check for inf/nan.
+            # timers('optimizer-unscale-and-check-inf', log_level=1).start(
+            #     barrier=args.barrier_with_L1_time)
+            found_inf_flag = self._unscale_main_grads_and_check_for_nan()
+            # timers('optimizer-unscale-and-check-inf').stop()
+
+            # We are done with scaling gradients
+            # so we can update the loss scale.
+            self.grad_scaler.update(found_inf_flag)
+
+            # If we found inf/nan, skip the update.
+            if found_inf_flag:
+                return False, None, None
+
+        # Clip the main gradients.
+        # timers('optimizer-clip-main-grad', log_level=1).start(
+        #     barrier=args.barrier_with_L1_time)
+        grad_norm = None
+        if self.clip_grad > 0.0:
+            grad_norm = self.clip_grad_norm(self.clip_grad)
+        # timers('optimizer-clip-main-grad').stop()
+
+        # Count the zeros in the grads.
+        # timers('optimizer-count-zeros', log_level=1).start(
+        #     barrier=args.barrier_with_L1_time)
+        num_zeros_in_grad = self.count_zeros() if \
+                            self.log_num_zeros_in_grad else None
+        # timers('optimizer-count-zeros').stop()
+
+        # Step the optimizer.
+        # timers('optimizer-inner-step', log_level=1).start(
+        #     barrier=args.barrier_with_L1_time)
+        self.optimizer.step()
+        # timers('optimizer-inner-step').stop()
+
+        # Update params from main params.
+        # timers('optimizer-copy-main-to-model-params', log_level=1).start(
+        #     barrier=args.barrier_with_L1_time)
+        self._copy_main_params_to_model_params()
+        # timers('optimizer-copy-main-to-model-params').stop()
+
+        # Successful update.
+        return True, grad_norm, num_zeros_in_grad
+
+
+class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
+    """Float16 optimizer for fp16 and bf16 data types.
+
+    Arguments:
+        optimizer: base optimizer such as Adam or SGD
+        clip_grad: clip gradeints with this global L2 norm. Note
+            that clipping is ignored if clip_grad == 0
+        log_num_zeros_in_grad: return number of zeros in the gradients.
+        params_have_main_grad: flag indicating if parameters have
+            a `main_grad` field. If this is set, we are assuming
+            that the model parameters are store in the `main_grad`
+            field instead of the typical `grad` field. This happens
+            for the DDP cases where there is a continuous buffer
+            holding the gradients. For example for bfloat16, we want
+            to do gradient accumulation and all-reduces in float32
+            and as a result we store those gradients in the main_grad.
+            Note that main grad is not necessarily in float32.
+        use_contiguous_buffers_in_local_ddp: if true, the local DDP model
+            is using a contiguous buffer to hold the model grads.
+        fp16: if true, the model is running in fp16.
+        bf16: if true, the model is running in bfloat16.
+        grad_scaler: used for scaling gradients. Note that this can be
+            None. This case happens when `bf16 = True` and we don't
+            use any loss scale. Note that for `bf16 = True`, we can have
+            a constnat gradient scaler. Also for `bf16 = False`, we
+            always require a grad scaler.
+        models: list of models (i.e., the virtual pipelining models). This
+            is used by the distributed optimizer for mapping parameters.
+    """
+
+    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
+                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+                 fp16, bf16, params_dtype, grad_scaler, models):
+
+        super().__init__(
+            optimizer, clip_grad, log_num_zeros_in_grad,
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+            fp16, bf16, params_dtype, grad_scaler, models)
+
+        # ======================
+        # main parameter stuff
+        # ======================
+
+        # Three groups of parameters:
+        #   float16_groups: original float16 parameters
+        #   fp32_from_float16_groups: fp32 copy of float16 parameters
+        #   fp32_from_fp32_groups: original fp32 parameters
+        self.float16_groups = []
+        self.fp32_from_float16_groups = []
+        self.fp32_from_fp32_groups = []
+
+        # For all the groups in the original optimizer:
+        for param_group in self.optimizer.param_groups:
+            float16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_float16_params_this_group = []
+            # For all the parameters in this group:
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+
+                    # float16 params:
+                    if param.type() in ['torch.cuda.HalfTensor',
+                                        'torch.cuda.BFloat16Tensor']:
+                        float16_params_this_group.append(param)
+                        # Create a copy
+                        main_param = param.detach().clone().float()
+                        # Copy tensor model parallel attributes.
+                        copy_tensor_model_parallel_attributes(main_param,
+                                                                              param)
+                        if hasattr(param, 'shared'):
+                            main_param.shared = param.shared
+                        # Replace the optimizer params with the new fp32 copy.
+                        param_group['params'][i] = main_param
+
+                        fp32_from_float16_params_this_group.append(main_param)
+                        # Reset existing state dict key to the new main param.
+                        if param in self.optimizer.state:
+                            self.optimizer.state[main_param] \
+                                = self.optimizer.state.pop(param)
+                    # fp32 params.
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+
+                    else:
+                        raise TypeError('Wrapped parameters must be one of '
+                                        'torch.cuda.FloatTensor,  '
+                                        'torch.cuda.HalfTensor, or '
+                                        'torch.cuda.BFloat16Tensor. '
+                                        'Received {}'.format(param.type()))
+
+            self.float16_groups.append(float16_params_this_group)
+            self.fp32_from_float16_groups.append(
+                fp32_from_float16_params_this_group)
+            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+
+    def zero_grad(self, set_to_none=True):
+        """We only need to zero the model related parameters, i.e.,
+        float16_groups & fp32_from_fp32_groups. We additionally zero
+        fp32_from_float16_groups as a memory optimization to reduce
+        fragmentation; in the case of set_to_none==True, the space
+        used by this field can be safely deallocated at this point."""
+        for group in self.float16_groups:
+            _zero_grad_group_helper(group, set_to_none)
+        for group in self.fp32_from_float16_groups:
+            _zero_grad_group_helper(group, set_to_none)
+        for group in self.fp32_from_fp32_groups:
+            _zero_grad_group_helper(group, set_to_none)
+
+
+    def _collect_main_grad_data_for_unscaling(self):
+
+        main_grads = []
+
+        # fp32 params from float16 ones.
+        for main_group in self.fp32_from_float16_groups:
+            for main_param in main_group:
+                if main_param.grad is not None:
+                    main_grads.append(main_param.grad.data)
+
+        # Append fp32 parameters.
+        for main_group in self.fp32_from_fp32_groups:
+            for main_param in main_group:
+                if main_param.grad is not None:
+                    main_grads.append(main_param.grad.data)
+        
+        return main_grads
+
+
+    def _get_model_and_main_params_data_float16(self):
+        model_data = []
+        main_data = []
+        for model_group, main_group in zip(self.float16_groups,
+                                           self.fp32_from_float16_groups):
+            for model_param, main_param in zip(model_group, main_group):
+                model_data.append(model_param.data)
+                main_data.append(main_param.data)
+        return model_data, main_data
+
+
+    def _copy_model_grads_to_main_grads(self):
+        # This only needs to be done for the float16 group.
+        for model_group, main_group in zip(self.float16_groups,
+                                           self.fp32_from_float16_groups):
+            for model_param, main_param in zip(model_group, main_group):
+                if self.params_have_main_grad and hasattr(model_param, 'main_grad'):
+                    main_param.grad = model_param.main_grad.float()
+                else:
+                    if model_param.grad is not None:
+                        main_param.grad = model_param.grad.float()
+
+                # Safe to deallocate model's grad/main_grad after copying.
+                # (If using contiguous buffers, main_grad's memory should
+                # persist and therefore should not be deallocated.)
+                model_param.grad = None
+                if self.params_have_main_grad and \
+                   not self.use_contiguous_buffers_in_local_ddp:
+                    model_param.main_grad = None
+
+        # For fp32 grads, we need to reset the grads to main grad.
+        if self.params_have_main_grad:
+            for model_group in self.fp32_from_fp32_groups:
+                for model_param in model_group:
+                    model_param.grad = model_param.main_grad
+
+                    # Safe to de-reference model's main_grad after copying.
+                    # (If using contiguous buffers, main_grad's memory should
+                    # persist and therefore should not be deallocated.)
+                    if not self.use_contiguous_buffers_in_local_ddp:
+                        model_param.main_grad = None
+
+
+    def _copy_main_params_to_model_params(self):
+        # Only needed for the float16 params.
+        model_data, main_data = self._get_model_and_main_params_data_float16()
+        _multi_tensor_copy_this_to_that(this=main_data, that=model_data,
+                                        overflow_buf=self._dummy_overflow_buf)
+
+
+    def _copy_model_params_to_main_params(self):
+        # Only needed for the float16 params.
+        model_data, main_data = self._get_model_and_main_params_data_float16()
+        _multi_tensor_copy_this_to_that(this=model_data, that=main_data,
+                                        overflow_buf=self._dummy_overflow_buf)
+
+
+    def state_dict(self):
+        state_dict = {}
+        state_dict['optimizer'] = self.optimizer.state_dict()
+        if self.grad_scaler:
+            state_dict['grad_scaler'] = self.grad_scaler.state_dict()
+        state_dict['fp32_from_fp16_params'] = self.fp32_from_float16_groups
+        return state_dict
+
+
+    def load_state_dict(self, state_dict):
+        # Optimizer.
+        optimizer_key = 'optimizer'
+        if optimizer_key not in state_dict:
+            optimizer_key = 'optimizer_state_dict'
+            print_rank_0('***WARNING*** loading optimizer from '
+                         'an old checkpoint ...')
+        self.optimizer.load_state_dict(state_dict[optimizer_key])
+
+        # Grad scaler.
+        if 'grad_scaler' not in state_dict:
+            if self.fp16:
+                print_rank_0('***WARNING*** found an old checkpoint, will not '
+                             'load grad scaler ...')
+        else:
+            if self.grad_scaler:
+                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
+            else:
+                print_rank_0('***WARNING*** fould the grad scaler in the '
+                             'checkpoint but it is None in the class. '
+                             'Skipping loading grad scaler ...')
+
+        # Copy data for the main params.
+        fp32_from_float16_params_key = 'fp32_from_fp16_params'
+        if fp32_from_float16_params_key not in state_dict:
+            fp32_from_float16_params_key = 'fp32_from_fp16'
+        for current_group, saved_group in zip(
+                self.fp32_from_float16_groups,
+                state_dict[fp32_from_float16_params_key]):
+            for current_param, saved_param in zip(current_group, saved_group):
+                current_param.data.copy_(saved_param.data)
+
+
+class FP32Optimizer(MegatronOptimizer):
+
+    def __init__(self, optimizer, clip_grad,
+                 log_num_zeros_in_grad,
+                 params_have_main_grad,
+                 use_contiguous_buffers_in_local_ddp,
+                 models):
+
+        super(FP32Optimizer, self).__init__(
+            optimizer, clip_grad, log_num_zeros_in_grad,
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+            models)
+
+        self._scale = torch.cuda.FloatTensor([1.0])
+
+
+    def zero_grad(self, set_to_none=True):
+        """Copied from torch.optim.optimizer"""
+        for group in self.optimizer.param_groups:
+            _zero_grad_group_helper(group['params'], set_to_none)
+
+
+    def get_loss_scale(self):
+        """FP32 optimizer does not do any scaling."""
+        return self._scale
+
+
+    @torch.no_grad()
+    def step(self, args, timers):
+        """Clip gradients (if needed) and step the base optimizer.
+        Always return successful since there is no overflow."""
+
+        # Copy main_grads to grads.
+        timers('optimizer-copy-to-main-grad', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
+        if self.params_have_main_grad:
+            for param_group in self.optimizer.param_groups:
+                for param in param_group['params']:
+                    param.grad = param.main_grad
+
+                    # Safe to de-reference model's main_grad after copying.
+                    # (If using contiguous buffers, main_grad's memory should
+                    # persist and therefore should not be deallocated.)
+                    if not self.use_contiguous_buffers_in_local_ddp:
+                        param.main_grad = None
+        timers('optimizer-copy-to-main-grad').stop()
+
+        # Clip gradients.
+        timers('optimizer-clip-main-grad', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
+        grad_norm = None
+        if self.clip_grad > 0.0:
+            grad_norm = self.clip_grad_norm(self.clip_grad)
+        timers('optimizer-clip-main-grad').stop()
+
+        # count the zeros in the grads
+        timers('optimizer-count-zeros', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
+        num_zeros_in_grad = self.count_zeros() if \
+                            self.log_num_zeros_in_grad else None
+        timers('optimizer-count-zeros').stop()
+
+        # Update parameters.
+        timers('optimizer-inner-step', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
+        self.optimizer.step()
+        timers('optimizer-inner-step').stop()
+
+        # No overflow for FP32 optimizer.
+        return True, grad_norm, num_zeros_in_grad
+
+
+    def reload_model_params(self):
+        pass
+
+
+    def state_dict(self):
+        return self.optimizer.state_dict()
+
+
+    def load_state_dict(self, state_dict):
+        self.optimizer.load_state_dict(state_dict)
diff --git a/training/DeepSpeed-Domino/domino/optimizer_param_scheduler.py b/training/DeepSpeed-Domino/domino/optimizer_param_scheduler.py
new file mode 100644
index 000000000..1c9aab34c
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/optimizer_param_scheduler.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# This file is adapted from optimizer_param_scheduler.py in Megatron-LM
+
+
+import math
+from domino.utils import print_rank_0
+
+class OptimizerParamScheduler(object):
+    """Anneals learning rate and weight decay"""
+
+    def __init__(self, optimizer, init_lr, max_lr, min_lr,
+                 lr_warmup_steps, lr_decay_steps, lr_decay_style,
+                 start_wd, end_wd, wd_incr_steps, wd_incr_style,
+                 use_checkpoint_opt_param_scheduler=True,
+                 override_opt_param_scheduler=False):
+
+        # Class values.
+        self.optimizer = optimizer
+
+        self.init_lr = init_lr
+        self.max_lr = float(max_lr)
+        self.min_lr = min_lr
+        assert self.min_lr >= 0.0
+        assert self.max_lr >= self.min_lr
+        assert self.init_lr <= self.max_lr
+
+        self.lr_warmup_steps = lr_warmup_steps
+        self.num_steps = 0
+        self.lr_decay_steps = lr_decay_steps
+        assert self.lr_decay_steps > 0
+        assert self.lr_warmup_steps < self.lr_decay_steps
+
+        self.lr_decay_style = lr_decay_style
+
+        self.start_wd = start_wd
+        self.end_wd = end_wd
+        assert self.start_wd >= 0.0
+        assert self.end_wd >= self.start_wd
+        self.wd_incr_steps = wd_incr_steps
+        self.wd_incr_style = wd_incr_style
+
+        self.override_opt_param_scheduler = override_opt_param_scheduler
+        self.use_checkpoint_opt_param_scheduler = use_checkpoint_opt_param_scheduler
+        if self.override_opt_param_scheduler:
+            assert not self.use_checkpoint_opt_param_scheduler, 'both override and '\
+                'use-checkpoint are set.'
+
+        # Set the learning rate
+        self.step(0)
+        print_rank_0('> learning rate decay style: {}'.format(self.lr_decay_style))
+
+
+    def get_wd(self):
+        """ Weight decay incr functions"""
+        if self.num_steps > self.wd_incr_steps:
+            return self.end_wd
+
+        if self.wd_incr_style == 'constant':
+            assert self.start_wd == self.end_wd
+            return self.end_wd
+
+        incr_ratio = float(self.num_steps) / float(self.wd_incr_steps)
+        assert incr_ratio >= 0.0
+        assert incr_ratio <= 1.0
+        delta_wd = self.end_wd - self.start_wd
+
+        if self.wd_incr_style == 'linear':
+            coeff = incr_ratio
+        elif self.wd_incr_style == 'cosine':
+            coeff = 0.5 * (math.cos(math.pi * (1 - incr_ratio)) + 1.0)
+        else:
+            raise Exception('{} weight decay increment style is not supported.'.format(
+                self.wd_incr_style))
+
+        return self.start_wd + coeff * delta_wd
+
+
+    def get_lr(self):
+        """Learning rate decay functions from:
+              https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
+
+        # Use linear warmup for the initial part.
+        if self.lr_warmup_steps > 0 and self.num_steps <= self.lr_warmup_steps:
+            return (
+                self.init_lr
+                + (
+                    (self.max_lr - self.init_lr)
+                    * float(self.num_steps)
+                    / float(self.lr_warmup_steps)
+                )
+            )
+
+        # If the learning rate is constant, just return the initial value.
+        if self.lr_decay_style == 'constant':
+            return self.max_lr
+
+        # For any steps larger than `self.lr_decay_steps`, use `self.min_lr`.
+        if self.num_steps > self.lr_decay_steps:
+            return self.min_lr
+
+        # If we are done with the warmup period, use the decay style.
+        if self.lr_decay_style == 'inverse-square-root':
+            warmup_steps = max(self.lr_warmup_steps, 1)
+            num_steps = max(self.num_steps, 1)
+            lr = self.max_lr * warmup_steps ** 0.5 / (num_steps ** 0.5)
+            return max(self.min_lr, lr)
+
+        num_steps_ = self.num_steps - self.lr_warmup_steps
+        decay_steps_ = self.lr_decay_steps - self.lr_warmup_steps
+        decay_ratio = float(num_steps_) / float(decay_steps_)
+        assert decay_ratio >= 0.0
+        assert decay_ratio <= 1.0
+        delta_lr = self.max_lr - self.min_lr
+
+        if self.lr_decay_style == 'linear':
+            coeff = (1.0 - decay_ratio)
+        elif self.lr_decay_style == 'cosine':
+            coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
+        else:
+            raise Exception('{} decay style is not supported.'.format(
+                self.lr_decay_style))
+
+        return self.min_lr + coeff * delta_lr
+
+
+    def step(self, increment):
+        """Set lr for all parameters groups."""
+        self.num_steps += increment
+        new_lr = self.get_lr()
+        new_wd = self.get_wd()
+        for group in self.optimizer.param_groups:
+            group['lr'] = new_lr * group.get('lr_mult', 1.0)
+            group['weight_decay'] = new_wd * group.get('wd_mult', 1.0)
+
+
+    def state_dict(self):
+        state_dict = {
+            'max_lr': self.max_lr,
+            'lr_warmup_steps': self.lr_warmup_steps,
+            'num_steps': self.num_steps,
+            'lr_decay_style': self.lr_decay_style,
+            'lr_decay_steps': self.lr_decay_steps,
+            'min_lr': self.min_lr,
+            'start_wd': self.start_wd,
+            'end_wd': self.end_wd,
+            'wd_incr_style': self.wd_incr_style,
+            'wd_incr_steps': self.wd_incr_steps
+        }
+        return state_dict
+
+
+    def _check_and_set(self, cls_value, sd_value, name):
+        """Auxiliary function for checking the values in the checkpoint and
+        setting them."""
+        if self.override_opt_param_scheduler:
+            print_rank_0(' > overriding {} value to {}'.format(name, cls_value))
+            return cls_value
+
+        if not self.use_checkpoint_opt_param_scheduler:
+            assert cls_value == sd_value, \
+                f'OptimizerParamScheduler: class input value {cls_value} and checkpoint' \
+                f'value {sd_value} for {name} do not match'
+        print_rank_0(' > using checkpoint value {} for {}'.format(sd_value,
+                                                                  name))
+        return sd_value
+
+
+    def load_state_dict(self, sd):
+
+        if 'start_lr' in sd:
+            max_lr_ = sd['start_lr']
+        else:
+            max_lr_ = sd['max_lr']
+        self.max_lr = self._check_and_set(self.max_lr, max_lr_,
+                                          'learning rate')
+        
+        self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'],
+                                          'minimum learning rate')
+
+        if 'warmup_iter' in sd:
+            lr_warmup_steps_ = sd['warmup_iter']
+        elif 'warmup_steps' in sd:
+            lr_warmup_steps_ = sd['warmup_steps']
+        else:
+            lr_warmup_steps_ = sd['lr_warmup_steps']
+        self.lr_warmup_steps = self._check_and_set(self.lr_warmup_steps,
+                                                lr_warmup_steps_,
+                                                'warmup iterations')
+
+        if 'end_iter' in sd:
+            lr_decay_steps_ = sd['end_iter']
+        elif 'decay_steps' in sd:
+            lr_decay_steps_  = sd['decay_steps']
+        else:
+            lr_decay_steps_ = sd['lr_decay_steps']
+        self.lr_decay_steps = self._check_and_set(self.lr_decay_steps, lr_decay_steps_,
+                                               'total number of iterations')
+
+        if 'decay_style' in sd:
+            lr_decay_style_ = sd['decay_style']
+        else:
+            lr_decay_style_ = sd['lr_decay_style']
+        self.lr_decay_style = self._check_and_set(self.lr_decay_style,
+                                               lr_decay_style_,
+                                               'learning rate decay style')
+
+        if 'num_iters' in sd:
+            num_steps = sd['num_iters']
+        else:
+            num_steps = sd['num_steps']
+        self.step(increment=num_steps)
+
+
+        if 'start_wd' in sd:
+            self.start_wd = self._check_and_set(self.start_wd,
+                                                sd['start_wd'],
+                                                "start weight decay")
+            self.end_wd = self._check_and_set(self.end_wd,
+                                                sd['end_wd'],
+                                                "end weight decay")
+            self.wd_incr_steps = self._check_and_set(self.wd_incr_steps,
+                                                sd['wd_incr_steps'],
+                                                "total number of weight decay iterations")
+            self.wd_incr_style = self._check_and_set(self.wd_incr_style,
+                                                sd['wd_incr_style'],
+                                                "weight decay incr style")
+            
+
+
+
+
+
+
+
diff --git a/training/DeepSpeed-Domino/domino/parallel_state.py b/training/DeepSpeed-Domino/domino/parallel_state.py
new file mode 100644
index 000000000..aceba20d0
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/parallel_state.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from parallel_state.py in Megatron-LM
+
+import operator
+from functools import reduce
+import torch
+
+
+class GlobalMemoryBuffer:
+    """Global buffer to avoid dynamic memory allocations.
+    Caller should ensure that buffers of the same name
+    are not used concurrently."""
+
+    def __init__(self):
+        self.buffer = {}
+
+    def get_tensor(self, tensor_shape, dtype, name):
+        required_len = reduce(operator.mul, tensor_shape, 1)
+        if (
+            self.buffer.get((name, dtype), None) is None
+            or self.buffer[(name, dtype)].numel() < required_len
+        ):
+            self.buffer[(name, dtype)] = torch.empty(
+                required_len, dtype=dtype, device=torch.cuda.current_device(), requires_grad=False
+            )
+
+        return self.buffer[(name, dtype)][0:required_len].view(*tensor_shape)
+
+
+_TENSOR_MODEL_PARALLEL_GROUP = None
+_PIPELINE_MODEL_PARALLEL_GROUP = None
+_MODEL_PARALLEL_GROUP = None
+_DATA_PARALLEL_GROUP = None
+
+_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_TENSOR_MODEL_PARALLEL_RANK = None
+_MPU_PIPELINE_MODEL_PARALLEL_RANK = None
+
+_PIPELINE_GLOBAL_RANKS = None
+_DATA_PARALLEL_GLOBAL_RANKS = None
+
+_GLOBAL_MEMORY_BUFFER = None
+
+
+def initialize_model_parallel(
+    tensor_model_parallel_size: int = 1
+) -> None:
+    """Initialize model data parallel groups.
+    """
+
+    pipeline_model_parallel_size = 1
+    assert torch.distributed.is_initialized()
+    world_size: int = torch.distributed.get_world_size()
+    data_parallel_size: int = world_size // (
+        tensor_model_parallel_size * pipeline_model_parallel_size
+    )
+
+    num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
+    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
+    num_data_parallel_groups: int = world_size // data_parallel_size
+
+    rank = torch.distributed.get_rank()
+
+    global _DATA_PARALLEL_GROUP
+    global _DATA_PARALLEL_GLOBAL_RANKS
+    all_data_parallel_group_ranks = []
+    for i in range(pipeline_model_parallel_size):
+        start_rank = i * num_pipeline_model_parallel_groups
+        end_rank = (i + 1) * num_pipeline_model_parallel_groups
+        for j in range(tensor_model_parallel_size):
+            ranks = range(start_rank + j, end_rank, tensor_model_parallel_size)
+            all_data_parallel_group_ranks.append(list(ranks))
+            group = torch.distributed.new_group(ranks)
+            if rank in ranks:
+                _DATA_PARALLEL_GROUP = group
+                _DATA_PARALLEL_GLOBAL_RANKS = ranks
+
+    # Build the model-parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    for i in range(data_parallel_size):
+        ranks = [
+            data_parallel_group_ranks[i]
+            for data_parallel_group_ranks in all_data_parallel_group_ranks
+        ]
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _MODEL_PARALLEL_GROUP = group
+
+    # Build the tensor model-parallel groups.
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    for i in range(num_tensor_model_parallel_groups):
+        ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _TENSOR_MODEL_PARALLEL_GROUP = group
+
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    global _PIPELINE_GLOBAL_RANKS
+    for i in range(num_pipeline_model_parallel_groups):
+        ranks = range(i, world_size, num_pipeline_model_parallel_groups)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _PIPELINE_MODEL_PARALLEL_GROUP = group
+            _PIPELINE_GLOBAL_RANKS = ranks
+
+    _set_global_memory_buffer()
+
+
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, 'model parallel group is not initialized'
+    return _MODEL_PARALLEL_GROUP
+
+
+def get_tensor_model_parallel_group(check_initialized=True):
+    """Get the tensor model parallel group the caller rank belongs to."""
+    if check_initialized:
+        assert (
+            _TENSOR_MODEL_PARALLEL_GROUP is not None
+        ), 'tensor model parallel group is not initialized'
+    return _TENSOR_MODEL_PARALLEL_GROUP
+
+
+def get_pipeline_model_parallel_group():
+    """Get the pipeline model parallel group the caller rank belongs to."""
+    assert (
+        _PIPELINE_MODEL_PARALLEL_GROUP is not None
+    ), 'pipeline_model parallel group is not initialized'
+    return _PIPELINE_MODEL_PARALLEL_GROUP
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, 'data parallel group is not initialized'
+    return _DATA_PARALLEL_GROUP
+
+
+def get_tensor_model_parallel_world_size():
+    """Return world size for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
+
+
+def get_pipeline_model_parallel_world_size():
+    """Return world size for the pipeline model parallel group."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())
+
+
+def get_tensor_model_parallel_rank():
+    """Return my rank for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    if _MPU_TENSOR_MODEL_PARALLEL_RANK is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
+
+
+def get_pipeline_model_parallel_rank():
+    """Return my rank for the pipeline model parallel group."""
+    return 0
+    # global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    # if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None:
+    #     return _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    # return torch.distributed.get_rank(group=get_pipeline_model_parallel_group())
+
+
+def is_pipeline_first_stage(ignore_virtual=False):
+    """Return True if in the first pipeline model-parallel stage, False otherwise."""
+    return get_pipeline_model_parallel_rank() == 0
+
+
+def is_pipeline_last_stage(ignore_virtual=False):
+    """Return True if in the last pipeline model-parallel stage, False otherwise."""
+    # if not ignore_virtual:
+    #     virtual_pipeline_model_parallel_world_size = (
+    #         get_virtual_pipeline_model_parallel_world_size()
+    #     )
+    #     if virtual_pipeline_model_parallel_world_size is not None and get_virtual_pipeline_model_parallel_rank() != (
+    #         virtual_pipeline_model_parallel_world_size - 1
+    #     ):
+    #         return False
+    # return get_pipeline_model_parallel_rank() == (get_pipeline_model_parallel_world_size() - 1)
+    return True
+
+
+def get_tensor_model_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the tensor model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_tensor_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+
+def get_data_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the data parallel group."""
+    assert _DATA_PARALLEL_GLOBAL_RANKS is not None, "Data parallel group is not initialized"
+    return _DATA_PARALLEL_GLOBAL_RANKS[0]
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_world_size(group=get_data_parallel_group())
+    else:
+        return 0
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank(group=get_data_parallel_group())
+    else:
+        return 0
+
+
+def _set_global_memory_buffer():
+    """Initialize global buffer"""
+    global _GLOBAL_MEMORY_BUFFER
+    assert _GLOBAL_MEMORY_BUFFER is None, 'global memory buffer is already initialized'
+    _GLOBAL_MEMORY_BUFFER = GlobalMemoryBuffer()
+
+
+def get_global_memory_buffer():
+    """Return the global GlobalMemoryBuffer object"""
+    assert _GLOBAL_MEMORY_BUFFER is not None, 'global memory buffer is not initialized'
+    return _GLOBAL_MEMORY_BUFFER
+
+
+def destroy_global_memory_buffer():
+    """Sets the global memory buffer to None"""
+    global _GLOBAL_MEMORY_BUFFER
+    _GLOBAL_MEMORY_BUFFER = None
+
+
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _MODEL_PARALLEL_GROUP
+    _MODEL_PARALLEL_GROUP = None
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    _TENSOR_MODEL_PARALLEL_GROUP = None
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    _PIPELINE_MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    _MPU_TENSOR_MODEL_PARALLEL_RANK = None
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    _MPU_PIPELINE_MODEL_PARALLEL_RANK = None
+    global _GLOBAL_MEMORY_BUFFER
+    _GLOBAL_MEMORY_BUFFER = None
diff --git a/training/DeepSpeed-Domino/domino/schedules.py b/training/DeepSpeed-Domino/domino/schedules.py
new file mode 100644
index 000000000..aa8dff81a
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/schedules.py
@@ -0,0 +1,354 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from schedules.py in Megatron-LM
+
+import contextlib
+from typing import Iterator, List, Union
+import torch
+from torch.autograd.variable import Variable
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+import domino.parallel_state as parallel_state
+from domino.modules.enums import ModelType
+from domino.utils import get_attr_wrapped_model, get_model_config, get_model_type
+from domino.utils import average_losses_across_data_parallel_group
+
+from megatron.core.pipeline_parallel import p2p_communication
+
+
+def get_forward_backward_func():
+    forward_backward_func = forward_backward_no_pipelining
+    return forward_backward_func
+
+
+def custom_backward(output, grad_output):
+    '''Directly call C++ autograd engine.
+
+    To make the 'deallocate_output_tensor' (above) optimization work, the C++
+    autograd engine must be called directly, bypassing Pytorch's
+    torch.autograd.backward. Pytorch's 'backward' checks that the output and
+    grad have the same shape, while C++'s 'backward' does not.
+    '''
+
+    assert output.numel() == 1, "output should be pseudo-'freed' in schedule, to optimize memory"
+    assert isinstance(output, torch.Tensor), "output == '%s'." % type(output).__name__
+    assert isinstance(grad_output, (torch.Tensor, type(None))), (
+        "grad_output == '%s'." % type(grad_output).__name__
+    )
+
+    # Handle scalar output
+    if grad_output is None:
+        assert output.numel() == 1, "implicit grad requires scalar output."
+        grad_output = torch.ones_like(output, memory_format=torch.preserve_format,)
+
+    # Call c++ engine [ see torch/csrc/autograd/python_engine.cpp ]
+    Variable._execution_engine.run_backward(
+        tensors=(output,),
+        grad_tensors=(grad_output,),
+        keep_graph=False,
+        create_graph=False,
+        inputs=tuple(),
+        allow_unreachable=True,
+        accumulate_grad=True,
+    )
+
+
+def forward_step(
+    forward_step_func,
+    data_iterator,
+    model,
+    num_microbatches,
+    input_tensor,
+    forward_data_store,
+    config,
+    collect_non_loss_data=False,
+    checkpoint_activations_microbatch=None,
+):
+    """Forward step for passed-in model.
+
+    If first stage, input tensor is obtained from data_iterator, otherwise
+    passed-in input_tensor is used.
+
+    Returns output tensor."""
+    if config.timers is not None:
+        config.timers('forward-compute', log_level=2).start()
+
+    unwrap_output_tensor = False
+    if not isinstance(input_tensor, list):
+        input_tensor = [input_tensor]
+        unwrap_output_tensor = True
+
+    set_input_tensor = get_attr_wrapped_model(model, "set_input_tensor")
+    set_input_tensor(input_tensor)
+
+    if config.enable_autocast:
+        context_manager = torch.autocast("cuda", dtype=config.autocast_dtype)
+    else:
+        context_manager = contextlib.nullcontext()
+    with context_manager:
+        if checkpoint_activations_microbatch is None:
+            output_tensor, loss_func = forward_step_func(data_iterator, model)
+        else:
+            output_tensor, loss_func = forward_step_func(
+                data_iterator, model, checkpoint_activations_microbatch
+            )
+
+    if parallel_state.is_pipeline_last_stage():
+        if not collect_non_loss_data:
+            output_tensor = loss_func(output_tensor)
+            loss = output_tensor
+            averaged_loss = average_losses_across_data_parallel_group([loss])
+            loss_reduced = {"lm loss": averaged_loss[0]}
+            output_tensor = loss / num_microbatches
+            forward_data_store.append(loss_reduced)
+        else:
+            data = loss_func(output_tensor, non_loss_data=True)
+            forward_data_store.append(data)
+
+    if config.timers is not None:
+        config.timers('forward-compute').stop()
+
+    if unwrap_output_tensor:
+        return output_tensor
+    return [output_tensor]
+
+
+def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config):
+    """Backward step through passed-in output tensor.
+
+    If last stage, output_tensor_grad is None, otherwise gradient of loss
+    with respect to stage's output tensor.
+
+    Returns gradient of loss with respect to input tensor (None if first
+    stage)."""
+
+    # NOTE: This code currently can handle at most one skip connection. It
+    # needs to be modified slightly to support arbitrary numbers of skip
+    # connections.
+
+    if config.timers is not None:
+        config.timers('backward-compute', log_level=2).start()
+
+    # Retain the grad on the input_tensor.
+    unwrap_input_tensor_grad = False
+    if not isinstance(input_tensor, list):
+        input_tensor = [input_tensor]
+        unwrap_input_tensor_grad = True
+    for x in input_tensor:
+        if x is not None:
+            x.retain_grad()
+
+    if not isinstance(output_tensor, list):
+        output_tensor = [output_tensor]
+    if not isinstance(output_tensor_grad, list):
+        output_tensor_grad = [output_tensor_grad]
+
+    # Backward pass.
+    if output_tensor_grad[0] is None and config.grad_scale_func is not None:
+        output_tensor[0] = config.grad_scale_func(output_tensor[0])
+
+    if config.deallocate_pipeline_outputs:
+        custom_backward(output_tensor[0], output_tensor_grad[0])
+    else:
+        torch.autograd.backward(output_tensor[0], grad_tensors=output_tensor_grad[0])
+
+    # Collect the grad of the input_tensor.
+    input_tensor_grad = [None]
+    if input_tensor is not None:
+        input_tensor_grad = []
+        for x in input_tensor:
+            if x is None:
+                input_tensor_grad.append(None)
+            else:
+                input_tensor_grad.append(x.grad)
+
+    # Handle single skip connection if it exists (encoder_hidden_state in
+    # model with encoder and decoder).
+    if (
+        parallel_state.get_pipeline_model_parallel_world_size() > 1
+        and parallel_state.is_pipeline_stage_after_split()
+        and model_type == ModelType.encoder_and_decoder
+    ):
+        if output_tensor_grad[1] is not None:
+            input_tensor_grad[-1].add_(output_tensor_grad[1])
+    if unwrap_input_tensor_grad:
+        input_tensor_grad = input_tensor_grad[0]
+
+    if config.timers is not None:
+        config.timers('backward-compute').stop()
+
+    return input_tensor_grad
+
+
+def forward_backward_no_pipelining(
+    *,
+    forward_step_func,
+    data_iterator: Union[Iterator, List[Iterator]],
+    model: Union[torch.nn.Module, List[torch.nn.Module]],
+    num_microbatches: int,
+    seq_length: int,  # unused
+    micro_batch_size: int,  # unused
+    decoder_seq_length: int = None,  # unused
+    forward_only: bool = False,
+    collect_non_loss_data: bool = False,
+):
+    """Run forward and backward passes with no pipeline parallelism
+    (no inter-stage communication).
+
+    Returns dictionary with losses.
+
+
+    See get_forward_backward_func() for argument details
+    """
+
+    if isinstance(model, list):
+        assert len(model) == 1, "non-pipeline-parallel schedule does not support model chunking"
+        model = model[0]
+    if isinstance(data_iterator, list):
+        assert (
+            len(data_iterator) == 1
+        ), "non-pipeline-parallel schedule does not support model chunking"
+        data_iterator = data_iterator[0]
+
+    config = get_model_config(model)
+
+    no_sync_func = config.no_sync_func
+    if no_sync_func is None and isinstance(model, torchDDP):
+        no_sync_func = model.no_sync
+    if no_sync_func is None:
+        no_sync_func = contextlib.nullcontext
+
+    model_type = get_model_type(model)
+
+    forward_data_store = []
+    input_tensor, output_tensor_grad = None, None
+    with no_sync_func():
+        for i in range(num_microbatches - 1):
+            output_tensor = forward_step(
+                forward_step_func,
+                data_iterator,
+                model,
+                num_microbatches,
+                input_tensor,
+                forward_data_store,
+                config,
+                collect_non_loss_data,
+            )
+            if not forward_only:
+                backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config)
+
+    # Run computation for last microbatch out of context handler (want to
+    # synchronize gradients).
+    output_tensor = forward_step(
+        forward_step_func,
+        data_iterator,
+        model,
+        num_microbatches,
+        input_tensor,
+        forward_data_store,
+        config,
+        collect_non_loss_data,
+    )
+
+    if not forward_only:
+        backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config)
+
+    return forward_data_store
+
+
+def get_tensor_shapes(
+    *,
+    rank: int,
+    model_type: ModelType,
+    seq_length: int,
+    micro_batch_size: int,
+    decoder_seq_length: int,
+    config,
+):
+    # Determine right tensor sizes (based on position of rank with respect to split
+    # rank) and model size.
+
+    tensor_shapes = []
+
+    if config.sequence_parallel:
+        seq_length = seq_length // parallel_state.get_tensor_model_parallel_world_size()
+        if model_type == ModelType.encoder_and_decoder:
+            decoder_seq_length = (
+                decoder_seq_length // parallel_state.get_tensor_model_parallel_world_size()
+            )
+
+    if model_type == ModelType.encoder_and_decoder:
+        if parallel_state.is_pipeline_stage_before_split(rank):
+            tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
+        else:
+            tensor_shapes.append((decoder_seq_length, micro_batch_size, config.hidden_size))
+            tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
+    else:
+        tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
+    return tensor_shapes
+
+
+def recv_forward(tensor_shapes, config):
+    input_tensors = []
+    for tensor_shape in tensor_shapes:
+        if tensor_shape is None:
+            input_tensors.append(None)
+        else:
+            input_tensors.append(p2p_communication.recv_forward(tensor_shape, config))
+    return input_tensors
+
+
+def recv_backward(tensor_shapes, config):
+    output_tensor_grads = []
+    for tensor_shape in tensor_shapes:
+        if tensor_shape is None:
+            output_tensor_grads.append(None)
+        else:
+            output_tensor_grads.append(p2p_communication.recv_backward(tensor_shape, config))
+    return output_tensor_grads
+
+
+def send_forward(output_tensors, tensor_shapes, config):
+    if not isinstance(output_tensors, list):
+        output_tensors = [output_tensors]
+    for (output_tensor, tensor_shape) in zip(output_tensors, tensor_shapes):
+        if tensor_shape is None:
+            continue
+        p2p_communication.send_forward(output_tensor, config)
+
+
+def send_backward(input_tensor_grads, tensor_shapes, config):
+    if not isinstance(input_tensor_grads, list):
+        input_tensor_grads = [input_tensor_grads]
+    for (input_tensor_grad, tensor_shape) in zip(input_tensor_grads, tensor_shapes):
+        if tensor_shape is None:
+            continue
+        p2p_communication.send_backward(input_tensor_grad, config)
+
+
+def send_forward_recv_backward(output_tensors, tensor_shapes, config):
+    if not isinstance(output_tensors, list):
+        output_tensors = [output_tensors]
+    output_tensor_grads = []
+    for (output_tensor, tensor_shape) in zip(output_tensors, tensor_shapes):
+        if tensor_shape is None:
+            output_tensor_grads.append(None)
+            continue
+        output_tensor_grad = p2p_communication.send_forward_recv_backward(
+            output_tensor, tensor_shape, config
+        )
+        output_tensor_grads.append(output_tensor_grad)
+    return output_tensor_grads
+
+
+def send_backward_recv_forward(input_tensor_grads, tensor_shapes, config):
+    if not isinstance(input_tensor_grads, list):
+        input_tensor_grads = [input_tensor_grads]
+    input_tensors = []
+    for (input_tensor_grad, tensor_shape) in zip(input_tensor_grads, tensor_shapes):
+        if tensor_shape is None:
+            input_tensors.append(None)
+            continue
+        input_tensor = p2p_communication.send_backward_recv_forward(
+            input_tensor_grad, tensor_shape, config
+        )
+        input_tensors.append(input_tensor)
+    return input_tensors
diff --git a/training/DeepSpeed-Domino/domino/tensor_parallel/__init__.py b/training/DeepSpeed-Domino/domino/tensor_parallel/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/DeepSpeed-Domino/domino/tensor_parallel/comm.py b/training/DeepSpeed-Domino/domino/tensor_parallel/comm.py
new file mode 100644
index 000000000..0f7f2d7ca
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/tensor_parallel/comm.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from comm.py in Megatron-LM
+
+import torch
+
+from domino.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+
+from .utils import split_tensor_along_last_dim
+
+
+def reduce_tensor(tensor):
+    if get_tensor_model_parallel_world_size() == 1:
+        return tensor
+
+    torch.distributed.all_reduce(tensor, group=get_tensor_model_parallel_group())
+    return tensor
+
+
+def split_tensor_last_dim(tensor):
+    world_size = get_tensor_model_parallel_world_size()
+    if world_size == 1:
+        return tensor
+
+    tensor_splits = split_tensor_along_last_dim(tensor, world_size)
+    rank = get_tensor_model_parallel_rank()
+    return tensor_splits[rank].contiguous()
+
+
+def gather_tensor_last_dim(tensor):
+    world_size = get_tensor_model_parallel_world_size()
+    if world_size == 1:
+        return tensor
+
+    last_dim = tensor.dim() - 1
+    rank = get_tensor_model_parallel_rank()
+    gathered_tensors = [torch.empty_like(tensor) for _ in range(world_size)]
+    gathered_tensors[rank] = tensor
+    torch.distributed.all_gather(gathered_tensors, tensor, group=get_tensor_model_parallel_group())
+    return torch.cat(gathered_tensors, dim=last_dim).contiguous()
+
+
+class CopyToModelParallelRegion(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return reduce_tensor(grad_output)
+
+
+class ReduceFromModelParallelRegion(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_):
+        return reduce_tensor(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class ScatterToModelParallelRegion(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_):
+        return split_tensor_last_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return gather_tensor_last_dim(grad_output)
+
+
+class GatherFromModelParallelRegion(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_):
+        return gather_tensor_last_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return split_tensor_last_dim(grad_output)
diff --git a/training/DeepSpeed-Domino/domino/tensor_parallel/cross_entropy.py b/training/DeepSpeed-Domino/domino/tensor_parallel/cross_entropy.py
new file mode 100644
index 000000000..a87c1f521
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/tensor_parallel/cross_entropy.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from cross_entropy.py in Megatron-LM
+
+import torch
+
+from domino.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+
+from .utils import VocabUtility
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, logits, target):
+        max_logits = torch.max(logits, dim=-1)[0]
+        torch.distributed.all_reduce(max_logits, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group())
+        logits = logits - max_logits.unsqueeze(dim=-1)
+
+        partition_vocab_size = logits.size()[-1]
+        rank = get_tensor_model_parallel_rank()
+        world_size = get_tensor_model_parallel_world_size()
+        vocab_start, vocab_end = VocabUtility.vocab_range_from_per_partition_vocab_size(partition_vocab_size, rank, world_size)
+
+        target_mask = (target < vocab_start) | (target >= vocab_end)
+        adjusted_target = target.clone() - vocab_start
+        adjusted_target[target_mask] = 0
+
+        logits_2d = logits.view(-1, partition_vocab_size)
+        adjusted_target_1d = adjusted_target.view(-1)
+        batch_indices = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[batch_indices, adjusted_target_1d].clone().contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        torch.distributed.all_reduce(predicted_logits, op=torch.distributed.ReduceOp.SUM, group=get_tensor_model_parallel_group())
+
+        exp_logits = torch.exp(logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(sum_exp_logits, op=torch.distributed.ReduceOp.SUM, group=get_tensor_model_parallel_group())
+
+        loss = torch.log(sum_exp_logits) - predicted_logits
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+
+        ctx.save_for_backward(exp_logits, target_mask, adjusted_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        softmax, target_mask, adjusted_target_1d = ctx.saved_tensors
+
+        grad_input = softmax.view(-1, softmax.size()[-1])
+        batch_indices = torch.arange(start=0, end=grad_input.size()[0], device=grad_input.device)
+        softmax_update = 1.0 - target_mask.view(-1).float()
+        grad_input[batch_indices, adjusted_target_1d] -= softmax_update
+        grad_input = grad_input.view_as(softmax)
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None
+
+
+def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
diff --git a/training/DeepSpeed-Domino/domino/tensor_parallel/data.py b/training/DeepSpeed-Domino/domino/tensor_parallel/data.py
new file mode 100644
index 000000000..8cc529521
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/tensor_parallel/data.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from data.py in Megatron-LM
+
+import torch
+
+from domino.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_src_rank,
+)
+
+_MAX_DATA_DIM = 5
+
+
+def _check_data_types(keys, data, target_dtype):
+    """Check that all the keys have the same target data type."""
+    for key in keys:
+        assert data[key].dtype == target_dtype, (
+            '{} has data type {} which '
+            'is different than {}'.format(key, data[key].dtype, target_dtype)
+        )
+
+
+def _build_key_size_numel_dictionaries(keys, data):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+
+    # Pack the sizes on rank zero.
+    if get_tensor_model_parallel_rank() == 0:
+        offset = 0
+        for key in keys:
+            assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    torch.distributed.broadcast(
+        sizes_cuda, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()
+    )
+
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+
+    return key_size, key_numel, total_numel
+
+
+def broadcast_data(keys, data, datatype):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+
+    Arguments:
+        keys: list of keys in the data disctionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data)
+
+    # Pack on rank zero.
+    if get_tensor_model_parallel_rank() == 0:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat([data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+    else:
+        flatten_data = torch.empty(total_numel, device=torch.cuda.current_device(), dtype=datatype)
+
+    # Broadcast
+    torch.distributed.broadcast(
+        flatten_data, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()
+    )
+
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+
+    return output
diff --git a/training/DeepSpeed-Domino/domino/tensor_parallel/partition.py b/training/DeepSpeed-Domino/domino/tensor_parallel/partition.py
new file mode 100644
index 000000000..b5f3b148b
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/tensor_parallel/partition.py
@@ -0,0 +1,756 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from partition.py in Megatron-LM
+
+import math
+import os
+import warnings
+from typing import Callable, Optional
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.nn.parameter import Parameter
+
+from megatron.core.model_parallel_config import ModelParallelConfig
+from domino.parallel_state import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank, get_tensor_model_parallel_group
+from domino.tensor_parallel.comm import (
+    CopyToModelParallelRegion,
+    ReduceFromModelParallelRegion,
+    GatherFromModelParallelRegion,
+    ScatterToModelParallelRegion,
+)
+
+# from .random import get_cuda_rng_tracker
+from .utils import VocabUtility
+
+_grad_accum_fusion_available = True
+try:
+    import fused_weight_gradient_mlp_cuda
+except ImportError:
+    _grad_accum_fusion_available = False
+
+_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {
+    'tensor_model_parallel': False,
+    'partition_dim': -1,
+    'partition_stride': 1,
+}
+
+
+def param_is_not_tensor_parallel_duplicate(param):
+    return (hasattr(param, 'tensor_model_parallel') and param.tensor_model_parallel) or (
+        get_tensor_model_parallel_rank() == 0
+    )
+
+
+def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
+    # Make sure the attributes are not set.
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        assert not hasattr(tensor, attribute)
+    # Set the attributes.
+    setattr(tensor, 'tensor_model_parallel', is_parallel)
+    setattr(tensor, 'partition_dim', dim)
+    setattr(tensor, 'partition_stride', stride)
+
+
+def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
+    def maybe_set(attribute, value):
+        if not hasattr(tensor, attribute):
+            setattr(tensor, attribute, value)
+
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute])
+
+
+def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
+    def maybe_copy(attribute):
+        if hasattr(source_tensor, attribute):
+            setattr(destination_tensor, attribute, getattr(source_tensor, attribute))
+
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        maybe_copy(attribute)
+
+
+def _initialize_affine_weight_gpu(weight, init_method, partition_dim, stride=1):
+    """Initialize affine weight for model parallel on GPU."""
+
+    set_tensor_model_parallel_attributes(
+        tensor=weight, is_parallel=True, dim=partition_dim, stride=stride
+    )
+
+    # with get_cuda_rng_tracker().fork():
+    init_method(weight)
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+
+    Keyword Arguments:
+        config: A megatron.core.ModelParallelConfig object
+    """
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        *,
+        init_method: Callable,
+        config: ModelParallelConfig,
+    ):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.0
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
+        # Divide the weight matrix along the vocaburaly dimension.
+        (
+            self.vocab_start_index,
+            self.vocab_end_index,
+        ) = VocabUtility.vocab_range_from_global_vocab_size(
+            self.num_embeddings, get_tensor_model_parallel_rank(), self.tensor_model_parallel_size
+        )
+        self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index
+
+        # Allocate weights and initialize.
+        if config.use_cpu_initialization:
+            self.weight = Parameter(
+                torch.empty(
+                    self.num_embeddings_per_partition, self.embedding_dim, dtype=config.params_dtype
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_cpu(
+                    self.weight,
+                    self.num_embeddings,
+                    self.embedding_dim,
+                    self.num_embeddings_per_partition,
+                    0,
+                    init_method,
+                    params_dtype=config.params_dtype,
+                )
+        else:
+            self.weight = Parameter(
+                torch.empty(
+                    self.num_embeddings_per_partition,
+                    self.embedding_dim,
+                    device=torch.cuda.current_device(),
+                    dtype=config.params_dtype,
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1)
+
+    def forward(self, input_):
+        if self.tensor_model_parallel_size > 1:
+            # Build the mask.
+            input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index)
+            # Mask the input.
+            masked_input = input_.clone() - self.vocab_start_index
+            masked_input[input_mask] = 0
+        else:
+            masked_input = input_
+            # Get the embeddings.
+        output_parallel = F.embedding(
+            masked_input,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+        # Mask the output embedding.
+        if self.tensor_model_parallel_size > 1:
+            output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = ReduceFromModelParallelRegion.apply(output_parallel)
+        return output
+
+
+class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
+    """See linear_with_grad_accumulation_and_async_allreduce"""
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx,
+        input,
+        weight,
+        bias,
+        gradient_accumulation_fusion,
+        async_grad_allreduce,
+        sequence_parallel,
+    ):
+        ctx.save_for_backward(input, weight)
+        ctx.use_bias = bias is not None
+        ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
+        ctx.async_grad_allreduce = async_grad_allreduce
+        ctx.sequence_parallel = sequence_parallel
+
+        if sequence_parallel:
+            world_size = get_tensor_model_parallel_world_size()
+            dim_size = list(input.size())
+            dim_size[0] = dim_size[0] * world_size
+
+            all_gather_buffer = get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
+            torch.distributed._all_gather_base(
+                all_gather_buffer, input, group=get_tensor_model_parallel_group()
+            )
+            total_input = all_gather_buffer
+        else:
+            total_input = input
+
+        output = torch.matmul(total_input, weight.t())
+        if bias is not None:
+            output = output + bias
+        return output
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output):
+        input, weight = ctx.saved_tensors
+        use_bias = ctx.use_bias
+
+        if ctx.sequence_parallel:
+            world_size = get_tensor_model_parallel_world_size()
+            dim_size = list(input.size())
+            dim_size[0] = dim_size[0] * world_size
+
+            all_gather_buffer = get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
+            handle = torch.distributed._all_gather_base(
+                all_gather_buffer, input, group=get_tensor_model_parallel_group(), async_op=True
+            )
+
+            # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
+            # gather is scheduled before the input gradient computation
+            total_input = all_gather_buffer
+        else:
+            total_input = input
+        grad_input = grad_output.matmul(weight)
+
+        if ctx.sequence_parallel:
+            handle.wait()
+
+        # Doing gather + slicing during the NeMo forward pass can make this tensor
+        # not be contiguous. PyTorch only checks if the tensor is contiguous, and only
+        # clones it if it's not contiguous:
+        # https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761
+        grad_output = grad_output.contiguous()
+        # Convert the tensor shapes to 2D for execution compatibility
+        grad_output = grad_output.view(
+            grad_output.shape[0] * grad_output.shape[1], grad_output.shape[2]
+        )
+        total_input = total_input.view(
+            total_input.shape[0] * total_input.shape[1], total_input.shape[2]
+        )
+
+        if ctx.async_grad_allreduce:
+            # Asynchronous all-reduce
+            handle = torch.distributed.all_reduce(
+                grad_input, group=get_tensor_model_parallel_group(), async_op=True
+            )
+            # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
+            # all-reduce is scheduled before the weight gradient computation
+
+        if ctx.sequence_parallel:
+            assert not ctx.async_grad_allreduce
+            dim_size = list(input.size())
+            sub_grad_input = torch.empty(
+                dim_size, dtype=input.dtype, device=torch.cuda.current_device(), requires_grad=False
+            )
+            # reduce_scatter
+            handle = torch.distributed._reduce_scatter_base(
+                sub_grad_input, grad_input, group=get_tensor_model_parallel_group(), async_op=True
+            )
+            # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
+            # reduce scatter is scheduled before the weight gradient computation
+
+        if ctx.gradient_accumulation_fusion:
+            if weight.main_grad.dtype == torch.float32:
+                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(
+                    total_input, grad_output, weight.main_grad
+                )
+            elif weight.main_grad.dtype in (torch.float16, torch.bfloat16):
+                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(
+                    total_input, grad_output, weight.main_grad
+                )
+            else:
+                raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
+            grad_weight = None
+        else:
+            grad_weight = grad_output.t().matmul(total_input)
+        grad_bias = grad_output.sum(dim=0) if use_bias else None
+
+        if ctx.sequence_parallel:
+            handle.wait()
+            return sub_grad_input, grad_weight, grad_bias, None, None, None
+
+        if ctx.async_grad_allreduce:
+            handle.wait()
+
+        return grad_input, grad_weight, grad_bias, None, None, None
+
+
+def linear_with_grad_accumulation_and_async_allreduce(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    gradient_accumulation_fusion: bool,
+    async_grad_allreduce: bool,
+    sequence_parallel: bool,
+) -> torch.Tensor:
+    """Linear layer execution with asynchronous communication and
+    gradient accumulation fusion in backprop.
+
+    This has the option to accumulate the result of backprop
+    calculation into an existing gradient buffer, preventing the need
+    to do an additional addition kernel after the gradient
+    calculation.
+
+    Additionally, the tensor parallel all reduce of the input
+    gradients can be done asynchronously with the calculation of
+    the weight gradients.
+
+    In the case of sequence parallelism, the reduce scatter of the
+    input gradients is done asynchronously with the calcluation of the
+    weight gradients.
+
+    Use of this module requires that the environment variable
+    CUDA_DEVICE_MAX_CONNECTIONS=1. There are a few collective
+    operations, noted in the code, that should be scheduled before
+    compute kernels to overlap the communication with the computation,
+    which is necessary for a speedup but not for correctness so that
+    ordering isn't imposed by the scheduler. Setting
+    CUDA_DEVICE_MAX_CONNECTIONS=1 forces the kernels to be scheduled
+    in the order they are called.
+
+    Arguments:
+
+    input (torch.Tensor required): input like torch.nn.functional.linear
+
+    weight (torch.Tensor required): weight like torch.nn.functional.linear
+
+    bias (torch.Tensor optional): bias like torch.nn.functional.linear
+
+    gradient_accumulation_fusion (bool required): Perform the gradient
+        accumulation fusion, requires the custom CUDA extension
+        fused_weight_gradient_mlp_cuda module. To use
+        gradient_accumulation_fusion you must install APEX with
+        --cpp_ext and --cuda_ext. For example: "pip install
+        --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\"
+        " Note that the extension requires CUDA>=11. Otherwise, you
+        must turn off gradient accumulation fusion."
+
+    async_grad_allreduce (bool required): Do the allreduce of input
+        gradients asyncronously with the computation of weight
+        gradients. If sequence_parallel is True, this must be
+        False, as no all reduce is performed.
+
+    sequence_parallel (bool required): Indicates that sequence
+        parallelism is used and thus in the forward pass the input is
+        all gathered, and the backward pass the input gradients are
+        reduce scattered.
+    """
+    args = [
+        input,
+        weight,
+        bias,
+        gradient_accumulation_fusion,
+        async_grad_allreduce,
+        sequence_parallel,
+    ]
+
+    if not linear_with_grad_accumulation_and_async_allreduce.warned:
+        if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
+            if sequence_parallel:
+                warnings.warn(
+                    "When using sequence parallelism it is recommended to set the "
+                    "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
+                    "maximum speedup"
+                )
+                linear_with_grad_accumulation_and_async_allreduce.warned = True
+
+            if async_grad_allreduce:
+                warnings.warn(
+                    "When using async grad allreduce it is recommended to set the "
+                    "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
+                    "maximum speedup"
+                )
+                linear_with_grad_accumulation_and_async_allreduce.warned = True
+
+    return LinearWithGradAccumulationAndAsyncCommunication.apply(*args)
+
+
+linear_with_grad_accumulation_and_async_allreduce.warned = False
+
+
+
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+
+    Keyword Arguments
+        bias: If true, add bias
+        gather_output: If true, call all-gather on output and make Y available
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: If True, do not add the bias term, instead
+                       return it to be added by the caller. This
+                       enables performance optimations where bias can
+                       be fused with other elementwise operations.
+
+        skip_weight_param_allocation: If True, weight parameter is not allocated and must be passed
+                                      as a keyword argument `weight` during the forward pass. Note
+                                      that this does not affect bias, which will be allocated if
+                                      bias is True. Defaults to False.
+
+        config: ModelParallelConfig object
+
+    """
+
+    def __init__(
+        self,
+        input_size,
+        output_size,
+        *,
+        config: ModelParallelConfig,
+        init_method: Callable,
+        bias=True,
+        gather_output=False,
+        stride=1,
+        keep_master_weight_for_test=False,
+        skip_bias_add=False,
+        skip_weight_param_allocation: bool = False,
+    ):
+        super(ColumnParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        world_size = get_tensor_model_parallel_world_size()
+        self.output_size_per_partition = output_size // world_size
+        self.skip_bias_add = skip_bias_add
+        self.config = config
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        if not skip_weight_param_allocation:
+            if config.use_cpu_initialization:
+                self.weight = Parameter(
+                    torch.empty(
+                        self.output_size_per_partition, self.input_size, dtype=config.params_dtype
+                    )
+                )
+                if config.perform_initialization:
+                    self.master_weight = _initialize_affine_weight_cpu(
+                        self.weight,
+                        self.output_size,
+                        self.input_size,
+                        self.output_size_per_partition,
+                        0,
+                        init_method,
+                        stride=stride,
+                        return_master_weight=keep_master_weight_for_test,
+                    )
+            else:
+                self.weight = Parameter(
+                    torch.empty(
+                        self.output_size_per_partition,
+                        self.input_size,
+                        device=torch.cuda.current_device(),
+                        dtype=config.params_dtype,
+                    )
+                )
+                if config.perform_initialization:
+                    _initialize_affine_weight_gpu(
+                        self.weight, init_method, partition_dim=0, stride=stride
+                    )
+        else:
+            self.weight = None
+
+        if bias:
+            if config.use_cpu_initialization:
+                self.bias = Parameter(
+                    torch.empty(self.output_size_per_partition, dtype=config.params_dtype)
+                )
+            else:
+                self.bias = Parameter(
+                    torch.empty(
+                        self.output_size_per_partition,
+                        device=torch.cuda.current_device(),
+                        dtype=config.params_dtype,
+                    )
+                )
+            set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
+            if config.perform_initialization:
+                # Always initialize bias to zero.
+                with torch.no_grad():
+                    self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        self.async_tensor_model_parallel_allreduce = (
+            config.async_tensor_model_parallel_allreduce and world_size > 1
+        )
+
+        self.sequence_parallel = config.sequence_parallel
+        if self.sequence_parallel and world_size <= 1:
+            warnings.warn(
+                f"`sequence_parallel` is set to `True`, but tensor model parallel size is {world_size}. "
+                f"Disabling sequence parallel."
+            )
+            self.sequence_parallel = False
+
+        if config.gradient_accumulation_fusion and not _grad_accum_fusion_available:
+            raise RuntimeError(
+                "ColumnParallelLinear was called with gradient_accumulation_fusion set "
+                "to True but the custom CUDA extension fused_weight_gradient_mlp_cuda "
+                "module is not found. To use gradient_accumulation_fusion you must "
+                "install APEX with --cpp_ext and --cuda_ext. For example: "
+                "pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\" "
+                "Note that the extension requires CUDA>=11. Otherwise, you must turn off "
+                "gradient accumulation fusion."
+            )
+        self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
+
+        if self.async_tensor_model_parallel_allreduce and self.sequence_parallel:
+            raise RuntimeError(
+                "`async_tensor_model_parallel_allreduce` and `sequence_parallel` "
+                "cannot be enabled at the same time."
+            )
+
+        self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+    def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None):
+        """Forward of ColumnParallelLinear
+
+        Args:
+            input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
+
+            weight (optional): weight tensor to use, compulsory when
+                skip_weight_param_allocation is True.
+
+        Returns:
+            - output
+            - bias
+
+        """
+        if weight is None:
+            if self.weight is None:
+                raise RuntimeError(
+                    "weight was not supplied to ColumnParallelLinear forward pass "
+                    "and skip_weight_param_allocation is True."
+                )
+            weight = self.weight
+        else:
+            # Check the weight passed in is the correct shape
+            expected_shape = (self.output_size_per_partition, self.input_size)
+            if weight.shape != expected_shape:
+                raise RuntimeError(
+                    f"supplied weight's shape is {tuple(weight.shape)}, "
+                    f"not {expected_shape} as expected"
+                )
+
+        bias = self.bias if not self.skip_bias_add else None
+
+        if self.async_tensor_model_parallel_allreduce or self.sequence_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = CopyToModelParallelRegion.apply(input_)
+        # Matrix multiply.
+        if not weight.requires_grad:
+            self._forward_impl = linear_with_frozen_weight
+        else:
+            self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+        output_parallel = self._forward_impl(
+            input=input_parallel,
+            weight=weight,
+            bias=bias,
+            gradient_accumulation_fusion=self.gradient_accumulation_fusion,
+            async_grad_allreduce=self.async_tensor_model_parallel_allreduce,
+            sequence_parallel=self.sequence_parallel,
+        )
+        if self.gather_output:
+            # All-gather across the partitions.
+            assert not self.sequence_parallel
+            output = GatherFromModelParallelRegion.apply(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+
+
+class RowParallelLinearNoComm(torch.nn.Module):
+    """Linear layer with row parallelism without communication.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+
+    Keyword Arguments:
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: If True, do not add the bias term, instead
+                       return it to be added by the caller. This
+                       enables performance optimations where bias can
+                       be fused with other elementwise operations.
+        config: ModelParallelConfig object
+
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        *,
+        config: ModelParallelConfig,
+        init_method: Callable,
+        bias: bool = True,
+        input_is_parallel: bool = False,
+        stride: int = 1,
+        keep_master_weight_for_test: bool = False,
+        skip_bias_add: bool = False,
+    ):
+        super(RowParallelLinearNoComm, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        # Divide the weight matrix along the last dimension.
+        world_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = input_size // world_size
+        self.skip_bias_add = skip_bias_add
+        self.config = config
+        self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
+        self.sequence_parallel = config.sequence_parallel
+        if self.sequence_parallel and not self.input_is_parallel:
+            raise RuntimeError("To enable `sequence_parallel`, `input_is_parallel` must be `True`")
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        self.weight = Parameter(
+            torch.empty(
+                self.output_size,
+                self.input_size_per_partition,
+                device=torch.cuda.current_device(),
+                dtype=config.params_dtype,
+            )
+        )
+        if config.perform_initialization:
+            _initialize_affine_weight_gpu(
+                self.weight, init_method, partition_dim=1, stride=stride
+            )
+        if bias:
+            if config.use_cpu_initialization:
+                self.bias = Parameter(torch.empty(self.output_size, dtype=config.params_dtype))
+            else:
+                self.bias = Parameter(
+                    torch.empty(
+                        self.output_size,
+                        device=torch.cuda.current_device(),
+                        dtype=config.params_dtype,
+                    )
+                )
+            setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
+
+            if config.perform_initialization:
+                # Always initialize bias to zero.
+                with torch.no_grad():
+                    self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+    def forward(self, input_):
+        """Forward of RowParallelLinear
+
+        Args:
+            input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
+
+        Returns:
+            - output
+            - bias
+        """
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            assert not self.sequence_parallel
+            input_parallel = ScatterToModelParallelRegion.apply(input_)
+        # Matrix multiply.
+        self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+        output_parallel = self._forward_impl(
+            input=input_parallel,
+            weight=self.weight,
+            bias=None,
+            gradient_accumulation_fusion=self.gradient_accumulation_fusion,
+            async_grad_allreduce=False,
+            sequence_parallel=False,
+        )
+
+        # All-reduce across all the partitions.
+        # if self.sequence_parallel:
+        #     output_ = reduce_scatter_to_sequence_parallel_region(output_parallel)
+        # else:
+        #     output_ = reduce_from_tensor_model_parallel_region(output_parallel)
+        # if not self.skip_bias_add:
+        #     output = output_ + self.bias if self.bias is not None else output_
+        #     output_bias = None
+        # else:
+        #     output = output_
+        #     output_bias = self.bias
+
+        output = output_parallel
+        output_bias = self.bias
+        return output, output_bias
\ No newline at end of file
diff --git a/training/DeepSpeed-Domino/domino/tensor_parallel/random.py b/training/DeepSpeed-Domino/domino/tensor_parallel/random.py
new file mode 100644
index 000000000..822794ed2
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/tensor_parallel/random.py
@@ -0,0 +1,241 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from random.py in Megatron-LM
+
+import contextlib
+
+import torch
+from torch import _C
+from torch.cuda import _lazy_call
+from torch.cuda import device as device_ctx_manager
+from torch.utils.checkpoint import detach_variable
+
+from domino.parallel_state import get_tensor_model_parallel_rank
+from domino.utils import safely_set_viewless_tensor_data
+from .utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+
+
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device('cuda')
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device('cuda', device)
+
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+
+    _lazy_call(cb)
+
+
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception('cuda rng state {} already exists'.format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            raise Exception('cuda rng state {} is not added'.format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-tensor-model-parallel regions.
+        tensor-model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank()
+    # Data parallel gets the original seed.
+    data_parallel_seed = seed
+
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, tensor_model_parallel_seed)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+
+    @staticmethod
+    def forward(ctx, run_function, distribute_saved_activations, *args):
+        ctx.run_function = run_function
+        ctx.distribute_saved_activations = distribute_saved_activations
+
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        with torch.no_grad():
+            outputs = run_function(*args)
+
+        # Divide hidden states across model parallel group and only keep
+        # the chunk corresponding to the current rank.
+        if distribute_saved_activations:
+            ctx.input_0_shape = args[0].data.shape
+            safely_set_viewless_tensor_data(
+                args[0], split_tensor_into_1d_equal_chunks(args[0].data, new_buffer=True)
+            )
+
+        # Store everything.
+        ctx.save_for_backward(*args)
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError(
+                "Checkpointing is not compatible with .grad(), "
+                "please use .backward() if possible"
+            )
+        inputs = ctx.saved_tensors
+        if ctx.distribute_saved_activations:
+            safely_set_viewless_tensor_data(
+                inputs[0], gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape)
+            )
+
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+        # Compute the forward pass.
+        detached_inputs = detach_variable(inputs)
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        torch.autograd.backward(outputs, args)
+        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp for inp in detached_inputs)
+        return (None, None) + grads
+
+
+def checkpoint(function, distribute_saved_activations, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function, distribute_saved_activations, *args)
diff --git a/training/DeepSpeed-Domino/domino/tensor_parallel/utils.py b/training/DeepSpeed-Domino/domino/tensor_parallel/utils.py
new file mode 100644
index 000000000..a4a604fa2
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/tensor_parallel/utils.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from utils.py in Megatron-LM
+
+
+from typing import List, Sequence
+import torch
+from domino import parallel_state
+
+
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor, num_partitions: int, contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """ Split a tensor along its last dimension.
+
+        Arguments:
+            tensor: input tensor.
+            num_partitions: number of partitions to split the tensor
+            contiguous_split_chunks: If True, make each chunk contiguous
+                                     in memory.
+
+        Returns:
+            A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
+    """ Break a tensor into equal 1D chunks across tensor parallel ranks.
+
+        Returns a Tensor or View with this rank's portion of the data.
+
+        Arguments:
+            tensor: The tensor to split
+
+        Keyword Arguments:
+            new_buffer (bool): If True, returns a new Tensor.
+                               If False, returns a view into the existing Tensor.
+                               Default is False
+
+    """
+    partition_size = torch.numel(tensor) // parallel_state.get_tensor_model_parallel_world_size()
+    start_index = partition_size * parallel_state.get_tensor_model_parallel_rank()
+    end_index = start_index + partition_size
+    if new_buffer:
+        data = torch.empty(
+            partition_size,
+            dtype=tensor.dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+        data.copy_(tensor.view(-1)[start_index:end_index])
+    else:
+        data = tensor.view(-1)[start_index:end_index]
+    return data
+
+
+def gather_split_1d_tensor(tensor):
+    """ Opposite of split_tensor_into_1d_equal_chunks. Gather values from tensor
+        model parallel ranks.
+
+        Returns a new Tensor with the gathered data.
+
+        Arguments:
+            tensor: A Tensor or view of this rank's portion of the data.
+    """
+    numel_gathered = torch.numel(tensor) * parallel_state.get_tensor_model_parallel_world_size()
+    gathered = torch.empty(
+        numel_gathered, dtype=tensor.dtype, device=torch.cuda.current_device(), requires_grad=False
+    )
+    # TODO: This API is experimental in pytorch (as of Feb 2022) and
+    # this might break in future pytorch releases. We chose this API
+    # as opposed to torch.distributed.all_gather for efficiency reasons.
+    # This API calls directly NCCL all-gather versus the former does
+    # internal copies and can potentially cause slow down.
+    torch.distributed._all_gather_base(
+        gathered, tensor, group=parallel_state.get_tensor_model_parallel_group()
+    )
+    return gathered
+
+
+class VocabUtility:
+    """ Split the vocabulary into `world_size` chunks and return the first
+        and last index of the vocabulary belonging to the `rank`
+        partition: Note that indices in [fist, last)
+
+    """
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(
+        per_partition_vocab_size: int, rank, world_size: int
+    ) -> Sequence[int]:
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(
+        global_vocab_size: int, rank: int, world_size: int
+    ) -> Sequence[int]:
+        per_partition_vocab_size = global_vocab_size // world_size
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size
+        )
diff --git a/training/DeepSpeed-Domino/domino/timer.py b/training/DeepSpeed-Domino/domino/timer.py
new file mode 100644
index 000000000..a9478fa01
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/timer.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+
+"""Megatron timers."""
+
+from abc import ABC
+from abc import abstractmethod
+import time
+
+import torch
+
+
+
+class TimerBase(ABC):
+
+    def __init__(self, name):
+        self.name = name
+
+    @abstractmethod
+    def start(self, barrier=False):
+        pass
+
+    @abstractmethod
+    def stop(self, barrier=False):
+        pass
+
+    @abstractmethod
+    def reset(self):
+        pass
+
+    @abstractmethod
+    def elapsed(self, reset=True, barrier=False):
+        pass
+
+
+
+class DummyTimer(TimerBase):
+
+    def __init__(self):
+        super().__init__('dummy timer')
+
+    def start(self, barrier=False):
+        return
+
+    def stop(self, barrier=False):
+        return
+
+    def reset(self):
+        return
+
+    def elapsed(self, reset=True, barrier=False):
+        raise Exception('dummy timer should not be used to '
+                        'calculate elapsed time')
+
+
+
+class Timer(TimerBase):
+    """
+    Comment on using `barrier`: If this flag is passed, then all
+    the caller processes will wait till all reach the timing routine.
+    It is up to the user to make sure all the ranks in `barrier_group`
+    call it otherwise, it will result in a hang.
+    Comment on `barrier_group`: By default it is set to None which
+    in torch distributed land, it will result in the global communicator.
+    """
+
+    def __init__(self, name):
+        super().__init__(name)
+        self._elapsed = 0.0
+        self._started = False
+        # Note that None will default to the global process group
+        self._barrier_group = None
+        self._start_time = time.time()
+
+
+    def set_barrier_group(self, barrier_group):
+        self._barrier_group = barrier_group
+
+
+    def start(self, barrier=False):
+        """Start the timer."""
+        assert not self._started, 'timer has already been started'
+        if barrier:
+            torch.distributed.barrier(group=self._barrier_group)
+        torch.cuda.synchronize()
+        self._start_time = time.time()
+        self._started = True
+
+
+    def stop(self, barrier=False):
+        """Stop the timer."""
+        assert self._started, 'timer is not started'
+        if barrier:
+            torch.distributed.barrier(group=self._barrier_group)
+        torch.cuda.synchronize()
+        self._elapsed += (time.time() - self._start_time)
+        self._started = False
+
+
+    def reset(self):
+        """Reset timer."""
+        self._elapsed = 0.0
+        self._started = False
+
+
+    def elapsed(self, reset=True, barrier=False):
+        """Calculate the elapsed time."""
+        _started = self._started
+        # If the timing in progress, end it first.
+        if self._started:
+            self.stop(barrier=barrier)
+        # Get the elapsed time.
+        _elapsed = self._elapsed
+        # Reset the elapsed time
+        if reset:
+            self.reset()
+        # If timing was in progress, set it back.
+        if _started:
+            self.start(barrier=barrier)
+        return _elapsed
+
+
+
+class Timers:
+    """Group of timers."""
+
+    def __init__(self, log_level, log_option):
+        self._log_level = log_level
+        self._log_option = log_option
+        self._timers = {}
+        self._log_levels = {}
+        self._dummy_timer = DummyTimer()
+        self._max_log_level = 2
+
+
+    def __call__(self, name, log_level=None):
+        # If the timer has already been set, then check if the log-level
+        # is provided, it matches the one that the timer was created with.
+        if name in self._timers:
+            if log_level is not None:
+                assert log_level == self._log_levels[name], \
+                    'input log level {} does not match already existing '\
+                    'log level {} for {} timer'.format(
+                        log_level, self._log_levels[name], name)
+            return self._timers[name]
+        # If timer does not exist and no log level is provided,
+        # set it to the max log level which is 2.
+        if log_level is None:
+            log_level = self._max_log_level
+        assert log_level <= self._max_log_level, \
+            'log level {} is larger than max supported log level {}'.format(
+                log_level, self._max_log_level)
+        # Now if the input log level is larger than the one set for
+        # the timers class, just ignore it and return a dummy timer.
+        if log_level > self._log_level:
+            return self._dummy_timer
+        # Otherwise, initalize the timer and set the level.
+        self._timers[name] = Timer(name)
+        self._log_levels[name] = log_level
+        return self._timers[name]
+
+
+    def _get_elapsed_time_all_ranks(self, names, reset, barrier):
+        """
+        Assumptions:
+            - All the ranks call this function.
+            - `names` are identical on all ranks.
+        If the above assumptions are not met, calling this function will
+        result in hang.
+        Arguments:
+            - names: list of timer names
+            - reset: reset the timer after recording the elapsed time
+            - barrier: if set, do a global barrier before time measurments
+        """
+
+        # First make sure all the callers are in sync.
+        if barrier:
+            torch.distributed.barrier()
+
+        world_size = torch.distributed.get_world_size()
+        rank = torch.distributed.get_rank()
+
+        # Here we can use gather on the rank we want to print the
+        # timing, however, there is no gather_base support in
+        # pytorch yet. It is simpler to deal with a single tensor
+        # and since we are only gathering a small amount of data,
+        # it should be ok to use all-gather instead of gather.
+        rank_name_to_time = torch.zeros((world_size, len(names)),
+                                        dtype=torch.float,
+                                        device=torch.cuda.current_device())
+        for i, name in enumerate(names):
+            if name in self._timers:
+                # Here we don't need to pass the barrier flag as all
+                # the processes are already in sync. This avoids the
+                # issue of different timers having different barrier
+                # groups inside their class.
+                rank_name_to_time[rank, i] = self._timers[name].elapsed(
+                    reset=reset)
+
+        # See the note above for why we are not using gather.
+        torch.distributed._all_gather_base(rank_name_to_time.view(-1),
+                                           rank_name_to_time[rank, :].view(-1))
+
+        return rank_name_to_time
+
+
+    def _get_global_min_max_time(self, names, reset, barrier, normalizer):
+        """Report only min and max times across all ranks."""
+
+        rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset,
+                                                             barrier)
+        name_to_min_max_time = {}
+        for i, name in enumerate(names):
+            rank_to_time = rank_name_to_time[:, i]
+            # filter out the ones we did not have any timings for
+            rank_to_time = rank_to_time[rank_to_time > 0.0]
+            # If the timer exists:
+            if rank_to_time.numel() > 0:
+                name_to_min_max_time[name] = (
+                    rank_to_time.min().item() / normalizer,
+                    rank_to_time.max().item() / normalizer)
+        return name_to_min_max_time
+
+
+    def _get_global_min_max_time_string(self, names, reset, barrier,
+                                        normalizer, max_only):
+        name_to_min_max_time = self._get_global_min_max_time(
+            names, reset, barrier, normalizer)
+        if not name_to_min_max_time:
+            return None
+        output_string = '(min, max) time across ranks (ms):'
+        for name in name_to_min_max_time:
+            min_time, max_time = name_to_min_max_time[name]
+            if max_only:
+                output_string += '\n    {}: {:.2f}'.format(
+                    (name+' ').ljust(48, '.'), max_time)
+            else:
+                output_string += '\n    {}: ({:.2f}, {:.2f})'.format(
+                    (name+' ').ljust(48, '.'), min_time, max_time)
+        return output_string
+
+
+    def _get_all_ranks_time_string(self, names, reset, barrier, normalizer):
+        """Report times across all ranks."""
+        rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset,
+                                                             barrier)
+
+        output_string = 'times across ranks (ms):'
+        no_reported_timing = True
+        for i, name in enumerate(names):
+            not_yet_found = True
+            for rank in range(torch.distributed.get_world_size()):
+                if rank_name_to_time[rank, i] > 0:
+                    no_reported_timing = False
+                    if not_yet_found:
+                        not_yet_found = False
+                        output_string += '\n  {}:'.format(name)
+                    output_string += '\n     rank {:2d}: {:.2f}'.format(
+                        rank, rank_name_to_time[rank, i] / normalizer)
+        if no_reported_timing:
+            return None
+        return output_string
+
+
+    def log(self, names, rank=None, normalizer=1.0, reset=True, barrier=False):
+        """Log a group of timers."""
+
+        # Print.
+        assert normalizer > 0.0
+        if self._log_option in ['max', 'minmax']:
+            max_only = False
+            if self._log_option == 'max':
+                max_only = True
+            output_string = self._get_global_min_max_time_string(
+                names, reset, barrier, normalizer/1000.0, max_only)
+        elif self._log_option == 'all':
+            output_string = self._get_all_ranks_time_string(names,
+                                                            reset, barrier,
+                                                            normalizer/1000.0)
+        else:
+            raise Exception('unknown timing log option {}'.format(
+                self._log_option))
+
+        # If no input rank is provided, log on last rank.
+        if rank is None:
+            rank = torch.distributed.get_world_size() - 1
+        if rank == torch.distributed.get_rank() and output_string is not None:
+            print(output_string, flush=True)
+
+
+    def write(self, names, writer, iteration, normalizer=1.0,
+              reset=False, barrier=False):
+        """Write timers to a tensorboard writer
+        Note that we only report maximum time across ranks to tensorboard.
+        """
+        # currently when using add_scalars,
+        # torch.utils.add_scalars makes each timer its own run, which
+        # polutes the runs list, so we just add each as a scalar
+        assert normalizer > 0.0
+        name_to_min_max_time = self._get_global_min_max_time(
+            names, reset, barrier, normalizer)
+        if writer is not None:
+            for name in name_to_min_max_time:
+                _, max_time = name_to_min_max_time[name]
+                writer.add_scalar(name + '-time', max_time, iteration)
diff --git a/training/DeepSpeed-Domino/domino/training.py b/training/DeepSpeed-Domino/domino/training.py
new file mode 100644
index 000000000..59e253fcf
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/training.py
@@ -0,0 +1,431 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from training.py in Megatron-LM
+
+import torch
+from domino.arguments import get_args, get_tokenizer, get_num_microbatches, get_timers
+from domino.utils import print_rank_0, get_model_config, get_ltor_masks_and_position_ids
+import domino.parallel_state as mpu
+from domino.tensor_parallel.partition import set_defaults_if_not_set_tensor_model_parallel_attributes
+from domino.modules.enums import ModelType
+from domino.schedules import get_forward_backward_func
+from domino.data.data_samplers import build_pretraining_data_loader
+from domino.modules.distributed import DistributedDataParallel as LocalDDP
+from domino.modules.module import Float16Module
+from domino.optimizer import get_megatron_optimizer
+from domino.optimizer_param_scheduler import OptimizerParamScheduler
+from domino.initialize import set_jit_fusion_options
+from domino.tensor_parallel.data import broadcast_data
+
+
+def is_rank_0():
+    # if torch.cuda.current_device() == 0:
+    if torch.distributed.get_rank() == 0:
+        return True
+    
+
+def forward_step(data_iterator, model):
+    input_tokens, target_labels, loss_mask, attention_mask, position_ids = prepare_batch(data_iterator)
+    model_output = model(input_tokens, position_ids, attention_mask, labels=target_labels)
+    return model_output, lambda output: compute_loss(loss_mask, output)
+
+
+def prepare_batch(data_iterator):
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    data_keys = ['text']
+    data_type = torch.int64
+
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+
+    broadcasted_data = broadcast_data(data_keys, data, data_type)
+    full_tokens = broadcasted_data['text'].long()
+    input_tokens = full_tokens[:, :-1].contiguous()
+    target_labels = full_tokens[:, 1:].contiguous()
+
+    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+        input_tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss
+    )
+
+    return input_tokens, target_labels, loss_mask, attention_mask, position_ids
+
+
+def compute_loss(loss_mask, model_output):
+    flattened_output = model_output.view(-1).float()
+    flattened_loss_mask = loss_mask.view(-1).float()
+    loss = torch.sum(flattened_output * flattened_loss_mask) / flattened_loss_mask.sum()
+    return loss
+
+
+def pretrain(base_model, train_ds, valid_ds, test_ds):
+    args = get_args()
+
+    # Model, optimizer, and learning rate.
+    model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
+        base_model, ModelType.encoder_or_decoder)
+    config = get_model_config(model)
+
+    # Do not use virtual pipeline parallelism for data parallel
+    train_data_iterator, valid_data_iterator, test_data_iterator \
+        = get_dataset_iterator(train_ds, valid_ds, test_ds)
+
+    # Train and eval.
+    print_rank_0('training ...')
+
+    if args.do_train and args.train_iters > 0:
+        train(forward_step,
+              model, optimizer, opt_param_scheduler,
+              train_data_iterator, valid_data_iterator, config)
+
+    # if args.do_valid:
+    #     total_loss_dict = evaluate(forward_step, valid_data_iterator, model, config, True)
+    #     print_rank_0(total_loss_dict)
+
+    # if args.do_test:
+    #     total_loss_dict = evaluate(forward_step, test_data_iterator, model, config, True)
+    #     print_rank_0(total_loss_dict)
+
+
+def setup_model_and_optimizer(base_model,
+                              model_type,
+                              no_wd_decay_cond=None,
+                              scale_lr_cond=None):
+    """Setup model and optimizer."""
+    args = get_args()
+
+    model = get_model(base_model, model_type)
+
+    if isinstance(model, list):
+        models = model
+    else:
+        models = [model]
+    optimizer = get_megatron_optimizer(models, no_wd_decay_cond, scale_lr_cond)
+    opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
+
+    args.iteration = 0
+
+    return model, optimizer, opt_param_scheduler
+
+
+def get_model(base_model, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True):
+    """Build the model."""
+    args = get_args()
+    args.model_type = model_type
+
+    # Build model.
+    model = base_model 
+    model.model_type = model_type
+
+    for param in model.parameters():
+        set_defaults_if_not_set_tensor_model_parallel_attributes(param)
+
+    # Print number of parameters.
+    if mpu.get_data_parallel_rank() == 0:
+        print(' > number of parameters on (tensor, pipeline) '
+              'model parallel rank ({}, {}): {}'.format(
+                  mpu.get_tensor_model_parallel_rank(),
+                  mpu.get_pipeline_model_parallel_rank(),
+                  sum([p.nelement() for p in model.parameters()])), flush=True)
+
+    # GPU allocation.
+    model.cuda(torch.cuda.current_device())
+
+    # Fp16 conversion.
+    if args.fp16 or args.bf16:
+        model = Float16Module(model, args)
+
+    if wrap_with_ddp:
+        if args.DDP_impl == 'local':
+            model = LocalDDP(model,
+                             args.accumulate_allreduce_grads_in_fp32,
+                             args.use_contiguous_buffers_in_local_ddp)
+            # broad cast params from data parallel src rank to other data parallel ranks
+            if args.data_parallel_random_init:
+                model.broadcast_params()
+        else:
+            raise NotImplementedError('Unknown DDP implementation specified: '
+                                      '{}. Exiting.'.format(args.DDP_impl))
+    return model
+
+
+def get_optimizer_param_scheduler(optimizer):
+    """Build the learning rate scheduler."""
+    args = get_args()
+
+    # Iteration-based training.
+    # Remove sample-based training.
+    if args.train_iters:
+        if args.lr_decay_iters is None:
+            args.lr_decay_iters = args.train_iters
+        lr_decay_steps = args.lr_decay_iters * args.global_batch_size
+        wd_incr_steps = args.train_iters * args.global_batch_size
+        if args.lr_warmup_fraction is not None:
+            lr_warmup_steps = args.lr_warmup_fraction * lr_decay_steps
+        else:
+            lr_warmup_steps = args.lr_warmup_iters * args.global_batch_size
+    else:
+        raise Exception(
+            'either train-iters or train-samples should be provided.')
+
+    opt_param_scheduler = OptimizerParamScheduler(
+        optimizer,
+        init_lr=args.lr_warmup_init,
+        max_lr=args.lr,
+        min_lr=args.min_lr,
+        lr_warmup_steps=lr_warmup_steps,
+        lr_decay_steps=lr_decay_steps,
+        lr_decay_style=args.lr_decay_style,
+        start_wd=args.start_weight_decay,
+        end_wd=args.end_weight_decay,
+        wd_incr_steps=wd_incr_steps,
+        wd_incr_style=args.weight_decay_incr_style,
+        use_checkpoint_opt_param_scheduler=args.use_checkpoint_opt_param_scheduler,
+        override_opt_param_scheduler=args.override_opt_param_scheduler)
+
+    return opt_param_scheduler
+
+
+def get_dataset_iterator(train_ds, valid_ds, test_ds):
+    """Build pretraining data iterators."""
+    args = get_args()
+
+    # Build loaders.
+    train_dataloader, valid_dataloader, test_dataloader = \
+        get_data_loader(train_ds, valid_ds, test_ds)
+
+    # Build iterators.
+    dl_type = args.dataloader_type
+    assert dl_type == 'single'
+
+    if train_dataloader is not None:
+        train_data_iterator = iter(train_dataloader)
+    else:
+        train_data_iterator = None
+
+    if valid_dataloader is not None:
+        valid_data_iterator = iter(valid_dataloader)
+    else:
+        valid_data_iterator = None
+
+    if test_dataloader is not None:
+        test_data_iterator = iter(test_dataloader)
+    else:
+        test_data_iterator = None
+
+    return train_data_iterator, valid_data_iterator, test_data_iterator
+
+
+def get_data_loader(train_ds, valid_ds, test_ds):
+    """Build pretraining data loaders."""
+    args = get_args()
+
+    (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
+
+    print_rank_0('> building train, validation, and test datasets ...')
+
+    # Backward compatibility, assume fixed batch size.
+    if args.iteration > 0 and args.consumed_train_samples == 0:
+        assert args.train_samples is None, \
+            'only backward compatiblity support for iteration-based training'
+        args.consumed_train_samples = args.iteration * args.global_batch_size
+    if args.iteration > 0 and args.consumed_valid_samples == 0:
+        if args.train_samples is None:
+            args.consumed_valid_samples = (args.iteration // args.eval_interval) * \
+                args.eval_iters * args.global_batch_size
+
+    # Data loader only on rank 0 of each model parallel group.
+    if mpu.get_tensor_model_parallel_rank() == 0:
+        # Build dataloders.
+        train_dataloader = build_pretraining_data_loader(
+            train_ds, args.consumed_train_samples)
+        valid_dataloader = build_pretraining_data_loader(
+            valid_ds, args.consumed_valid_samples)
+        test_dataloader = build_pretraining_data_loader(test_ds, 0)
+
+        # Flags to know if we need to do training/validation/testing.
+        do_train = train_dataloader is not None and args.train_iters > 0
+        do_valid = valid_dataloader is not None and args.eval_iters > 0
+        do_test = test_dataloader is not None and args.eval_iters > 0
+        # Need to broadcast num_tokens and num_type_tokens.
+        flags = torch.cuda.LongTensor(
+            [int(do_train), int(do_valid), int(do_test)])
+    else:
+        flags = torch.cuda.LongTensor([0, 0, 0])
+
+    # Broadcast num tokens.
+    torch.distributed.broadcast(flags,
+                                mpu.get_tensor_model_parallel_src_rank(),
+                                group=mpu.get_tensor_model_parallel_group())
+    args.do_train = flags[0].item()
+    args.do_valid = flags[1].item()
+    args.do_test = flags[2].item()
+
+    return train_dataloader, valid_dataloader, test_dataloader
+
+
+def train(forward_step_func, model, optimizer, opt_param_scheduler,
+          train_data_iterator, valid_data_iterator, config):
+    """Train the model function."""
+    args = get_args()
+    timers = get_timers()
+
+    model.train()
+
+    # Iterations.
+    iteration = args.iteration
+
+    # Setup some training config params
+    config.grad_scale_func = optimizer.scale_loss
+    config.timers = None
+
+    timers('ite-time', log_level=0).start(barrier=True)
+    while iteration < args.train_iters:
+        args.curr_iteration = iteration
+        loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
+            train_step(forward_step_func,
+                        train_data_iterator,
+                        model,
+                        optimizer,
+                        opt_param_scheduler,
+                        config)
+
+        iteration += 1
+        args.consumed_train_samples += mpu.get_data_parallel_world_size() * \
+            args.micro_batch_size * get_num_microbatches()
+        
+        ite_time = timers('ite-time').elapsed(barrier=True)
+        if iteration % args.log_interval == 0 and is_rank_0():
+            loss = loss_dict['lm loss'].item()
+            print( 'iteration: {} | loss: {:.3f} | iteration time (ms): {} '.format(iteration, loss, ite_time*1000.0))
+            # loss_scale = optimizer.cur_scale
+            # lr = optimizer.param_groups[0]['lr']
+            # print( 'lr: {} loss scale: {:.1f} |'.format(lr, loss_scale))'
+
+    return iteration
+
+
+def train_step(forward_step_func, data_iterator,
+               model, optimizer, opt_param_scheduler, config):
+    """Single training step."""
+    args = get_args()
+
+    # Set grad to zero.
+    if args.DDP_impl == 'local' and args.use_contiguous_buffers_in_local_ddp:
+        model.zero_grad_buffer()
+    optimizer.zero_grad()
+
+    forward_backward_func = get_forward_backward_func()
+
+    losses_reduced = forward_backward_func(
+        forward_step_func=forward_step_func,
+        data_iterator=data_iterator,
+        model=model,
+        num_microbatches=get_num_microbatches(),
+        seq_length=args.seq_length,
+        micro_batch_size=args.micro_batch_size,
+        decoder_seq_length=args.decoder_seq_length,
+        forward_only=False)
+
+    timers = None
+    # reset timers if necessary
+    if config.timers is None:
+        config.timers = timers
+
+    # Empty unused memory.
+    if args.empty_unused_memory_level >= 1:
+        torch.cuda.empty_cache()
+
+    update_successful, grad_norm, num_zeros_in_grad = optimizer.step(args, timers)
+
+    # Update learning rate.
+    if update_successful:
+        increment = get_num_microbatches() * \
+                    args.micro_batch_size * \
+                    args.data_parallel_size
+        opt_param_scheduler.step(increment=increment)
+        skipped_iter = 0
+    else:
+        skipped_iter = 1
+
+    # Empty unused memory.
+    if args.empty_unused_memory_level >= 2:
+        torch.cuda.empty_cache()
+
+    if mpu.is_pipeline_last_stage(ignore_virtual=True):
+        # Average loss across microbatches.
+        loss_reduced = {}
+        for key in losses_reduced[0]:
+            losses_reduced_for_key = [x[key] for x in losses_reduced]
+            loss_reduced[key] = sum(losses_reduced_for_key) / len(losses_reduced_for_key)
+        return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad
+    return {}, skipped_iter, grad_norm, num_zeros_in_grad
+
+
+def evaluate(forward_step_func,
+             data_iterator,
+             model,
+             config,
+             verbose=False):
+    """Evaluation."""
+    args = get_args()
+
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+
+    total_loss_dict = {}
+
+    # make validation batch size independent from training batch size
+    eval_batch_size = args.global_batch_size
+    eval_num_microbatches = eval_batch_size // \
+        (args.micro_batch_size * args.data_parallel_size)
+
+    with torch.no_grad():
+        iteration = 0
+        if verbose:
+            print_rank_0(
+                f'Evaluating on {args.eval_iters * eval_batch_size} samples')
+        while iteration < args.eval_iters:
+            iteration += 1
+            if verbose:
+                print_rank_0(f'Evaluating iter {iteration}/{args.eval_iters}')
+
+            forward_backward_func = get_forward_backward_func()
+            # Don't care about timing during evaluation
+            config.timers = None
+            loss_dicts = forward_backward_func(
+                forward_step_func=forward_step_func,
+                data_iterator=data_iterator,
+                model=model,
+                num_microbatches=eval_num_microbatches,
+                seq_length=args.seq_length,
+                micro_batch_size=args.micro_batch_size,
+                decoder_seq_length=args.decoder_seq_length,
+                forward_only=True)
+
+            # Empty unused memory
+            if args.empty_unused_memory_level >= 1:
+                torch.cuda.empty_cache()
+
+            if mpu.is_pipeline_last_stage(ignore_virtual=True):
+                # Reduce across processes.
+                for loss_dict in loss_dicts:
+                    for key in loss_dict:
+                        total_loss_dict[key] = total_loss_dict.get(
+                            key, torch.cuda.FloatTensor([0.0])) + loss_dict[key]
+
+            args.consumed_valid_samples += eval_batch_size
+
+    # Move model back to the train mode.
+    model.train()
+
+    for key in total_loss_dict:
+        total_loss_dict[key] /= args.eval_iters * eval_num_microbatches
+
+    return total_loss_dict
diff --git a/training/DeepSpeed-Domino/domino/utils.py b/training/DeepSpeed-Domino/domino/utils.py
new file mode 100644
index 000000000..af90f6684
--- /dev/null
+++ b/training/DeepSpeed-Domino/domino/utils.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+
+import sys
+import torch
+from torch.nn.parallel import DistributedDataParallel as torchDDP
+from apex.multi_tensor_apply import multi_tensor_applier
+import amp_C
+from domino.arguments import get_args
+import domino.parallel_state as mpu
+from domino.tensor_parallel.partition import param_is_not_tensor_parallel_duplicate
+
+
+def unwrap_model(model, module_instances=(torchDDP)):
+    return_list = True
+    if not isinstance(model, list):
+        model = [model]
+        return_list = False
+    unwrapped_model = []
+    for model_module in model:
+        while isinstance(model_module, module_instances):
+            model_module = model_module.module
+        unwrapped_model.append(model_module)
+    if not return_list:
+        return unwrapped_model[0]
+    return unwrapped_model
+
+
+def get_attr_wrapped_model(model, attr, allow_none=True):
+    """Get an attribute from a wrapped model"""
+    if isinstance(model, list):
+        raise RuntimeError("_get_attr_wrapped_model given a list of models")
+
+    if allow_none:
+        def condition(model, attr):
+            return not hasattr(model, attr)
+    else:
+        def condition(model, attr):
+            return getattr(model, attr, None) is None
+
+    while condition(model, attr):
+        if not hasattr(model, "module"):
+            raise RuntimeError(f"_get_attr_wrapped_model couldn't find attribute {attr}")
+
+        model = model.module
+    return getattr(model, attr)
+
+
+def get_model_type(model):
+    return get_attr_wrapped_model(model, 'model_type')
+
+
+def get_model_config(model):
+    # args = get_args()
+    # if args.deepspeed:
+    #     return get_attr_wrapped_model(model.module, 'config', allow_none=False)
+    return get_attr_wrapped_model(model, 'config', allow_none=False)
+
+
+def param_is_not_shared(param):
+    return not hasattr(param, 'shared') or not param.shared
+
+
+def calc_params_l2_norm(model):
+    """Calculate l2 norm of parameters """
+    args = get_args()
+    if not isinstance(model, list):
+        model = [model]
+    # Remove duplicate params.
+    params_data = []
+    for model_ in model:
+        for param in model_.parameters():
+            is_not_shared = param_is_not_shared(param)
+            is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+            if is_not_shared and is_not_tp_duplicate:
+                if args.bf16:
+                    params_data.append(param.data.float())
+                else:
+                    params_data.append(param.data)
+    # Calculate norm
+    dummy_overflow_buf = torch.cuda.IntTensor([0])
+    norm, _ = multi_tensor_applier(
+        amp_C.multi_tensor_l2norm,
+        dummy_overflow_buf,
+        [params_data],
+        False # no per-parameter norm
+    )
+    norm_2 = norm * norm
+    # Sum across all model-parallel GPUs.
+    torch.distributed.all_reduce(norm_2,
+                                 op=torch.distributed.ReduceOp.SUM,
+                                 group=mpu.get_model_parallel_group())
+    return norm_2.item() ** 0.5
+
+
+def print_rank_0(message):
+    """If distributed is initialized, print only on rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
+
+
+def get_ltor_masks_and_position_ids(data,
+                                    eod_token,
+                                    reset_position_ids,
+                                    reset_attention_mask,
+                                    eod_mask_loss):
+    """Build masks and position id for left to right model."""
+
+    # Extract batch size and sequence length.
+    micro_batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    if reset_attention_mask:
+        att_mask_batch = micro_batch_size
+    else:
+        att_mask_batch = 1
+    attention_mask = torch.tril(torch.ones(
+        (att_mask_batch, seq_length, seq_length), device=data.device)).view(
+            att_mask_batch, 1, seq_length, seq_length)
+
+    # Loss mask.
+    loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
+    if eod_mask_loss:
+        loss_mask[data == eod_token] = 0.0
+
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(micro_batch_size):
+
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1):] -= (i + 1 - prev_index)
+                    prev_index = i + 1
+
+    # Convert attention mask to binary:
+    attention_mask = (attention_mask < 0.5)
+
+    return attention_mask, loss_mask, position_ids
+
+
+def average_losses_across_data_parallel_group(losses):
+    """Reduce a tensor of losses across all GPUs."""
+    averaged_losses = torch.cat(
+        [loss.clone().detach().view(1) for loss in losses])
+    torch.distributed.all_reduce(averaged_losses,
+                                 group=mpu.get_data_parallel_group())
+    averaged_losses = averaged_losses / \
+        torch.distributed.get_world_size(group=mpu.get_data_parallel_group())
+
+    return averaged_losses
+
+
+def _kernel_make_viewless_tensor(inp, requires_grad):
+    '''Make a viewless tensor.
+
+    View tensors have the undesirable side-affect of retaining a reference
+    to the originally-viewed tensor, even after manually setting the '.data'
+    field. This method creates a new tensor that links to the old tensor's
+    data, without linking the viewed tensor, referenced via the '._base'
+    field.
+    '''
+    out = torch.empty(
+        (1,),
+        dtype = inp.dtype,
+        device = inp.device,
+        requires_grad = requires_grad,
+    )
+    out.data = inp.data
+    return out
+
+class MakeViewlessTensor(torch.autograd.Function):
+    '''
+    Autograd function to make a viewless tensor.
+
+    This function should be used in cases where the computation graph needs
+    to be propagated, but we only want a viewless tensor (e.g.,
+    ParallelTransformer's hidden_states). Call this function by passing
+    'keep_graph = True' to 'make_viewless_tensor()'.
+    '''
+    @staticmethod
+    def forward(ctx, inp, requires_grad):
+        return _kernel_make_viewless_tensor(inp, requires_grad)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None
+
+def make_viewless_tensor(inp, requires_grad, keep_graph):
+    '''
+    Entry-point for creating viewless tensors.
+
+    This method should be used, rather than calling 'MakeViewlessTensor'
+    or '_kernel_make_viewless_tensor' directly. This method acts as a
+    switch for determining if an autograd function or a regular method
+    should be used to create the tensor.
+    '''
+
+    # return tensor as-is, if not a 'view'
+    if inp._base is None:
+        return inp
+
+    # create viewless tensor
+    if keep_graph:
+        return MakeViewlessTensor.apply(inp, requires_grad)
+    else:
+        return _kernel_make_viewless_tensor(inp, requires_grad)
+
+
+def safely_set_viewless_tensor_data(tensor, new_data_tensor):
+    '''Safely set tensor's '.data' field.
+
+    Check first that the tensor is viewless (i.e., '._base' not set). If not,
+    raise an exception.
+    '''
+    # assert_viewless_tensor(
+    #     tensor,
+    #     extra_msg="FYI, tensor._base has shape %s, and new_data_tensor has shape %s."
+    #     % ("--" if tensor._base is None else tensor._base.shape, new_data_tensor.shape),
+    # )
+    tensor.data = new_data_tensor
diff --git a/training/DeepSpeed-Domino/megatron/__init__.py b/training/DeepSpeed-Domino/megatron/__init__.py
new file mode 100644
index 000000000..aa99c0665
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from .global_vars import get_args, get_retro_args
+from .global_vars import get_current_global_batch_size
+from .global_vars import get_num_microbatches
+from .global_vars import get_signal_handler
+from .global_vars import update_num_microbatches
+from .global_vars import get_tokenizer
+from .global_vars import get_tensorboard_writer
+from .global_vars import get_adlr_autoresume
+from .global_vars import get_timers
+from .initialize  import initialize_megatron
+
+from .utils import (print_rank_0,
+                    is_last_rank,
+                    print_rank_last)
diff --git a/training/DeepSpeed-Domino/megatron/arguments.py b/training/DeepSpeed-Domino/megatron/arguments.py
new file mode 100644
index 000000000..ba398aa44
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/arguments.py
@@ -0,0 +1,1305 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron arguments."""
+
+import argparse
+import dataclasses
+import json
+import os
+import torch
+import types
+
+import torch.nn.functional as F
+from megatron.global_vars import set_retro_args, get_retro_args
+# from tools.retro.utils import get_args_path as get_retro_args_path
+
+from megatron.core.transformer import TransformerConfig
+
+def parse_args(extra_args_provider=None, ignore_unknown_args=False):
+    """Parse all arguments."""
+    parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
+                                     allow_abbrev=False)
+
+    # Standard arguments.
+    parser = _add_network_size_args(parser)
+    parser = _add_regularization_args(parser)
+    parser = _add_training_args(parser)
+    parser = _add_initialization_args(parser)
+    parser = _add_learning_rate_args(parser)
+    parser = _add_checkpointing_args(parser)
+    parser = _add_mixed_precision_args(parser)
+    parser = _add_distributed_args(parser)
+    parser = _add_validation_args(parser)
+    parser = _add_data_args(parser)
+    parser = _add_autoresume_args(parser)
+    parser = _add_biencoder_args(parser)
+    parser = _add_vision_args(parser)
+    parser = _add_logging_args(parser)
+    parser = _add_inference_args(parser)
+    parser = _add_transformer_engine_args(parser)
+    parser = _add_retro_args(parser)
+
+    # Custom arguments.
+    if extra_args_provider is not None:
+        parser = extra_args_provider(parser)
+
+    # Parse.
+    if ignore_unknown_args:
+        args, _ = parser.parse_known_args()
+    else:
+        args = parser.parse_args()
+
+    # Args from environment
+    args.rank = int(os.getenv('RANK', '0'))
+    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
+
+    return args
+
+def validate_args(args, defaults={}):
+    # Tensor model parallel size.
+    args.tensor_model_parallel_size = min(
+        args.tensor_model_parallel_size, args.world_size)
+    assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\
+        ' ({}) is not divisible by tensor model parallel size ({})'.format(
+            args.world_size, args.tensor_model_parallel_size)
+    # Pipeline model parallel size.
+    args.pipeline_model_parallel_size = min(
+        args.pipeline_model_parallel_size,
+        (args.world_size // args.tensor_model_parallel_size))
+    args.transformer_pipeline_model_parallel_size = (
+        args.pipeline_model_parallel_size - 1
+        if args.standalone_embedding_stage else
+        args.pipeline_model_parallel_size
+    )
+    # Checks.
+    model_parallel_size = args.pipeline_model_parallel_size * \
+                          args.tensor_model_parallel_size
+    assert args.world_size % model_parallel_size == 0, 'world size ({}) is not'\
+        ' divisible by tensor parallel size ({}) times pipeline parallel ' \
+        'size ({})'.format(args.world_size, args.tensor_model_parallel_size,
+                           args.pipeline_model_parallel_size)
+    args.data_parallel_size = args.world_size // model_parallel_size
+    if args.rank == 0:
+        print('using world size: {}, data-parallel-size: {}, '
+              'tensor-model-parallel size: {}, '
+              'pipeline-model-parallel size: {} '.format(
+                  args.world_size, args.data_parallel_size,
+                  args.tensor_model_parallel_size,
+                  args.pipeline_model_parallel_size), flush=True)
+    if args.pipeline_model_parallel_size > 1:
+        if args.pipeline_model_parallel_split_rank is not None:
+            assert args.pipeline_model_parallel_split_rank < \
+                    args.pipeline_model_parallel_size, 'split rank needs'\
+                    ' to be less than pipeline model parallel size ({})'.format(
+                            args.pipeline_model_parallel_size)
+
+    # Deprecated arguments
+    assert args.batch_size is None, '--batch-size argument is no longer ' \
+        'valid, use --micro-batch-size instead'
+    del args.batch_size
+    assert args.warmup is None, '--warmup argument is no longer valid, use ' \
+        '--lr-warmup-fraction instead'
+    del args.warmup
+    assert args.model_parallel_size is None, '--model-parallel-size is no ' \
+        'longer valid, use --tensor-model-parallel-size instead'
+    del args.model_parallel_size
+
+    if args.checkpoint_activations:
+        if args.rank == 0:
+            print('--checkpoint-activations is no longer valid, use --recompute-activations, '
+                  'or, for more control, --recompute-granularity and --recompute-method.')
+        exit()
+    del args.checkpoint_activations
+
+    if args.recompute_activations:
+        args.recompute_granularity = 'selective'
+    del args.recompute_activations
+
+    # Set input defaults.
+    for key in defaults:
+        # For default to be valid, it should not be provided in the
+        # arguments that are passed to the program. We check this by
+        # ensuring the arg is set to None.
+        if getattr(args, key, None) is not None:
+            if args.rank == 0:
+                print('WARNING: overriding default arguments for {key}:{v} \
+                       with {key}:{v2}'.format(key=key, v=defaults[key],
+                                               v2=getattr(args, key)),
+                                               flush=True)
+        else:
+            setattr(args, key, defaults[key])
+
+    # Batch size.
+    assert args.micro_batch_size is not None
+    assert args.micro_batch_size > 0
+    if args.global_batch_size is None:
+        args.global_batch_size = args.micro_batch_size * args.data_parallel_size
+        if args.rank == 0:
+            print('setting global batch size to {}'.format(
+                args.global_batch_size), flush=True)
+    assert args.global_batch_size > 0
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        assert args.pipeline_model_parallel_size > 2, \
+            'pipeline-model-parallel size should be greater than 2 with ' \
+            'interleaved schedule'
+        assert args.num_layers % args.num_layers_per_virtual_pipeline_stage == 0, \
+            'number of layers is not divisible by number of layers per virtual ' \
+            'pipeline stage'
+        args.virtual_pipeline_model_parallel_size = \
+            (args.num_layers // args.transformer_pipeline_model_parallel_size) // \
+            args.num_layers_per_virtual_pipeline_stage
+    else:
+        args.virtual_pipeline_model_parallel_size = None
+
+    # Parameters dtype.
+    args.params_dtype = torch.float
+    if args.fp16:
+        assert not args.bf16
+        args.params_dtype = torch.half
+    if args.bf16:
+        assert not args.fp16
+        args.params_dtype = torch.bfloat16
+        # bfloat16 requires gradient accumulation and all-reduce to
+        # be done in fp32.
+        if not args.accumulate_allreduce_grads_in_fp32:
+            args.accumulate_allreduce_grads_in_fp32 = True
+            if args.rank == 0:
+                print('accumulate and all-reduce gradients in fp32 for '
+                      'bfloat16 data type.', flush=True)
+
+    if args.rank == 0:
+        print('using {} for parameters ...'.format(args.params_dtype),
+              flush=True)
+
+    # If we do accumulation and all-reduces in fp32, we need to have local DDP
+    # and we should make sure use-contiguous-buffers-in-local-ddp is not off.
+    if args.accumulate_allreduce_grads_in_fp32:
+        assert args.DDP_impl == 'local'
+        assert args.use_contiguous_buffers_in_local_ddp
+
+    # If we use the distributed optimizer, we need to have local DDP
+    # and we should make sure use-contiguous-buffers-in-local-ddp is on.
+    if args.use_distributed_optimizer:
+        assert args.DDP_impl == 'local'
+        assert args.use_contiguous_buffers_in_local_ddp
+
+    # For torch DDP, we do not use contiguous buffer
+    if args.DDP_impl == 'torch':
+        args.use_contiguous_buffers_in_local_ddp = False
+
+    if args.dataloader_type is None:
+        args.dataloader_type = 'single'
+
+    # Consumed tokens.
+    args.consumed_train_samples = 0
+    args.consumed_valid_samples = 0
+
+    # Support for variable sequence lengths across batches/microbatches.
+    # set it if the dataloader supports generation of variable sequence lengths
+    # across batches/microbatches. Due to additional communication overhead
+    # during pipeline parallelism, it should not be set if sequence length
+    # is constant during training.
+    args.variable_seq_lengths = False
+
+    # Iteration-based training.
+    if args.train_iters:
+        # If we use iteration-based training, make sure the
+        # sample-based options are off.
+        assert args.train_samples is None, \
+            'expected iteration-based training'
+        assert args.lr_decay_samples is None, \
+            'expected iteration-based learning rate decay'
+        assert args.lr_warmup_samples == 0, \
+            'expected iteration-based learning rate warmup'
+        assert args.rampup_batch_size is None, \
+            'expected no batch-size rampup for iteration-based training'
+        if args.lr_warmup_fraction is not None:
+            assert args.lr_warmup_iters == 0, \
+                'can only specify one of lr-warmup-fraction and lr-warmup-iters'
+
+    # Sample-based training.
+    if args.train_samples:
+        # If we use sample-based training, make sure the
+        # iteration-based options are off.
+        assert args.train_iters is None, \
+            'expected sample-based training'
+        assert args.lr_decay_iters is None, \
+            'expected sample-based learning rate decay'
+        assert args.lr_warmup_iters == 0, \
+            'expected sample-based learnig rate warmup'
+        if args.lr_warmup_fraction is not None:
+            assert args.lr_warmup_samples == 0, \
+                'can only specify one of lr-warmup-fraction ' \
+                'and lr-warmup-samples'
+
+    if args.num_layers is not None:
+        assert args.encoder_num_layers is None, \
+            'cannot have both num-layers and encoder-num-layers specified'
+        args.encoder_num_layers = args.num_layers
+    else:
+        assert args.encoder_num_layers is not None, \
+            'either num-layers or encoder-num-layers should be specified'
+        args.num_layers = args.encoder_num_layers
+
+    # Check required arguments.
+    required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
+                     'max_position_embeddings']
+    for req_arg in required_args:
+        _check_arg_is_not_none(args, req_arg)
+
+    # Checks.
+    if args.ffn_hidden_size is None:
+        args.ffn_hidden_size = 4 * args.hidden_size
+
+    if args.swiglu:
+        # reduce the dimnesion for MLP since projections happens on
+        # two linear layers. this keeps the number of paramters in
+        # the same ballpark as the counterpart with 4*h size
+        # we keep it a multiple of 64, which means the actual tensor size
+        # will be a multiple of 64 / tp_size
+        args.ffn_hidden_size = int((4 * args.hidden_size * 2 / 3) / 64) * 64
+
+    if args.kv_channels is None:
+        assert args.hidden_size % args.num_attention_heads == 0
+        args.kv_channels = args.hidden_size // args.num_attention_heads
+
+    if args.seq_length is not None:
+        assert args.encoder_seq_length is None
+        args.encoder_seq_length = args.seq_length
+    else:
+        assert args.encoder_seq_length is not None
+        args.seq_length = args.encoder_seq_length
+
+    if args.seq_length is not None:
+        assert args.max_position_embeddings >= args.seq_length
+    if args.decoder_seq_length is not None:
+        assert args.max_position_embeddings >= args.decoder_seq_length
+    if args.lr is not None:
+        assert args.min_lr <= args.lr
+    if args.save is not None:
+        assert args.save_interval is not None
+    # Mixed precision checks.
+    if args.fp16_lm_cross_entropy:
+        assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
+    if args.fp32_residual_connection:
+        assert args.fp16 or args.bf16, \
+            'residual connection in fp32 only supported when using fp16 or bf16.'
+
+    if args.weight_decay_incr_style == 'constant':
+        assert args.start_weight_decay is None
+        assert args.end_weight_decay is None
+        args.start_weight_decay = args.weight_decay
+        args.end_weight_decay = args.weight_decay
+    else:
+        assert args.start_weight_decay is not None
+        assert args.end_weight_decay is not None
+
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+    # Persistent fused layer norm.
+    if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 11):
+        args.no_persist_layer_norm = True
+        if args.rank == 0:
+            print('Persistent fused layer norm kernel is supported from '
+                  'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
+                  'Defaulting to no_persist_layer_norm=True')
+
+    # Activation recomputing.
+    if args.distribute_saved_activations:
+        assert args.tensor_model_parallel_size > 1, 'can distribute ' \
+            'recomputed activations only across tensor model ' \
+            'parallel groups'
+        assert args.recompute_granularity == 'full', \
+            'distributed recompute activations is only '\
+            'application to full recompute granularity'
+        assert args.recompute_method is not None, \
+            'for distributed recompute activations to work you '\
+            'need to use a recompute method '
+        assert (TORCH_MAJOR, TORCH_MINOR) >= (1, 10), \
+            'distributed recompute activations are supported for pytorch ' \
+            'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \
+            'pytorch version is v%s.%s.' % (TORCH_MAJOR, TORCH_MINOR)
+
+    if args.recompute_granularity == 'selective':
+        assert args.recompute_method is None, \
+            'recompute method is not yet supported for ' \
+            'selective recomputing granularity'
+
+    # disable sequence parallelism when tp=1
+    # to avoid change in numerics when
+    # sequence_parallelism is enabled.
+    if args.tensor_model_parallel_size == 1:
+        args.sequence_parallel = False
+
+    # disable async_tensor_model_parallel_allreduce when
+    # model parallel memory optimization is enabled
+    if args.sequence_parallel:
+        args.async_tensor_model_parallel_allreduce = False
+
+    if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
+        if args.sequence_parallel:
+            raise RuntimeError(
+                "Using sequence parallelism requires setting the environment variable "
+                "CUDA_DEVICE_MAX_CONNECTIONS to 1")
+        if args.async_tensor_model_parallel_allreduce:
+            raise RuntimeError(
+                "Using async gradient all reduce requires setting the environment "
+                "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
+
+    # Disable bias gelu fusion if we are disabling bias altogether
+    if not args.add_bias_linear:
+        args.bias_gelu_fusion = False
+
+    # Retro checks.
+    if args.retro_add_retriever:
+
+        # Sequence parallelism unsupported.
+        assert not args.sequence_parallel, \
+            "retro currently does not support sequence parallelism."
+
+        # Pipeline parallelism unsupported.
+        assert args.pipeline_model_parallel_size == 1, \
+            "retro currently does not support pipeline parallelism."
+
+        # Load retro args.
+        # retro_args_path = get_retro_args_path(args.retro_workdir)
+        retro_args_path = None
+        assert os.path.exists(retro_args_path), "retro workdir missing args.json"
+        with open(retro_args_path) as f:
+            retro_args = types.SimpleNamespace(**json.load(f))
+            retro_args.retro_return_doc_ids = args.retro_return_doc_ids
+            retro_args.retro_gpt_retrieved_length = \
+                args.retro_num_retrieved_chunks * \
+                retro_args.retro_gpt_chunk_length
+            set_retro_args(retro_args)
+
+    # Legacy RoPE arguments
+    if args.use_rotary_position_embeddings:
+        args.position_embedding_type = 'rope'
+
+    # Would just need to add 'NoPE' as a position_embedding_type to support this, but for now
+    # don't allow it to keep things simple
+    if not args.add_position_embedding and args.position_embedding_type != 'rope':
+        raise RuntimeError('--no-position-embedding is deprecated, use --position-embedding-type')
+
+    # Print arguments.
+    _print_args("arguments", args)
+    retro_args = get_retro_args()
+    if retro_args and args != retro_args:
+        _print_args("retro arguments", types.SimpleNamespace(**{k:v for k,v in vars(retro_args).items() if k.startswith("retro")}, rank=args.rank))
+
+    return args
+
+
+def _print_args(title, args):
+    """Print arguments."""
+    if args.rank == 0:
+        print(f'------------------------ {title} ------------------------',
+              flush=True)
+        str_list = []
+        for arg in vars(args):
+            dots = '.' * (48 - len(arg))
+            str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
+        for arg in sorted(str_list, key=lambda x: x.lower()):
+            print(arg, flush=True)
+        print(f'-------------------- end of {title} ---------------------',
+              flush=True)
+
+
+def _check_arg_is_not_none(args, arg):
+    assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
+
+def core_transformer_config_from_args(args):
+
+    # Translate args to core transformer configuration
+    kw_args = {}
+    for f in dataclasses.fields(TransformerConfig):
+        if hasattr(args, f.name):
+            kw_args[f.name] = getattr(args, f.name)
+    kw_args['persist_layer_norm'] = not args.no_persist_layer_norm
+    kw_args['layernorm_zero_centered_gamma'] = args.apply_layernorm_1p
+    kw_args['deallocate_pipeline_outputs'] = True
+    kw_args['pipeline_dtype'] = args.params_dtype
+    kw_args['batch_p2p_comm'] = not args.overlap_p2p_comm
+    if args.swiglu:
+        kw_args['activation_func'] = F.silu
+        kw_args['gated_linear_unit'] = True
+        kw_args['bias_gelu_fusion'] = False
+    if args.init_method_xavier_uniform:
+        kw_args['init_method'] = torch.nn.init.xavier_uniform_
+        kw_args['scaled_init_method'] = torch.nn.init.xavier_uniform_
+    if args.group_query_attention:
+        kw_args['num_query_groups'] = args.num_query_groups
+    else:
+        kw_args['num_query_groups'] = None
+
+    return TransformerConfig(**kw_args)
+
+def _add_transformer_engine_args(parser):
+    group = parser.add_argument_group(title='Transformer-Engine')
+
+    group.add_argument('--fp8-format', default=None,
+                       choices=['e4m3', 'hybrid'],
+                       help='Which fp8 format scheme to use for FP8 tensors in the forward and backward pass',
+                       dest='fp8')
+    group.add_argument('--fp8-margin', type=int, default=0,
+                       help='Scaling margin for fp8',
+                       dest='fp8_margin')
+    group.add_argument('--fp8-interval', type=int, default=1,
+                       help='Scaling update interval for fp8',
+                       dest='fp8_interval')
+    group.add_argument('--fp8-amax-history-len', type=int, default=1,
+                       help='Number of steps for which amax history is recorded per tensor',
+                       dest='fp8_amax_history_len')
+    group.add_argument('--fp8-amax-compute-algo', default='most_recent',
+                       choices=['most_recent', 'max'],
+                       help='Algorithm for computing amax from history',
+                       dest='fp8_amax_compute_algo')
+    group.add_argument('--no-fp8-wgrad', action='store_false',
+                       help='Execute wgrad in higher precision even for FP8 runs',
+                       dest='fp8_wgrad')
+    group.add_argument('--transformer-impl', default='local',
+                       choices=['local', 'transformer_engine'],
+                       help='Which Transformer implementation to use.',
+                       dest='transformer_impl')
+    group.add_argument('--normalization', default='LayerNorm',
+                       choices=['LayerNorm', 'RMSNorm'],
+                       help='Which normalization technique to use.',
+                       dest='normalization')
+
+    return parser
+
+def _add_inference_args(parser):
+    group = parser.add_argument_group(title='inference')
+
+    group.add_argument('--inference-batch-times-seqlen-threshold',
+                       type=int, default=512,
+                       help='During inference, if batch-size times '
+                       'sequence-length is smaller than this threshold '
+                       'then we will not use pipelining, otherwise we will.')
+    group.add_argument('--max-tokens-to-oom',
+                       type=int, default=12000,
+                       help='Maximum number of tokens during inference'
+                       'tokens here is # in prompt + # to generate'
+                       'Allows us to throw an error before OOM crashes server')
+    group.add_argument('--output-bert-embeddings', action='store_true',
+                       help='Output Bert embeddings (via mean pooling) from '
+                       'model, rather than its binary head output or entire '
+                       'hidden batch.')
+    group.add_argument('--bert-embedder-type', default="megatron",
+                       choices=["megatron", "huggingface"],
+                       help='Select either Megatron or Huggingface as the '
+                       'Bert embedder.')
+
+    return parser
+
+
+def _add_retro_args(parser):
+    group = parser.add_argument_group(title='retro')
+
+    group.add_argument('--retro-workdir', default=None,
+                       help='Retro working directory, which contains the '
+                       'preprocessed data for for pretraining. This directory '
+                       'is built during preprocessing (see '
+                       'tools/retro/README.md), and contains subdirectories '
+                       'for the chunk database and pretraining neighbors.')
+    group.add_argument('--retro-add-retriever',
+                       action='store_true', default=False,
+                       help='Add a retriever to the transformer, for use in '
+                       'pretraining a Retro model.')
+    group.add_argument('--retro-cyclic-train-iters', type=int, default=None,
+                       help='Set number of training iterations for cyclic '
+                       'Retro training.')
+    group.add_argument('--retro-encoder-layers', type=int, default=2,
+                       help='Number of layers to use for the retrieval '
+                       'encoder.')
+    group.add_argument('--retro-encoder-hidden-dropout',
+                       type=float, default=0.1, help='Hidden dropout for '
+                       'retrieval encoder.')
+    group.add_argument('--retro-encoder-attention-dropout',
+                       type=float, default=0.1, help='Attention dropout for '
+                       'retrieval encoder.')
+    group.add_argument("--retro-num-neighbors", type=int, default=2,
+                       help='Number of neighbors to retrieve during '
+                       'pretraining.')
+    group.add_argument("--retro-num-retrieved-chunks", type=int, default=2,
+                       help='Number of chunks to retrieve from the retrieval '
+                       'database.')
+    group.add_argument("--retro-return-doc-ids", action="store_true",
+                       help="Turn this on when preprocessing retro data.")
+
+    # Enforce argument naming convention.
+    for action in group._group_actions:
+        prefix = action.dest.split("_")[0]
+        assert prefix == "retro", \
+            "Retro args must be prefixed with '--retro-*', for consistent " \
+            "styling. Please fix '%s'." % ", ".join(action.option_strings)
+
+    return parser
+
+
+def _add_network_size_args(parser):
+    group = parser.add_argument_group(title='network size')
+
+    group.add_argument('--num-layers', type=int, default=None,
+                       help='Number of transformer layers.')
+    group.add_argument('--encoder-num-layers', type=int, default=None,
+                       help='Number of encoder transformer layers.')
+    group.add_argument('--decoder-num-layers', type=int, default=None,
+                       help='Number of decoder transformer layers.')
+    group.add_argument('--hidden-size', type=int, default=None,
+                       help='Tansformer hidden size.')
+    group.add_argument('--ffn-hidden-size', type=int, default=None,
+                       help='Transformer Feed-Forward Network hidden size. '
+                       'This is set to 4*hidden-size if not provided')
+    group.add_argument('--num-attention-heads', type=int, default=None,
+                       help='Number of transformer attention heads.')
+    group.add_argument('--kv-channels', type=int, default=None,
+                       help='Projection weights dimension in multi-head '
+                       'attention. This is set to '
+                       '   args.hidden_size // args.num_attention_heads '
+                       'if not provided.')
+    group.add_argument('--group-query-attention', action='store_true',
+                          help='Use group-query attention.')
+    group.add_argument('--num-query-groups', type=int, default=1)
+
+    group.add_argument('--max-position-embeddings', type=int, default=None,
+                       help='Maximum number of position embeddings to use. '
+                       'This is the size of position embedding.')
+    group.add_argument('--position-embedding-type', type=str, default='learned_absolute',
+                       choices=['learned_absolute', 'rope'],
+                       help='Position embedding type.')
+    group.add_argument('--use-rotary-position-embeddings', action='store_true',
+                       help='Use rotary positional embeddings or not. '
+                       'Deprecated: use --position-embedding-type')
+    group.add_argument('--rotary-percent', type=float, default=1.0,
+                       help='Percent of rotary dimension to use, default 100%%')
+    group.add_argument('--rotary-seq-len-interpolation-factor', type=int, default=None,
+                       help='Sequence length interpolation factor for rotary embeddings.')
+    group.add_argument('--no-position-embedding',
+                       action='store_false',
+                       help='Disable position embedding. Deprecated: use --position-embedding-type',
+                       dest='add_position_embedding')
+    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
+                       help='Pad the vocab size to be divisible by this value.'
+                       'This is added for computational efficieny reasons.')
+    group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
+                       help='Layer norm epsilon.')
+    group.add_argument('--apply-layernorm-1p', action='store_true',
+                       help='Adjust LayerNorm weights such that they are centered '
+                       'around zero. This improves numerical stability.')
+    group.add_argument('--apply-residual-connection-post-layernorm',
+                       action='store_true',
+                       help='If set, use original BERT residula connection '
+                       'ordering.')
+    group.add_argument('--openai-gelu', action='store_true',
+                       help='Use OpenAIs GeLU implementation. This option'
+                       'should not be used unless for backward compatibility'
+                       'reasons.')
+    group.add_argument('--squared-relu', action='store_true',
+                       help='Use squared relu activation instead of default gelu')
+    group.add_argument('--swiglu', action='store_true',
+                       help='Use gated linear units and SiLU activation instead of default gelu')
+    group.add_argument('--onnx-safe', type=bool, required=False,
+                       help='Use workarounds for known problems with '
+                       'Torch ONNX exporter')
+    group.add_argument('--bert-no-binary-head', action='store_false',
+                       help='Disable BERT binary head.',
+                       dest='bert_binary_head')
+    group.add_argument('--num-experts', type=int, default=None,
+                       help='Number of Experts in Switch Transformer (None means no Switch)')
+    group.add_argument('--untie-embeddings-and-output-weights', action='store_true',
+                       help='Untie embeddings and output weights.'),
+    group.add_argument('--embedding-weights-in-fp32', action='store_true',
+                       help='Cast word embedding weights to fp32 before embedding fwd.'),
+    return parser
+
+
+def _add_logging_args(parser):
+    group = parser.add_argument_group(title='logging')
+
+    group.add_argument('--log-params-norm', action='store_true',
+                       help='If set, calculate and log parameters norm.')
+    group.add_argument('--log-num-zeros-in-grad', action='store_true',
+                       help='If set, calculate and log the number of zeros in gradient.')
+    group.add_argument('--timing-log-level', type=int,
+                       default=0, choices=range(0,3),
+                       help='Granularity level to measure and report timing. '
+                       '   0: report only iteration time and make sure timing '
+                       '      does not introduce extra overhead.'
+                       '   1: report timing for operations that are executed '
+                       '      very limited times (basically once) during '
+                       '      each iteration (such as gradient all-reduce) '
+                       '   2: report timing for operations that migh be '
+                       '      executed numerous times during each iteration. '
+                       'Note that setting the level to 1 or 2 might '
+                       'cause increase in iteration time.')
+    group.add_argument('--no-barrier-with-level-1-timing', action='store_false',
+                       help='If not set, use barrier with level 1 time '
+                       'measurements. Note that this is up to the user '
+                       'to make sure calling barrier with their timers '
+                       'will not result in hangs. This can happen if for '
+                       'example the user adds a level 1 timer that is not '
+                       'called by all ranks.',
+                       dest='barrier_with_L1_time')
+    group.add_argument('--timing-log-option', type=str, default='minmax',
+                       choices=['max', 'minmax', 'all'],
+                       help='Options for logging timing:'
+                       '  max: report the max timing across all ranks'
+                       '  minmax: report min and max timings across all ranks'
+                       '  all: report timings of all ranks.')
+    group.add_argument('--tensorboard-log-interval', type=int, default=1,
+                       help='Report to tensorboard interval.')
+    group.add_argument('--tensorboard-queue-size', type=int, default=1000,
+                       help='Size of the tensorboard queue for pending events '
+                       'and summaries before one of the ‘add’ calls forces a '
+                       'flush to disk.')
+    group.add_argument('--log-timers-to-tensorboard', action='store_true',
+                       help='If set, write timers to tensorboard.')
+    group.add_argument('--log-batch-size-to-tensorboard', action='store_true',
+                       help='If set, write batch-size to tensorboard.')
+    group.add_argument('--no-log-learnig-rate-to-tensorboard',
+                       action='store_false',
+                       help='Disable learning rate logging to tensorboard.',
+                       dest='log_learning_rate_to_tensorboard')
+    group.add_argument('--no-log-loss-scale-to-tensorboard',
+                       action='store_false',
+                       help='Disable loss-scale logging to tensorboard.',
+                       dest='log_loss_scale_to_tensorboard')
+    group.add_argument('--log-validation-ppl-to-tensorboard',
+                       action='store_true',
+                       help='If set, write validation perplexity to '
+                       'tensorboard.')
+    group.add_argument('--log-memory-to-tensorboard',
+                       action='store_true',
+                       help='Enable memory logging to tensorboard.')
+    group.add_argument('--log-world-size-to-tensorboard',
+                       action='store_true',
+                       help='Enable world size logging to tensorboard.')
+
+    return parser
+
+
+def _add_regularization_args(parser):
+    group = parser.add_argument_group(title='regularization')
+
+    group.add_argument('--attention-dropout', type=float, default=0.1,
+                       help='Post attention dropout probability.')
+    group.add_argument('--hidden-dropout', type=float, default=0.1,
+                       help='Dropout probability for hidden state transformer.')
+    group.add_argument('--weight-decay', type=float, default=0.01,
+                       help='Weight decay coefficient for L2 regularization.')
+    group.add_argument('--start-weight-decay', type=float,
+                       help='Initial weight decay coefficient for L2 regularization.')
+    group.add_argument('--end-weight-decay', type=float,
+                       help='End of run weight decay coefficient for L2 regularization.')
+    group.add_argument('--weight-decay-incr-style', type=str, default='constant',
+                       choices=['constant', 'linear', 'cosine'],
+                       help='Weight decay increment function.')
+    group.add_argument('--clip-grad', type=float, default=1.0,
+                       help='Gradient clipping based on global L2 norm.')
+    group.add_argument('--adam-beta1', type=float, default=0.9,
+                       help='First coefficient for computing running averages '
+                       'of gradient and its square')
+    group.add_argument('--adam-beta2', type=float, default=0.999,
+                       help='Second coefficient for computing running averages '
+                       'of gradient and its square')
+    group.add_argument('--adam-eps', type=float, default=1e-08,
+                       help='Term added to the denominator to improve'
+                       'numerical stability')
+    group.add_argument('--sgd-momentum', type=float, default=0.9,
+                       help='Momentum factor for sgd')
+
+    return parser
+
+
+def _add_training_args(parser):
+    group = parser.add_argument_group(title='training')
+
+    group.add_argument('--micro-batch-size', type=int, default=None,
+                       help='Batch size per model instance (local batch size). '
+                       'Global batch size is local batch size times data '
+                       'parallel size times number of micro batches.')
+    group.add_argument('--batch-size', type=int, default=None,
+                       help='Old batch size parameter, do not use. '
+                       'Use --micro-batch-size instead')
+    group.add_argument('--global-batch-size', type=int, default=None,
+                       help='Training batch size. If set, it should be a '
+                       'multiple of micro-batch-size times data-parallel-size. '
+                       'If this value is None, then '
+                       'use micro-batch-size * data-parallel-size as the '
+                       'global batch size. This choice will result in 1 for '
+                       'number of micro-batches.')
+    group.add_argument('--rampup-batch-size', nargs='*', default=None,
+                       help='Batch size ramp up with the following values:'
+                       '  --rampup-batch-size <start batch size> '
+                       '                      <batch size incerement> '
+                       '                      <ramp-up samples> '
+                       'For example:'
+                       '   --rampup-batch-size 16 8 300000 \ '
+                       '   --global-batch-size 1024'
+                       'will start with global batch size 16 and over '
+                       ' (1024 - 16) / 8 = 126 intervals will increase'
+                       'the batch size linearly to 1024. In each interval'
+                       'we will use approximately 300000 / 126 = 2380 samples.')
+    group.add_argument('--recompute-activations', action='store_true',
+                       help='recompute activation to allow for training '
+                       'with larger models, sequences, and batch sizes.')
+    group.add_argument('--recompute-granularity', type=str, default=None,
+                       choices=['full', 'selective'],
+                       help='Checkpoint activations to allow for training '
+                       'with larger models, sequences, and batch sizes. '
+                       'It is supported at two granularities 1) full: '
+                       'whole transformer layer is recomputed, '
+                       '2) selective: core attention part of the transformer '
+                       'layer is recomputed.')
+    group.add_argument('--distribute-saved-activations',
+                       action='store_true',
+                       help='If set, distribute recomputed activations '
+                       'across model parallel group.')
+    group.add_argument('--recompute-method', type=str, default=None,
+                       choices=['uniform', 'block'],
+                       help='1) uniform: uniformly divide the total number of '
+                       'Transformer layers and recompute the input activation of '
+                       'each divided chunk at specified granularity, '
+                       '2) recompute the input activations of only a set number of '
+                       'individual Transformer layers per pipeline stage and do the '
+                       'rest without any recomputing at specified granularity'
+                       'default) do not apply activations recompute to any layers')
+    group.add_argument('--recompute-num-layers', type=int, default=None,
+                       help='1) uniform: the number of Transformer layers in each '
+                       'uniformly divided recompute unit, '
+                       '2) block: the number of individual Transformer layers '
+                       'to recompute within each pipeline stage.')
+    group.add_argument('--profile', action='store_true',
+                       help='Enable nsys profiling. When using this option, nsys '
+                       'options should be specified in commandline. An example '
+                       'nsys commandline is `nsys profile -s none -t nvtx,cuda '
+                       '-o <path/to/output_file> --force-overwrite true '
+                       '--capture-range=cudaProfilerApi '
+                       '--capture-range-end=stop`.')
+    group.add_argument('--profile-step-start', type=int, default=10,
+                       help='Gloable step to start profiling.')
+    group.add_argument('--profile-step-end', type=int, default=12,
+                       help='Gloable step to stop profiling.')
+    group.add_argument('--profile-ranks', nargs='+', type=int, default=[0],
+                       help='Global ranks to profile.')
+
+
+    # deprecated
+    group.add_argument('--checkpoint-activations', action='store_true',
+                       help='Checkpoint activation to allow for training '
+                       'with larger models, sequences, and batch sizes.')
+    group.add_argument('--train-iters', type=int, default=None,
+                       help='Total number of iterations to train over all '
+                       'training runs. Note that either train-iters or '
+                       'train-samples should be provided.')
+    group.add_argument('--train-samples', type=int, default=None,
+                       help='Total number of samples to train over all '
+                       'training runs. Note that either train-iters or '
+                       'train-samples should be provided.')
+    group.add_argument('--log-interval', type=int, default=100,
+                       help='Report loss and timing interval.')
+    group.add_argument('--exit-interval', type=int, default=None,
+                       help='Exit the program after the iteration is divisible '
+                       'by this value.')
+    group.add_argument('--exit-duration-in-mins', type=int, default=None,
+                       help='Exit the program after this many minutes.')
+    group.add_argument('--exit-signal-handler', action='store_true',
+                       help='Dynamically save the checkpoint and shutdown the '
+                       'training if SIGTERM is received')
+    group.add_argument('--tensorboard-dir', type=str, default=None,
+                       help='Write TensorBoard logs to this directory.')
+    group.add_argument('--no-masked-softmax-fusion',
+                       action='store_false',
+                       help='Disable fusion of query_key_value scaling, '
+                       'masking, and softmax.',
+                       dest='masked_softmax_fusion')
+    group.add_argument('--no-bias-gelu-fusion', action='store_false',
+                       help='Disable bias and gelu fusion.',
+                       dest='bias_gelu_fusion')
+    group.add_argument('--no-bias-dropout-fusion', action='store_false',
+                       help='Disable bias and dropout fusion.',
+                       dest='bias_dropout_fusion')
+    group.add_argument('--use-flash-attn', action='store_true',
+                       help='use FlashAttention implementation of attention. '
+                       'https://arxiv.org/abs/2205.14135')
+    group.add_argument('--disable-bias-linear', action='store_false',
+                       help='Disable bias in the linear layers',
+                       dest='add_bias_linear')
+    group.add_argument('--optimizer', type=str, default='adam',
+                       choices=['adam', 'sgd'],
+                       help='Optimizer function')
+    group.add_argument('--dataloader-type', type=str, default=None,
+                       choices=['single', 'cyclic'],
+                       help='Single pass vs multiple pass data loader')
+    group.add_argument('--no-async-tensor-model-parallel-allreduce',
+                       action='store_false',
+                       help='Disable asynchronous execution of '
+                       'tensor-model-parallel all-reduce with weight '
+                       'gradient compuation of a column-linear layer.',
+                       dest='async_tensor_model_parallel_allreduce')
+    group.add_argument('--no-persist-layer-norm', action='store_true',
+                       help='Disable using persistent fused layer norm kernel. '
+                       'This kernel supports only a set of hidden sizes. Please '
+                       'check persist_ln_hidden_sizes if your hidden '
+                       'size is supported.')
+    group.add_argument('--sequence-parallel', action='store_true',
+                       help='Enable sequence parallel optimization.')
+    group.add_argument('--no-gradient-accumulation-fusion',
+                       action='store_false',
+                       help='Disable fusing gradient accumulation to weight '
+                       'gradient computation of linear layers',
+                       dest='gradient_accumulation_fusion')
+    return parser
+
+
+def _add_initialization_args(parser):
+    group = parser.add_argument_group(title='initialization')
+
+    group.add_argument('--seed', type=int, default=1234,
+                       help='Random seed used for python, numpy, '
+                       'pytorch, and cuda.')
+    group.add_argument('--data-parallel-random-init', action='store_true',
+                       help='Enable random initialization of params '
+                       'across data parallel ranks')
+    group.add_argument('--init-method-std', type=float, default=0.02,
+                       help='Standard deviation of the zero mean normal '
+                       'distribution used for weight initialization.')
+    group.add_argument('--init-method-xavier-uniform', action='store_true',
+                       help='Enable Xavier uniform parameter initialization')
+
+    return parser
+
+
+def _add_learning_rate_args(parser):
+    group = parser.add_argument_group(title='learning rate')
+
+    group.add_argument('--lr', type=float, default=None,
+                       help='Initial learning rate. Depending on decay style '
+                       'and initial warmup, the learing rate at each '
+                       'iteration would be different.')
+    group.add_argument('--lr-decay-style', type=str, default='linear',
+                       choices=['constant', 'linear', 'cosine', 'inverse-square-root'],
+                       help='Learning rate decay function.')
+    group.add_argument('--lr-decay-iters', type=int, default=None,
+                       help='number of iterations to decay learning rate over,'
+                       ' If None defaults to `--train-iters`')
+    group.add_argument('--lr-decay-samples', type=int, default=None,
+                       help='number of samples to decay learning rate over,'
+                       ' If None defaults to `--train-samples`')
+    group.add_argument('--lr-warmup-fraction', type=float, default=None,
+                       help='fraction of lr-warmup-(iters/samples) to use '
+                       'for warmup (as a float)')
+    group.add_argument('--lr-warmup-iters', type=int, default=0,
+                       help='number of iterations to linearly warmup '
+                       'learning rate over.')
+    group.add_argument('--lr-warmup-samples', type=int, default=0,
+                       help='number of samples to linearly warmup '
+                       'learning rate over.')
+    group.add_argument('--lr-warmup-init', type=float, default=0.0,
+                       help='Initial value for learning rate warmup. The '
+                       'scheduler starts warmup from this value.')
+    group.add_argument('--warmup', type=int, default=None,
+                       help='Old lr warmup argument, do not use. Use one of the'
+                       '--lr-warmup-* arguments above')
+    group.add_argument('--min-lr', type=float, default=0.0,
+                       help='Minumum value for learning rate. The scheduler'
+                       'clip values below this threshold.')
+    group.add_argument('--override-opt_param-scheduler', action='store_true',
+                       help='Reset the values of the scheduler (learning rate,'
+                       'warmup iterations, minimum learning rate, maximum '
+                       'number of iterations, and decay style from input '
+                       'arguments and ignore values from checkpoints. Note'
+                       'that all the above values will be reset.')
+    group.add_argument('--use-checkpoint-opt_param-scheduler', action='store_true',
+                       help='Use checkpoint to set the values of the scheduler '
+                       '(learning rate, warmup iterations, minimum learning '
+                       'rate, maximum number of iterations, and decay style '
+                       'from checkpoint and ignore input arguments.')
+
+    return parser
+
+
+def _add_checkpointing_args(parser):
+    group = parser.add_argument_group(title='checkpointing')
+
+    group.add_argument('--save', type=str, default=None,
+                       help='Output directory to save checkpoints to.')
+    group.add_argument('--save-interval', type=int, default=None,
+                       help='Number of iterations between checkpoint saves.')
+    group.add_argument('--no-save-optim', action='store_true', default=None,
+                       help='Do not save current optimizer.')
+    group.add_argument('--no-save-rng', action='store_true', default=None,
+                       help='Do not save current rng state.')
+    group.add_argument('--load', type=str, default=None,
+                       help='Directory containing a model checkpoint.')
+    group.add_argument('--no-load-optim', action='store_true', default=None,
+                       help='Do not load optimizer when loading checkpoint.')
+    group.add_argument('--no-load-rng', action='store_true', default=None,
+                       help='Do not load rng state when loading checkpoint.')
+    group.add_argument('--finetune', action='store_true',
+                       help='Load model for finetuning. Do not load optimizer '
+                       'or rng state from checkpoint and set iteration to 0. '
+                       'Assumed when loading a release checkpoint.')
+    group.add_argument('--no-initialization', action='store_false',
+                       help='Do not perform initialization when building model, '
+                       'can reduce startup time when definitely loading from a '
+                       'checkpoint',
+                       dest='perform_initialization')
+    group.add_argument('--use-checkpoint-args', action='store_true',
+                       help='Override any command line arguments with arguments '
+                       'from the checkpoint')
+    group.add_argument('--exit-on-missing-checkpoint', action='store_true',
+                       help="If '--load' is set, but checkpoint is not found "
+                       "(e.g., path typo), then exit instead of random "
+                       "initialization.")
+
+    return parser
+
+
+def _add_mixed_precision_args(parser):
+    group = parser.add_argument_group(title='mixed precision')
+
+    group.add_argument('--fp16', action='store_true',
+                       help='Run model in fp16 mode.')
+    group.add_argument('--bf16', action='store_true',
+                       help='Run model in bfloat16 mode.')
+    group.add_argument('--loss-scale', type=float, default=None,
+                       help='Static loss scaling, positive power of 2 '
+                       'values can improve fp16 convergence. If None, dynamic'
+                       'loss scaling is used.')
+    group.add_argument('--initial-loss-scale', type=float, default=2**32,
+                       help='Initial loss-scale for dynamic loss scaling.')
+    group.add_argument('--min-loss-scale', type=float, default=1.0,
+                       help='Minimum loss scale for dynamic loss scale.')
+    group.add_argument('--loss-scale-window', type=float, default=1000,
+                       help='Window over which to raise/lower dynamic scale.')
+    group.add_argument('--hysteresis', type=int, default=2,
+                       help='hysteresis for dynamic loss scaling')
+    group.add_argument('--fp32-residual-connection', action='store_true',
+                       help='Move residual connections to fp32.')
+    group.add_argument('--no-query-key-layer-scaling', action='store_false',
+                       help='Do not scale Q * K^T by 1 / layer-number.',
+                       dest='apply_query_key_layer_scaling')
+    group.add_argument('--attention-softmax-in-fp32', action='store_true',
+                       help='Run attention masking and softmax in fp32. '
+                       'This flag is ignored unless '
+                       '--no-query-key-layer-scaling is specified.')
+    group.add_argument('--accumulate-allreduce-grads-in-fp32',
+                       action='store_true',
+                       help='Gradient accumulation and all-reduce in fp32.')
+    group.add_argument('--fp16-lm-cross-entropy', action='store_true',
+                       help='Move the cross entropy unreduced loss calculation'
+                       'for lm head to fp16.')
+
+    return parser
+
+
+def _add_distributed_args(parser):
+    group = parser.add_argument_group(title='distributed')
+
+    group.add_argument('--tensor-model-parallel-size', type=int, default=1,
+                       help='Degree of tensor model parallelism.')
+    group.add_argument('--pipeline-model-parallel-size', type=int, default=1,
+                       help='Degree of pipeline model parallelism.')
+    group.add_argument('--pipeline-model-parallel-split-rank',
+                       type=int, default=None,
+                       help='Rank where encoder and decoder should be split.')
+    group.add_argument('--model-parallel-size', type=int, default=None,
+                       help='Old model parallel argument, do not use. Use '
+                       '--tensor-model-parallel-size instead.')
+    group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
+                       help='Number of layers per virtual pipeline stage')
+    group.add_argument('--overlap-p2p-communication',
+                       action='store_true',
+                       help='overlap pipeline parallel communication with forward and backward chunks',
+                       dest='overlap_p2p_comm')
+    group.add_argument('--distributed-backend', default='nccl',
+                       choices=['nccl', 'gloo'],
+                       help='Which backend to use for distributed training.')
+    group.add_argument('--distributed-timeout-minutes', type=int, default=10,
+                       help='Timeout minutes for torch.distributed.')
+    group.add_argument('--DDP-impl', default='local',
+                       choices=['local', 'torch'],
+                       help='which DistributedDataParallel implementation '
+                       'to use.')
+    group.add_argument('--no-contiguous-buffers-in-local-ddp',
+                       action='store_false', help='If set, dont use '
+                       'contiguous buffer in local DDP.',
+                       dest='use_contiguous_buffers_in_local_ddp')
+    group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false',
+                       help='Use scatter/gather to optimize communication of tensors in pipeline',
+                       dest='scatter_gather_tensors_in_pipeline')
+    group.add_argument('--use-ring-exchange-p2p', action='store_true',
+                       default=False, help='If set, use custom-built ring exchange '
+                       'for p2p communications. Note that this option will require '
+                       'a custom built image that support ring-exchange p2p.')
+    group.add_argument('--local_rank', type=int, default=None,
+                       help='local rank passed from distributed launcher.')
+    group.add_argument('--lazy-mpu-init', type=bool, required=False,
+                       help='If set to True, initialize_megatron() '
+                       'skips DDP initialization and returns function to '
+                       'complete it instead.Also turns on '
+                       '--use-cpu-initialization flag. This is for '
+                       'external DDP manager.' )
+    group.add_argument('--use-cpu-initialization', action='store_true',
+                       default=None, help='If set, affine parallel weights '
+                       'initialization uses CPU' )
+    group.add_argument('--empty-unused-memory-level', default=0, type=int,
+                       choices=[0, 1, 2],
+                       help='Call torch.cuda.empty_cache() each iteration '
+                       '(training and eval), to reduce fragmentation.'
+                       '0=off, 1=moderate, 2=aggressive.')
+    group.add_argument('--standalone-embedding-stage', action='store_true',
+                       default=False, help='If set, *input* embedding layer '
+                       'is placed on its own pipeline stage, without any '
+                       'transformer layers. (For T5, this flag currently only '
+                       'affects the encoder embedding.)')
+    group.add_argument('--use-distributed-optimizer', action='store_true',
+                       help='Use distributed optimizer.')
+
+    return parser
+
+
+def _add_validation_args(parser):
+    group = parser.add_argument_group(title='validation')
+
+    group.add_argument('--eval-iters', type=int, default=100,
+                       help='Number of iterations to run for evaluation'
+                       'validation/test for.')
+    group.add_argument('--eval-interval', type=int, default=1000,
+                       help='Interval between running evaluation on '
+                       'validation set.')
+    group.add_argument('--skip-train', action='store_true',
+                       default=False, help='If set, bypass the training loop, '
+                       'optionally do evaluation for validation/test, and exit.')
+
+    return parser
+
+
+def _add_data_args(parser):
+    group = parser.add_argument_group(title='data and dataloader')
+
+    group.add_argument('--data-path', nargs='*', default=None,
+                       help='Path to the training dataset. Accepted format:'
+                       '1) a single data path, 2) multiple datasets in the'
+                       'form: dataset1-weight dataset1-path dataset2-weight '
+                       'dataset2-path ... It is used with --split when a '
+                       'single dataset used for all three: train, valid '
+                       'and test. It is exclusive to the other '
+                       '--*-data-path args')
+    group.add_argument('--split', type=str, default='969, 30, 1',
+                       help='Comma-separated list of proportions for training,'
+                       ' validation, and test split. For example the split '
+                       '`90,5,5` will use 90%% of data for training, 5%% for '
+                       'validation and 5%% for test.')
+    group.add_argument('--train-data-path', nargs='*', default=None,
+                       help='Path to the training dataset. Accepted format:'
+                       '1) a single data path, 2) multiple datasets in the'
+                       'form: dataset1-weight dataset1-path dataset2-weight '
+                       'dataset2-path ...')
+    group.add_argument('--valid-data-path', nargs='*', default=None,
+                       help='Path to the validation dataset. Accepted format:'
+                       '1) a single data path, 2) multiple datasets in the'
+                       'form: dataset1-weight dataset1-path dataset2-weight '
+                       'dataset2-path ...')
+    group.add_argument('--test-data-path', nargs='*', default=None,
+                       help='Path to the test dataset. Accepted format:'
+                       '1) a single data path, 2) multiple datasets in the'
+                       'form: dataset1-weight dataset1-path dataset2-weight '
+                       'dataset2-path ...')
+    group.add_argument('--data-cache-path', default=None,
+                       help='Path to a directory to hold cached index files.')
+
+    group.add_argument('--vocab-size', type=int, default=None,
+                       help='Size of vocab before EOD or padding.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file.')
+    group.add_argument('--merge-file', type=str, default=None,
+                       help='Path to the BPE merge file.')
+    group.add_argument('--vocab-extra-ids', type=int, default=0,
+                       help='Number of additional vocabulary tokens. '
+                            'They are used for span masking in the T5 model')
+    group.add_argument('--seq-length', type=int, default=None,
+                       help='Maximum sequence length to process.')
+    group.add_argument('--encoder-seq-length', type=int, default=None,
+                       help='Maximum encoder sequence length to process.'
+                       'This should be exclusive of --seq-length')
+    group.add_argument('--decoder-seq-length', type=int, default=None,
+                       help="Maximum decoder sequence length to process.")
+    group.add_argument('--retriever-seq-length', type=int, default=256,
+                       help='Maximum sequence length for the biencoder model '
+                       'for retriever')
+    group.add_argument('--sample-rate', type=float, default=1.0,
+                       help='sample rate for training data. Supposed to be 0 '
+                            ' < sample_rate < 1')
+    group.add_argument('--mask-prob', type=float, default=0.15,
+                       help='Probability of replacing a token with mask.')
+    group.add_argument('--short-seq-prob', type=float, default=0.1,
+                       help='Probability of producing a short sequence.')
+    group.add_argument('--mmap-warmup', action='store_true',
+                       help='Warm up mmap files.')
+    group.add_argument('--num-workers', type=int, default=2,
+                       help="Dataloader number of workers.")
+    group.add_argument('--tokenizer-type', type=str,
+                       default=None,
+                       choices=['BertWordPieceLowerCase',
+                                'BertWordPieceCase',
+                                'GPT2BPETokenizer',
+                                'SentencePieceTokenizer',
+                                'GPTSentencePieceTokenizer',
+                                'NullTokenizer'],
+                       help='What type of tokenizer to use.')
+    group.add_argument('--tokenizer-model', type=str, default=None,
+                       help='Sentencepiece tokenizer model.')
+    group.add_argument('--data-impl', type=str, default='infer',
+                       choices=['mmap', 'infer'],
+                       help='Implementation of indexed datasets.')
+    group.add_argument('--reset-position-ids', action='store_true',
+                       help='Reset posistion ids after end-of-document token.')
+    group.add_argument('--reset-attention-mask', action='store_true',
+                       help='Reset self attention maske after '
+                       'end-of-document token.')
+    group.add_argument('--eod-mask-loss', action='store_true',
+                       help='Mask loss for the end of document tokens.')
+
+    return parser
+
+
+def _add_autoresume_args(parser):
+    group = parser.add_argument_group(title='autoresume')
+
+    group.add_argument('--adlr-autoresume', action='store_true',
+                       help='Enable autoresume on adlr cluster.')
+    group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
+                       help='Intervals over which check for autoresume'
+                       'termination signal')
+
+    return parser
+
+
+def _add_biencoder_args(parser):
+    group = parser.add_argument_group(title='biencoder')
+
+    # network size
+    group.add_argument('--ict-head-size', type=int, default=None,
+                       help='Size of block embeddings to be used in ICT and '
+                        'REALM (paper default: 128)')
+    group.add_argument('--biencoder-projection-dim', type=int, default=0,
+                       help='Size of projection head used in biencoder (paper'
+                        ' default: 128)')
+    group.add_argument('--biencoder-shared-query-context-model', action='store_true',
+                        help='Whether to share the parameters of the query '
+                        'and context models or not')
+
+    # checkpointing
+    group.add_argument('--ict-load', type=str, default=None,
+                       help='Directory containing an ICTBertModel checkpoint')
+    group.add_argument('--bert-load', type=str, default=None,
+                       help='Directory containing an BertModel checkpoint '
+                       '(needed to start ICT and REALM)')
+
+    # data
+    group.add_argument('--titles-data-path', type=str, default=None,
+                       help='Path to titles dataset used for ICT')
+    group.add_argument('--query-in-block-prob', type=float, default=0.1,
+                       help='Probability of keeping query in block for '
+                       'ICT dataset')
+    group.add_argument('--use-one-sent-docs', action='store_true',
+                       help='Whether to use one sentence documents in ICT')
+    group.add_argument('--evidence-data-path', type=str, default=None,
+                       help='Path to Wikipedia Evidence frm DPR paper')
+
+    # training
+    group.add_argument('--retriever-report-topk-accuracies', nargs='+', type=int,
+                        default=[], help="Which top-k accuracies to report "
+                        "(e.g. '1 5 20')")
+    group.add_argument('--retriever-score-scaling', action='store_true',
+                       help='Whether to scale retriever scores by inverse '
+                        'square root of hidden size')
+
+    # faiss index
+    group.add_argument('--block-data-path', type=str, default=None,
+                       help='Where to save/load BlockData to/from')
+    group.add_argument('--embedding-path', type=str, default=None,
+                       help='Where to save/load Open-Retrieval Embedding'
+                        ' data to/from')
+
+    # indexer
+    group.add_argument('--indexer-batch-size', type=int, default=128,
+                       help='How large of batches to use when doing indexing '
+                       'jobs')
+    group.add_argument('--indexer-log-interval', type=int, default=1000,
+                       help='After how many batches should the indexer '
+                       'report progress')
+    return parser
+
+
+def _add_vision_args(parser):
+    group = parser.add_argument_group(title="vision")
+
+    # general vision arguements
+    group.add_argument('--num-classes', type=int, default=1000,
+                       help='num of classes in vision classificaiton task')
+    group.add_argument('--img-h', type=int, default=224,
+                       help='Image height for vision classification task')
+    group.add_argument('--img-w', type=int, default=224,
+                       help='Image height for vision classification task')
+    group.add_argument('--num-channels', type=int, default=3,
+                       help='Number of channels in input image data')
+    group.add_argument('--patch-dim', type=int, default=16,
+                       help='patch dimension')
+    group.add_argument('--classes-fraction', type=float, default=1.0,
+                       help='training with fraction of classes.')
+    group.add_argument('--data-per-class-fraction', type=float, default=1.0,
+                       help='training with fraction of data per class.')
+    group.add_argument('--no-data-sharding', action='store_false',
+                       help='Disable data sharding.',
+                       dest='data_sharding')
+    group.add_argument('--head-lr-mult', type=float, default=1.0,
+                       help='learning rate multiplier for head during finetuning')
+
+    # pretraining type and backbone selection`
+    group.add_argument('--vision-pretraining', action='store_true',
+                       help='flag to indicate vision pretraining')
+    group.add_argument('--vision-pretraining-type', type=str, default='classify',
+                       choices=['classify', 'inpaint', 'dino'],
+                       help='pretraining objectives')
+    group.add_argument('--vision-backbone-type', type=str, default='vit',
+                       choices=['vit', 'mit', 'swin'],
+                       help='backbone types types')
+    group.add_argument('--swin-backbone-type', type=str, default='tiny',
+                       choices=['tiny', 'base', 'h3'],
+                       help='pretraining objectives')
+
+    # inpainting arguments
+    group.add_argument('--mask-type', type=str, default='random',
+                       choices=['random', 'row'],
+                       help='mask types')
+    group.add_argument('--mask-factor', type=float, default=1.0,
+                       help='mask size scaling parameter')
+
+    # dino arguments
+    group.add_argument('--iter-per-epoch', type=int, default=1250,
+                       help='iterations per epoch')
+    group.add_argument('--dino-local-img-size', type=int, default=96,
+                       help='Image size for vision classification task')
+    group.add_argument('--dino-local-crops-number', type=int, default=10,
+                       help='Number of local crops')
+    group.add_argument('--dino-head-hidden-size', type=int, default=2048,
+                       help='Hidden dimension size in dino head')
+    group.add_argument('--dino-bottleneck-size', type=int, default=256,
+                       help='Bottle neck dimension in dino head ')
+    group.add_argument('--dino-freeze-last-layer', type=float, default=1,
+                       help='Freezing last layer weights')
+    group.add_argument('--dino-norm-last-layer', action='store_true',
+                       help='Disable Norm in last layer.')
+    group.add_argument('--dino-warmup-teacher-temp', type=float, default=0.04,
+                       help='warump teacher temperature')
+    group.add_argument('--dino-teacher-temp', type=float, default=0.07,
+                       help='teacher temperature')
+    group.add_argument('--dino-warmup-teacher-temp-epochs', type=int, default=30,
+                       help='warmup teacher temperaure epochs')
+
+    return parser
diff --git a/training/DeepSpeed-Domino/megatron/checkpointing.py b/training/DeepSpeed-Domino/megatron/checkpointing.py
new file mode 100644
index 000000000..e88b58513
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/checkpointing.py
@@ -0,0 +1,686 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Input/output checkpointing."""
+
+import os
+import random
+import sys
+import numpy as np
+
+import torch
+
+from megatron import update_num_microbatches
+from megatron.core import mpu, tensor_parallel
+from .global_vars import get_args
+from .utils import (unwrap_model,
+                    print_rank_0)
+
+
+_CHECKPOINT_VERSION = None
+
+
+def set_checkpoint_version(value):
+    global _CHECKPOINT_VERSION
+    if _CHECKPOINT_VERSION is not None:
+        assert _CHECKPOINT_VERSION == value, \
+            "checkpoint versions do not match"
+    _CHECKPOINT_VERSION = value
+
+
+def get_checkpoint_version():
+    global _CHECKPOINT_VERSION
+    return _CHECKPOINT_VERSION
+
+
+def check_checkpoint_args(checkpoint_args):
+    """Ensure fixed arguments for a model are the same for the input
+    arguments and the one retrieved from checkpoint."""
+    args = get_args()
+
+    def _compare(arg_name, old_arg_name=None, default=None):
+        if old_arg_name is not None:
+            ckpt_arg_name = old_arg_name
+        else:
+            ckpt_arg_name = arg_name
+        if default is not None:
+            checkpoint_value = getattr(checkpoint_args, ckpt_arg_name, default)
+        else:
+            checkpoint_value = getattr(checkpoint_args, ckpt_arg_name)
+        args_value = getattr(args, arg_name)
+        error_message = '{} value from checkpoint ({}) is not equal to the ' \
+                        'input argument value ({}).'.format(
+                            arg_name, checkpoint_value, args_value)
+        assert checkpoint_value == args_value, error_message
+
+    _compare('num_layers')
+    _compare('hidden_size')
+    _compare('num_attention_heads')
+    _compare('add_position_embedding', default=True)
+    if args.vocab_file:
+        _compare('max_position_embeddings')
+        _compare('make_vocab_size_divisible_by')
+        _compare('padded_vocab_size')
+        _compare('tokenizer_type')
+    if args.data_parallel_random_init:
+        _compare('data_parallel_random_init')
+    if get_checkpoint_version() < 3.0:
+        _compare('tensor_model_parallel_size',
+                 old_arg_name='model_parallel_size')
+    if get_checkpoint_version() >= 3.0:
+        _compare('tensor_model_parallel_size')
+        _compare('pipeline_model_parallel_size')
+
+
+def ensure_directory_exists(filename):
+    """Build filename's path if it does not already exists."""
+    dirname = os.path.dirname(filename)
+    os.makedirs(dirname, exist_ok = True)
+
+
+def get_checkpoint_name(checkpoints_path, iteration, release=False,
+                        pipeline_parallel=None,
+                        tensor_rank=None, pipeline_rank=None):
+    """Determine the directory name for this rank's checkpoint."""
+    if release:
+        directory = 'release'
+    else:
+        directory = 'iter_{:07d}'.format(iteration)
+
+    # Use both the tensor and pipeline MP rank.
+    if pipeline_parallel is None:
+        pipeline_parallel = (mpu.get_pipeline_model_parallel_world_size() > 1)
+    if tensor_rank is None:
+        tensor_rank = mpu.get_tensor_model_parallel_rank()
+    if pipeline_rank is None:
+        pipeline_rank = mpu.get_pipeline_model_parallel_rank()
+
+    # Use both the tensor and pipeline MP rank. If using the distributed
+    # optimizer, then the optimizer's path must additionally include the
+    # data parallel rank.
+    if not pipeline_parallel:
+        common_path = os.path.join(checkpoints_path, directory,
+                            f'mp_rank_{tensor_rank:02d}')
+    else:
+        common_path = os.path.join(checkpoints_path, directory,
+                        f'mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}')
+
+    return os.path.join(common_path, "model_optim_rng.pt")
+
+
+def get_distributed_optimizer_checkpoint_name(model_checkpoint_name):
+    return os.path.join(os.path.dirname(model_checkpoint_name),
+                        "distrib_optim.pt")
+
+
+def find_checkpoint_rank_0(checkpoints_path, iteration, release=False):
+    """Finds the checkpoint for rank 0 without knowing if we are using
+    pipeline parallelism or not.
+
+    Since the checkpoint naming scheme changes if pipeline parallelism
+    is present, we need to look for both naming schemes if we don't
+    know if the checkpoint has pipeline parallelism.
+    """
+
+    # Look for checkpoint with no pipelining
+    filename = get_checkpoint_name(checkpoints_path, iteration, release,
+                                   pipeline_parallel=False,
+                                   tensor_rank=0, pipeline_rank=0)
+    if os.path.isfile(filename):
+        return filename
+
+    # Look for checkpoint with pipelining
+    filename = get_checkpoint_name(checkpoints_path, iteration, release,
+                                   pipeline_parallel=True,
+                                   tensor_rank=0, pipeline_rank=0)
+    if os.path.isfile(filename):
+        return filename
+
+    return None, None
+
+
+def get_checkpoint_tracker_filename(checkpoints_path):
+
+    """Tracker file rescords the latest chckpoint during
+    training to restart from."""
+    return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
+
+
+def read_metadata(tracker_filename):
+    # Read the tracker file and either set the iteration or
+    # mark it as a release checkpoint.
+    iteration = 0
+    release = False
+    with open(tracker_filename, 'r') as f:
+        metastring = f.read().strip()
+        try:
+            iteration = int(metastring)
+        except ValueError:
+            release = metastring == 'release'
+            if not release:
+                print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
+                    tracker_filename))
+                sys.exit()
+    assert iteration > 0 or release, 'error parsing metadata file {}'.format(
+        tracker_filename)
+
+    # Get the max iteration retrieved across the ranks.
+    if torch.distributed.is_initialized():
+        iters_cuda = torch.cuda.LongTensor([iteration])
+        torch.distributed.all_reduce(iters_cuda, op=torch.distributed.ReduceOp.MAX)
+        max_iter = iters_cuda[0].item()
+
+        # We should now have all the same iteration.
+        # If not, print a warning and chose the maximum
+        # iteration across all ranks.
+        if iteration != max_iter:
+            rank = torch.distributed.get_rank()
+            print('WARNING: on rank {} found iteration {} in the '
+                  'metadata while max iteration across the ranks '
+                  'is {}, replacing it with max iteration.'.format(
+                      rank, iteration, max_iter), flush=True)
+    else:
+        # When loading a checkpoint outside of training (for example,
+        # when editing it), we might not have torch distributed
+        # initialized, in this case, just assume we have the latest
+        max_iter = iteration
+    return max_iter, release
+
+
+def get_rng_state():
+    """ collect rng state across data parallel ranks """
+    args = get_args()
+    rng_state = {
+        'random_rng_state': random.getstate(),
+        'np_rng_state': np.random.get_state(),
+        'torch_rng_state': torch.get_rng_state(),
+        'cuda_rng_state': torch.cuda.get_rng_state(),
+        'rng_tracker_states': tensor_parallel.get_cuda_rng_tracker().get_states()}
+
+    rng_state_list = None
+    if torch.distributed.is_initialized() and \
+            mpu.get_data_parallel_world_size() > 1 and \
+            args.data_parallel_random_init:
+        rng_state_list = \
+            [None for i in range(mpu.get_data_parallel_world_size())]
+        torch.distributed.all_gather_object(
+            rng_state_list,
+            rng_state,
+            group=mpu.get_data_parallel_group())
+    else:
+        rng_state_list = [rng_state]
+
+    return rng_state_list
+
+
+def save_checkpoint(iteration, model, optimizer, opt_param_scheduler):
+    """Save a model checkpoint."""
+    args = get_args()
+
+    # Only rank zero of the data parallel writes to the disk.
+    model = unwrap_model(model)
+
+    print_rank_0('saving checkpoint at iteration {:7d} to {}'.format(
+        iteration, args.save))
+
+    # Collect rng state across data parallel ranks.
+    rng_state = get_rng_state()
+
+    # Checkpoint name.
+    checkpoint_name = get_checkpoint_name(args.save, iteration)
+
+    # Save distributed optimizer's custom parameter state.
+    if args.use_distributed_optimizer:
+        optim_checkpoint_name = \
+            get_distributed_optimizer_checkpoint_name(checkpoint_name)
+        ensure_directory_exists(optim_checkpoint_name)
+        optimizer.save_parameter_state(optim_checkpoint_name)
+
+    # Collect args, model, RNG.
+    if not torch.distributed.is_initialized() \
+       or mpu.get_data_parallel_rank() == 0:
+
+        # Arguments, iteration, and model.
+        state_dict = {}
+        state_dict['args'] = args
+        state_dict['checkpoint_version'] = 3.0
+        state_dict['iteration'] = iteration
+        if len(model) == 1:
+            state_dict['model'] = model[0].state_dict_for_save_checkpoint()
+        else:
+            for i in range(len(model)):
+                mpu.set_virtual_pipeline_model_parallel_rank(i)
+                state_dict['model%d' % i] = \
+                    model[i].state_dict_for_save_checkpoint()
+
+        # Optimizer stuff.
+        if not args.no_save_optim:
+            if optimizer is not None:
+                state_dict['optimizer'] = optimizer.state_dict()
+            if opt_param_scheduler is not None:
+                state_dict['opt_param_scheduler'] = \
+                    opt_param_scheduler.state_dict()
+
+        # RNG states.
+        if not args.no_save_rng:
+            state_dict["rng_state"] = rng_state
+
+        # Save.
+        ensure_directory_exists(checkpoint_name)
+        torch.save(state_dict, checkpoint_name)
+
+    # Wait so everyone is done (necessary)
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+    print_rank_0('  successfully saved checkpoint at iteration {:7d} to {}' \
+                 .format(iteration, args.save))
+
+    # And update the latest iteration
+    if not torch.distributed.is_initialized() \
+       or torch.distributed.get_rank() == 0:
+        tracker_filename = get_checkpoint_tracker_filename(args.save)
+        with open(tracker_filename, 'w') as f:
+            f.write(str(iteration))
+
+    # Wait so everyone is done (not necessary)
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+
+def _transpose_first_dim(t, num_splits, num_splits_first, model):
+    input_shape = t.size()
+    # We use a self_attention module but the values extracted aren't
+    # specific to self attention so should work for cross attention as well
+    while hasattr(model, 'module'):
+        model = model.module
+    attention_module = model.language_model.encoder.layers[0].self_attention
+    hidden_size_per_attention_head = attention_module.hidden_size_per_attention_head
+    num_attention_heads_per_partition = attention_module.num_attention_heads_per_partition
+    if num_splits_first:
+        """[num_splits * np * hn, h]
+        -->(view) [num_splits, np, hn, h]
+        -->(tranpose) [np, num_splits, hn, h]
+        -->(view) [np * num_splits * hn, h] """
+
+        intermediate_shape = \
+            (num_splits, num_attention_heads_per_partition,
+             hidden_size_per_attention_head) + input_shape[1:]
+
+        t = t.view(*intermediate_shape)
+        t = t.transpose(0, 1).contiguous()
+    else:
+        """[np * hn * num_splits, h]
+        -->(view) [np, hn, num_splits, h]
+        -->(tranpose) [np, num_splits, hn, h]
+        -->(view) [np * num_splits * hn, h] """
+
+        intermediate_shape = \
+            (num_attention_heads_per_partition,
+             hidden_size_per_attention_head, num_splits) +\
+             input_shape[1:]
+
+        t = t.view(*intermediate_shape)
+        t = t.transpose(1, 2).contiguous()
+    t = t.view(*input_shape)
+
+    return t
+
+
+def fix_query_key_value_ordering(model, checkpoint_version):
+    """Fix up query/key/value matrix ordering if checkpoint
+    version is smaller than 2.0
+    """
+    if checkpoint_version < 2.0:
+        if isinstance(model, list):
+            assert len(model)==1
+            model = model[0]
+        for name, param in model.named_parameters():
+            if name.endswith(('.query_key_value.weight', '.query_key_value.bias')):
+                if checkpoint_version == 0:
+                    fixed_param = _transpose_first_dim(param.data, 3, True, model)
+                elif checkpoint_version == 1.0:
+                    fixed_param = _transpose_first_dim(param.data, 3, False, model)
+                else:
+                    print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
+                    sys.exit()
+                param.data.copy_(fixed_param)
+            if name.endswith(('.key_value.weight', '.key_value.bias')):
+                if checkpoint_version == 0:
+                    fixed_param = _transpose_first_dim(param.data, 2, True, model)
+                elif checkpoint_version == 1.0:
+                    fixed_param = _transpose_first_dim(param.data, 2, False, model)
+                else:
+                    print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
+                    sys.exit()
+                param.data.copy_(fixed_param)
+        print_rank_0(" succesfully fixed query-key-values ordering for"
+                     " checkpoint version {}".format(checkpoint_version))
+
+
+def _load_base_checkpoint(load_dir, rank0=False):
+    """ Load the base state_dict from the given directory
+
+    If rank0 is true, just loads rank 0 checkpoint, ignoring arguments.
+    """
+
+    # Read the tracker file and set the iteration.
+    tracker_filename = get_checkpoint_tracker_filename(load_dir)
+
+    # If no tracker file, return nothing
+    if not os.path.isfile(tracker_filename):
+        if not rank0:
+            print_rank_0('WARNING: could not find the metadata file {} '.format(
+                tracker_filename))
+            print_rank_0('    will not load any checkpoints and will start from '
+                         'random')
+        return None, "", False
+
+    # Otherwise, read the tracker file and either set the iteration or
+    # mark it as a release checkpoint.
+    iteration, release = read_metadata(tracker_filename)
+
+    # Checkpoint.
+    if rank0:
+        checkpoint_name = find_checkpoint_rank_0(load_dir, iteration, release)
+    else:
+        checkpoint_name = get_checkpoint_name(load_dir, iteration, release)
+        if release:
+            print_rank_0(f' loading release checkpoint from {load_dir}')
+        else:
+            print_rank_0(f' loading checkpoint from {load_dir} at iteration {iteration}')
+
+    # Load the checkpoint.
+    try:
+        state_dict = torch.load(checkpoint_name, map_location='cpu')
+    except ModuleNotFoundError:
+        from megatron.fp16_deprecated import loss_scaler
+        # For backward compatibility.
+        if not rank0:
+            print_rank_0(' > deserializing using the old code structure ...')
+        sys.modules['fp16.loss_scaler'] = sys.modules[
+            'megatron.fp16_deprecated.loss_scaler']
+        sys.modules['megatron.fp16.loss_scaler'] = sys.modules[
+            'megatron.fp16_deprecated.loss_scaler']
+        state_dict = torch.load(checkpoint_name, map_location='cpu')
+        sys.modules.pop('fp16.loss_scaler', None)
+        sys.modules.pop('megatron.fp16.loss_scaler', None)
+    except BaseException as e:
+        print_rank_0('could not load the checkpoint')
+        print_rank_0(e)
+        sys.exit()
+
+    return state_dict, checkpoint_name, release
+
+
+def load_args_from_checkpoint(args, load_arg='load'):
+    """Set required arguments from the checkpoint specified in the
+    arguments.
+
+    Will overwrite arguments that have a non-None default value, but
+    will leave any arguments that default to None as set.
+
+    Returns the same args NameSpace with the new values added/updated.
+
+    If no checkpoint is specified in args, or if the checkpoint is
+    there but invalid, the arguments will not be modified
+
+    """
+    load_dir = getattr(args, load_arg)
+
+    if load_dir is None:
+        print_rank_0('No load directory specified, using provided arguments.')
+        return args
+
+    state_dict, checkpoint_name, release = _load_base_checkpoint(load_dir, rank0=True)
+
+    # Args.
+    if not state_dict:
+        print_rank_0('Checkpoint not found to provide arguments, using provided arguments.')
+        return args
+
+    if 'args' not in state_dict:
+        print_rank_0('Checkpoint provided does not have arguments saved, using provided arguments.')
+        return args
+
+    checkpoint_args = state_dict['args']
+    checkpoint_version = state_dict.get('checkpoint_version', 0)
+    args.iteration = state_dict['iteration']
+
+    # One-off conversion for foundation models
+    if hasattr(checkpoint_args, 'disable_bias_linear'):
+        setattr(checkpoint_args, 'add_bias_linear', not getattr(checkpoint_args, 'disable_bias_linear'))
+
+    def _set_arg(arg_name, old_arg_name=None, force=False):
+        if not force and getattr(args, arg_name, None) is not None:
+            return
+
+        if old_arg_name is not None:
+            checkpoint_value = getattr(checkpoint_args, old_arg_name, None)
+        else:
+            checkpoint_value = getattr(checkpoint_args, arg_name, None)
+
+        if checkpoint_value is not None:
+            print_rank_0(f"Setting {arg_name} to {checkpoint_value} from checkpoint")
+            setattr(args, arg_name, checkpoint_value)
+        else:
+            print_rank_0(f"Checkpoint did not provide arguments {arg_name}")
+
+    _set_arg('num_layers')
+    _set_arg('hidden_size')
+    _set_arg('ffn_hidden_size')
+    _set_arg('seq_length')
+    _set_arg('num_attention_heads')
+    _set_arg('kv_channels')
+    _set_arg('max_position_embeddings')
+    _set_arg('position_embedding_type', force=True)
+    _set_arg('add_position_embedding', force=True)
+    _set_arg('use_rotary_position_embeddings', force=True)
+    _set_arg('rotary_percent', force=True)
+    _set_arg('add_bias_linear', force=True)
+    _set_arg('swiglu', force=True)
+    _set_arg('untie_embeddings_and_output_weights', force=True)
+    _set_arg('apply_layernorm_1p', force=True)
+    _set_arg('tokenizer_type')
+    _set_arg('padded_vocab_size')
+    if checkpoint_version < 3.0:
+        _set_arg('tensor_model_parallel_size',
+                 'model_parallel_size')
+    else:
+        _set_arg('tensor_model_parallel_size', force=True)
+        _set_arg('pipeline_model_parallel_size', force=True)
+        _set_arg('virtual_pipeline_model_parallel_size', force=True)
+        _set_arg('num_layers_per_virtual_pipeline_stage')
+    return args, checkpoint_args
+
+
+def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', strict=True):
+    """Load a model checkpoint and return the iteration.
+    strict (bool): whether to strictly enforce that the keys in
+        :attr:`state_dict` of the checkpoint match the names of
+        parameters and buffers in model.
+    """
+    args = get_args()
+    load_dir = getattr(args, load_arg)
+
+    model = unwrap_model(model)
+
+    state_dict, checkpoint_name, release = _load_base_checkpoint(load_dir, rank0=False)
+
+    # Checkpoint not loaded.
+    if state_dict is None:
+
+        # Conditionally exit at this point.
+        if args.exit_on_missing_checkpoint:
+            print_rank_0(">> '--exit-on-missing-checkpoint' set ... exiting. <<")
+            torch.distributed.barrier()
+            sys.exit()
+
+        # Iteration defaults to 0.
+        return 0
+
+    # Set checkpoint version.
+    set_checkpoint_version(state_dict.get('checkpoint_version', 0))
+
+    # Set iteration.
+    if args.finetune or release:
+        iteration = 0
+    else:
+        try:
+            iteration = state_dict['iteration']
+        except KeyError:
+            try:  # Backward compatible with older checkpoints
+                iteration = state_dict['total_iters']
+            except KeyError:
+                print_rank_0('A metadata file exists but unable to load '
+                             'iteration from checkpoint {}, exiting'.format(
+                                 checkpoint_name))
+                sys.exit()
+
+    # Check arguments.
+    assert args.consumed_train_samples == 0
+    assert args.consumed_valid_samples == 0
+    if 'args' in state_dict and not args.finetune:
+        checkpoint_args = state_dict['args']
+        check_checkpoint_args(checkpoint_args)
+        args.consumed_train_samples = getattr(checkpoint_args,
+                                              'consumed_train_samples', 0)
+        update_num_microbatches(consumed_samples=args.consumed_train_samples)
+        args.consumed_valid_samples = getattr(checkpoint_args,
+                                              'consumed_valid_samples', 0)
+    else:
+        print_rank_0('could not find arguments in the checkpoint ...')
+
+    # Model.
+    if len(model) == 1:
+        model[0].load_state_dict(state_dict['model'], strict=strict)
+    else:
+        for i in range(len(model)):
+            mpu.set_virtual_pipeline_model_parallel_rank(i)
+            model[i].load_state_dict(state_dict['model%d' % i], strict=strict)
+
+    # Fix up query/key/value matrix ordering if needed.
+    checkpoint_version = get_checkpoint_version()
+    print_rank_0(f' checkpoint version {checkpoint_version}')
+    fix_query_key_value_ordering(model, checkpoint_version)
+
+    # Optimizer.
+    if not release and not args.finetune and not args.no_load_optim:
+        try:
+            # Load state dict.
+            if optimizer is not None:
+                optimizer.load_state_dict(state_dict['optimizer'])
+
+            # Load distributed optimizer's custom parameter state.
+            if args.use_distributed_optimizer:
+                tracker_filename = get_checkpoint_tracker_filename(load_dir)
+                iteration, release = read_metadata(tracker_filename)
+                model_checkpoint_name = \
+                    get_checkpoint_name(load_dir, iteration, release)
+                optim_checkpoint_name = \
+                    get_distributed_optimizer_checkpoint_name(
+                        model_checkpoint_name)
+                optimizer.load_parameter_state(optim_checkpoint_name)
+
+            # Load scheduler.
+            if opt_param_scheduler is not None:
+                if 'lr_scheduler' in state_dict: # backward compatbility
+                    opt_param_scheduler.load_state_dict(state_dict['lr_scheduler'])
+                else:
+                    opt_param_scheduler.load_state_dict(state_dict['opt_param_scheduler'])
+        except KeyError:
+            print_rank_0('Unable to load optimizer from checkpoint {}. '
+                         'Specify --no-load-optim or --finetune to prevent '
+                         'attempting to load the optimizer state, '
+                         'exiting ...'.format(checkpoint_name))
+            sys.exit()
+    else:
+        if (args.fp16 or args.bf16) and optimizer is not None:
+            optimizer.reload_model_params()
+
+    # rng states.
+    if not release and not args.finetune and not args.no_load_rng:
+        try:
+            if 'rng_state' in state_dict:
+                # access rng_state for data parallel rank
+                if args.data_parallel_random_init:
+
+                    rng_state = state_dict['rng_state'][mpu.get_data_parallel_rank()]
+                else:
+                    rng_state = state_dict['rng_state'][0]
+                random.setstate(rng_state['random_rng_state'])
+                np.random.set_state(rng_state['np_rng_state'])
+                torch.set_rng_state(rng_state['torch_rng_state'])
+                torch.cuda.set_rng_state(rng_state['cuda_rng_state'])
+                # Check for empty states array
+                if not rng_state['rng_tracker_states']:
+                    raise KeyError
+                tensor_parallel.get_cuda_rng_tracker().set_states(
+                    rng_state['rng_tracker_states'])
+            else:  # backward compatability
+                random.setstate(state_dict['random_rng_state'])
+                np.random.set_state(state_dict['np_rng_state'])
+                torch.set_rng_state(state_dict['torch_rng_state'])
+                torch.cuda.set_rng_state(state_dict['cuda_rng_state'])
+                # Check for empty states array
+                if not state_dict['rng_tracker_states']:
+                    raise KeyError
+                tensor_parallel.get_cuda_rng_tracker().set_states(
+                    state_dict['rng_tracker_states'])
+        except KeyError:
+            print_rank_0('Unable to load rng state from checkpoint {}. '
+                         'Specify --no-load-rng or --finetune to prevent '
+                         'attempting to load the rng state, '
+                         'exiting ...'.format(checkpoint_name))
+            sys.exit()
+
+    # Some utilities want to load a checkpoint without distributed being initialized
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+    print_rank_0(f'  successfully loaded checkpoint from {args.load} '
+                 f'at iteration {iteration}')
+
+    return iteration
+
+
+def load_biencoder_checkpoint(model, only_query_model=False,
+                              only_context_model=False, custom_load_path=None):
+    """
+    selectively load retrieval models for indexing/retrieving
+    from saved checkpoints
+    """
+
+    args = get_args()
+
+    model = unwrap_model(model)
+
+    load_path = custom_load_path if custom_load_path is not None else args.load
+
+    tracker_filename = get_checkpoint_tracker_filename(load_path)
+    with open(tracker_filename, 'r') as f:
+        iteration = int(f.read().strip())
+
+    checkpoint_name = get_checkpoint_name(load_path, iteration,
+                                          args.use_distributed_optimizer,
+                                          release=False)
+
+    if mpu.get_data_parallel_rank() == 0:
+        print('global rank {} is loading checkpoint {}'.format(
+            torch.distributed.get_rank(), checkpoint_name))
+
+    state_dict = torch.load(checkpoint_name, map_location='cpu')
+    ret_state_dict = state_dict['model']
+
+    if only_query_model:
+        ret_state_dict.pop('context_model')
+    if only_context_model:
+        ret_state_dict.pop('query_model')
+
+    assert len(model) == 1
+    model[0].load_state_dict(ret_state_dict)
+    torch.distributed.barrier()
+
+    if mpu.get_data_parallel_rank() == 0:
+        print(' successfully loaded {}'.format(checkpoint_name))
+
+    return model
diff --git a/training/DeepSpeed-Domino/megatron/core/README.md b/training/DeepSpeed-Domino/megatron/core/README.md
new file mode 100644
index 000000000..0c8c61738
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/README.md
@@ -0,0 +1 @@
+Megatron Core is a library for efficient and scalable training of transformer based models.
diff --git a/training/DeepSpeed-Domino/megatron/core/__init__.py b/training/DeepSpeed-Domino/megatron/core/__init__.py
new file mode 100644
index 000000000..25a663c0c
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/__init__.py
@@ -0,0 +1,11 @@
+import megatron.core.parallel_state
+import megatron.core.tensor_parallel
+import megatron.core.utils
+
+from .inference_params import InferenceParams
+from .model_parallel_config import ModelParallelConfig
+
+# Alias parallel_state as mpu, its legacy name
+mpu = parallel_state
+
+__all__ = ["parallel_state", "tensor_parallel", "utils", "InferenceParams", "ModelParallelConfig"]
diff --git a/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/__init__.py b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/__init__.py
new file mode 100644
index 000000000..70bc6869b
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+from .core import check_is_distributed_checkpoint
+from .mapping import LocalNonpersitentObject, ShardedTensor
+from .serialization import load, load_common_state_dict, save
diff --git a/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/core.py b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/core.py
new file mode 100644
index 000000000..f20a0c3a2
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/core.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+import json
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Optional
+
+CONFIG_FNAME = 'metadata.json'
+
+
+class CheckpointingException(Exception):
+    pass
+
+
+@dataclass
+class CheckpointingConfig:
+    """ Documents backends used in the checkpoint. """
+
+    sharded_backend: str
+    sharded_backend_version: int = 1
+    common_backend: str = 'torch'
+    common_backend_version: int = 1
+
+
+def check_is_distributed_checkpoint(checkpoint_dir):
+    return maybe_load_config(checkpoint_dir) is not None
+
+
+def maybe_load_config(checkpoint_dir: str) -> Optional[CheckpointingConfig]:
+    config_path = Path(checkpoint_dir, CONFIG_FNAME)
+    if not config_path.exists():
+        return None
+    with config_path.open() as f:
+        config_dict = json.load(f)
+    return CheckpointingConfig(**config_dict)
+
+
+def save_config(config: CheckpointingConfig, checkpoint_dir: str):
+    config_path = Path(checkpoint_dir, CONFIG_FNAME)
+    with config_path.open('w') as f:
+        json.dump(asdict(config), f)
diff --git a/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/dict_utils.py b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/dict_utils.py
new file mode 100644
index 000000000..c6baf4f11
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/dict_utils.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Utilities for operating with dicts and lists. """
+
+from collections import defaultdict
+from typing import Any, Callable, Iterable, Optional, Tuple, Union
+
+import torch
+
+
+def extract_matching_values(
+    x: Union[dict, list], predicate: Callable
+) -> Tuple[Union[dict, list], Union[dict, list]]:
+    """ Return matching and nonmatching values. Keeps hierarchy. """
+    if isinstance(x, dict):
+        matching_vals = {}
+        nonmatching_vals = {}
+        for k, v in x.items():
+            if isinstance(v, (list, dict)):
+                match, nonmatch = extract_matching_values(v, predicate)
+                if match:
+                    matching_vals[k] = match
+                if nonmatch or not v:
+                    nonmatching_vals[k] = nonmatch
+            elif predicate(v):
+                matching_vals[k] = v
+            else:
+                nonmatching_vals[k] = v
+    else:
+        assert isinstance(x, list)
+        matching_vals = []
+        nonmatching_vals = []
+        for v in x:
+            if isinstance(v, (list, dict)) and v:
+                match, nonmatch = extract_matching_values(v, predicate)
+                if match:
+                    matching_vals.append(match)
+                if nonmatch or not v:
+                    nonmatching_vals.append(nonmatch)
+            elif predicate(v):
+                matching_vals.append(v)
+            else:
+                nonmatching_vals.append(v)
+    return matching_vals, nonmatching_vals
+
+
+def diff(x1: Any, x2: Any, prefix: Tuple = ()) -> Tuple[list, list, list]:
+    mismatch = []
+    if isinstance(x1, dict) and isinstance(x2, dict):
+        only_left = [prefix + (k,) for k in x1.keys() - x2.keys()]
+        only_right = [prefix + (k,) for k in x2.keys() - x1.keys()]
+        for k in x2.keys() & x1.keys():
+            _left, _right, _mismatch = diff(x1[k], x2[k], prefix + (k,))
+            only_left.extend(_left)
+            only_right.extend(_right)
+            mismatch.extend(_mismatch)
+    elif isinstance(x1, list) and isinstance(x2, list):
+        only_left = list(range(len(x1) - 1, len(x2) - 1, -1))
+        only_right = list(range(len(x1) - 1, len(x2) - 1, -1))
+        for i, (v1, v2) in enumerate(zip(x1, x2)):
+            _left, _right, _mismatch = diff(v1, v2, prefix + (i,))
+            only_left.extend(_left)
+            only_right.extend(_right)
+            mismatch.extend(_mismatch)
+    else:
+        only_left = []
+        only_right = []
+        if isinstance(x1, torch.Tensor) and isinstance(x2, torch.Tensor):
+            _is_mismatch = not torch.all(x1 == x2)
+        else:
+            try:
+                _is_mismatch = bool(x1 != x2)
+            except RuntimeError:
+                _is_mismatch = True
+
+        if _is_mismatch:
+            mismatch.append((prefix, type(x1), type(x2)))
+
+    return only_left, only_right, mismatch
+
+
+def inspect_keys_types(d: dict, prefix: Tuple = (), indent: int = 4):
+    print_indent = lambda: print(' ' * indent * len(prefix), end='')
+    for k, v in d.items():
+        if isinstance(v, dict):
+            print_indent()
+            print(f'> {k}:')
+            inspect_keys_types(v, prefix + (k,), indent)
+        else:
+            print_indent()
+            if isinstance(v, torch.Tensor):
+                print(f'> {k}: {type(v)} of shape {v.shape}')
+            else:
+                print(f'> {k}: {type(v)}')
+
+
+def inspect_types(x: Any, prefix: Tuple = (), indent: int = 4):
+    print_indent = lambda: print(' ' * indent * len(prefix), end='')
+    if isinstance(x, dict):
+        print()
+        for k, v in x.items():
+            print_indent()
+            print(f'> {k}: ', end='')
+            inspect_types(v, prefix + (k,), indent)
+    elif isinstance(x, list):
+        print()
+        for i, v in enumerate(x):
+            print_indent()
+            print(f'- {i}: ', end='')
+            inspect_types(v, prefix + (i,), indent)
+    else:
+        if isinstance(x, torch.Tensor):
+            print(f'Tensor of shape {x.shape}')
+        else:
+            try:
+                x_str = str(x)
+            except:
+                x_str = '<no string repr>'
+            if len(x_str) > 30:
+                x_str = x_str[:30] + '... (truncated)'
+            print(f'[{type(x)}]: {x_str}')
+
+
+def nested_values(x: Union[dict, list]):
+    x_iter = x.values() if isinstance(x, dict) else x
+    for v in x_iter:
+        if isinstance(v, (dict, list)):
+            yield from nested_values(v)
+        else:
+            yield v
+
+
+def nested_items_iter(x: Union[dict, list]):
+    x_iter = x.items() if isinstance(x, dict) else enumerate(x)
+    for k, v in x_iter:
+        if isinstance(v, (dict, list)):
+            yield from nested_items_iter(v)
+        else:
+            yield x, k, v
+
+
+def dict_map(f: Callable, d: dict):
+    for sub_d, k, v in nested_items_iter(d):
+        sub_d[k] = f(v)
+
+
+def dict_map_with_key(f: Callable, d: dict):
+    for sub_d, k, v in nested_items_iter(d):
+        sub_d[k] = f(k, v)
+
+
+def dict_list_map_inplace(f: Callable, x: Union[dict, list]):
+    if isinstance(x, dict):
+        for k, v in x.items():
+            x[k] = dict_list_map_inplace(f, v)
+    elif isinstance(x, list):
+        x[:] = (dict_list_map_inplace(f, v) for v in x)
+    else:
+        return f(x)
+    return x
+
+
+def dict_list_map_outplace(f: Callable, x: Union[dict, list]):
+    if isinstance(x, dict):
+        return {k: dict_list_map_outplace(f, v) for k, v in x.items()}
+    elif isinstance(x, list):
+        return [dict_list_map_outplace(f, v) for v in x]
+    else:
+        return f(x)
+
+
+def merge(x1: dict, x2: dict):
+    if isinstance(x1, dict) and isinstance(x2, dict):
+        for k, v2 in x2.items():
+            if k not in x1:
+                x1[k] = v2
+            else:
+                x1[k] = merge(x1[k], v2)
+    elif isinstance(x1, list) and isinstance(x2, list):
+        if len(x1) != len(x2):
+            raise ValueError('Cannot merge two lists with different lengths')
+        for i, v2 in enumerate(x2):
+            x1[i] = merge(x1[i], v2)
+    else:
+        raise ValueError(f'Duplicate non-dict and non-list values encountered: `{x1}` and `{x2}`')
+    return x1
+
+
+def map_reduce(
+    xs: Iterable,
+    key_fn: Callable = lambda x: x,
+    value_fn: Callable = lambda x: x,
+    reduce_fn: Callable = lambda x: x,
+) -> dict:
+    res = defaultdict(list)
+    for x in xs:
+        res[key_fn(x)].append(value_fn(x))
+    for k in res:
+        res[k] = reduce_fn(res[k])
+    return dict(res)
diff --git a/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/mapping.py b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/mapping.py
new file mode 100644
index 000000000..ec71411d6
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/mapping.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Core library classes. """
+
+from dataclasses import dataclass, replace
+from itertools import chain
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from .core import CheckpointingException
+from .dict_utils import dict_list_map_inplace, dict_list_map_outplace
+
+# These type definitions are just hints to differentiate a plain model state
+#  dict (StateDict) from a state dict with tensors replaced with ShardedTensors
+#  (ShardedStateDict).
+StateDict = Dict[str, Any]
+ShardedStateDict = Dict[str, Any]
+ReplicaId = Union[int, Tuple[int, ...]]
+
+
+@dataclass
+class ShardedTensor:
+    """Represents a mapping between a local tensor and a global tensor.
+
+    Global tensor is assumed to consist of many local tensors distributed
+    between different processes.
+
+    Attributes:
+        key: unique identifier of a global tensor
+        data: local tensor data. Can be None only for consistency validation
+        dtype: tensor dtype
+        local_shape: local tensor shape
+        global_shape: global tensor shape
+        global_offset: offset of a local tensor in a global tensor, specified
+            in number of tensor elements
+        axis_fragmentations: global tensor fragmentation of each axis
+        replica_id: indicates given local tensor's replication wrt. local
+            tensors in different processes
+        prepend_axis_num: number of axes prepended to the local tensor
+            to reflect global tensor shape.
+            The behavior is similar to unsqueezing the local tensor.
+        allow_shape_mismatch: if True, during loading, the global shape of a
+            stored tensor does not have to match the expected global shape.
+            Useful for representing tensors with flexible shape, e.g. padded.
+        flattened_range: specifies a slice that should be applied to a flattened
+            tensor with `local_shape` in order to get the tensor stored as `data`
+    """
+
+    key: str
+    data: Optional[torch.Tensor]
+    dtype: torch.dtype
+    local_shape: Tuple[int, ...]
+    global_shape: Tuple[int, ...]
+    global_offset: Tuple[int, ...]
+    axis_fragmentations: Optional[Tuple[int, ...]]
+    replica_id: ReplicaId = 0
+    prepend_axis_num: int = 0
+    allow_shape_mismatch: bool = False
+    flattened_range: Optional[slice] = None
+
+    def global_slice(self) -> Tuple[Union[int, slice], ...]:
+        assert len(self.global_offset) == len(self.local_shape) + self.prepend_axis_num
+        return tuple(
+            chain(
+                (off for off in self.global_offset[: self.prepend_axis_num]),
+                (
+                    slice(off, off + sh)
+                    for off, sh in zip(
+                        self.global_offset[self.prepend_axis_num :], self.local_shape
+                    )
+                ),
+            )
+        )
+
+    def global_coordinates(self) -> Tuple[np.ndarray, ...]:
+        if self.flattened_range is None:
+            raise CheckpointingException(
+                f'`global_coordinates` is undefined for'
+                f' {self.__class__.__name__} without `flattened_range`'
+            )
+
+        local_coords = self.local_coordinates()
+        assert len(local_coords) + self.prepend_axis_num == len(self.global_offset), (
+            len(local_coords),
+            self,
+        )
+        global_coords = tuple(
+            c + off
+            for c, off in zip((0,) * self.prepend_axis_num + local_coords, self.global_offset)
+        )
+        return global_coords
+
+    def local_coordinates(self) -> Tuple[np.ndarray, ...]:
+        if self.flattened_range is None:
+            raise CheckpointingException(
+                f'`local_coordinates` is undefined for'
+                f' {self.__class__.__name__} without `flattened_range`'
+            )
+
+        # TODO: np.unravel_index?
+        mask = np.zeros(np.product(self.local_shape), dtype=bool)
+        mask[self.flattened_range] = True
+        return np.nonzero(mask.reshape(self.local_shape))
+
+    def max_allowed_chunks(self) -> Tuple[int, ...]:
+        chunks = []
+        for axis_sh, axis_fragm in zip(self.global_shape, self.axis_fragmentations):
+            if not self.allow_shape_mismatch and axis_sh % axis_fragm != 0:
+                raise CheckpointingException(
+                    f'Axis shape ({axis_sh}) not divisible' f' by axis fragmentation ({axis_fragm}'
+                )
+            axis_chunk_size = axis_sh // axis_fragm
+            chunks.append(axis_chunk_size)
+        return tuple(chunks)
+
+    def without_data(self):
+        return replace(self, data=None)
+
+    @classmethod
+    def from_rank_offsets(
+        cls,
+        key: str,
+        data: torch.Tensor,
+        *rank_offsets: Tuple[int, int, int],
+        replica_id: ReplicaId = 0,
+        prepend_axis_num: int = 0,
+        allow_shape_mismatch: bool = False,
+    ):
+        """Allows to construct the ShardedTensor given offset specified in process ranks.
+        Arguments:
+            key: unique key
+            data: local tensor data
+            rank_offsets: each tuple (axis, axis_rank_offset, axis_fragm)
+                says that if global tensor is divided into `axis_fragm`
+                 fragment along `axis` axis, then local tensor data
+                 corresponds to the `axis_rank_offset` chunk.
+            replica_id: see ShardedTensor
+            prepend_axis_num: see ShardedTensor
+            allow_shape_mismatch: see ShardedTensor
+        """
+        global_offset = [0] * (data.ndim + prepend_axis_num)
+        global_shape = ([1] * prepend_axis_num) + list(data.shape)
+        axis_fragmentations = [1] * (data.ndim + prepend_axis_num)
+        _seen_axis = set()
+        for axis, axis_rank_offset, axis_fragm in rank_offsets:
+            assert axis >= 0 and axis_rank_offset >= 0 and axis_fragm >= 0, (
+                axis,
+                axis_rank_offset,
+                axis_fragm,
+            )
+            assert (
+                axis_rank_offset < axis_fragm
+            ), 'Rank offset must be lower than axis fragmentation'
+            if axis in _seen_axis:
+                raise CheckpointingException('Duplicated axis specified')
+            _seen_axis.add(axis)
+
+            local_axis_shape = 1 if axis < prepend_axis_num else data.shape[axis - prepend_axis_num]
+            global_shape[axis] = axis_fragm * local_axis_shape
+            global_offset[axis] = axis_rank_offset * local_axis_shape
+            axis_fragmentations[axis] = axis_fragm
+
+        return cls(
+            key,
+            data,
+            data.dtype,
+            tuple(data.shape),
+            tuple(global_shape),
+            tuple(global_offset),
+            tuple(axis_fragmentations),
+            replica_id,
+            prepend_axis_num,
+            allow_shape_mismatch,
+        )
+
+    def __str__(self):
+        return f'{self.__class__.__name__}(key=\'{self.key}\')'
+
+
+def is_main_replica(replica_id):
+    if isinstance(replica_id, int):
+        return replica_id == 0
+    return all(r == 0 for r in replica_id)
+
+
+class LocalNonpersitentObject:
+    """Object that should not be stored in a checkpoint, but restored locally.
+
+    Wrapping any object inside the state dict with LocalNonpersitentObject
+    will result in:
+    - during saving, this object will *not* be stored in the checkpoint
+    - during loading, a local version of this object will be placed in a state dict
+    """
+
+    def __init__(self, obj):
+        self.obj = obj
+
+    def unwrap(self):
+        return self.obj
+
+
+@dataclass
+class ShardedObject:
+    """Represents a mapping between a local object and a global object.
+
+    Global object is assumed to consist of many local objects distributed
+    between different processes.
+
+    NOTE: Contrary to ShardedTensor, it's impossible to change global object
+    sharding. Conceptually, ShardedObject is a fully-sharded ShardedTensor
+    with atomic arbitrary typed elements.
+
+    Attributes:
+        key: unique identifier of a global tensor
+        data: local object data. Can be None only for consistency validation
+        global_shape: global object shape
+        global_offset: offset of a local object in a global object, specified
+            in number of shards
+        replica_id: indicates local object replication wrt. local
+            objects in different processes
+    """
+
+    key: str
+    data: object
+    global_shape: Tuple[int, ...]
+    global_offset: Tuple[int, ...]
+    replica_id: ReplicaId = 0
+
+    def without_data(self):
+        return replace(self, data=None)
+
+    @property
+    def unique_key(self):
+        return f'{self.key}/shard_{".".join(map(str, self.global_offset))}_{".".join(map(str, self.global_shape))}'
+
+    def __str__(self):
+        return f'{self.__class__.__name__}(key=\'{self.key}\')'
+
+
+@dataclass
+class ShardedTensorFactory:
+    """ Allows to apply transformations to tensors before/after serialization.
+
+    The essence of those transformations is that they can be applied to
+    optimizer states the same way they are applied to the model params.
+
+    Builder creates a sub-state-dict out of a tensor before saving, and merger
+    merges the corresponding state dict after loading.
+    """
+
+    key: str
+    data: torch.Tensor
+    build_fn: Callable[[str, torch.Tensor], ShardedStateDict]
+    merge_fn: Callable[[StateDict], torch.Tensor]
+
+    def build(self):
+        return self.build_fn(self.key, self.data)
+
+
+def apply_factories(sharded_state_dict: ShardedStateDict):
+    def apply(x):
+        if isinstance(x, ShardedTensorFactory):
+            x = x.build()
+        return x
+
+    dict_list_map_inplace(apply, sharded_state_dict)
+
+
+def apply_factory_merges(x1: StateDict, x2: ShardedStateDict):
+    if isinstance(x2, ShardedTensorFactory):
+        return x2.merge_fn(x1)
+
+    # There rest is almost the same as the `merge` function from `dict_utils`
+    if isinstance(x1, dict) and isinstance(x2, dict):
+        for k, v2 in x2.items():
+            if k not in x1:
+                raise ValueError('Different dict keys encountered in `apply_factory_merges`')
+            else:
+                x1[k] = apply_factory_merges(x1[k], v2)
+    elif isinstance(x1, list) and isinstance(x2, list):
+        if len(x1) != len(x2):
+            raise ValueError('Cannot merge two lists with different lengths')
+        for i, v2 in enumerate(x2):
+            x1[i] = apply_factory_merges(x1[i], v2)
+    else:
+        raise ValueError(f'Duplicate non-dict and non-list values encountered: `{x1}` and `{x2}`')
+    return x1
diff --git a/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/optimizer.py b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/optimizer.py
new file mode 100644
index 000000000..4ed35fa6d
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/optimizer.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Optimizer related helpers. """
+
+import logging
+from copy import deepcopy
+from dataclasses import replace
+from itertools import chain
+from typing import Dict, Iterable, List, Union
+
+logger = logging.getLogger(__name__)
+
+import torch
+
+from .dict_utils import nested_values
+from .mapping import (
+    LocalNonpersitentObject,
+    ShardedStateDict,
+    ShardedTensor,
+    ShardedTensorFactory,
+    StateDict,
+)
+from .utils import extract_sharded_tensors, extract_sharded_tensors_and_factories
+
+
+def get_optim_param_to_id_map(optim_params_iter: Iterable[torch.nn.Parameter]) -> Dict[int, int]:
+    param_mappings = {}
+    for i, param in enumerate(optim_params_iter):
+        if id(param) not in param_mappings:
+            param_mappings[id(param)] = i
+    return param_mappings
+
+
+def get_param_id_to_sharded_param_map(
+    model_sharded_state_dict: ShardedStateDict, optim_params_iter: Iterable[torch.nn.Parameter]
+) -> Dict[int, Union[ShardedTensor, ShardedTensorFactory]]:
+    model_sharded_state_dict, _ = extract_sharded_tensors_and_factories(model_sharded_state_dict)
+    id_to_sharded_param_map = {}
+    param_to_id_map = get_optim_param_to_id_map(optim_params_iter)
+    for ten in nested_values(model_sharded_state_dict):
+        if id(ten.data) in param_to_id_map:
+            id_to_sharded_param_map[param_to_id_map[id(ten.data)]] = ten
+        else:
+            logger.debug(f'{ten} is not tracked by the optimizer')
+
+    if not id_to_sharded_param_map:
+        logger.warning(
+            "Sharded parameters mapping is empty. It means tensors in model state dict"
+            " do not correspond to tensors in optimizer parameters map."
+            " Make sure to call state_dict with `keep_vars=True`."
+        )
+    return id_to_sharded_param_map
+
+
+def make_sharded_optimizer_tensor(
+    model_param: Union[ShardedTensor, ShardedTensorFactory], optim_param: torch.Tensor, prefix: str
+) -> Union[ShardedTensor, ShardedTensorFactory]:
+    if isinstance(model_param, ShardedTensorFactory):
+        return replace(model_param, key=f'{prefix}.{model_param.key}', data=optim_param)
+
+    assert (
+        tuple(optim_param.shape) == model_param.local_shape
+    ), f'Optimizer shape ({tuple(optim_param.shape)} does not match model shape ({model_param.local_shape})'
+    return replace(
+        model_param, key=f'{prefix}.{model_param.key}', data=optim_param, dtype=optim_param.dtype
+    )
+
+
+def optim_state_to_sharding_state(
+    optim_state_dict: StateDict, id_to_sharded_param_map: Dict[int, ShardedTensor]
+):
+    sharded_state = {}
+    for param_id, param_state in optim_state_dict['state'].items():
+        sharded_state[param_id] = {}
+        for state_key, param in param_state.items():
+            if param_id in id_to_sharded_param_map:
+                sharded_state[param_id][state_key] = make_sharded_optimizer_tensor(
+                    id_to_sharded_param_map[param_id], param, prefix=f'optimizer.state.{state_key}'
+                )
+            else:
+                raise ValueError(f'Param id {param_id} does not match any model sharded param')
+
+    optim_state_dict['param_groups'] = deepcopy(optim_state_dict['param_groups'])
+    for group in optim_state_dict['param_groups']:
+        group['params'] = LocalNonpersitentObject(group['params'])
+    optim_state_dict['state'] = sharded_state
diff --git a/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/serialization.py b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/serialization.py
new file mode 100644
index 000000000..9827e9636
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/serialization.py
@@ -0,0 +1,334 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+import logging
+import os
+from collections import Counter, defaultdict
+from itertools import chain
+from pathlib import Path
+from typing import Iterable, List, Tuple, Union
+
+import numpy as np
+import torch
+
+from .core import CheckpointingConfig, maybe_load_config, save_config
+from .dict_utils import (
+    dict_list_map_inplace,
+    diff,
+    extract_matching_values,
+    map_reduce,
+    merge,
+    nested_values,
+)
+from .mapping import (
+    CheckpointingException,
+    ShardedObject,
+    ShardedStateDict,
+    ShardedTensor,
+    ShardedTensorFactory,
+    StateDict,
+    apply_factories,
+    apply_factory_merges,
+    is_main_replica,
+)
+from .strategies.base import (
+    LoadCommonStrategy,
+    LoadShardedStrategy,
+    SaveCommonStrategy,
+    SaveShardedStrategy,
+    StrategyAction,
+    get_default_strategy,
+)
+from .utils import extract_sharded_tensors, extract_sharded_tensors_or_nonpersistent
+
+COMMON_STATE_FNAME = 'common.pt'
+
+logger = logging.getLogger(__name__)
+
+
+def load(
+    sharded_state_dict: ShardedStateDict,
+    checkpoint_dir: str,
+    sharded_strategy: Union[LoadShardedStrategy, None] = None,
+    common_strategy: Union[LoadCommonStrategy, None] = None,
+) -> StateDict:
+    """Loading entrypoint.
+
+    Arguments:
+        sharded_state_dict: state dict of the existing model populated with
+            ShardedTensors. Used as a mapping to determine which parts of
+            global tensors stored in the checkpoint should be loaded.
+        checkpoint_dir: directory with the checkpoint
+        sharded_strategy: configures loading behavior for sharded tensors
+        common_strategy: configures loading behavior for common data
+    """
+    if common_strategy is not None:
+        raise NotImplementedError('The only supported common strategy is torch')
+
+    checkpoint_dir = Path(checkpoint_dir)
+    common_state_dict = load_common_state_dict(checkpoint_dir)
+    if not sharded_state_dict:
+        return common_state_dict
+
+    sharded_objects, sharded_state_dict = load_sharded_objects(sharded_state_dict, checkpoint_dir)
+    merge(common_state_dict, sharded_objects)
+
+    saved_config = maybe_load_config(checkpoint_dir)
+    if saved_config is None:
+        raise CheckpointingException(f'{checkpoint_dir} is not a distributed checkpoint')
+
+    sh_ten_factories, _ = extract_matching_values(
+        sharded_state_dict, lambda x: isinstance(x, ShardedTensorFactory)
+    )
+    apply_factories(sharded_state_dict)
+    sharded_state_dict, _ = extract_sharded_tensors_or_nonpersistent(sharded_state_dict)
+    sharded_state_dict, nonpersistent_state_dict = extract_sharded_tensors(sharded_state_dict)
+    dict_list_map_inplace(lambda o: o.unwrap(), nonpersistent_state_dict)
+    merge(common_state_dict, nonpersistent_state_dict)
+
+    validate_sharding_integrity(nested_values(sharded_state_dict))
+
+    if sharded_strategy is None:
+        sharded_strategy = get_default_strategy(
+            StrategyAction.LOAD_SHARDED,
+            saved_config.sharded_backend,
+            saved_config.sharded_backend_version,
+        )
+    else:
+        # TODO: implement consistency checks here
+        pass
+    loaded_state_dict = sharded_strategy.load(sharded_state_dict, checkpoint_dir)
+
+    loaded_state_dict = apply_factory_merges(loaded_state_dict, sh_ten_factories)
+
+    merge(common_state_dict, loaded_state_dict)
+    return common_state_dict
+
+
+# TODO: implement it as common torch strategy
+def load_common_state_dict(checkpoint_dir: Path):
+    return torch.load(Path(checkpoint_dir) / COMMON_STATE_FNAME, map_location='cpu')
+
+
+def load_sharded_objects(sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+    sharded_objects, sharded_state_dict = extract_matching_values(
+        sharded_state_dict, lambda v: isinstance(v, ShardedObject)
+    )
+
+    def load_sharded_object(sh_obj: ShardedObject):
+        sh_obj.data = None
+        load_path = (checkpoint_dir / sh_obj.unique_key).with_suffix('.pt')
+        loaded_obj = torch.load(load_path)
+        return loaded_obj
+
+    return dict_list_map_inplace(load_sharded_object, sharded_objects), sharded_state_dict
+
+
+def save(
+    sharded_state_dict: ShardedStateDict,
+    checkpoint_dir: str,
+    sharded_strategy: Union[SaveShardedStrategy, None] = None,
+    common_strategy: Union[SaveCommonStrategy, None] = None,
+):
+    """Saving entrypoint.
+
+    Extracts ShardedTensors from the given state dict. Rank 0 saves the
+    "regular" part of the checkpoint to common torch file.
+    The ShardedTensors are saved according to a strategy specified by the
+    config.
+
+    Arguments:
+        sharded_state_dict: state dict of the populated with
+            ShardedTensors. Used as a mapping to determine how local tensors
+            should be saved as global tensors in the checkpoint.
+        checkpoint_dir: directory to save the checkpoint to
+        sharded_strategy: configures sharded tensors saving behavior and backend
+        common_strategy: configures common data saving behavior and backend
+    """
+    checkpoint_dir = Path(checkpoint_dir)
+
+    if torch.distributed.get_rank() == 0:
+        if not checkpoint_dir.exists():
+            raise CheckpointingException(
+                f'Checkpoint destination directory does not exist: {checkpoint_dir}'
+            )
+
+        if next(checkpoint_dir.iterdir(), None) is not None:
+            raise CheckpointingException(
+                f'Checkpoint destination directory ({checkpoint_dir}) is not empty'
+            )
+
+    if common_strategy is not None:
+        raise NotImplementedError('The only supported common strategy is torch')
+
+    if sharded_strategy is None:
+        sharded_strategy = get_default_strategy(StrategyAction.SAVE_SHARDED, 'zarr', 1)
+
+    apply_factories(sharded_state_dict)
+    sharded_state_dict, state_dict = extract_sharded_tensors_or_nonpersistent(sharded_state_dict)
+    sharded_state_dict, _ = extract_sharded_tensors(sharded_state_dict)
+    sharded_tensors = list(nested_values(sharded_state_dict))
+    validate_sharding_integrity(sharded_tensors)
+
+    _save_common_dict(state_dict, checkpoint_dir, True)
+
+    sharded_strategy.save(sharded_tensors, checkpoint_dir)
+    save_config(
+        CheckpointingConfig(sharded_strategy.backend, sharded_strategy.version), checkpoint_dir
+    )
+
+
+# TODO: implement it as common torch strategy
+def _save_common_dict(
+    state_dict: StateDict, checkpoint_dir: Path, validate_consistency: bool = False
+):
+    common_state_dict = _extract_and_save_sharded_objects(
+        state_dict, checkpoint_dir, validate_consistency
+    )
+    if torch.distributed.get_rank() == 0:
+        torch.save(common_state_dict, checkpoint_dir / COMMON_STATE_FNAME)
+    if validate_consistency:
+        # TODO: implement checking consistency with rank 0 common dict on other ranks
+        pass
+        # torch.distributed.barrier()
+        # if not torch.distributed.get_rank() == 0:
+        #     rank_0_state_dict = torch.load(checkpoint_dir / COMMON_STATE_FNAME)
+        #     print(diff(common_state_dict, rank_0_state_dict))
+
+
+def _extract_and_save_sharded_objects(
+    state_dict: StateDict, checkpoint_dir: Path, validate_consistency: bool = False
+):
+    sharded_objects, state_dict = extract_matching_values(
+        state_dict, lambda v: isinstance(v, ShardedObject)
+    )
+    sharded_objects = list(nested_values(sharded_objects))
+    if validate_consistency:
+        validate_objects_sharding_integrity(sharded_objects)
+    for sh_obj in sharded_objects:
+        if is_main_replica(sh_obj.replica_id):
+            save_path = (checkpoint_dir / sh_obj.unique_key).with_suffix('.pt')
+            os.makedirs(save_path.parent, exist_ok=True)
+            torch.save(sh_obj.data, save_path)
+    return state_dict
+
+
+def validate_sharding_integrity(sharded_tensors: Iterable[ShardedTensor]):
+    sharding = [ten.without_data() for ten in sharded_tensors]
+    all_sharding = [None] * torch.distributed.get_world_size()
+    torch.distributed.all_gather_object(all_sharding, sharding)
+    if torch.distributed.get_rank() != 0:
+        return
+
+    key_shardings = defaultdict(list)
+    for rank, rank_shardings in enumerate(all_sharding):
+        for sharding in rank_shardings:
+            key_shardings[sharding.key].append((rank, sharding))
+    for key, shardings in key_shardings.items():
+        _validate_sharding_for_key(shardings)
+
+
+def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]):
+    some_rank_shard = rank_sharding[0][1]
+    global_shape = some_rank_shard.global_shape
+    local_shape = some_rank_shard.local_shape
+    dtype = some_rank_shard.dtype
+    has_flattened_range = some_rank_shard.flattened_range is not None
+    for rank, sharding in rank_sharding:
+        assert sharding.dtype == dtype, (sharding.dtype, dtype, some_rank_shard)
+        assert sharding.global_shape == global_shape, (
+            sharding.global_shape,
+            global_shape,
+            some_rank_shard,
+        )
+        assert sharding.local_shape == local_shape, (
+            sharding.local_shape,
+            local_shape,
+            some_rank_shard,
+        )
+        assert (sharding.flattened_range is not None) == has_flattened_range, (
+            (sharding.flattened_range is not None),
+            has_flattened_range,
+            some_rank_shard,
+        )
+
+    shard_access_cnt = _compute_shards_access(rank_sharding)
+    if has_flattened_range:
+        map_reduce(
+            rank_sharding,
+            lambda x: x[1].global_offset,
+            lambda x: x[1],
+            _validate_sharding_for_key_flattened,
+        )
+    else:
+        if not torch.all(shard_access_cnt == 1):
+            logger.error(f'Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}')
+            raise CheckpointingException(f'Invalid access pattern for {rank_sharding[0][1]}')
+
+
+def _compute_shards_access(rank_sharding):
+    def chunk_offset(sharding):
+        assert len(sharding.global_offset) == len(sharding.local_shape) + sharding.prepend_axis_num
+        return tuple(
+            chain(
+                (off for off in sharding.global_offset[: sharding.prepend_axis_num]),
+                (
+                    off // sh
+                    for off, sh in zip(
+                        sharding.global_offset[sharding.prepend_axis_num :], sharding.local_shape
+                    )
+                ),
+            )
+        )
+
+    shard_access_cnt = torch.zeros(
+        rank_sharding[0][1].axis_fragmentations, dtype=torch.int, device='cpu'
+    )
+    for rank, sharding in rank_sharding:
+        if is_main_replica(sharding.replica_id):
+            shard_access_cnt[chunk_offset(sharding)] += 1
+        # TODO: consider validating different replicas too
+    return shard_access_cnt
+
+
+def _validate_sharding_for_key_flattened(tensors_by_shard):
+    all_slices = []
+    local_shape = tensors_by_shard[0].local_shape
+    for sharding in tensors_by_shard:
+        assert sharding.local_shape == local_shape
+        sharding: ShardedTensor
+        if not is_main_replica(sharding.replica_id):
+            # TODO: this checks only saving (and loading replica_id=0) consistency
+            continue
+
+        all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop))
+
+    starts, stops = map(np.asarray, zip(*sorted(all_slices)))
+    if (
+        starts[0] != 0
+        or stops[-1] != np.product(local_shape)
+        or not np.all(starts[1:] == stops[:-1])
+    ):
+        logger.error(
+            f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}. Ranges: {(starts, stops)}'
+        )
+        raise CheckpointingException(
+            f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}'
+        )
+
+
+def validate_objects_sharding_integrity(sharded_objects: List[ShardedObject]):
+    """ Ensure uniqueness of saved objects. """
+    local_sh_objs = [sh_obj.without_data() for sh_obj in sharded_objects]
+    all_sh_objs = [None] * torch.distributed.get_world_size()
+    torch.distributed.all_gather_object(all_sh_objs, local_sh_objs)
+    if torch.distributed.get_rank() != 0:
+        return
+    unique_keys = [
+        sh_obj.unique_key
+        for sh_obj in chain.from_iterable(all_sh_objs)
+        if is_main_replica(sh_obj.replica_id)
+    ]
+    if len(unique_keys) != len(set(unique_keys)):
+        duplicates = {k: cnt for k, cnt in Counter(unique_keys).items() if cnt > 1}
+        logger.error(f'Duplicate ShardedObject keys and counts: {duplicates}')
+        raise CheckpointingException(f'Duplicate ShardedObject keys: {list(duplicates.keys())}')
diff --git a/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/__init__.py b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/__init__.py
new file mode 100644
index 000000000..7177d973c
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Various loading and saving strategies """
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+try:
+    import tensorstore
+    import zarr
+
+    from .tensorstore import _import_trigger
+    from .zarr import _import_trigger
+except ImportError:
+    logger.warning('Zarr-based strategies will not be registered because of missing packages')
diff --git a/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/base.py b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/base.py
new file mode 100644
index 000000000..92ded320f
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/base.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from enum import Enum
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from ..mapping import CheckpointingException, ShardedStateDict, ShardedTensor, StateDict
+
+
+class StrategyAction(Enum):
+    LOAD_COMMON = 'load_common'
+    LOAD_SHARDED = 'load_sharded'
+    SAVE_COMMON = 'save_common'
+    SAVE_SHARDED = 'save_sharded'
+
+
+default_strategies = defaultdict(dict)
+
+
+def get_default_strategy(action: StrategyAction, backend: str, version: int):
+    try:
+        return default_strategies[action.value][(backend, version)]
+    except KeyError as e:
+        raise CheckpointingException(
+            f'Cannot find default strategy for: {(action, backend, version)}'
+        ) from e
+
+
+class LoadStrategyBase(ABC):
+    @abstractmethod
+    def check_backend_compatibility(self, loaded_version):
+        raise NotImplementedError
+
+    @abstractmethod
+    def check_version_compatibility(self, loaded_version):
+        raise NotImplementedError
+
+
+class SaveStrategyBase(ABC):
+    def __init__(self, backend: str, version: int):
+        self.backend = backend
+        self.version = version
+
+
+class LoadCommonStrategy(LoadStrategyBase):
+    @abstractmethod
+    def load(self, checkpoint_dir: Path):
+        raise NotImplementedError
+
+
+class LoadShardedStrategy(LoadStrategyBase):
+    @abstractmethod
+    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        raise NotImplementedError
+
+
+class SaveCommonStrategy(SaveStrategyBase):
+    @abstractmethod
+    def save(self, common_state_dict: StateDict, checkpoint_dir: Path):
+        raise NotImplementedError
+
+
+class SaveShardedStrategy(SaveStrategyBase):
+    @abstractmethod
+    def save(self, sharded_tensors: List[ShardedTensor], checkpoint_dir: Path):
+        raise NotImplementedError
diff --git a/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/tensorstore.py b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/tensorstore.py
new file mode 100644
index 000000000..34355a0f4
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/tensorstore.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Strategies using TensorStore to load and save Zarr arrays. """
+
+from functools import partial
+from itertools import starmap
+from pathlib import Path
+
+import tensorstore as ts
+import torch
+
+from ..core import CheckpointingException
+from ..dict_utils import dict_list_map_inplace
+from ..mapping import ShardedStateDict, ShardedTensor
+from .base import LoadShardedStrategy, StrategyAction, default_strategies
+from .zarr import postprocess_numpy_array
+
+_import_trigger = None
+
+
+class TensorStoreLoadShardedStrategy(LoadShardedStrategy):
+    def __init__(self, load_directly_on_device: bool = False):
+        super().__init__()
+        self.load_directly_on_device = load_directly_on_device
+
+    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        if torch.distributed.get_rank() == 0:
+            print(f'Loading distributed checkpoint with {self.__class__.__name__}')
+            if self.load_directly_on_device:
+                print(f'Loading distributed checkpoint directly on the GPU')
+        load_fn = partial(
+            _load_from_array,
+            checkpoint_dir=checkpoint_dir,
+            load_directly_on_device=self.load_directly_on_device,
+        )
+        dict_list_map_inplace(load_fn, sharded_state_dict)
+        return sharded_state_dict
+
+    def check_backend_compatibility(self, loaded_version):
+        pass  # TODO
+
+    def check_version_compatibility(self, loaded_version):
+        pass  # TODO
+
+
+def merge_global_slice_with_shape(global_slice, actual_shape, key):
+    def _merge_slice(dim_slice, dim_size):
+        if isinstance(dim_slice, slice):
+            assert (
+                dim_slice.start < dim_size
+            ), f'Got empty slice for ShardedTensor {key} ({dim_slice}, {dim_size})'
+            if dim_slice.stop > dim_size:
+                dim_slice = slice(dim_slice.start, dim_size, dim_slice.step)
+        return dim_slice
+
+    assert len(global_slice) == len(actual_shape), (global_slice, actual_shape, key)
+    return tuple(starmap(_merge_slice, zip(global_slice, actual_shape)))
+
+
+def _load_from_array(
+    sharded_tensor: ShardedTensor,
+    checkpoint_dir: Path,
+    load_directly_on_device: bool = False,
+    apply_flattened_range: bool = True,
+):
+    x = _load_regular_chunk(sharded_tensor, checkpoint_dir)
+    ten = postprocess_numpy_array(x, sharded_tensor, apply_flattened_range)
+    if load_directly_on_device:
+        sharded_tensor.data.data.copy_(ten)
+        return sharded_tensor.data
+    else:
+        return ten
+
+
+def _load_regular_chunk(sharded_tensor: ShardedTensor, checkpoint_dir: Path):
+    assert isinstance(sharded_tensor, ShardedTensor), type(sharded_tensor)
+    spec = {'driver': 'zarr', 'metadata_key': '.zarray', 'kvstore': {}}
+    spec['kvstore'] = {
+        'driver': 'file',
+        'path': str(checkpoint_dir / sharded_tensor.key),
+    }
+    try:
+        arr = ts.open(ts.Spec(spec), open=True).result()
+    except Exception as e:
+        raise CheckpointingException(
+            f'Array {checkpoint_dir / sharded_tensor.key} could not be loaded. Error: {e}'
+        ) from e
+
+    if sharded_tensor.global_shape == arr.shape:
+        x = (
+            arr[sharded_tensor.global_slice()].read().result()
+        )  # flattened tensors loading is delayed
+    elif sharded_tensor.allow_shape_mismatch:
+        global_slice = merge_global_slice_with_shape(
+            sharded_tensor.global_slice(), arr.shape, sharded_tensor.key
+        )
+        x = arr[global_slice].read().result()  # flattened tensors loading is delayed
+    else:
+        _msg = (
+            f'Global shape mismatch for loaded ({arr.shape})'
+            f' and expected ({sharded_tensor.global_shape}) tensor'
+            f' for key {sharded_tensor.key}'
+        )
+        raise CheckpointingException(_msg)
+    return x
+
+
+default_strategies[StrategyAction.LOAD_SHARDED.value][
+    ('zarr', 1)
+] = TensorStoreLoadShardedStrategy()
diff --git a/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/two_stage.py b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/two_stage.py
new file mode 100644
index 000000000..f35fb0a69
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/two_stage.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" 2-stage checkpoint loading. """
+import os
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from functools import partial, wraps
+from itertools import chain
+from logging import DEBUG, INFO, StreamHandler, getLogger
+from operator import attrgetter, itemgetter
+from pathlib import Path
+from typing import Iterable, List, NamedTuple, Optional, Tuple, Union
+
+import torch
+
+from ..dict_utils import dict_list_map_inplace, map_reduce, nested_values
+from ..mapping import ShardedStateDict, ShardedTensor, StateDict
+from .base import LoadShardedStrategy
+from .tensorstore import _load_from_array
+from .zarr import flatten_range
+
+_import_trigger = None
+
+
+timers = defaultdict(list)
+
+logger = getLogger(__name__)
+
+
+def timed(verbose=True):
+    def timed_dec(fn):
+        name = fn.__name__
+
+        @wraps(fn)
+        def wrapped(*args, **kwargs):
+            if verbose:
+                logger.debug(f'{name} init')
+            start = time.time()
+            ret = fn(*args, **kwargs)
+            took = time.time() - start
+            if verbose:
+                logger.debug(f'{name} took {took}s')
+            timers[name].append(took)
+            return ret
+
+        return wrapped
+
+    return timed_dec
+
+
+@dataclass
+class _ShardedTensorMetadata:
+    global_rank: int
+    sharded_tensor_no_data: ShardedTensor
+    dist_group_rank: Tuple[int]  # id of distributed group
+    dist_group_ranks: Tuple[int]  # id of distributed group
+    data_size: Optional[int] = None  # bytes
+
+
+def sharded_tensor_chunk_id(sharded_tensor: ShardedTensor):
+    return (
+        sharded_tensor.key,
+        sharded_tensor.global_offset,
+    )
+
+
+class TwoStageDataParallelLoadShardedStrategy(LoadShardedStrategy):
+    """ Loads one checkpoint replica from storage and broadcasts to other nodes.
+
+    This strategy loads checkpoint from storage on minimal set of nodes
+    and distributes the checkpoint to other nodes with torch.distributed.
+    Loading is performed with tensorstore.
+
+    Steps:
+    0. (optional) create Gloo distributed groups
+    1. Exchange ShardedTensors metadata between all nodes
+    2. Align needed tensors within DP groups
+    3. For each globally unique tensor:
+      a) on one of the ranks load it from storage to CPU and move to CUDA
+      b) allocate CUDA tensor on other ranks
+      c) broadcast within DP group
+      d) copy tensor content to the model param location
+      e) free tensor buffers from a) and b)
+
+    Notes:
+    1. Loading and broadcasting is done sequentially to avoid both host and device OOMs
+    2. There is a lot of overlap potential between all three steps done for each tensor:
+      a) loading from storage to numpy
+      b) moving CPU tensors to CUDA
+      c) broadcast
+
+    """
+
+    def __init__(self, data_parallel_group, cpu_transfer=True):
+        super().__init__()
+
+        self.cpu_transfer = cpu_transfer
+        self.data_parallel_group_orig = data_parallel_group
+        self.data_parallel_group = None if cpu_transfer else data_parallel_group
+        self.dp_group_ranks = tuple(
+            sorted(torch.distributed.get_process_group_ranks(data_parallel_group))
+        )
+        self.dp_group_rank = torch.distributed.get_rank(self.data_parallel_group_orig)
+        self.global_rank = torch.distributed.get_rank()
+
+    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        self.maybe_init_gloo_group()
+        all_tensors_sorted = self._build_load_plan(sharded_state_dict)
+        self._exchange_loaded_tensors(all_tensors_sorted, sharded_state_dict, checkpoint_dir)
+        self.summarize_load_times()
+        return sharded_state_dict
+
+    def summarize_load_times(self):
+        torch.distributed.barrier()
+        logger.info('Checkpoint loading finished. Summary:')
+        for key, times in sorted(timers.items()):
+            times_sum = sum(times)
+            max_times = torch.tensor([times_sum], device='cuda')
+            avg_times = torch.tensor([times_sum], device='cuda')
+            torch.distributed.all_reduce(max_times, op=torch.distributed.ReduceOp.MAX)
+            torch.distributed.all_reduce(avg_times, op=torch.distributed.ReduceOp.SUM)
+            avg_times /= torch.distributed.get_world_size()
+            if torch.distributed.get_rank() == 0:
+                logger.info(f'{key}: max {max_times[0]}, avg {avg_times[0]}')
+
+    @timed(verbose=False)
+    def load_tensor_from_storage(self, checkpoint_dir, ten_meta: _ShardedTensorMetadata):
+        logger.debug(f'_load_from_array({ten_meta.sharded_tensor_no_data.key}) init')
+        ret = _load_from_array(
+            ten_meta.sharded_tensor_no_data,
+            checkpoint_dir,
+            load_directly_on_device=False,
+            apply_flattened_range=False,
+        )
+        logger.debug(f'_load_from_array({ten_meta.sharded_tensor_no_data.key}) DONE')
+        return ret
+
+    @timed()
+    def maybe_init_gloo_group(self):
+        if not self.cpu_transfer:
+            return
+        all_groups = [None] * torch.distributed.get_world_size()
+        torch.distributed.all_gather_object(all_groups, self.dp_group_ranks)
+        all_groups = set(tuple(sorted(gr)) for gr in all_groups)
+        for group_ranks in sorted(all_groups):
+            gloo_pg = torch.distributed.new_group(ranks=group_ranks, backend='gloo')
+            if self.global_rank in group_ranks:
+                self.data_parallel_group = gloo_pg
+                assert self.dp_group_rank == torch.distributed.get_rank(self.data_parallel_group)
+
+    def check_backend_compatibility(self, loaded_version):
+        pass  # TODO
+
+    def check_version_compatibility(self, loaded_version):
+        pass  # TODO
+
+    @timed()
+    def _build_load_plan(
+        self, sharded_state_dict: ShardedStateDict
+    ) -> List[_ShardedTensorMetadata]:
+        local_meta = [
+            _ShardedTensorMetadata(
+                self.global_rank,
+                sharded_ten.without_data(),
+                self.dp_group_rank,
+                self.dp_group_ranks,
+            )
+            for sharded_ten in nested_values(sharded_state_dict)
+        ]
+        all_meta = [None] * torch.distributed.get_world_size(group=self.data_parallel_group)
+        torch.distributed.all_gather_object(all_meta, local_meta, group=self.data_parallel_group)
+        all_meta = list(chain.from_iterable(all_meta))
+        all_tensors_sorted = self.deduplicate_chunks(all_meta)
+        return all_tensors_sorted
+
+    @timed()
+    def deduplicate_chunks(self, ten_metas: List[_ShardedTensorMetadata]):
+        """ Group tensors by chunk and then pick the tensor with the lowest rank.
+
+        NOTE: with proper loading overlap, loading from randomized ranks
+         (instead of the smallest one) could be beneficial here.
+        """
+        ten_metas = map_reduce(
+            ten_metas,
+            key_fn=lambda meta: sharded_tensor_chunk_id(meta.sharded_tensor_no_data),
+            reduce_fn=partial(min, key=attrgetter('dist_group_rank')),
+        )
+        all_metas_sorted = list(map(itemgetter(1), sorted(ten_metas.items())))
+        return all_metas_sorted
+
+    @timed()
+    def _exchange_loaded_tensors(
+        self, ten_metas: List[_ShardedTensorMetadata], sharded_state_dict, checkpoint_dir
+    ):
+        logger.debug(f'_exchange_loaded_tensors, num ten_metas: {len(ten_metas)}')
+        for ten_meta in ten_metas:
+
+            src_rank = torch.distributed.get_global_rank(
+                self.data_parallel_group, ten_meta.dist_group_rank
+            )
+
+            if self.dp_group_rank == ten_meta.dist_group_rank:
+                exchange_tensor = self.load_tensor_from_storage(checkpoint_dir, ten_meta)
+                if not self.cpu_transfer:
+                    exchange_tensor = exchange_tensor.cuda()
+            else:
+                # TODO: for non-flattened ranges we could reuse the buffer from the start here
+                exchange_tensor = torch.empty(
+                    ten_meta.sharded_tensor_no_data.local_shape,
+                    device='cpu' if self.cpu_transfer else 'cuda',
+                    dtype=ten_meta.sharded_tensor_no_data.dtype,
+                )
+
+            logger.debug(
+                f'exchange {ten_meta.sharded_tensor_no_data.key}, {exchange_tensor.shape}({exchange_tensor.numel()}), broadcast({src_rank} -> {self.dp_group_ranks})'
+            )
+            torch.distributed.broadcast(
+                exchange_tensor, group=self.data_parallel_group, src=src_rank
+            )
+            self._distribute_data_to_state_dict(ten_meta, exchange_tensor, sharded_state_dict)
+            logger.debug(f'exchange {ten_meta.sharded_tensor_no_data.key} done')
+
+            # free buffer memory
+            exchange_tensor = None
+
+    @timed(verbose=False)
+    def _distribute_data_to_state_dict(
+        self,
+        ten_meta: _ShardedTensorMetadata,
+        loaded_ten: torch.Tensor,
+        sharded_state_dict: ShardedStateDict,
+    ):
+        tensor_key = sharded_tensor_chunk_id(ten_meta.sharded_tensor_no_data)
+
+        def _fill_in_data(t: Union[ShardedTensor, torch.Tensor]):
+            if not isinstance(t, ShardedTensor) or sharded_tensor_chunk_id(t) != tensor_key:
+                # already filled-in or key not matching
+                return t
+            sharded_tensor: ShardedTensor = t
+            x = loaded_ten
+            if sharded_tensor.flattened_range is not None:
+                x = flatten_range(sharded_tensor, x)
+
+            # Reuse existing buffer
+            sharded_tensor.data.data.copy_(x)
+            return sharded_tensor.data
+
+        dict_list_map_inplace(_fill_in_data, sharded_state_dict)
diff --git a/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/zarr.py b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/zarr.py
new file mode 100644
index 000000000..4c61f2d97
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/strategies/zarr.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Strategies using Zarr as an underlying format. """
+import os
+from functools import partial
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import torch
+import zarr
+
+from ..core import CheckpointingException
+from ..dict_utils import dict_list_map_inplace
+from ..mapping import ShardedStateDict, ShardedTensor, is_main_replica
+from .base import LoadShardedStrategy, SaveShardedStrategy, StrategyAction, default_strategies
+
+numpy_to_torch_dtype_dict = {
+    np.bool_: torch.bool,
+    np.uint8: torch.uint8,
+    np.int8: torch.int8,
+    np.int16: torch.int16,
+    np.int32: torch.int32,
+    np.int64: torch.int64,
+    np.float16: torch.float16,
+    np.float32: torch.float32,
+    np.float64: torch.float64,
+    np.complex64: torch.complex64,
+    np.complex128: torch.complex128,
+}
+
+torch_to_numpy_dtype_dict = {v: k for k, v in numpy_to_torch_dtype_dict.items()}
+
+
+try:
+    import tensorstore
+
+    HAS_BFLOAT16 = True
+    numpy_to_torch_dtype_dict[np.dtype('bfloat16')] = torch.bfloat16
+    torch_to_numpy_dtype_dict[torch.bfloat16] = np.dtype('bfloat16')
+except ImportError:
+    HAS_BFLOAT16 = False
+
+_import_trigger = None
+
+
+class ZarrSaveShardedStrategy(SaveShardedStrategy):
+    def save(self, sharded_tensors: List[ShardedTensor], checkpoint_dir: Path):
+        arrays = _create_or_open_zarr_arrays(sharded_tensors, checkpoint_dir)
+        for ten, arr in zip(sharded_tensors, arrays):
+            _save_to_existing_array(ten, arr)
+        torch.distributed.barrier()
+
+
+def _create_or_open_zarr_arrays(
+    sharded_tensors: List[ShardedTensor], checkpoint_dir: Path
+) -> List[zarr.Array]:
+    arrays = []
+    for ten in sharded_tensors:
+        if _should_create_array(ten):
+            _create_zarr_array(ten, checkpoint_dir)
+            # TODO: maybe reuse the opened arrays
+
+    torch.distributed.barrier()
+    for ten in sharded_tensors:
+        # if is_main_replica(ten.replica_id) and set(ten.global_offset) == {0}:
+        #     continue
+        open_kwargs = {}
+        if ten.flattened_range is not None:
+            open_kwargs['synchronizer'] = zarr.ProcessSynchronizer(
+                str(checkpoint_dir / f'{ten.key}.sync')
+            )
+        arr = zarr.open(checkpoint_dir / ten.key, 'r+', **open_kwargs)
+        arrays.append(arr)
+    return arrays
+
+
+def _should_create_array(ten: ShardedTensor):
+    return (
+        is_main_replica(ten.replica_id)
+        and set(ten.global_offset) == {0}
+        and (ten.flattened_range is None or ten.flattened_range.start == 0)
+    )
+
+
+def _save_to_existing_array(sharded_tensor: ShardedTensor, arr: zarr.Array):
+    if not is_main_replica(sharded_tensor.replica_id):
+        return
+    x = sharded_tensor.data
+    x = x.detach().cpu()
+    torch.cuda.synchronize()
+    if x.dtype == torch.bfloat16:
+        x = x.float()
+        x = x.numpy()
+        x = x.astype('bfloat16')
+    else:
+        x = x.numpy()
+
+    if sharded_tensor.flattened_range is None:
+        arr[sharded_tensor.global_slice()] = x
+    else:
+        arr.set_coordinate_selection(sharded_tensor.global_coordinates(), x)
+
+
+def _create_zarr_array(sharded_tensor: ShardedTensor, checkpoint_dir: Path):
+    np_dtype = torch_to_numpy_dtype_dict[sharded_tensor.dtype]
+    try:
+        arr = zarr.create(
+            sharded_tensor.global_shape,
+            dtype=np_dtype,
+            store=checkpoint_dir / sharded_tensor.key,
+            chunks=sharded_tensor.max_allowed_chunks(),
+            compressor=None,
+            fill_value=None,
+            write_empty_chunks=True,
+        )
+    except zarr.errors.ContainsArrayError as e:
+        raise CheckpointingException(
+            f'Array {checkpoint_dir / sharded_tensor.key} already exists'
+        ) from e
+
+    if HAS_BFLOAT16 and np_dtype == np.dtype('bfloat16'):
+        arr._dtype = np_dtype
+        zarray = arr.store['.zarray']
+        arr.store['.zarray'] = zarray.replace(b'<V2', b'bfloat16')
+    return arr
+
+
+class ZarrLoadShardedStrategy(LoadShardedStrategy):
+    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        dict_list_map_inplace(
+            partial(_load_from_array, checkpoint_dir=checkpoint_dir), sharded_state_dict
+        )
+        return sharded_state_dict
+
+    def check_backend_compatibility(self, loaded_version):
+        pass  # TODO
+
+    def check_version_compatibility(self, loaded_version):
+        pass  # TODO
+
+
+def _load_from_array(sharded_tensor: ShardedTensor, checkpoint_dir: Path):
+    assert isinstance(sharded_tensor, ShardedTensor), type(sharded_tensor)
+    try:
+        arr = zarr.open(checkpoint_dir / sharded_tensor.key, 'r')
+    except zarr.errors.PathNotFoundError as e:
+        raise CheckpointingException(
+            f'Array {checkpoint_dir / sharded_tensor.key} not found'
+        ) from e
+
+    if not sharded_tensor.allow_shape_mismatch and sharded_tensor.global_shape != arr.shape:
+        _msg = (
+            f'Global shape mismatch for loaded ({arr.shape})'
+            f' and expected ({sharded_tensor.global_shape}) tensor'
+            f' for key {sharded_tensor.key}'
+        )
+        raise CheckpointingException(_msg)
+
+    x = arr[sharded_tensor.global_slice()]  # flattened tensors loading is delayed
+    return postprocess_numpy_array(x, sharded_tensor)
+
+
+def postprocess_numpy_array(loaded_array, sharded_tensor, apply_flattened_range=True):
+    x = loaded_array
+    if HAS_BFLOAT16 and x.dtype == np.dtype('bfloat16'):
+        x = x.astype(np.dtype('float32'))
+        x = torch.from_numpy(x)
+        x = x.bfloat16()
+    else:
+        x = torch.from_numpy(x)
+    # TODO: consider some other consistency checks
+    if x.shape != sharded_tensor.local_shape:
+        if sharded_tensor.allow_shape_mismatch:
+            x = pad_to_expected_shape(x, sharded_tensor)
+        else:
+            _msg = (
+                f'Local shape mismatch for loaded ({x.shape})'
+                f' and expected ({sharded_tensor.local_shape}) tensor'
+                f' for key {sharded_tensor.key}'
+            )
+            raise CheckpointingException(_msg)
+
+    if apply_flattened_range and sharded_tensor.flattened_range is not None:
+        x = flatten_range(sharded_tensor, x)
+
+    # TODO: consider cuda() tensors support
+    return x
+
+
+def flatten_range(sharded_tensor, x):
+    return x.flatten()[sharded_tensor.flattened_range]
+
+
+def pad_to_expected_shape(x: torch.Tensor, expected_sharded_ten: ShardedTensor):
+    pad_args = []
+    assert len(x.shape) == len(expected_sharded_ten.local_shape)
+    # Reversed iteration order because F.pad expects so
+    for x_sh, exp_sh, axis_fragm in reversed(
+        list(
+            zip(x.shape, expected_sharded_ten.local_shape, expected_sharded_ten.axis_fragmentations)
+        )
+    ):
+        if x_sh == exp_sh:
+            pad_args.extend((0, 0))
+        elif x_sh > exp_sh:
+            assert (
+                False
+            ), f'Expected shape ({exp_sh}) smaller than actual ({x_sh}) for {repr(expected_sharded_ten)}'
+        else:
+            pad_args.extend((0, exp_sh - x_sh))
+    # TODO: behavior control with envvar is for testing purposes only, remove it
+    if not int(os.environ.get('DIST_CKPT_PAD_REPLICATE', 0)):
+        return torch.nn.functional.pad(x, pad_args)
+
+    # unsqueeze and squeeze to get shapes supported by cudnn
+    print(f'Replicating last row for {expected_sharded_ten.key}')
+    if x.dtype == torch.bfloat16:
+        return (
+            torch.nn.functional.pad(x.float().unsqueeze(0), pad_args, mode='replicate')
+            .squeeze(0)
+            .bfloat16()
+        )
+    return torch.nn.functional.pad(x.unsqueeze(0), pad_args, mode='replicate').squeeze(0)
+
+
+# default_strategies[StrategyAction.LOAD_SHARDED.value][('zarr', 1)] = ZarrLoadShardedStrategy()
+default_strategies[StrategyAction.SAVE_SHARDED.value][('zarr', 1)] = ZarrSaveShardedStrategy(
+    'zarr', 1
+)
diff --git a/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/utils.py b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/utils.py
new file mode 100644
index 000000000..f7976f007
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/dist_checkpointing/utils.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+from typing import Tuple
+
+from .dict_utils import dict_list_map_inplace, extract_matching_values
+from .mapping import (
+    LocalNonpersitentObject,
+    ShardedStateDict,
+    ShardedTensor,
+    ShardedTensorFactory,
+    StateDict,
+)
+
+
+def extract_sharded_tensors(
+    sharded_state_dict: ShardedStateDict,
+) -> Tuple[ShardedStateDict, StateDict]:
+    return extract_matching_values(sharded_state_dict, lambda v: isinstance(v, ShardedTensor))
+
+
+def extract_sharded_tensors_and_factories(
+    sharded_state_dict: ShardedStateDict,
+) -> Tuple[ShardedStateDict, StateDict]:
+    return extract_matching_values(
+        sharded_state_dict, lambda v: isinstance(v, (ShardedTensor, ShardedTensorFactory))
+    )
+
+
+def extract_sharded_tensors_or_nonpersistent(
+    sharded_state_dict: ShardedStateDict,
+) -> Tuple[ShardedStateDict, StateDict]:
+    return extract_matching_values(
+        sharded_state_dict,
+        lambda v: isinstance(v, (ShardedTensor, LocalNonpersitentObject, ShardedTensorFactory)),
+    )
+
+
+def add_prefix_for_sharding(sharded_state_dict: ShardedStateDict, prefix: str):
+    def add_prefix(t):
+        if isinstance(t, ShardedTensor):
+            t.key = f'{prefix}.{t.key}'
+        return t
+
+    dict_list_map_inplace(add_prefix, sharded_state_dict)
diff --git a/training/DeepSpeed-Domino/megatron/core/enums.py b/training/DeepSpeed-Domino/megatron/core/enums.py
new file mode 100644
index 000000000..46e7d3b76
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/enums.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import enum
+
+
+class ModelType(enum.Enum):
+    encoder_or_decoder = 1
+    encoder_and_decoder = 2
+    retro_encoder = 3
+    retro_decoder = 4
diff --git a/training/DeepSpeed-Domino/megatron/core/fusions/__init__.py b/training/DeepSpeed-Domino/megatron/core/fusions/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/DeepSpeed-Domino/megatron/core/fusions/fused_bias_dropout.py b/training/DeepSpeed-Domino/megatron/core/fusions/fused_bias_dropout.py
new file mode 100644
index 000000000..e0d6ffbda
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/fusions/fused_bias_dropout.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Optional, Tuple
+
+import torch
+
+
+def _bias_dropout_add_func(x, bias, residual, prob, training):
+    # type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor
+    # NOTE: Previously, the argument `bias` used to be passed as
+    # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the
+    # transformer layer but broadcasting should automatically take care of that.
+    # Also, looking at broadcasting semantics, `expand_as` and broadcasting
+    # seem to be identical performance-wise (both just change the view).
+
+    # If we want to train mixed precision, then the output of this function
+    # should be half precision. However, in AMP O1, the input (residual) is
+    # in fp32, and it will up-cast the result to fp32, causing pipeline parallel
+    # GPU communication to hang. Therefore, we need to cast residual to the same
+    # dtype as x.
+    residual = residual if residual.dtype == x.dtype else residual.to(x.dtype)
+    if bias is not None:
+        x = x + bias
+    out = torch.nn.functional.dropout(x, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+@torch.jit.script
+def bias_dropout_add_fused_train(
+    x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float,
+) -> torch.Tensor:
+    x, bias = x_with_bias  # unpack
+    return _bias_dropout_add_func(x, bias, residual, prob, True)
+
+
+@torch.jit.script
+def bias_dropout_add_fused_inference(
+    x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float,
+) -> torch.Tensor:
+    x, bias = x_with_bias  # unpack
+    return _bias_dropout_add_func(x, bias, residual, prob, False)
+
+
+def get_bias_dropout_add(training, fused):
+    def unfused_bias_dropout_add(x_with_bias, residual, prob):
+        x, bias = x_with_bias  # unpack
+        return _bias_dropout_add_func(x, bias, residual, prob, training)
+
+    if fused:
+        # jit scripting for a nn.module (with dropout) is not
+        # triggering the fusion kernel. For now, we use two
+        # different nn.functional routines to account for varying
+        # dropout semantics during training and inference phases.
+        if training:
+            return bias_dropout_add_fused_train
+        else:
+            return bias_dropout_add_fused_inference
+    else:
+        return unfused_bias_dropout_add
diff --git a/training/DeepSpeed-Domino/megatron/core/fusions/fused_bias_gelu.py b/training/DeepSpeed-Domino/megatron/core/fusions/fused_bias_gelu.py
new file mode 100644
index 000000000..9c791c180
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/fusions/fused_bias_gelu.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+###### BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+
+
+@torch.jit.script
+def bias_gelu(bias, y):
+    x = bias + y
+    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, bias, y):
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
+        1 + tanh_out
+    )
+    return ff * g
+
+
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(bias, input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, bias, input)
+        return tmp, tmp
+
+
+bias_gelu_impl = GeLUFunction.apply
diff --git a/training/DeepSpeed-Domino/megatron/core/fusions/fused_layer_norm.py b/training/DeepSpeed-Domino/megatron/core/fusions/fused_layer_norm.py
new file mode 100644
index 000000000..e4f098424
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/fusions/fused_layer_norm.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import importlib
+import numbers
+
+import torch
+from torch.nn import init
+from torch.nn.parameter import Parameter
+
+from megatron.core.utils import make_viewless_tensor
+
+try:
+    from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
+
+    HAVE_PERSIST_LAYER_NORM = True
+except:
+    HAVE_PERSIST_LAYER_NORM = False
+
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+
+    HAVE_FUSED_LAYER_NORM = True
+except:
+    HAVE_FUSED_LAYER_NORM = False
+
+
+class FusedLayerNorm(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        eps=1e-5,
+        persist_layer_norm=True,
+        sequence_parallel=False,
+        zero_centered_gamma=False,
+    ):
+        super().__init__()
+
+        self.zero_centered_gamma = zero_centered_gamma
+
+        # List of hiddens sizes supported in the persistent layer norm kernel
+        # If the hidden size is not supported, fall back to the non-persistent
+        # kernel.
+        persist_ln_hidden_sizes = [
+            1024,
+            1536,
+            2048,
+            2304,
+            3072,
+            3840,
+            4096,
+            5120,
+            6144,
+            8192,
+            10240,
+            12288,
+            12800,
+            15360,
+            16384,
+            18432,
+            20480,
+            24576,
+            25600,
+            30720,
+            32768,
+            40960,
+            49152,
+            65536,
+        ]
+        if hidden_size not in persist_ln_hidden_sizes or not HAVE_PERSIST_LAYER_NORM:
+            persist_layer_norm = False
+
+        if not persist_layer_norm and not HAVE_FUSED_LAYER_NORM:
+            # TODO: Add pytorch only layer norm
+            raise ValueError(f'Apex must currently be installed to use megatron core.')
+
+        if isinstance(hidden_size, numbers.Integral):
+            hidden_size = (hidden_size,)
+        self.hidden_size = torch.Size(hidden_size)
+        self.eps = eps
+        self.weight = Parameter(torch.Tensor(*hidden_size))
+        self.bias = Parameter(torch.Tensor(*hidden_size))
+        self.reset_parameters()
+        self.persist_layer_norm = persist_layer_norm
+        self.sequence_parallel = sequence_parallel
+
+        # set sequence parallelism flag on weight and bias parameters
+        setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
+        setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
+
+    def reset_parameters(self):
+
+        if self.zero_centered_gamma:
+            init.zeros_(self.weight)
+            init.zeros_(self.bias)
+        else:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def forward(self, input):
+
+        weight = self.weight + 1 if self.zero_centered_gamma else self.weight
+
+        if self.persist_layer_norm:
+            output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
+
+            # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
+            # a populated '_base' field). This will result in schedule.py's
+            # deallocate_output_tensor() throwing an error, so a viewless tensor is
+            # created to prevent this.
+            output = make_viewless_tensor(
+                inp=output, requires_grad=input.requires_grad, keep_graph=True
+            )
+
+        else:
+            output = FusedLayerNormAffineFunction.apply(
+                input, weight, self.bias, self.hidden_size, self.eps
+            )
+
+        return output
diff --git a/training/DeepSpeed-Domino/megatron/core/fusions/fused_softmax.py b/training/DeepSpeed-Domino/megatron/core/fusions/fused_softmax.py
new file mode 100644
index 000000000..56eb2e801
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/fusions/fused_softmax.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+
+import torch
+import torch.nn as nn
+
+from megatron.core.transformer.enums import AttnMaskType
+
+
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_upper_triang_masked_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0])
+
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_upper_triang_masked_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+
+        return input_grads, None
+
+
+class ScaledMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply the mask.
+    3. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, mask, scale):
+        import scaled_masked_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+
+        softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_masked_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads = scaled_masked_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
+        return input_grads, None, None
+
+
+class ScaledSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following two operations in sequence
+    1. Scale the tensor.
+    2. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+
+        softmax_results = scaled_softmax_cuda.forward(inputs, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads = scaled_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
+        return input_grads, None, None
+
+
+class FusedScaleMaskSoftmax(nn.Module):
+    """
+    fused operation: scaling + mask + softmax
+
+    Arguments:
+        input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_bf16: flag to indicate if input in bf16 data format.
+        attn_mask_type: attention mask type (pad or causal)
+        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        scale: scaling factor used in input tensor scaling.
+    """
+
+    def __init__(
+        self,
+        input_in_fp16,
+        input_in_bf16,
+        attn_mask_type,
+        scaled_masked_softmax_fusion,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
+        super(FusedScaleMaskSoftmax, self).__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.input_in_bf16 = input_in_bf16
+        assert not (
+            self.input_in_fp16 and self.input_in_bf16
+        ), "both fp16 and bf16 flags cannot be active at the same time."
+        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
+        self.attn_mask_type = attn_mask_type
+        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+
+        assert self.scale is None or softmax_in_fp32, "softmax should be in fp32 when scaled"
+
+    def forward(self, input, mask):
+        # [b, np, sq, sk]
+        assert input.dim() == 4
+
+        if self.is_kernel_available(mask, *input.size()):
+            return self.forward_fused_softmax(input, mask)
+        else:
+            return self.forward_torch_softmax(input, mask)
+
+    def is_kernel_available(self, mask, b, np, sq, sk):
+        attn_batches = b * np
+
+        if (
+            self.scaled_masked_softmax_fusion  # user want to fuse
+            and self.input_in_float16  # input must be fp16
+            and 16 < sk <= 4096  # sk must be 16 ~ 2048
+            and sq % 4 == 0  # sq must be divisor of 4
+            and sk % 4 == 0  # sk must be divisor of 4
+            and attn_batches % 4 == 0  # np * b must be divisor of 4
+        ):
+            if 0 <= sk <= 4096:
+                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
+
+                if self.attn_mask_type == AttnMaskType.causal:
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if sq % batch_per_block == 0:
+                        return True
+        return False
+
+    def forward_fused_softmax(self, input, mask):
+        b, np, sq, sk = input.size()
+        scale = self.scale if self.scale is not None else 1.0
+
+        if self.attn_mask_type == AttnMaskType.causal:
+            assert sq == sk, "causal mask is only for self attention"
+
+            # input is 3D tensor (attn_batches, sq, sk)
+            input = input.view(-1, sq, sk)
+            probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
+            return probs.view(b, np, sq, sk)
+        else:
+            # input is 4D tensor (b, np, sq, sk)
+            if mask is not None:
+                return ScaledMaskedSoftmax.apply(input, mask, scale)
+            else:
+                return ScaledSoftmax.apply(input, scale)
+
+    def forward_torch_softmax(self, input, mask):
+        if self.input_in_float16 and self.softmax_in_fp32:
+            input = input.float()
+
+        if self.scale is not None:
+            input = input * self.scale
+        mask_output = self.mask_func(input, mask) if mask is not None else input
+        probs = torch.nn.Softmax(dim=-1)(mask_output)
+
+        if self.input_in_float16 and self.softmax_in_fp32:
+            if self.input_in_fp16:
+                probs = probs.half()
+            else:
+                probs = probs.bfloat16()
+
+        return probs
+
+    @staticmethod
+    def get_batch_per_block(sq, sk, b, np):
+        import scaled_masked_softmax_cuda
+
+        return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
diff --git a/training/DeepSpeed-Domino/megatron/core/inference_params.py b/training/DeepSpeed-Domino/megatron/core/inference_params.py
new file mode 100644
index 000000000..287902460
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/inference_params.py
@@ -0,0 +1,27 @@
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+
+    def __init__(self, max_batch_size, max_sequence_length):
+        self.max_sequence_length = max_sequence_length
+        self.max_batch_size = max_batch_size
+        self.sequence_len_offset = 0
+        self.batch_size_offset = 0
+        self.key_value_memory_dict = {}
+
+    def swap_key_value_dict(self, batch_idx):
+        "swap between batches"
+        if len(self.key_value_memory_dict) == 0:
+            raise ValueError("should not swap when dict in empty")
+
+        for layer_number in self.key_value_memory_dict.keys():
+            inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number]
+            assert (
+                len(batch_idx) == inference_key_memory.shape[1]
+            )  # make sure batch size is the same
+            new_inference_key_memory = inference_key_memory[:, batch_idx]
+            new_inference_value_memory = inference_value_memory[:, batch_idx]
+            self.key_value_memory_dict[layer_number] = (
+                new_inference_key_memory,
+                new_inference_value_memory,
+            )
diff --git a/training/DeepSpeed-Domino/megatron/core/model_parallel_config.py b/training/DeepSpeed-Domino/megatron/core/model_parallel_config.py
new file mode 100644
index 000000000..85d3c8e7b
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/model_parallel_config.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+from typing import Callable, Optional
+
+import torch
+
+
+@dataclass
+class ModelParallelConfig:
+    """Base configuration for Megatron Core
+
+    Model Parallelism
+    -----------------
+
+    tensor_model_parallel_size (int): Intra-layer model parallelism. Splits tensors across GPU ranks. Defaults to 1.
+
+    pipeline_model_parallel_size (int): Inter-layer model parallelism. Splits transformer layers across GPU
+        ranks. Defaults to 1.
+
+    virtual_pipeline_model_parallel_size (int): Interleaved pipeline parallelism is used to improve performance by
+        reducing the pipeline bubble.  Considers a transformer block as a list of smaller transformer (virtual) blocks.
+        The number of virtual blocks per pipeline model parallel rank is the virtual model parallel size.  See Efficient
+        Large-Scale Language Model Training on GPU Clusters Using Megatron-LM: https://arxiv.org/pdf/2104.04473.pdf for
+        more details.  Defaults to None.
+
+    sequence_parallel (bool): Makes tensor parallelism more memory efficient for LLMs (20B+) by
+        parallelizing layer norms and dropout sequentially.  See Reducing Activation Recomputation in Large Transformer
+        Models: https://arxiv.org/abs/2205.05198 for more details. Defaults to False.
+
+    Initialization
+    --------------
+
+    perform_initialization (bool, default=True): If true, weights are initialized. This option can be useful when you
+        know you are going to load values from a checkpoint.
+
+    use_cpu_initialization: (bool, default=False): When set to False, we initialize the weights directly on the GPU.
+        Transferring weights from CPU to GPU can take a significant amount of time for large models. Defaults to False.
+
+    Training
+    --------
+
+    fp16 (bool): If true, train with fp16 mixed precision training. Defaults to False.
+
+    bf16 (bool): If true, train with bf16 mixed precision training. Defaults to False.
+
+    params_dtype (torch.dtype): dtype used when intializing the weights. Defaults to torch.float32
+
+    timers (optional, default=None): TODO
+
+    Optimizations
+    -------------
+
+    gradient_accumulation_fusion (bool): If true, fuses weight gradient accumulation to GEMMs. Requires the custom CUDA
+        extension fused_weight_gradient_mlp_cuda module. To use gradient_accumulation_fusion you must install APEX with
+        --cpp_ext and --cuda_ext. For example: "pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext\"
+        ". Note that the extension requires CUDA>=11. Otherwise, you must turn off gradient accumulation fusion.
+        Defaults to False.
+
+    async_tensor_model_parallel_allreduce (bool, default=True): If true, enables asynchronous execution of
+        tensor-model-parallel all-reduce with weight gradient compuation of a column-linear layer.  Defaults to False.
+
+    Pipeline Parallelism
+    --------------------
+
+    pipeline_dtype (required): dtype used in p2p communication, usually params_dtype
+
+    grad_scale_func (optional, default=None): If using loss scaling, this function should take the loss and return the
+        scaled loss. If None, no function is called on the loss.
+
+    enable_autocast (bool): If true runs the forward step function inside torch.autocast context. Default is False.
+
+    autocast_dtype (torch.dtype): dtype to pass to torch.amp.autocast when enabled. Default is pipeline_dtype.
+    
+    variable_seq_lengths (bool, default=False): Support for variable sequence lengths across microbatches. Setting this
+        communicates the size of tensors during pipeline parallelism communication, because of this extra overhead it
+        should only be set if the sequence length varies by microbatch within a global batch.
+
+    num_microbatches_with_partial_activation_checkpoints (int, default=None): If int, set the number of microbatches
+        where not all of the layers will be checkpointed and recomputed. The rest of the microbatches within the window
+        of maximum outstanding microbatches will recompute all layers (either full recompute or selective recompute). If
+        None, the checkpoint and recompute will be left up to the forward_step function.
+
+    overlap_p2p_comm (bool, optional, default=False): When True some of the peer to peer communication for pipeline
+        parallelism will overlap with computation. Must be False if batch_p2p_comm is true.
+
+    batch_p2p_comm (bool, default=True): Use batch_isend_irecv instead of individual isend/irecv calls. Must be False
+        if overlap_p2p_comm is True.
+
+    batch_p2p_sync (bool, default=True): When using batch_isend_irecv, do a cuda.device.synchronize afterward to work
+        around a bug in older version of PyTorch.
+
+    use_ring_exchange_p2p (bool, default = False): Use custom ring_exchange kernel instead of
+        torch.distributed.batch_isend_irecv(). Requires custom built torch with torch.distributed.ring_exchange.
+
+    deallocate_pipeline_outputs (optional, default=False): If True, output data is deallocated after the tensor is sent
+        to the next pipeline stage.  Helps with saving memory, does nothing when pipeline parallel is not used.
+
+    no_sync_func (optional): Function that creates a context that suppresses asynchronous data-parallel
+        communication. If the model is an instance of torch.nn.DistributedDataParallel, the default is to use
+        torch.nn.DistributedDataParallel.no_sync.
+
+    grad_sync_func (optional): Function that launches asynchronous gradient reductions (e.g. distributed optimizer
+        gradient reduce-scatters). The function should take one argument: an iterable of parameters whose gradients are
+        to be synchronized.
+
+    param_sync_func (optional): Function that launches asynchronous parameter synchronizations (e.g. distributed
+        optimizer parameter all-gathers). The function should take one argument: an iterable of parameters to be
+        synchronized.
+
+    """
+
+    # Model parallelism
+    tensor_model_parallel_size: int = 1
+    pipeline_model_parallel_size: int = 1
+    virtual_pipeline_model_parallel_size: Optional[int] = None
+    sequence_parallel: bool = False
+
+    # Initialization
+    perform_initialization: bool = True
+    use_cpu_initialization: bool = False
+
+    # Training
+    fp16: bool = False
+    bf16: bool = False
+    params_dtype: torch.dtype = torch.float32
+    timers: Callable = None
+
+    # Optimizations
+    gradient_accumulation_fusion: bool = False
+    async_tensor_model_parallel_allreduce: bool = False
+
+    # Pipeline Parallel
+    pipeline_dtype: torch.dtype = None
+    grad_scale_func: Callable = None
+    enable_autocast: bool = False
+    autocast_dtype: torch.dtype = None
+    variable_seq_lengths: bool = False
+    num_microbatches_with_partial_activation_checkpoints: Optional[int] = None
+    overlap_p2p_comm: bool = False
+    batch_p2p_comm: bool = True
+    batch_p2p_sync: bool = True
+    use_ring_exchange_p2p: bool = False
+    deallocate_pipeline_outputs: bool = False
+    no_sync_func: Callable = None
+    grad_sync_func: Callable = None
+    param_sync_func: Callable = None
+
+    def __post_init__(self):
+        """ Python dataclass method that is used to modify attributes after initialization.
+            See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
+        """
+        if self.sequence_parallel:
+            if self.tensor_model_parallel_size <= 1:
+                raise ValueError("Can not use sequence paralllelism without tensor parallelism")
+            if self.async_tensor_model_parallel_allreduce:
+                # sequence_parallelism already does this async
+                self.async_tensor_model_parallel_allreduce = False
+
+        if self.pipeline_model_parallel_size > 1:
+            if self.pipeline_dtype is None:
+                raise ValueError(
+                    "When using pipeline parallelism, pipeline_dtype must be specified"
+                )
+
+        if self.autocast_dtype is None:
+            self.autocast_dtype = self.params_dtype
diff --git a/training/DeepSpeed-Domino/megatron/core/models/__init__.py b/training/DeepSpeed-Domino/megatron/core/models/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/DeepSpeed-Domino/megatron/core/models/common/__init__.py b/training/DeepSpeed-Domino/megatron/core/models/common/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/DeepSpeed-Domino/megatron/core/models/common/rotary_pos_embedding.py b/training/DeepSpeed-Domino/megatron/core/models/common/rotary_pos_embedding.py
new file mode 100644
index 000000000..291b10df7
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/models/common/rotary_pos_embedding.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import importlib.util
+
+import torch
+from torch import einsum, nn
+
+__all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, seq_len_interpolation_factor=None):
+        super().__init__()
+        self.seq_len_interpolation_factor = seq_len_interpolation_factor
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq, persistent=False)
+
+    def forward(self, max_seq_len, offset=0):
+        seq = torch.arange(max_seq_len, device=self.inv_freq.device) + offset
+        if self.seq_len_interpolation_factor is not None:
+            seq = seq.type_as(self.inv_freq)
+            seq *= 1 / self.seq_len_interpolation_factor
+        freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq)
+        # first part even vector components, second part odd vector components,
+        #  2 * dim in dimension size
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # emb [seq_length, .., dim]
+        return emb[:, None, None, :]
+
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        state_dict.pop(f'{prefix}inv_freq', None)
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+
+def _rotate_half(x):
+    """
+    change sign so the last dimension becomes [-odd, +even]
+    """
+    x1, x2 = torch.chunk(x, 2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(t, freqs):
+    """
+    input tensor t is of shape [seq_length, ..., dim]
+    rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
+    check https://kexue.fm/archives/8265 for detailed formulas
+    """
+    rot_dim = freqs.shape[-1]
+    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
+    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
+
+    # first part is cosine component
+    # second part is sine component, need to change signs with _rotate_half method
+    t = (t * freqs.cos()) + (_rotate_half(t) * freqs.sin())
+    return torch.cat((t, t_pass), dim=-1)
diff --git a/training/DeepSpeed-Domino/megatron/core/models/gpt/__init__.py b/training/DeepSpeed-Domino/megatron/core/models/gpt/__init__.py
new file mode 100644
index 000000000..2d5eb8674
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/models/gpt/__init__.py
@@ -0,0 +1 @@
+from .gpt_model import GPTModel
diff --git a/training/DeepSpeed-Domino/megatron/core/models/gpt/gpt_embedding.py b/training/DeepSpeed-Domino/megatron/core/models/gpt/gpt_embedding.py
new file mode 100644
index 000000000..578ae803c
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/models/gpt/gpt_embedding.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from megatron.core import tensor_parallel
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import (
+    make_sharded_tensor_for_checkpoint,
+    make_tp_sharded_tensor_for_checkpoint,
+)
+
+
+class GPTEmbedding(MegatronModule):
+    """Language model embeddings.
+
+    Arguments:
+        config (TransformerConfig): config object with all necessary configs for TransformerBlock
+        vocab_size (int): vocabulary size
+        max_sequence_length (int): maximum size of sequence. This
+                             is used for positional embedding
+        add_position_embedding (bool): Add a position embedding.
+        embedding_dropout_prob float): dropout probability for embeddings
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        vocab_size: int,
+        max_sequence_length: int,
+        add_position_embedding: bool,
+    ):
+        super().__init__(config=config)
+
+        self.config: TransformerConfig = config
+        self.vocab_size: int = vocab_size
+        self.max_sequence_length: int = max_sequence_length
+        self.add_position_embedding: bool = add_position_embedding
+
+        # Word embeddings (parallel).
+        self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
+            num_embeddings=self.vocab_size,
+            embedding_dim=self.config.hidden_size,
+            init_method=self.config.init_method,
+            config=self.config,
+        )
+
+        # Position embedding (serial).
+        if self.add_position_embedding:
+            self.position_embeddings = torch.nn.Embedding(
+                self.max_sequence_length, self.config.hidden_size
+            )
+
+            # Initialize the position embeddings.
+            if self.config.perform_initialization:
+                self.config.init_method(self.position_embeddings.weight)
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(self.config.hidden_dropout)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+
+    def forward(self, input_ids, position_ids):
+        # Embeddings.
+        word_embeddings = self.word_embeddings(input_ids)
+        if self.add_position_embedding:
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings = word_embeddings + position_embeddings
+        else:
+            embeddings = word_embeddings
+
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.config.fp32_residual_connection:
+            embeddings = embeddings.float()
+
+        # Dropout.
+        if self.config.sequence_parallel:
+            embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
+            with tensor_parallel.get_cuda_rng_tracker().fork():
+                embeddings = self.embedding_dropout(embeddings)
+        else:
+            embeddings = self.embedding_dropout(embeddings)
+
+        return embeddings
+
+    def sharded_state_dict(self, prefix=''):
+
+        sharded_state_dict = {}
+
+        word_embeddings_prefix = f'{prefix}word_embeddings.'
+        word_embeddings_state_dict = self.word_embeddings.state_dict(
+            prefix=word_embeddings_prefix, keep_vars=True
+        )
+
+        sharded_word_embeddings_key = f'{word_embeddings_prefix}weight'
+        sharded_word_embeddings_tensor = make_tp_sharded_tensor_for_checkpoint(
+            tensor=word_embeddings_state_dict[sharded_word_embeddings_key],
+            key=sharded_word_embeddings_key,
+            allow_shape_mismatch=True,
+        )
+        sharded_state_dict[sharded_word_embeddings_key] = sharded_word_embeddings_tensor
+
+        if self.add_position_embedding:
+            position_embeddings_prefix = f'{prefix}position_embeddings.'
+            position_embeddings_state_dict = self.position_embeddings.state_dict(
+                prefix=position_embeddings_prefix, keep_vars=True
+            )
+            sharded_position_embeddings_key = f'{position_embeddings_prefix}weight'
+            sharded_position_embeddings_tensor = make_sharded_tensor_for_checkpoint(
+                tensor=position_embeddings_state_dict[sharded_position_embeddings_key],
+                key=sharded_position_embeddings_key,
+            )
+            sharded_state_dict[sharded_position_embeddings_key] = sharded_position_embeddings_tensor
+
+        return sharded_state_dict
diff --git a/training/DeepSpeed-Domino/megatron/core/models/gpt/gpt_model.py b/training/DeepSpeed-Domino/megatron/core/models/gpt/gpt_model.py
new file mode 100644
index 000000000..f1c304b7a
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/models/gpt/gpt_model.py
@@ -0,0 +1,308 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+from typing import Literal, Optional
+
+import torch
+from torch import Tensor
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.models.common.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.models.gpt.gpt_embedding import GPTEmbedding
+from megatron.core.transformer.enums import AttnMaskType, ModelType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint
+
+
+class GPTModel(MegatronModule):
+    """Transformer language model.
+
+    Arguments:
+        config (TransformerConfig): transformer config
+
+        vocab_size (int): vocabulary size
+
+        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
+
+        pre_process (bool): Include embedding layer (used with pipeline parallelism)
+        post_process (bool): Include an output layer (used with pipeline parallelism)
+
+        parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks
+
+        share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are
+            shared. Defaults to False.
+
+        position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope'].
+            Defaults is 'learned_absolute'.
+
+        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
+            Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
+
+        seq_len_interpolation_factor (float): scale of linearly interpolating RoPE for longer sequences.
+            The value must be a float larger than 1.0. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        vocab_size: int,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
+        rotary_percent: float = 1.0,
+        seq_len_interpolation_factor: Optional[float] = None,
+    ):
+        super(GPTModel, self).__init__(config=config)
+
+        self.config: TransformerConfig = config
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        self.position_embedding_type = position_embedding_type
+
+        # megatron core pipelining currently depends on model type
+        # TODO: remove this dependency ?
+        self.model_type = ModelType.encoder_or_decoder
+
+        # Embeddings.
+        if self.pre_process:
+            self.embedding = GPTEmbedding(
+                config=self.config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=self.max_sequence_length,
+                add_position_embedding=(self.position_embedding_type == 'learned_absolute'),
+            )
+
+        # Rotary Position Embeddings
+        if self.position_embedding_type == 'rope':
+            rotary_dim = self.config.kv_channels
+            if rotary_percent < 1.0:
+                rotary_dim = int(rotary_dim * rotary_percent)
+
+            self.rotary_pos_emb = RotaryEmbedding(rotary_dim, seq_len_interpolation_factor)
+        else:
+            self.rotary_pos_emb = None
+
+        # Transformer.
+        self.decoder = TransformerBlock(
+            config=self.config,
+            self_attn_mask_type=AttnMaskType.causal,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+
+        # Output
+        if post_process:
+            self.output_layer = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                self.vocab_size,
+                config=config,
+                init_method=config.init_method,
+                bias=False,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=self.pre_process
+                and self.share_embeddings_and_output_weights,
+            )
+
+        if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
+            self.initialize_last_stage_with_word_embeddings()
+
+    def set_input_tensor(self, input_tensor):
+        """ See megatron.model.transformer.set_input_tensor()"""
+
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt'
+        self.decoder.set_input_tensor(input_tensor[0])
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        attention_mask: Tensor,
+        decoder_input: Tensor = None,
+        labels: Tensor = None,
+        inference_params=None,
+    ):
+        # If decoder_input is provided (not None), then input_ids and position_ids are ignored.
+        # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input.
+
+        # Decoder embedding.
+        if decoder_input is not None:
+            pass
+        elif self.pre_process:
+            decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
+        else:
+            # intermediate stage of pipeline
+            # decoder will get hidden_states from encoder.input_tensor
+            decoder_input = None
+
+        # Rotary positional embeddings
+        rotary_pos_emb = None
+        if self.rotary_pos_emb is not None:
+            if inference_params is not None:
+                rotary_seq_len = inference_params.max_sequence_length
+            else:
+                if self.decoder.input_tensor is not None:
+                    rotary_seq_len = self.decoder.input_tensor.size(0)
+                else:
+                    rotary_seq_len = decoder_input.size(0)
+
+                # Decoder input is split along sequence dimension, but RoPE is applied in tensor parallel region
+                if self.config.sequence_parallel:
+                    rotary_seq_len *= self.config.tensor_model_parallel_size
+
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+
+        # Run decoder.
+        hidden_states = self.decoder(
+            hidden_states=decoder_input,
+            attention_mask=attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+
+        if not self.post_process:
+            return hidden_states
+
+        # logits and loss
+        output_weight = None
+        if self.share_embeddings_and_output_weights:
+            output_weight = self.shared_embedding_or_output_weight()
+        logits, _ = self.output_layer(hidden_states, weight=output_weight)
+
+        if labels is None:
+            # [s b h] => [b s h]
+            return logits.transpose(0, 1).contiguous()
+
+        # [b s] => [s b]
+        labels = labels.transpose(0, 1).contiguous()
+        loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels)
+
+        # [s b] => [b, s]
+        loss = loss.transpose(0, 1).contiguous()
+        return loss
+
+    def shared_embedding_or_output_weight(self):
+        if self.pre_process:
+            return self.embedding.word_embeddings.weight
+        elif self.post_process:
+            return self.output_layer.weight
+        return None
+
+    def initialize_last_stage_with_word_embeddings(self):
+
+        # This function just initializes the word embeddings in the final stage
+        # when we are using pipeline parallelism and sharing word
+        # embeddings. Nothing to do if we aren't sharing weights or aren't using
+        # pipeline parallelism.
+        if not self.share_embeddings_and_output_weights or (self.pre_process and self.post_process):
+            return
+
+        if self.post_process and not self.pre_process:
+            assert not parallel_state.is_pipeline_first_stage()
+            # set word_embeddings weights to 0 here, then copy first
+            # stage's weights using all_reduce below.
+            self.output_layer.weight.data.fill_(0)
+            self.output_layer.weight.shared = True
+
+        # Parameters are shared between the word embeddings layers, and the
+        # heads at the end of the model. In a pipelined setup with more than
+        # one stage, the initial embedding layer and the head are on different
+        # workers, so we do the following:
+        # 1. Create a second copy of word_embeddings on the last stage, with
+        #    initial parameters of 0.0.
+        # 2. Do an all-reduce between the first and last stage to ensure that
+        #    the two copies of word_embeddings start off with the same
+        #    parameter values.
+        # 3. In the training loop, before an all-reduce between the grads of
+        #    the two word_embeddings layers to ensure that every applied weight
+        #    update is the same on both stages.
+
+        # Ensure that first and last stages have the same initial parameter
+        # values.
+        if torch.distributed.is_initialized():
+            if parallel_state.is_rank_in_embedding_group():
+                weight = self.shared_embedding_or_output_weight()
+                torch.distributed.all_reduce(
+                    weight.data, group=parallel_state.get_embedding_group()
+                )
+
+        elif not getattr(GPTModel, "embedding_warning_printed", False):
+            logging.getLogger(__name__).warning(
+                "Distributed processes aren't initialized, so the output layer "
+                "is not initialized with weights from the word embeddings. "
+                "If you are just manipulating a model this is fine, but "
+                "this needs to be handled manually. If you are training "
+                "something is definitely wrong."
+            )
+            GPTModel.embedding_warning_printed = True
+
+    def sharded_state_dict(self, prefix=''):
+        sharded_state_dict = {}
+
+        if self.pre_process:
+            embedding_prefix = f'{prefix}embedding.'
+            embedding_sharded_state_dict = self.embedding.sharded_state_dict(
+                prefix=embedding_prefix
+            )
+            sharded_state_dict.update(embedding_sharded_state_dict)
+
+        decoder_prefix = f'{prefix}decoder.'
+        decoder_sharded_state_dict = self.decoder.sharded_state_dict(prefix=decoder_prefix)
+        sharded_state_dict.update(decoder_sharded_state_dict)
+
+        if self.post_process:
+            output_layer_prefix = f'{prefix}output_layer.'
+            output_layer_key = f'{output_layer_prefix}weight'
+            if self.share_embeddings_and_output_weights:
+                if not self.pre_process:
+                    # when sharing embeddings with last stage, we need to use the weights from the first stage
+                    # on pipeline first rank, word embeddings are saved to {prefix}embedding.word_embeddings.weight
+                    tensor = self.shared_embedding_or_output_weight()
+                    first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight'
+                    dp_rank = parallel_state.get_data_parallel_rank()
+                    dp_size = parallel_state.get_data_parallel_world_size()
+                    last_stage_word_emb_replica_id = (
+                        dp_rank + dp_size
+                    )  # copy of first stage embedding
+
+                    sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
+                        tensor=tensor,
+                        key=first_stage_word_emb_key,
+                        replica_id=last_stage_word_emb_replica_id,
+                        allow_shape_mismatch=True,
+                    )
+
+                    sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
+
+            else:
+                output_layer_state_dict = self.output_layer.state_dict(
+                    prefix=output_layer_prefix, keep_vars=True
+                )
+                output_layer_tensor = output_layer_state_dict[output_layer_key]
+                # independent output layer
+                sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
+                    tensor=output_layer_tensor,
+                    key=output_layer_key,
+                    replica_id=parallel_state.get_data_parallel_rank(),
+                    allow_shape_mismatch=True,
+                )
+
+                sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
+
+        return sharded_state_dict
diff --git a/training/DeepSpeed-Domino/megatron/core/package_info.py b/training/DeepSpeed-Domino/megatron/core/package_info.py
new file mode 100644
index 000000000..47025af14
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/package_info.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+
+
+MAJOR = 0
+MINOR = 3
+PATCH = 0
+PRE_RELEASE = ''
+
+# Use the following formatting: (major, minor, patch, pre-release)
+VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
+
+__shortversion__ = '.'.join(map(str, VERSION[:3]))
+__version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:])
+
+__package_name__ = 'megatron_core'
+__contact_names__ = 'NVIDIA'
+__contact_emails__ = 'nemo-toolkit@nvidia.com'  # use NeMo Email
+__homepage__ = (
+    'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/'  # use NeMo homepage
+)
+__repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core'
+__download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
+__description__ = (
+    'Megatron Core - a library for efficient and scalable training of transformer based models'
+)
+__license__ = 'BSD-3'
+__keywords__ = (
+    'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch'
+)
diff --git a/training/DeepSpeed-Domino/megatron/core/parallel_state.py b/training/DeepSpeed-Domino/megatron/core/parallel_state.py
new file mode 100644
index 000000000..52e7c46b9
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/parallel_state.py
@@ -0,0 +1,651 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Model and data parallel groups."""
+
+import os
+from typing import Optional
+
+import torch
+
+from .utils import GlobalMemoryBuffer
+
+# Intra-layer model parallel group that the current rank belongs to.
+_TENSOR_MODEL_PARALLEL_GROUP = None
+# Inter-layer model parallel group that the current rank belongs to.
+_PIPELINE_MODEL_PARALLEL_GROUP = None
+# Model parallel group (both intra- and pipeline) that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Embedding group.
+_EMBEDDING_GROUP = None
+# Position embedding group.
+_POSITION_EMBEDDING_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+_DATA_PARALLEL_GROUP_GLOO = None
+# FP8 amax reduction group.
+_AMAX_REDUCTION_GROUP = None
+
+_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
+_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+_PIPELINE_MODEL_PARALLEL_SPLIT_RANK = None
+
+# These values enable us to change the mpu sizes on the fly.
+_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_TENSOR_MODEL_PARALLEL_RANK = None
+_MPU_PIPELINE_MODEL_PARALLEL_RANK = None
+
+# A list of ranks that have a copy of the embedding.
+_EMBEDDING_GLOBAL_RANKS = None
+
+# A list of ranks that have a copy of the position embedding.
+_POSITION_EMBEDDING_GLOBAL_RANKS = None
+
+# A list of global ranks for each pipeline group to ease calculation of the source
+# rank when broadcasting from the first or last pipeline stage.
+_PIPELINE_GLOBAL_RANKS = None
+
+# A list of global ranks for each data parallel group to ease calculation of the source
+# rank when broadcasting weights from src to all other data parallel ranks
+_DATA_PARALLEL_GLOBAL_RANKS = None
+
+# Memory buffers to avoid dynamic memory allocation
+_GLOBAL_MEMORY_BUFFER = None
+
+
+def initialize_model_parallel(
+    tensor_model_parallel_size: int = 1,
+    pipeline_model_parallel_size: int = 1,
+    virtual_pipeline_model_parallel_size: Optional[int] = None,
+    pipeline_model_parallel_split_rank: Optional[int] = None,
+    use_fp8: bool = False,
+    use_sharp: bool = False,
+) -> None:
+    """Initialize model data parallel groups.
+
+    Arguments:
+        tensor_model_parallel_size (int, default = 1):
+            The number of GPUs to split individual tensors across.
+
+        pipeline_model_parallel_size (int, default = 1):
+            The number of tensor parallel GPU groups to split the
+            Transformer layers across. For example, if
+            tensor_model_parallel_size is 4 and
+            pipeline_model_parallel_size is 2, the model will be split
+            into 2 groups of 4 GPUs.
+
+        virtual_pipeline_model_parallel_size (int, optional):
+            The number of stages that each pipeline group will have,
+            interleaving as necessary. If None, no interleaving is
+            performed. For example, if tensor_model_parallel_size is 1,
+            pipeline_model_parallel_size is 4,
+            virtual_pipeline_model_parallel_size is 2, and there are
+            16 transformer layers in the model, the model will be
+            split into 8 stages with two layers each and each GPU
+            would get 2 stages as such (layer number starting with 1):
+
+            GPU 0: [1, 2] [9, 10]
+            GPU 1: [3, 4] [11, 12]
+            GPU 2: [5, 6] [13, 14]
+            GPU 3: [7, 8] [15, 16]
+
+        pipeline_model_parallel_split_rank (int, optional):
+            For models with both an encoder and decoder, the rank in
+            pipeline to switch between encoder and decoder (i.e. the
+            first rank of the decoder). This allows the user to set
+            the pipeline parallel size of the encoder and decoder
+            independently. For example, if
+            pipeline_model_parallel_size is 8 and
+            pipeline_model_parallel_split_rank is 3, then ranks 0-2
+            will be the encoder and ranks 3-7 will be the decoder.
+
+        use_fp8 (bool, default = False):
+            Construct GPU groups needed for FP8 training, namely for
+            amax reduction across the product of the data-parallel and
+            tensor-parallel groups.
+
+        use_sharp (bool, default = False):
+            Set the use of SHARP for the collective communications of
+            data-parallel process groups. When `True`, run barrier
+            within each data-parallel process group, which specifies
+            the SHARP application target groups.
+
+    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
+    and 8 data-parallel groups as:
+        8 data_parallel groups:
+            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
+        8 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
+        4 pipeline model-parallel groups:
+            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+
+    """
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size: int = torch.distributed.get_world_size()
+
+    if world_size % (tensor_model_parallel_size * pipeline_model_parallel_size) != 0:
+        raise RuntimeError(
+            f"world_size ({world_size}) is not divisible by tensor_model_parallel_size "
+            f"({tensor_model_parallel_size}) x pipeline_model_parallel_size ({pipeline_model_parallel_size})"
+        )
+
+    data_parallel_size: int = world_size // (
+        tensor_model_parallel_size * pipeline_model_parallel_size
+    )
+
+    num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
+    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
+    num_data_parallel_groups: int = world_size // data_parallel_size
+
+    if virtual_pipeline_model_parallel_size is not None:
+        if not pipeline_model_parallel_size > 2:
+            raise RuntimeError(
+                "pipeline-model-parallel size should be greater than 2 with interleaved schedule"
+            )
+        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+        _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
+        _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size
+
+    if pipeline_model_parallel_split_rank is not None:
+        global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+        _PIPELINE_MODEL_PARALLEL_SPLIT_RANK = pipeline_model_parallel_split_rank
+
+    rank = torch.distributed.get_rank()
+
+    # Build the data-parallel groups.
+    global _DATA_PARALLEL_GROUP
+    global _DATA_PARALLEL_GROUP_GLOO
+    global _DATA_PARALLEL_GLOBAL_RANKS
+    assert _DATA_PARALLEL_GROUP is None, 'data parallel group is already initialized'
+    all_data_parallel_group_ranks = []
+    for i in range(pipeline_model_parallel_size):
+        start_rank = i * num_pipeline_model_parallel_groups
+        end_rank = (i + 1) * num_pipeline_model_parallel_groups
+        for j in range(tensor_model_parallel_size):
+            ranks = range(start_rank + j, end_rank, tensor_model_parallel_size)
+            all_data_parallel_group_ranks.append(list(ranks))
+            group = torch.distributed.new_group(ranks)
+            group_gloo = torch.distributed.new_group(ranks, backend="gloo")
+            if rank in ranks:
+                _DATA_PARALLEL_GROUP = group
+                _DATA_PARALLEL_GROUP_GLOO = group_gloo
+                _DATA_PARALLEL_GLOBAL_RANKS = ranks
+
+    # Apply SHARP to DP process groups
+    if use_sharp:
+        if rank == 0:
+            print(
+                "The number of process groups to use SHARP with depends on the type "
+                "of the network switch. Nvidia QM1 switch supports SAHRP up to 8 "
+                "process groups and QM2 supports up to 256 process groups. We apply "
+                "SHARP to the communications of the data-parallel domain. If the "
+                "number of data-parallel process groups is larger than the max "
+                "process groups that the network switch supports, the communication "
+                "will fall back to non-SHARP operators. To enable SHARP, "
+                "`#SBATCH_NETWORK=sharp` should be set in the sbatch script."
+            )
+        torch.distributed.barrier(
+            group=get_data_parallel_group(), device_ids=[torch.cuda.current_device()]
+        )
+        # Set `NCCL_SHARP_DISABLE=1` to restrict SHARP application to DP process groups
+        os.environ["NCCL_SHARP_DISABLE"] = "1"
+
+    # Build the model-parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, 'model parallel group is already initialized'
+    for i in range(data_parallel_size):
+        ranks = [
+            data_parallel_group_ranks[i]
+            for data_parallel_group_ranks in all_data_parallel_group_ranks
+        ]
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _MODEL_PARALLEL_GROUP = group
+
+    # Build the tensor model-parallel groups.
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    assert (
+        _TENSOR_MODEL_PARALLEL_GROUP is None
+    ), 'tensor model parallel group is already initialized'
+    for i in range(num_tensor_model_parallel_groups):
+        ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _TENSOR_MODEL_PARALLEL_GROUP = group
+
+    # Build the pipeline model-parallel groups and embedding groups
+    # (first and last rank in each pipeline model-parallel group).
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    global _PIPELINE_GLOBAL_RANKS
+    assert (
+        _PIPELINE_MODEL_PARALLEL_GROUP is None
+    ), 'pipeline model parallel group is already initialized'
+    global _EMBEDDING_GROUP
+    global _EMBEDDING_GLOBAL_RANKS
+    assert _EMBEDDING_GROUP is None, 'embedding group is already initialized'
+    global _POSITION_EMBEDDING_GROUP
+    global _POSITION_EMBEDDING_GLOBAL_RANKS
+    assert _POSITION_EMBEDDING_GROUP is None, 'position embedding group is already initialized'
+    for i in range(num_pipeline_model_parallel_groups):
+        ranks = range(i, world_size, num_pipeline_model_parallel_groups)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _PIPELINE_MODEL_PARALLEL_GROUP = group
+            _PIPELINE_GLOBAL_RANKS = ranks
+        # Setup embedding group (to exchange gradients between
+        # first and last stages).
+        if len(ranks) > 1:
+            embedding_ranks = [ranks[0], ranks[-1]]
+            position_embedding_ranks = [ranks[0]]
+            if pipeline_model_parallel_split_rank is not None:
+                if ranks[pipeline_model_parallel_split_rank] not in embedding_ranks:
+                    embedding_ranks = [
+                        ranks[0],
+                        ranks[pipeline_model_parallel_split_rank],
+                        ranks[-1],
+                    ]
+                if ranks[pipeline_model_parallel_split_rank] not in position_embedding_ranks:
+                    position_embedding_ranks = [ranks[0], ranks[pipeline_model_parallel_split_rank]]
+        else:
+            embedding_ranks = ranks
+            position_embedding_ranks = ranks
+
+        group = torch.distributed.new_group(embedding_ranks)
+        if rank in embedding_ranks:
+            _EMBEDDING_GROUP = group
+        if rank in ranks:
+            _EMBEDDING_GLOBAL_RANKS = embedding_ranks
+
+        group = torch.distributed.new_group(position_embedding_ranks)
+        if rank in position_embedding_ranks:
+            _POSITION_EMBEDDING_GROUP = group
+        if rank in ranks:
+            _POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks
+
+    # Build the FP8 groups.
+    global _AMAX_REDUCTION_GROUP
+    assert _AMAX_REDUCTION_GROUP is None, 'FP8 amax reduction group is already initialized'
+    if use_fp8:
+        amax_group_size: int = tensor_model_parallel_size * data_parallel_size
+        num_amax_groups: int = world_size // amax_group_size
+        for i in range(num_amax_groups):
+            start_rank = i * amax_group_size
+            end_rank = (i + 1) * amax_group_size
+            ranks = range(start_rank, end_rank)
+            group = torch.distributed.new_group(ranks)
+            if rank in ranks:
+                _AMAX_REDUCTION_GROUP = group
+
+    # Initialize global memory buffer
+    # This isn't really "parallel state" but there isn't another good place to
+    # put this. If we end up with a more generic initialization of megatron-core
+    # we could stick it there
+    _set_global_memory_buffer()
+
+
+def is_unitialized():
+    """Useful for code segments that may be accessed with or without mpu initialization"""
+    return _DATA_PARALLEL_GROUP is None
+
+
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if (
+        _TENSOR_MODEL_PARALLEL_GROUP is None
+        or _PIPELINE_MODEL_PARALLEL_GROUP is None
+        or _DATA_PARALLEL_GROUP is None
+    ):
+        return False
+    return True
+
+
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, 'model parallel group is not initialized'
+    return _MODEL_PARALLEL_GROUP
+
+
+def get_tensor_model_parallel_group(check_initialized=True):
+    """Get the tensor model parallel group the caller rank belongs to."""
+    if check_initialized:
+        assert (
+            _TENSOR_MODEL_PARALLEL_GROUP is not None
+        ), 'tensor model parallel group is not initialized'
+    return _TENSOR_MODEL_PARALLEL_GROUP
+
+
+def get_pipeline_model_parallel_group():
+    """Get the pipeline model parallel group the caller rank belongs to."""
+    assert (
+        _PIPELINE_MODEL_PARALLEL_GROUP is not None
+    ), 'pipeline_model parallel group is not initialized'
+    return _PIPELINE_MODEL_PARALLEL_GROUP
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, 'data parallel group is not initialized'
+    return _DATA_PARALLEL_GROUP
+
+
+def get_data_parallel_group_gloo():
+    """Get the data parallel group-gloo the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP_GLOO is not None, 'data parallel group-gloo is not initialized'
+    return _DATA_PARALLEL_GROUP_GLOO
+
+
+def get_embedding_group():
+    """Get the embedding group the caller rank belongs to."""
+    assert _EMBEDDING_GROUP is not None, 'embedding group is not initialized'
+    return _EMBEDDING_GROUP
+
+
+def get_position_embedding_group():
+    """Get the position embedding group the caller rank belongs to."""
+    assert _POSITION_EMBEDDING_GROUP is not None, 'position embedding group is not initialized'
+    return _POSITION_EMBEDDING_GROUP
+
+
+def get_amax_reduction_group():
+    """Get the FP8 amax reduction group the caller rank belongs to."""
+    assert _AMAX_REDUCTION_GROUP is not None, 'FP8 amax reduction group is not initialized'
+    return _AMAX_REDUCTION_GROUP
+
+
+def set_tensor_model_parallel_world_size(world_size):
+    """Set the tensor model parallel size"""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = world_size
+
+
+def set_pipeline_model_parallel_world_size(world_size):
+    """Set the pipeline model parallel size"""
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
+
+
+def set_virtual_pipeline_model_parallel_world_size(world_size):
+    """Set the pipeline model parallel size"""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
+
+
+def get_tensor_model_parallel_world_size():
+    """Return world size for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
+
+
+def get_pipeline_model_parallel_world_size():
+    """Return world size for the pipeline model parallel group."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())
+
+
+def set_tensor_model_parallel_rank(rank):
+    """Set tensor model parallel rank."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    _MPU_TENSOR_MODEL_PARALLEL_RANK = rank
+
+
+def set_pipeline_model_parallel_rank(rank):
+    """Set pipeline model parallel rank."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    _MPU_PIPELINE_MODEL_PARALLEL_RANK = rank
+
+
+def set_pipeline_model_parallel_split_rank(rank):
+    """Set pipeline model parallel split rank."""
+    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+    _PIPELINE_MODEL_PARALLEL_SPLIT_RANK = rank
+
+
+def get_tensor_model_parallel_rank():
+    """Return my rank for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    if _MPU_TENSOR_MODEL_PARALLEL_RANK is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
+
+
+def get_pipeline_model_parallel_rank():
+    """Return my rank for the pipeline model parallel group."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None:
+        return _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_pipeline_model_parallel_group())
+
+
+def get_pipeline_model_parallel_split_rank():
+    """Return pipeline model parallel split rank."""
+    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+    return _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+
+
+def is_pipeline_first_stage(ignore_virtual=False):
+    """Return True if in the first pipeline model-parallel stage, False otherwise."""
+    if not ignore_virtual:
+        if (
+            get_virtual_pipeline_model_parallel_world_size() is not None
+            and get_virtual_pipeline_model_parallel_rank() != 0
+        ):
+            return False
+    return get_pipeline_model_parallel_rank() == 0
+
+
+def is_pipeline_last_stage(ignore_virtual=False):
+    """Return True if in the last pipeline model-parallel stage, False otherwise."""
+    if not ignore_virtual:
+        virtual_pipeline_model_parallel_world_size = (
+            get_virtual_pipeline_model_parallel_world_size()
+        )
+        if virtual_pipeline_model_parallel_world_size is not None and get_virtual_pipeline_model_parallel_rank() != (
+            virtual_pipeline_model_parallel_world_size - 1
+        ):
+            return False
+    return get_pipeline_model_parallel_rank() == (get_pipeline_model_parallel_world_size() - 1)
+
+
+def is_rank_in_embedding_group(ignore_virtual=False):
+    """Return true if current rank is in embedding group, False otherwise."""
+    rank = torch.distributed.get_rank()
+    global _EMBEDDING_GLOBAL_RANKS
+    if ignore_virtual:
+        return rank in _EMBEDDING_GLOBAL_RANKS
+    if rank in _EMBEDDING_GLOBAL_RANKS:
+        if rank == _EMBEDDING_GLOBAL_RANKS[0]:
+            return is_pipeline_first_stage(ignore_virtual=False)
+        elif rank == _EMBEDDING_GLOBAL_RANKS[-1]:
+            return is_pipeline_last_stage(ignore_virtual=False)
+        else:
+            return True
+    return False
+
+
+def is_rank_in_position_embedding_group():
+    """Return true if current rank is in position embedding group, False otherwise."""
+    rank = torch.distributed.get_rank()
+    global _POSITION_EMBEDDING_GLOBAL_RANKS
+    return rank in _POSITION_EMBEDDING_GLOBAL_RANKS
+
+
+def is_pipeline_stage_before_split(rank=None):
+    """Return True if pipeline stage executes encoder block for a model
+    with both encoder and decoder."""
+    if get_pipeline_model_parallel_world_size() == 1:
+        return True
+    if rank is None:
+        rank = get_pipeline_model_parallel_rank()
+    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+    if _PIPELINE_MODEL_PARALLEL_SPLIT_RANK is None:
+        return True
+    if rank < _PIPELINE_MODEL_PARALLEL_SPLIT_RANK:
+        return True
+    return False
+
+
+def is_pipeline_stage_after_split(rank=None):
+    """Return True if pipeline stage executes decoder block for a model
+    with both encoder and decoder."""
+    if get_pipeline_model_parallel_world_size() == 1:
+        return True
+    if rank is None:
+        rank = get_pipeline_model_parallel_rank()
+    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+    if _PIPELINE_MODEL_PARALLEL_SPLIT_RANK is None:
+        return True
+    if rank >= _PIPELINE_MODEL_PARALLEL_SPLIT_RANK:
+        return True
+    return False
+
+
+def is_pipeline_stage_at_split():
+    """Return true if pipeline stage executes decoder block and next
+    stage executes encoder block for a model with both encoder and
+    decoder."""
+    rank = get_pipeline_model_parallel_rank()
+    return is_pipeline_stage_before_split(rank) and is_pipeline_stage_after_split(rank + 1)
+
+
+def get_virtual_pipeline_model_parallel_rank():
+    """Return the virtual pipeline-parallel rank."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+
+
+def set_virtual_pipeline_model_parallel_rank(rank):
+    """Set the virtual pipeline-parallel rank."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = rank
+
+
+def get_virtual_pipeline_model_parallel_world_size():
+    """Return the virtual pipeline-parallel world size."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+
+
+def get_tensor_model_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the tensor model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_tensor_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+
+def get_data_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the data parallel group."""
+    assert _DATA_PARALLEL_GLOBAL_RANKS is not None, "Data parallel group is not initialized"
+    return _DATA_PARALLEL_GLOBAL_RANKS[0]
+
+
+def get_pipeline_model_parallel_first_rank():
+    """Return the global rank of the first process in the pipeline for the
+    current tensor parallel group"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    return _PIPELINE_GLOBAL_RANKS[0]
+
+
+def get_pipeline_model_parallel_last_rank():
+    """Return the global rank of the last process in the pipeline for the
+    current tensor parallel group"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    last_rank_local = get_pipeline_model_parallel_world_size() - 1
+    return _PIPELINE_GLOBAL_RANKS[last_rank_local]
+
+
+def get_pipeline_model_parallel_next_rank():
+    """Return the global rank that follows the caller in the pipeline"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pipeline_model_parallel_rank()
+    world_size = get_pipeline_model_parallel_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
+
+
+def get_pipeline_model_parallel_prev_rank():
+    """Return the global rank that preceeds the caller in the pipeline"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pipeline_model_parallel_rank()
+    world_size = get_pipeline_model_parallel_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_world_size(group=get_data_parallel_group())
+    else:
+        return 0
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank(group=get_data_parallel_group())
+    else:
+        return 0
+
+
+def _set_global_memory_buffer():
+    """Initialize global buffer"""
+    global _GLOBAL_MEMORY_BUFFER
+    assert _GLOBAL_MEMORY_BUFFER is None, 'global memory buffer is already initialized'
+    _GLOBAL_MEMORY_BUFFER = GlobalMemoryBuffer()
+
+
+def get_global_memory_buffer():
+    """Return the global GlobalMemoryBuffer object"""
+    assert _GLOBAL_MEMORY_BUFFER is not None, 'global memory buffer is not initialized'
+    return _GLOBAL_MEMORY_BUFFER
+
+
+def destroy_global_memory_buffer():
+    """Sets the global memory buffer to None"""
+    global _GLOBAL_MEMORY_BUFFER
+    _GLOBAL_MEMORY_BUFFER = None
+
+
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _MODEL_PARALLEL_GROUP
+    _MODEL_PARALLEL_GROUP = None
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    _TENSOR_MODEL_PARALLEL_GROUP = None
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    _PIPELINE_MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
+    global _EMBEDDING_GROUP
+    _EMBEDDING_GROUP = None
+    global _POSITION_EMBEDDING_GROUP
+    _POSITION_EMBEDDING_GROUP = None
+    global _AMAX_REDUCTION_GROUP
+    _AMAX_REDUCTION_GROUP = None
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    _MPU_TENSOR_MODEL_PARALLEL_RANK = None
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    _MPU_PIPELINE_MODEL_PARALLEL_RANK = None
+    global _GLOBAL_MEMORY_BUFFER
+    _GLOBAL_MEMORY_BUFFER = None
diff --git a/training/DeepSpeed-Domino/megatron/core/pipeline_parallel/__init__.py b/training/DeepSpeed-Domino/megatron/core/pipeline_parallel/__init__.py
new file mode 100644
index 000000000..00cd1ff38
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/pipeline_parallel/__init__.py
@@ -0,0 +1 @@
+from .schedules import get_forward_backward_func
diff --git a/training/DeepSpeed-Domino/megatron/core/pipeline_parallel/p2p_communication.py b/training/DeepSpeed-Domino/megatron/core/pipeline_parallel/p2p_communication.py
new file mode 100644
index 000000000..29ee34df8
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/pipeline_parallel/p2p_communication.py
@@ -0,0 +1,571 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import operator
+from functools import reduce
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+
+from megatron import core
+from megatron.core import ModelParallelConfig
+from megatron.core.parallel_state import (
+    get_pipeline_model_parallel_group,
+    get_pipeline_model_parallel_next_rank,
+    get_pipeline_model_parallel_prev_rank,
+    get_pipeline_model_parallel_rank,
+)
+
+# Types
+Shape = Union[List[int], torch.Size]
+
+
+def _communicate_shapes(tensor_send_next, tensor_send_prev, recv_prev, recv_next, config):
+    """Communicate tensor shapes between stages. Used to communicate
+    tensor shapes before the actual tensor communication happens.
+    This is required when the sequence lengths across micro batches
+    are not uniform.
+
+    Takes the following arguments:
+        tensor_send_next: tensor to send to next rank (no tensor sent if
+                          set to None).
+        tensor_send_prev: tensor to send to prev rank (no tensor sent if
+                          set to None).
+        recv_prev: boolean for whether tensor should be received from
+                   previous rank.
+        recv_next: boolean for whether tensor should be received from
+                   next rank.
+    Returns:
+        (recv_prev_shape, recv_next_shape)
+    """
+
+    recv_prev_shape_tensor = None
+    recv_next_shape_tensor = None
+    send_prev_shape_tensor = None
+    send_next_shape_tensor = None
+    if recv_prev:
+        recv_prev_shape_tensor = torch.empty(
+            (3), device=torch.cuda.current_device(), dtype=torch.int64
+        )
+    if recv_next:
+        recv_next_shape_tensor = torch.empty(
+            (3), device=torch.cuda.current_device(), dtype=torch.int64
+        )
+    if tensor_send_prev is not None:
+        send_prev_shape_tensor = torch.tensor(
+            tensor_send_prev.size(), device=torch.cuda.current_device(), dtype=torch.int64
+        )
+    if tensor_send_next is not None:
+        send_next_shape_tensor = torch.tensor(
+            tensor_send_next.size(), device=torch.cuda.current_device(), dtype=torch.int64
+        )
+
+    if config.use_ring_exchange_p2p:
+        torch.distributed.ring_exchange(
+            tensor_send_prev=send_prev_shape_tensor,
+            tensor_recv_prev=recv_prev_shape_tensor,
+            tensor_send_next=send_next_shape_tensor,
+            tensor_recv_next=recv_next_shape_tensor,
+            group=get_pipeline_model_parallel_group(),
+        )
+    else:
+        ops = []
+        if send_prev_shape_tensor is not None:
+            send_prev_op = torch.distributed.P2POp(
+                torch.distributed.isend,
+                send_prev_shape_tensor,
+                get_pipeline_model_parallel_prev_rank(),
+            )
+            ops.append(send_prev_op)
+        if recv_prev_shape_tensor is not None:
+            recv_prev_op = torch.distributed.P2POp(
+                torch.distributed.irecv,
+                recv_prev_shape_tensor,
+                get_pipeline_model_parallel_prev_rank(),
+            )
+            ops.append(recv_prev_op)
+        if send_next_shape_tensor is not None:
+            send_next_op = torch.distributed.P2POp(
+                torch.distributed.isend,
+                send_next_shape_tensor,
+                get_pipeline_model_parallel_next_rank(),
+            )
+            ops.append(send_next_op)
+        if recv_next_shape_tensor is not None:
+            recv_next_op = torch.distributed.P2POp(
+                torch.distributed.irecv,
+                recv_next_shape_tensor,
+                get_pipeline_model_parallel_next_rank(),
+            )
+            ops.append(recv_next_op)
+        if len(ops) > 0:
+            reqs = torch.distributed.batch_isend_irecv(ops)
+            for req in reqs:
+                req.wait()
+
+        # To protect against race condition when using batch_isend_irecv().
+        # should take this out once the bug with batch_isend_irecv is resolved.
+        torch.cuda.synchronize()
+
+    recv_prev_shape = [0, 0, 0]
+    if recv_prev_shape_tensor is not None:
+        recv_prev_shape = recv_prev_shape_tensor.tolist()
+
+    recv_next_shape = [0, 0, 0]
+    if recv_next_shape_tensor is not None:
+        recv_next_shape = recv_next_shape_tensor.tolist()
+
+    return recv_prev_shape, recv_next_shape
+
+
+def _batched_p2p_ops(
+    *,
+    tensor_send_prev: Optional[torch.Tensor],
+    tensor_recv_prev: Optional[torch.Tensor],
+    tensor_send_next: Optional[torch.Tensor],
+    tensor_recv_next: Optional[torch.Tensor],
+    group: torch.distributed.ProcessGroup
+):
+    ops = []
+    if tensor_send_prev is not None:
+        send_prev_op = torch.distributed.P2POp(
+            torch.distributed.isend,
+            tensor_send_prev,
+            get_pipeline_model_parallel_prev_rank(),
+            group,
+        )
+        ops.append(send_prev_op)
+    if tensor_recv_prev is not None:
+        recv_prev_op = torch.distributed.P2POp(
+            torch.distributed.irecv,
+            tensor_recv_prev,
+            get_pipeline_model_parallel_prev_rank(),
+            group,
+        )
+        ops.append(recv_prev_op)
+    if tensor_send_next is not None:
+        send_next_op = torch.distributed.P2POp(
+            torch.distributed.isend,
+            tensor_send_next,
+            get_pipeline_model_parallel_next_rank(),
+            group,
+        )
+        ops.append(send_next_op)
+    if tensor_recv_next is not None:
+        recv_next_op = torch.distributed.P2POp(
+            torch.distributed.irecv,
+            tensor_recv_next,
+            get_pipeline_model_parallel_next_rank(),
+            group,
+        )
+        ops.append(recv_next_op)
+    if len(ops) > 0:
+        reqs = torch.distributed.batch_isend_irecv(ops)
+    else:
+        reqs = []
+    return reqs
+
+
+def _p2p_ops(
+    *,
+    tensor_send_prev: Optional[torch.Tensor],
+    tensor_recv_prev: Optional[torch.Tensor],
+    tensor_send_next: Optional[torch.Tensor],
+    tensor_recv_next: Optional[torch.Tensor],
+    group: torch.distributed.ProcessGroup
+):
+    reqs = []
+    rank = get_pipeline_model_parallel_rank()
+    if get_pipeline_model_parallel_rank() % 2 == 0:
+        if tensor_send_next is not None:
+            send_next_req = torch.distributed.isend(
+                tensor=tensor_send_next, dst=get_pipeline_model_parallel_next_rank(), group=group,
+            )
+            reqs.append(send_next_req)
+
+        if tensor_recv_prev is not None:
+            recv_prev_req = torch.distributed.irecv(
+                tensor=tensor_recv_prev, src=get_pipeline_model_parallel_prev_rank(), group=group,
+            )
+            reqs.append(recv_prev_req)
+
+        if tensor_send_prev is not None:
+            send_prev_req = torch.distributed.isend(
+                tensor=tensor_send_prev, dst=get_pipeline_model_parallel_prev_rank(), group=group,
+            )
+            reqs.append(send_prev_req)
+
+        if tensor_recv_next is not None:
+            recv_next_req = torch.distributed.irecv(
+                tensor=tensor_recv_next, src=get_pipeline_model_parallel_next_rank(), group=group,
+            )
+            reqs.append(recv_next_req)
+
+    else:
+        if tensor_recv_prev is not None:
+            recv_prev_req = torch.distributed.irecv(
+                tensor=tensor_recv_prev, src=get_pipeline_model_parallel_prev_rank(), group=group,
+            )
+            reqs.append(recv_prev_req)
+
+        if tensor_send_next is not None:
+            send_next_req = torch.distributed.isend(
+                tensor=tensor_send_next, dst=get_pipeline_model_parallel_next_rank(), group=group,
+            )
+            reqs.append(send_next_req)
+
+        if tensor_recv_next is not None:
+            recv_next_req = torch.distributed.irecv(
+                tensor=tensor_recv_next, src=get_pipeline_model_parallel_next_rank(), group=group,
+            )
+            reqs.append(recv_next_req)
+
+        if tensor_send_prev is not None:
+            send_prev_req = torch.distributed.isend(
+                tensor=tensor_send_prev, dst=get_pipeline_model_parallel_prev_rank(), group=group,
+            )
+            reqs.append(send_prev_req)
+    return reqs
+
+
+def _communicate(
+    *,
+    tensor_send_next: Optional[torch.Tensor],
+    tensor_send_prev: Optional[torch.Tensor],
+    recv_prev: bool,
+    recv_next: bool,
+    tensor_shape: Shape,
+    config: ModelParallelConfig,
+    wait_on_reqs: bool = True
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Communicate tensors between stages. Used as helper method in other
+    communication methods that are used in megatron/schedules.py.
+
+    Arguments:
+        tensor_send_next (torch.Tensor, optional):
+            Tensor to send to next rank (no tensor sent if None)
+
+        tensor_send_prev (torch.Tensor, optional):
+            Tensor to send to prev rank (no tensor sent if None)
+
+        recv_prev (boolean, required):
+            whether tensor should be received from previous rank.
+
+        recv_next (boolean, required):
+            whether tensor should be received from next rank.
+
+        tensor_shape (List[int] or torch.Size, required):
+            shape of tensor to receive (this method assumes that all
+            tensors sent and received in a single function call are
+            the same shape).
+
+        wait_on_reqs (boolean, optional, default=False):
+            For non-batched p2p communication, wait on each request
+            before returning.
+
+    Returns:
+        tuple containing
+
+        - tensor_recv_prev: torch.Tensor if recv_prev is True, None otherwise.
+        - tensor_recv_next: torch.Tensor if recv_next is True, None otherwise.
+
+    """
+
+    # Create placeholder tensors for receive in forward and backward directions
+    # if needed.
+    tensor_recv_prev = None
+    tensor_recv_next = None
+
+    if not config.variable_seq_lengths:
+        recv_prev_shape = tensor_shape
+        recv_next_shape = tensor_shape
+    else:
+        recv_prev_shape, recv_next_shape = _communicate_shapes(
+            tensor_send_next, tensor_send_prev, recv_prev, recv_next, config
+        )
+
+    if recv_prev:
+        if config.pipeline_dtype is None:
+            raise RuntimeError("pipeline_dtype must be provided if recv_prev is True")
+        if tensor_shape is None:
+            raise RuntimeError(
+                "tensor_shape must be specified if recv_prev is True. "
+                "Common tensor_shape is (seq_length, micro_batch_size, hidden_size)"
+            )
+        tensor_recv_prev = torch.empty(
+            recv_prev_shape,
+            requires_grad=True,
+            device=torch.cuda.current_device(),
+            dtype=config.pipeline_dtype,
+        )
+    if recv_next:
+        if config.pipeline_dtype is None:
+            raise RuntimeError("dtype must be provided if recv_next is True")
+        if tensor_shape is None:
+            raise RuntimeError(
+                "tensor_shape must be specified if recv_next is True. "
+                "Common tensor_shape is (seq_length, micro_batch_size, hidden_size)"
+            )
+        tensor_recv_next = torch.empty(
+            recv_next_shape,
+            requires_grad=True,
+            device=torch.cuda.current_device(),
+            dtype=config.pipeline_dtype,
+        )
+
+    # Send tensors in both the forward and backward directions as appropriate.
+    if config.use_ring_exchange_p2p:
+
+        def _ring_exchange_wrapper(**kwargs):
+            torch.distributed.ring_exchange(**kwargs)
+            return []
+
+        p2p_func = _ring_exchange_wrapper
+    elif config.batch_p2p_comm:
+        assert wait_on_reqs
+        p2p_func = _batched_p2p_ops
+    else:
+        p2p_func = _p2p_ops
+
+    reqs = p2p_func(
+        tensor_send_prev=tensor_send_prev,
+        tensor_recv_prev=tensor_recv_prev,
+        tensor_send_next=tensor_send_next,
+        tensor_recv_next=tensor_recv_next,
+        group=get_pipeline_model_parallel_group(),
+    )
+
+    if wait_on_reqs and len(reqs) > 0:
+        for req in reqs:
+            req.wait()
+        reqs = None
+
+    if config.batch_p2p_comm and config.batch_p2p_sync:
+        # To protect against race condition when using batch_isend_irecv().
+        # User should assert that we have a modern enough PyTorch to not need this
+        torch.cuda.synchronize()
+
+    return tensor_recv_prev, tensor_recv_next, reqs
+
+
+def recv_forward(tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor:
+    """ Receive tensor from previous rank in pipeline (forward receive).
+
+
+    See _communicate for argument details.
+    """
+
+    if core.parallel_state.is_pipeline_first_stage():
+        input_tensor = None
+    else:
+        if config.timers is not None:
+            config.timers('forward-recv', log_level=2).start()
+        input_tensor, _, _ = _communicate(
+            tensor_send_next=None,
+            tensor_send_prev=None,
+            recv_prev=True,
+            recv_next=False,
+            tensor_shape=tensor_shape,
+            config=config,
+        )
+        if config.timers is not None:
+            config.timers('forward-recv').stop()
+    return input_tensor
+
+
+def recv_backward(tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor:
+    """Receive tensor from next rank in pipeline (backward receive).
+
+    See _communicate for argument details.
+    """
+    if core.parallel_state.is_pipeline_last_stage():
+        output_tensor_grad = None
+    else:
+        if config.timers is not None:
+            config.timers('backward-recv', log_level=2).start()
+        _, output_tensor_grad, _ = _communicate(
+            tensor_send_next=None,
+            tensor_send_prev=None,
+            recv_prev=False,
+            recv_next=True,
+            tensor_shape=tensor_shape,
+            config=config,
+        )
+        if config.timers is not None:
+            config.timers('backward-recv').stop()
+    return output_tensor_grad
+
+
+def send_forward(output_tensor: torch.Tensor, config: ModelParallelConfig) -> None:
+    """Send tensor to next rank in pipeline (forward send).
+
+    See _communicate for argument details.
+    """
+
+    if not core.parallel_state.is_pipeline_last_stage():
+        if config.timers is not None:
+            config.timers('forward-send', log_level=2).start()
+        _communicate(
+            tensor_send_next=output_tensor,
+            tensor_send_prev=None,
+            recv_prev=False,
+            recv_next=False,
+            tensor_shape=None,
+            config=config,
+        )
+        if config.timers is not None:
+            config.timers('forward-send').stop()
+
+
+def send_backward(input_tensor_grad: torch.Tensor, config: ModelParallelConfig) -> None:
+    """Send tensor to previous rank in pipeline (backward send).
+
+    See _communicate for argument details.
+    """
+    if not core.parallel_state.is_pipeline_first_stage():
+        if config.timers is not None:
+            config.timers('backward-send', log_level=2).start()
+        _communicate(
+            tensor_send_next=None,
+            tensor_send_prev=input_tensor_grad,
+            recv_prev=False,
+            recv_next=False,
+            tensor_shape=None,
+            config=config,
+        )
+        if config.timers is not None:
+            config.timers('backward-send').stop()
+
+
+def send_forward_recv_backward(
+    output_tensor: torch.Tensor, tensor_shape: Shape, config: ModelParallelConfig
+) -> torch.Tensor:
+    """Batched send and recv with next rank in pipeline.
+
+    See _communicate for argument details.
+    """
+    if core.parallel_state.is_pipeline_last_stage():
+        output_tensor_grad = None
+    else:
+        if config.timers is not None:
+            config.timers('forward-send-backward-recv', log_level=2).start()
+        _, output_tensor_grad, _ = _communicate(
+            tensor_send_next=output_tensor,
+            tensor_send_prev=None,
+            recv_prev=False,
+            recv_next=True,
+            tensor_shape=tensor_shape,
+            config=config,
+        )
+        if config.timers is not None:
+            config.timers('forward-send-backward-recv').stop()
+    return output_tensor_grad
+
+
+def send_backward_recv_forward(
+    input_tensor_grad: torch.Tensor, tensor_shape: Shape, config: ModelParallelConfig
+) -> torch.Tensor:
+    """Batched send and recv with previous rank in pipeline.
+
+    See _communicate for argument details.
+    """
+    if core.parallel_state.is_pipeline_first_stage():
+        input_tensor = None
+    else:
+        if config.timers is not None:
+            config.timers('backward-send-forward-recv', log_level=2).start()
+        input_tensor, _, _ = _communicate(
+            tensor_send_next=None,
+            tensor_send_prev=input_tensor_grad,
+            recv_prev=True,
+            recv_next=False,
+            tensor_shape=tensor_shape,
+            config=config,
+        )
+        if config.timers is not None:
+            config.timers('backward-send-forward-recv').stop()
+    return input_tensor
+
+
+def send_forward_recv_forward(
+    output_tensor: torch.Tensor,
+    recv_prev: bool,
+    tensor_shape: Shape,
+    config: ModelParallelConfig,
+    overlap_p2p_comm: bool = False,
+) -> torch.Tensor:
+    """Batched recv from previous rank and send to next rank in pipeline.
+
+    See _communicate for argument details.
+    """
+    if config.timers is not None:
+        config.timers('forward-send-forward-recv', log_level=2).start()
+    input_tensor, _, wait_handles = _communicate(
+        tensor_send_next=output_tensor,
+        tensor_send_prev=None,
+        recv_prev=recv_prev,
+        recv_next=False,
+        tensor_shape=tensor_shape,
+        wait_on_reqs=(not overlap_p2p_comm),
+        config=config,
+    )
+    if config.timers is not None:
+        config.timers('forward-send-forward-recv').stop()
+    if overlap_p2p_comm:
+        return input_tensor, wait_handles
+    return input_tensor
+
+
+def send_backward_recv_backward(
+    input_tensor_grad: torch.Tensor,
+    recv_next: bool,
+    tensor_shape: Shape,
+    config: ModelParallelConfig,
+    overlap_p2p_comm: bool = False,
+) -> torch.Tensor:
+    """Batched recv from next rank and send to previous rank in pipeline.
+
+    See _communicate for argument details.
+    """
+    if config.timers is not None:
+        config.timers('backward-send-backward-recv', log_level=2).start()
+    _, output_tensor_grad, wait_handles = _communicate(
+        tensor_send_next=None,
+        tensor_send_prev=input_tensor_grad,
+        recv_prev=False,
+        recv_next=recv_next,
+        tensor_shape=tensor_shape,
+        wait_on_reqs=(not overlap_p2p_comm),
+        config=config,
+    )
+    if config.timers is not None:
+        config.timers('backward-send-backward-recv').stop()
+    if overlap_p2p_comm:
+        return output_tensor_grad, wait_handles
+    return output_tensor_grad
+
+
+def send_forward_backward_recv_forward_backward(
+    output_tensor: torch.Tensor,
+    input_tensor_grad: torch.Tensor,
+    recv_prev: bool,
+    recv_next: bool,
+    tensor_shape: Shape,
+    config: ModelParallelConfig,
+) -> torch.Tensor:
+    """Batched send and recv with previous and next ranks in pipeline.
+
+    See _communicate for argument details.
+    """
+    if config.timers is not None:
+        config.timers('forward-backward-send-forward-backward-recv', log_level=2).start()
+    input_tensor, output_tensor_grad, _ = _communicate(
+        tensor_send_next=output_tensor,
+        tensor_send_prev=input_tensor_grad,
+        recv_prev=recv_prev,
+        recv_next=recv_next,
+        tensor_shape=tensor_shape,
+        config=config,
+    )
+    if config.timers is not None:
+        config.timers('forward-backward-send-forward-backward-recv').stop()
+    return input_tensor, output_tensor_grad
diff --git a/training/DeepSpeed-Domino/megatron/core/pipeline_parallel/schedules.py b/training/DeepSpeed-Domino/megatron/core/pipeline_parallel/schedules.py
new file mode 100644
index 000000000..6eeb15b5c
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/pipeline_parallel/schedules.py
@@ -0,0 +1,1254 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import contextlib
+from typing import Callable, Iterator, List, Optional, Union
+
+import torch
+from torch.autograd.variable import Variable
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+
+from megatron import core
+from megatron.core import parallel_state
+from megatron.core.enums import ModelType
+from megatron.core.pipeline_parallel import p2p_communication
+from megatron.core.utils import get_attr_wrapped_model, get_model_config, get_model_type
+
+# Types
+Shape = Union[List[int], torch.Size]
+
+
+def get_forward_backward_func():
+    """Retrieves the appropriate forward_backward function given the
+    configuration of parallel_state.
+
+    Returns a function that will perform all of the forward and
+    backward passes of the model given the pipeline model parallel
+    world size and virtual pipeline model parallel world size in the
+    global parallel_state.
+
+    Note that if using sequence parallelism, the sequence length component of
+    the tensor shape is updated to original_sequence_length /
+    tensor_model_parallel_world_size.
+
+    The function returned takes the following arguments:
+
+    forward_step_func (required): A function that takes a data
+        iterator and a model as its arguments and return the model's
+        forward output and the loss function. The loss function should
+        take one torch.Tensor and return a torch.Tensor of loss and a
+        dictionary of string -> torch.Tensor.
+
+        A third argument, checkpoint_activations_microbatch, indicates
+        that the activations for this microbatch should be
+        checkpointed. A None value for this argument indicates that
+        the default from the configuration should be used. This is
+        used when the
+        num_microbatches_with_partial_activation_checkpoints is used.
+
+        For example:
+
+        def loss_func(loss_mask, output_tensor):
+            losses = output_tensor.float()
+            loss_mask = loss_mask.view(-1).float()
+            loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+
+            # Reduce loss for logging.
+            averaged_loss = average_losses_across_data_parallel_group([loss])
+
+            return loss, {'lm loss': averaged_loss[0]}
+
+        def forward_step(data_iterator, model):
+            data, loss_mask = next(data_iterator)
+            output = model(data)
+            return output, partial(loss_func, loss_mask)
+
+
+        forward_backward_func(forward_step_func=forward_step, ...)
+
+
+    data_iterator (required): an iterator over the data, will be
+        passed as is to forward_step_func. Expected to be a list of
+        iterators in the case of interleaved pipeline parallelism.
+
+    model (required): the actual model. Expected to be a list of modules in the case of interleaved
+        pipeline parallelism. Must be a (potentially wrapped) megatron.core.models.MegatronModule.
+
+    num_microbatches (int, required):
+        The number of microbatches to go through
+
+    seq_length (int, required): Sequence length of the current global batch. If this is a dual-stack
+        transformer, this is the encoder's sequence length. This is ignored if variable_seq_lengths
+        in the config is True. Otherwise, each microbatch in the current global batch size must use
+        this sequence length.
+
+    micro_batch_size (int, required): The number of sequences in a microbatch.
+
+    decoder_seq_length (int, optional): The sequence length for the decoder in a dual-stack
+        transformer. This is ignored for a single-stack transformer.
+
+    forward_only (optional, default = False): Perform only the forward step
+
+    collect_non_loss_data (optional, bool, default=False): TODO
+
+    """
+    pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
+    if pipeline_model_parallel_size > 1:
+        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+            forward_backward_func = forward_backward_pipelining_with_interleaving
+        else:
+            forward_backward_func = forward_backward_pipelining_without_interleaving
+    else:
+        forward_backward_func = forward_backward_no_pipelining
+    return forward_backward_func
+
+
+def deallocate_output_tensor(out, deallocate_pipeline_outputs=False):
+    '''Pseudo-deallocate (i.e., set to scalar) the output tensor's '.data' field.
+
+    This method should be called right after the output tensor has been
+    sent to the next pipeline stage. At this point, the output tensor is
+    only useful for its '.grad_fn' field, and not its '.data'.
+    '''
+    if (out is None) or (not deallocate_pipeline_outputs):
+        return
+    assert isinstance(out, torch.Tensor), "expected Tensor, found %s." % type(out).__name__
+    assert out._base is None, "counter-productive to free a view of another tensor."
+    out.data = torch.empty((1,), device=out.device, dtype=out.dtype,)
+
+
+def custom_backward(output, grad_output):
+    '''Directly call C++ autograd engine.
+
+    To make the 'deallocate_output_tensor' (above) optimization work, the C++
+    autograd engine must be called directly, bypassing Pytorch's
+    torch.autograd.backward. Pytorch's 'backward' checks that the output and
+    grad have the same shape, while C++'s 'backward' does not.
+    '''
+
+    assert output.numel() == 1, "output should be pseudo-'freed' in schedule, to optimize memory"
+    assert isinstance(output, torch.Tensor), "output == '%s'." % type(output).__name__
+    assert isinstance(grad_output, (torch.Tensor, type(None))), (
+        "grad_output == '%s'." % type(grad_output).__name__
+    )
+
+    # Handle scalar output
+    if grad_output is None:
+        assert output.numel() == 1, "implicit grad requires scalar output."
+        grad_output = torch.ones_like(output, memory_format=torch.preserve_format,)
+
+    # Call c++ engine [ see torch/csrc/autograd/python_engine.cpp ]
+    Variable._execution_engine.run_backward(
+        tensors=(output,),
+        grad_tensors=(grad_output,),
+        keep_graph=False,
+        create_graph=False,
+        inputs=tuple(),
+        allow_unreachable=True,
+        accumulate_grad=True,
+    )
+
+
+def forward_step(
+    forward_step_func,
+    data_iterator,
+    model,
+    num_microbatches,
+    input_tensor,
+    forward_data_store,
+    config,
+    collect_non_loss_data=False,
+    checkpoint_activations_microbatch=None,
+):
+    """Forward step for passed-in model.
+
+    If first stage, input tensor is obtained from data_iterator, otherwise
+    passed-in input_tensor is used.
+
+    Returns output tensor."""
+    if config.timers is not None:
+        config.timers('forward-compute', log_level=2).start()
+
+    unwrap_output_tensor = False
+    if not isinstance(input_tensor, list):
+        input_tensor = [input_tensor]
+        unwrap_output_tensor = True
+
+    set_input_tensor = get_attr_wrapped_model(model, "set_input_tensor")
+    set_input_tensor(input_tensor)
+
+    if config.enable_autocast:
+        context_manager = torch.autocast("cuda", dtype=config.autocast_dtype)
+    else:
+        context_manager = contextlib.nullcontext()
+    with context_manager:
+        if checkpoint_activations_microbatch is None:
+            output_tensor, loss_func = forward_step_func(data_iterator, model)
+        else:
+            output_tensor, loss_func = forward_step_func(
+                data_iterator, model, checkpoint_activations_microbatch
+            )
+
+    if parallel_state.is_pipeline_last_stage():
+        if not collect_non_loss_data:
+            output_tensor = loss_func(output_tensor)
+            loss, loss_reduced = output_tensor
+            output_tensor = loss / num_microbatches
+            forward_data_store.append(loss_reduced)
+        else:
+            data = loss_func(output_tensor, non_loss_data=True)
+            forward_data_store.append(data)
+
+    if config.timers is not None:
+        config.timers('forward-compute').stop()
+
+    # If T5 model (or other model with encoder and decoder)
+    # and in decoder stack, then send encoder_hidden_state
+    # downstream as well.
+    model_type = get_model_type(model)
+    if (
+        parallel_state.is_pipeline_stage_after_split()
+        and model_type == ModelType.encoder_and_decoder
+    ):
+        return [output_tensor, input_tensor[-1]]
+    if unwrap_output_tensor:
+        return output_tensor
+    return [output_tensor]
+
+
+def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config):
+    """Backward step through passed-in output tensor.
+
+    If last stage, output_tensor_grad is None, otherwise gradient of loss
+    with respect to stage's output tensor.
+
+    Returns gradient of loss with respect to input tensor (None if first
+    stage)."""
+
+    # NOTE: This code currently can handle at most one skip connection. It
+    # needs to be modified slightly to support arbitrary numbers of skip
+    # connections.
+
+    if config.timers is not None:
+        config.timers('backward-compute', log_level=2).start()
+
+    # Retain the grad on the input_tensor.
+    unwrap_input_tensor_grad = False
+    if not isinstance(input_tensor, list):
+        input_tensor = [input_tensor]
+        unwrap_input_tensor_grad = True
+    for x in input_tensor:
+        if x is not None:
+            x.retain_grad()
+
+    if not isinstance(output_tensor, list):
+        output_tensor = [output_tensor]
+    if not isinstance(output_tensor_grad, list):
+        output_tensor_grad = [output_tensor_grad]
+
+    # Backward pass.
+    if output_tensor_grad[0] is None and config.grad_scale_func is not None:
+        output_tensor[0] = config.grad_scale_func(output_tensor[0])
+
+    if config.deallocate_pipeline_outputs:
+        custom_backward(output_tensor[0], output_tensor_grad[0])
+    else:
+        torch.autograd.backward(output_tensor[0], grad_tensors=output_tensor_grad[0])
+
+    # Collect the grad of the input_tensor.
+    input_tensor_grad = [None]
+    if input_tensor is not None:
+        input_tensor_grad = []
+        for x in input_tensor:
+            if x is None:
+                input_tensor_grad.append(None)
+            else:
+                input_tensor_grad.append(x.grad)
+
+    # Handle single skip connection if it exists (encoder_hidden_state in
+    # model with encoder and decoder).
+    if (
+        parallel_state.get_pipeline_model_parallel_world_size() > 1
+        and parallel_state.is_pipeline_stage_after_split()
+        and model_type == ModelType.encoder_and_decoder
+    ):
+        if output_tensor_grad[1] is not None:
+            input_tensor_grad[-1].add_(output_tensor_grad[1])
+    if unwrap_input_tensor_grad:
+        input_tensor_grad = input_tensor_grad[0]
+
+    if config.timers is not None:
+        config.timers('backward-compute').stop()
+
+    return input_tensor_grad
+
+
+def forward_backward_no_pipelining(
+    *,
+    forward_step_func,
+    data_iterator: Union[Iterator, List[Iterator]],
+    model: Union[torch.nn.Module, List[torch.nn.Module]],
+    num_microbatches: int,
+    seq_length: int,  # unused
+    micro_batch_size: int,  # unused
+    decoder_seq_length: int = None,  # unused
+    forward_only: bool = False,
+    collect_non_loss_data: bool = False,
+):
+    """Run forward and backward passes with no pipeline parallelism
+    (no inter-stage communication).
+
+    Returns dictionary with losses.
+
+
+    See get_forward_backward_func() for argument details
+    """
+
+    if isinstance(model, list):
+        assert len(model) == 1, "non-pipeline-parallel schedule does not support model chunking"
+        model = model[0]
+    if isinstance(data_iterator, list):
+        assert (
+            len(data_iterator) == 1
+        ), "non-pipeline-parallel schedule does not support model chunking"
+        data_iterator = data_iterator[0]
+
+    config = get_model_config(model)
+
+    no_sync_func = config.no_sync_func
+    if no_sync_func is None and isinstance(model, torchDDP):
+        no_sync_func = model.no_sync
+    if no_sync_func is None:
+        no_sync_func = contextlib.nullcontext
+
+    model_type = get_model_type(model)
+
+    forward_data_store = []
+    input_tensor, output_tensor_grad = None, None
+    with no_sync_func():
+        for i in range(num_microbatches - 1):
+            output_tensor = forward_step(
+                forward_step_func,
+                data_iterator,
+                model,
+                num_microbatches,
+                input_tensor,
+                forward_data_store,
+                config,
+                collect_non_loss_data,
+            )
+            if not forward_only:
+                backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config)
+
+    # Run computation for last microbatch out of context handler (want to
+    # synchronize gradients).
+    output_tensor = forward_step(
+        forward_step_func,
+        data_iterator,
+        model,
+        num_microbatches,
+        input_tensor,
+        forward_data_store,
+        config,
+        collect_non_loss_data,
+    )
+
+    if not forward_only:
+        backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config)
+
+    return forward_data_store
+
+
+def forward_backward_pipelining_with_interleaving(
+    *,
+    forward_step_func,
+    data_iterator: Union[Iterator, List[Iterator]],
+    model: Union[torch.nn.Module, List[torch.nn.Module]],
+    num_microbatches: int,
+    seq_length: int,
+    micro_batch_size: int,
+    decoder_seq_length: int = None,
+    forward_only: bool = False,
+    collect_non_loss_data: bool = False,
+):
+    """Run interleaved 1F1B schedule (model split into model chunks), with
+    communication between pipeline stages as needed.
+
+    Returns dictionary with losses if the last stage, empty dict otherwise."""
+    assert isinstance(model, list), "interleaved pipeline parallelism expected model chunking"
+    assert all(isinstance(chunk, torch.nn.Module) for chunk in model), "invalid model chunking"
+    assert isinstance(
+        data_iterator, list
+    ), "interleaved pipeline parallelism expected each model chunk to have a data iterator"
+
+    config = get_model_config(model[0])
+    if config.overlap_p2p_comm and config.batch_p2p_comm:
+        raise ValueError("Can not use both overlap_p2p_comm and batch_p2p_comm")
+
+    # Disable async grad reductions
+    no_sync_func = config.no_sync_func
+    if no_sync_func is None and all(isinstance(chunk, torchDDP) for chunk in model):
+
+        def multi_no_sync():
+            stack = contextlib.ExitStack()
+            for chunk in model:
+                stack.enter_context(chunk.no_sync())
+            return stack
+
+        no_sync_func = multi_no_sync
+    if no_sync_func is None:
+        no_sync_func = contextlib.nullcontext
+    no_sync_context = None
+
+    def disable_grad_sync():
+        """Disable asynchronous grad reductions"""
+        nonlocal no_sync_context
+        if no_sync_context is None:
+            no_sync_context = no_sync_func()
+            no_sync_context.__enter__()
+
+    def enable_grad_sync():
+        """Enable asynchronous grad reductions"""
+        nonlocal no_sync_context
+        if no_sync_context is not None:
+            no_sync_context.__exit__(None, None, None)
+            no_sync_context = None
+
+    disable_grad_sync()
+
+    # Model chunk IDs with synchronized grads
+    synchronized_model_chunks = set()
+
+    input_tensors = [[] for _ in range(len(model))]
+    output_tensors = [[] for _ in range(len(model))]
+    forward_data_store = []
+    if not forward_only:
+        output_tensor_grads = [[] for _ in range(len(model))]
+
+    pipeline_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
+    pipeline_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()
+
+    if num_microbatches % pipeline_parallel_size != 0:
+        msg = f'number of microbatches ({num_microbatches}) is not divisible by '
+        msg += f'pipeline-model-parallel-size ({pipeline_parallel_size}) '
+        msg += 'when using interleaved schedule'
+        raise RuntimeError(msg)
+
+    model_type = get_model_type(model[0])
+    if model_type == ModelType.encoder_and_decoder:
+        raise RuntimeError("Interleaving is not supported with an encoder and decoder model.")
+
+    if decoder_seq_length is not None and decoder_seq_length != seq_length:
+        raise RuntimeError(
+            "Interleaving is not supported with a different decoder sequence length."
+        )
+
+    tensor_shape = [seq_length, micro_batch_size, config.hidden_size]
+    if config.sequence_parallel:
+        tensor_shape[0] = tensor_shape[0] // parallel_state.get_tensor_model_parallel_world_size()
+
+    # Compute number of warmup and remaining microbatches.
+    num_model_chunks = len(model)
+    total_num_microbatches = num_microbatches * num_model_chunks
+    all_warmup_microbatches = False
+    if forward_only:
+        num_warmup_microbatches = total_num_microbatches
+    else:
+        # Run all forward passes and then all backward passes if number of
+        # microbatches is just the number of pipeline stages.
+        # Otherwise, perform (num_model_chunks-1)*pipeline_parallel_size on
+        # all workers, followed by more microbatches after depending on
+        # stage ID (more forward passes for earlier stages, later stages can
+        # immediately start with 1F1B).
+        if num_microbatches == pipeline_parallel_size:
+            num_warmup_microbatches = total_num_microbatches
+            all_warmup_microbatches = True
+        else:
+            num_warmup_microbatches = (pipeline_parallel_size - pipeline_parallel_rank - 1) * 2
+            num_warmup_microbatches += (num_model_chunks - 1) * pipeline_parallel_size
+            num_warmup_microbatches = min(num_warmup_microbatches, total_num_microbatches)
+    num_microbatches_remaining = total_num_microbatches - num_warmup_microbatches
+
+    # Checkpoint the activations of partial Transformer layers in a number of micro-batches
+    # within the maximum outstanding micro-batch backpropagations.
+    # Micro-batches with the ids less than 'num_microbatches_with_partial_activation_checkpoints'
+    # checkpoint partial Transformer layers (or skip checkpointing) and
+    # the rest of micro-batches within a window of micro-batches checkpoint
+    # all Transformer layers. The window of micro-batches is set by the maximum
+    # outstanding backpropagations and becomes smaller at later pipeline stages.
+    # Please refer the appendix C in https://arxiv.org/pdf/2205.05198.pdf
+    max_outstanding_backprops = None
+    if config.num_microbatches_with_partial_activation_checkpoints is not None:
+        max_outstanding_backprops = num_warmup_microbatches + 1
+
+    # Synchronize params for first two model chunks
+    if config.param_sync_func is not None:
+        config.param_sync_func(model[0].parameters())
+        config.param_sync_func(model[1].parameters())
+
+    def get_model_chunk_id(microbatch_id, forward):
+        """Helper method to get the model chunk ID given the iteration number."""
+        microbatch_id_in_group = microbatch_id % (pipeline_parallel_size * num_model_chunks)
+        model_chunk_id = microbatch_id_in_group // pipeline_parallel_size
+        if not forward:
+            model_chunk_id = num_model_chunks - model_chunk_id - 1
+        return model_chunk_id
+
+    def is_first_microbatch_for_model_chunk(microbatch_id: int) -> bool:
+        """Check if an iteration is the first for a model chunk."""
+        microbatch_group_size = pipeline_parallel_size * num_model_chunks
+        num_microbatch_groups = total_num_microbatches // microbatch_group_size
+        microbatch_group_id = microbatch_id // microbatch_group_size
+        microbatch_id_in_group = microbatch_id % microbatch_group_size
+        if microbatch_group_id == 0:
+            return microbatch_id_in_group % pipeline_parallel_size == 0
+        else:
+            return False
+
+    def is_last_microbatch_for_model_chunk(microbatch_id: int) -> bool:
+        """Check if an iteration is the last for a model chunk."""
+        microbatch_group_size = pipeline_parallel_size * num_model_chunks
+        num_microbatch_groups = total_num_microbatches // microbatch_group_size
+        microbatch_group_id = microbatch_id // microbatch_group_size
+        microbatch_id_in_group = microbatch_id % microbatch_group_size
+        if microbatch_group_id == num_microbatch_groups - 1:
+            return microbatch_id_in_group % pipeline_parallel_size == pipeline_parallel_size - 1
+        else:
+            return False
+
+    def forward_step_helper(microbatch_id, checkpoint_activations_microbatch):
+        """Helper method to run forward step with model split into chunks
+        (run set_virtual_pipeline_model_parallel_rank() before calling
+        forward_step())."""
+        model_chunk_id = get_model_chunk_id(microbatch_id, forward=True)
+        parallel_state.set_virtual_pipeline_model_parallel_rank(model_chunk_id)
+
+        # launch param synchronization for next model chunk
+        # Note: Asynchronous communication tends to slow down compute.
+        # To reduce idling from mismatched microbatch times, we launch
+        # asynchronous communication at the same time across the
+        # pipeline-parallel group.
+        if config.param_sync_func is not None:
+            param_sync_microbatch_id = microbatch_id + pipeline_parallel_rank
+            if (
+                param_sync_microbatch_id < total_num_microbatches
+                and is_first_microbatch_for_model_chunk(param_sync_microbatch_id)
+            ):
+                param_sync_chunk_id = get_model_chunk_id(param_sync_microbatch_id, forward=True) + 1
+                if 1 < param_sync_chunk_id < num_model_chunks:
+                    config.param_sync_func(model[param_sync_chunk_id].parameters())
+
+        # forward step
+        if parallel_state.is_pipeline_first_stage():
+            if len(input_tensors[model_chunk_id]) == len(output_tensors[model_chunk_id]):
+                input_tensors[model_chunk_id].append(None)
+        input_tensor = input_tensors[model_chunk_id][-1]
+        output_tensor = forward_step(
+            forward_step_func,
+            data_iterator[model_chunk_id],
+            model[model_chunk_id],
+            num_microbatches,
+            input_tensor,
+            forward_data_store,
+            config,
+            collect_non_loss_data,
+            checkpoint_activations_microbatch,
+        )
+        output_tensors[model_chunk_id].append(output_tensor)
+
+        # if forward-only, no need to save tensors for a backward pass
+        if forward_only:
+            input_tensors[model_chunk_id].pop()
+            output_tensors[model_chunk_id].pop()
+
+        return output_tensor
+
+    def backward_step_helper(microbatch_id):
+        """Helper method to run backward step with model split into chunks
+        (run set_virtual_pipeline_model_parallel_rank() before calling
+        backward_step())."""
+        model_chunk_id = get_model_chunk_id(microbatch_id, forward=False)
+        parallel_state.set_virtual_pipeline_model_parallel_rank(model_chunk_id)
+
+        # launch grad synchronization (default)
+        if config.grad_sync_func is None and is_last_microbatch_for_model_chunk(microbatch_id):
+            enable_grad_sync()
+            synchronized_model_chunks.add(model_chunk_id)
+
+        if parallel_state.is_pipeline_last_stage():
+            if len(output_tensor_grads[model_chunk_id]) == 0:
+                output_tensor_grads[model_chunk_id].append(None)
+        input_tensor = input_tensors[model_chunk_id].pop(0)
+        output_tensor = output_tensors[model_chunk_id].pop(0)
+        output_tensor_grad = output_tensor_grads[model_chunk_id].pop(0)
+        input_tensor_grad = backward_step(
+            input_tensor, output_tensor, output_tensor_grad, model_type, config
+        )
+
+        # launch grad synchronization (custom grad sync)
+        # Note: Asynchronous communication tends to slow down compute.
+        # To reduce idling from mismatched microbatch times, we launch
+        # asynchronous communication at the same time across the
+        # pipeline-parallel group.
+        if config.grad_sync_func is not None:
+            grad_sync_microbatch_id = microbatch_id - pipeline_parallel_rank
+            if grad_sync_microbatch_id >= 0 and is_last_microbatch_for_model_chunk(
+                grad_sync_microbatch_id
+            ):
+                grad_sync_chunk_id = get_model_chunk_id(grad_sync_microbatch_id, forward=False)
+                enable_grad_sync()
+                config.grad_sync_func(model[grad_sync_chunk_id].parameters())
+                synchronized_model_chunks.add(grad_sync_chunk_id)
+        disable_grad_sync()
+
+        return input_tensor_grad
+
+    # Run warmup forward passes.
+    parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+    input_tensors[0].append(p2p_communication.recv_forward(tensor_shape, config))
+
+    fwd_wait_handles = None
+    bwd_wait_handles = None
+
+    for k in range(num_warmup_microbatches):
+
+        if fwd_wait_handles is not None:
+            for req in fwd_wait_handles:
+                req.wait()
+
+        # Decide to checkpoint all layers' activations of the current micro-batch
+        if max_outstanding_backprops is not None:
+            checkpoint_activations_microbatch = (
+                k % max_outstanding_backprops
+                >= config.num_microbatches_with_partial_activation_checkpoints
+            )
+        else:
+            checkpoint_activations_microbatch = None
+
+        output_tensor = forward_step_helper(k, checkpoint_activations_microbatch)
+
+        # Determine if tensor should be received from previous stage.
+        next_forward_model_chunk_id = get_model_chunk_id(k + 1, forward=True)
+        recv_prev = True
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+            if next_forward_model_chunk_id == 0:
+                recv_prev = False
+        if k == (total_num_microbatches - 1):
+            recv_prev = False
+
+        # Don't send tensor downstream if on last stage.
+        if parallel_state.is_pipeline_last_stage():
+            output_tensor = None
+
+        # Send and receive tensors as appropriate (send tensors computed
+        # in this iteration; receive tensors for next iteration).
+        if not config.overlap_p2p_comm:
+            if (
+                k == (num_warmup_microbatches - 1)
+                and not forward_only
+                and not all_warmup_microbatches
+            ):
+                input_tensor_grad = None
+                recv_next = True
+                if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+                    recv_next = False
+                (
+                    input_tensor,
+                    output_tensor_grad,
+                ) = p2p_communication.send_forward_backward_recv_forward_backward(
+                    output_tensor,
+                    input_tensor_grad,
+                    recv_prev=recv_prev,
+                    recv_next=recv_next,
+                    tensor_shape=tensor_shape,
+                    config=config,
+                )
+                output_tensor_grads[num_model_chunks - 1].append(output_tensor_grad)
+            else:
+                input_tensor = p2p_communication.send_forward_recv_forward(
+                    output_tensor, recv_prev=recv_prev, tensor_shape=tensor_shape, config=config
+                )
+            input_tensors[next_forward_model_chunk_id].append(input_tensor)
+        else:
+            input_tensor, fwd_wait_handles = p2p_communication.send_forward_recv_forward(
+                output_tensor,
+                recv_prev=recv_prev,
+                tensor_shape=tensor_shape,
+                config=config,
+                overlap_p2p_comm=True,
+            )
+
+            if (
+                k == (num_warmup_microbatches - 1)
+                and not forward_only
+                and not all_warmup_microbatches
+            ):
+                input_tensor_grad = None
+                recv_next = True
+                if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+                    recv_next = False
+
+                (
+                    output_tensor_grad,
+                    bwd_wait_handles,
+                ) = p2p_communication.send_backward_recv_backward(
+                    input_tensor_grad,
+                    recv_next=recv_next,
+                    tensor_shape=tensor_shape,
+                    config=config,
+                    overlap_p2p_comm=True,
+                )
+
+                output_tensor_grads[num_model_chunks - 1].append(output_tensor_grad)
+            input_tensors[next_forward_model_chunk_id].append(input_tensor)
+
+        deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs)
+
+    # Run 1F1B in steady state.
+    for k in range(num_microbatches_remaining):
+        # Forward pass.
+        forward_k = k + num_warmup_microbatches
+
+        # Decide to checkpoint all layers' activations of the current micro-batch
+        if max_outstanding_backprops is not None:
+            checkpoint_activations_microbatch = (
+                forward_k % max_outstanding_backprops
+                >= config.num_microbatches_with_partial_activation_checkpoints
+            )
+        else:
+            checkpoint_activations_microbatch = None
+
+        if config.overlap_p2p_comm:
+            if fwd_wait_handles is not None:
+                for req in fwd_wait_handles:
+                    req.wait()
+
+            deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs)
+
+            output_tensor = forward_step_helper(forward_k, checkpoint_activations_microbatch)
+
+            # Determine if current stage has anything to send in either direction,
+            # otherwise set tensor to None.
+            forward_model_chunk_id = get_model_chunk_id(forward_k, forward=True)
+            parallel_state.set_virtual_pipeline_model_parallel_rank(forward_model_chunk_id)
+
+            # Last virtual stage no activation tensor to send
+            if parallel_state.is_pipeline_last_stage():
+                output_tensor = None
+
+            # Determine if peers are sending, and where in data structure to put
+            # received tensors.
+            recv_prev = True
+            if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+                # First stage is ahead of last stage by (pipeline_parallel_size - 1).
+                next_forward_model_chunk_id = get_model_chunk_id(
+                    forward_k - (pipeline_parallel_size - 1), forward=True
+                )
+                if next_forward_model_chunk_id == (num_model_chunks - 1):
+                    recv_prev = False
+                next_forward_model_chunk_id += 1
+            else:
+                next_forward_model_chunk_id = get_model_chunk_id(forward_k + 1, forward=True)
+
+            # If last iteration, don't receive; we already received one extra
+            # before the start of the for loop.
+            if k == (num_microbatches_remaining - 1):
+                recv_prev = False
+
+            # Send activation tensor to the next stage and receive activation tensor from the
+            # previous stage
+            input_tensor, fwd_wait_handles = p2p_communication.send_forward_recv_forward(
+                output_tensor,
+                recv_prev=recv_prev,
+                tensor_shape=tensor_shape,
+                config=config,
+                overlap_p2p_comm=True,
+            )
+            # assert fwd_wait_handles is not None
+
+            if bwd_wait_handles is not None:
+                for req in bwd_wait_handles:
+                    req.wait()
+
+            # Backward pass.
+            backward_k = k
+            input_tensor_grad = backward_step_helper(backward_k)
+
+            backward_model_chunk_id = get_model_chunk_id(backward_k, forward=False)
+            parallel_state.set_virtual_pipeline_model_parallel_rank(backward_model_chunk_id)
+
+            # First virtual stage no activation gradient tensor to send
+            if parallel_state.is_pipeline_first_stage():
+                input_tensor_grad = None
+
+            # Determine if the current virtual stage has an activation gradient tensor to receive
+            recv_next = True
+            if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+                # Last stage is ahead of first stage by (pipeline_parallel_size - 1).
+                next_backward_model_chunk_id = get_model_chunk_id(
+                    backward_k - (pipeline_parallel_size - 1), forward=False
+                )
+                if next_backward_model_chunk_id == 0:
+                    recv_next = False
+                next_backward_model_chunk_id -= 1
+            else:
+                next_backward_model_chunk_id = get_model_chunk_id(backward_k + 1, forward=False)
+
+            output_tensor_grad, bwd_wait_handles = p2p_communication.send_backward_recv_backward(
+                input_tensor_grad,
+                recv_next=recv_next,
+                tensor_shape=tensor_shape,
+                config=config,
+                overlap_p2p_comm=True,
+            )
+
+        else:  # no p2p overlap
+            output_tensor = forward_step_helper(forward_k, checkpoint_activations_microbatch)
+
+            # Backward pass.
+            backward_k = k
+            input_tensor_grad = backward_step_helper(backward_k)
+
+            # Send output_tensor and input_tensor_grad, receive input_tensor
+            # and output_tensor_grad.
+
+            # Determine if current stage has anything to send in either direction,
+            # otherwise set tensor to None.
+            forward_model_chunk_id = get_model_chunk_id(forward_k, forward=True)
+            parallel_state.set_virtual_pipeline_model_parallel_rank(forward_model_chunk_id)
+            if parallel_state.is_pipeline_last_stage():
+                output_tensor = None
+
+            backward_model_chunk_id = get_model_chunk_id(backward_k, forward=False)
+            parallel_state.set_virtual_pipeline_model_parallel_rank(backward_model_chunk_id)
+            if parallel_state.is_pipeline_first_stage():
+                input_tensor_grad = None
+
+            # Determine if peers are sending, and where in data structure to put
+            # received tensors.
+            recv_prev = True
+            if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+                # First stage is ahead of last stage by (pipeline_parallel_size - 1).
+                next_forward_model_chunk_id = get_model_chunk_id(
+                    forward_k - (pipeline_parallel_size - 1), forward=True
+                )
+                if next_forward_model_chunk_id == (num_model_chunks - 1):
+                    recv_prev = False
+                next_forward_model_chunk_id += 1
+            else:
+                next_forward_model_chunk_id = get_model_chunk_id(forward_k + 1, forward=True)
+
+            recv_next = True
+            if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+                # Last stage is ahead of first stage by (pipeline_parallel_size - 1).
+                next_backward_model_chunk_id = get_model_chunk_id(
+                    backward_k - (pipeline_parallel_size - 1), forward=False
+                )
+                if next_backward_model_chunk_id == 0:
+                    recv_next = False
+                next_backward_model_chunk_id -= 1
+            else:
+                next_backward_model_chunk_id = get_model_chunk_id(backward_k + 1, forward=False)
+
+            # If last iteration, don't receive; we already received one extra
+            # before the start of the for loop.
+            if k == (num_microbatches_remaining - 1):
+                recv_prev = False
+
+            # Communicate tensors.
+            (
+                input_tensor,
+                output_tensor_grad,
+            ) = p2p_communication.send_forward_backward_recv_forward_backward(
+                output_tensor,
+                input_tensor_grad,
+                recv_prev=recv_prev,
+                recv_next=recv_next,
+                tensor_shape=tensor_shape,
+                config=config,
+            )
+            deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs)
+
+        # Put input_tensor and output_tensor_grad in data structures in the
+        # right location.
+        if recv_prev:
+            input_tensors[next_forward_model_chunk_id].append(input_tensor)
+        if recv_next:
+            output_tensor_grads[next_backward_model_chunk_id].append(output_tensor_grad)
+
+    deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs)
+
+    # Run cooldown backward passes (flush out pipeline).
+    if not forward_only:
+        if config.overlap_p2p_comm and bwd_wait_handles is not None:
+            for wait_handle in bwd_wait_handles:
+                wait_handle.wait()
+
+        if all_warmup_microbatches:
+            output_tensor_grads[num_model_chunks - 1].append(
+                p2p_communication.recv_backward(tensor_shape, config=config)
+            )
+        for k in range(num_microbatches_remaining, total_num_microbatches):
+            input_tensor_grad = backward_step_helper(k)
+            next_backward_model_chunk_id = get_model_chunk_id(k + 1, forward=False)
+            recv_next = True
+            if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+                if next_backward_model_chunk_id == (num_model_chunks - 1):
+                    recv_next = False
+            if k == (total_num_microbatches - 1):
+                recv_next = False
+            output_tensor_grads[next_backward_model_chunk_id].append(
+                p2p_communication.send_backward_recv_backward(
+                    input_tensor_grad, recv_next=recv_next, tensor_shape=tensor_shape, config=config
+                )
+            )
+
+    # Launch any remaining grad reductions
+    enable_grad_sync()
+    if config.grad_sync_func is not None:
+        params = []
+        for model_chunk_id in range(num_model_chunks):
+            if model_chunk_id not in synchronized_model_chunks:
+                params.extend(model[model_chunk_id].parameters())
+                synchronized_model_chunks.add(model_chunk_id)
+        if params:
+            config.grad_sync_func(params)
+
+    return forward_data_store
+
+
+def get_tensor_shapes(
+    *,
+    rank: int,
+    model_type: ModelType,
+    seq_length: int,
+    micro_batch_size: int,
+    decoder_seq_length: int,
+    config,
+):
+    # Determine right tensor sizes (based on position of rank with respect to split
+    # rank) and model size.
+    # Send two tensors if model is T5 and rank is in decoder stage:
+    #     first tensor is decoder (pre-transpose),
+    #     second tensor is encoder (post-transpose).
+    # If model is T5 and rank is at the boundary:
+    #     send one tensor (post-transpose from encoder).
+    # Otherwise, send one tensor (pre-transpose).
+    tensor_shapes = []
+
+    if config.sequence_parallel:
+        seq_length = seq_length // parallel_state.get_tensor_model_parallel_world_size()
+        if model_type == ModelType.encoder_and_decoder:
+            decoder_seq_length = (
+                decoder_seq_length // parallel_state.get_tensor_model_parallel_world_size()
+            )
+
+    if model_type == ModelType.encoder_and_decoder:
+        if parallel_state.is_pipeline_stage_before_split(rank):
+            tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
+        else:
+            tensor_shapes.append((decoder_seq_length, micro_batch_size, config.hidden_size))
+            tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
+    else:
+        tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
+    return tensor_shapes
+
+
+def recv_forward(tensor_shapes, config):
+    input_tensors = []
+    for tensor_shape in tensor_shapes:
+        if tensor_shape is None:
+            input_tensors.append(None)
+        else:
+            input_tensors.append(p2p_communication.recv_forward(tensor_shape, config))
+    return input_tensors
+
+
+def recv_backward(tensor_shapes, config):
+    output_tensor_grads = []
+    for tensor_shape in tensor_shapes:
+        if tensor_shape is None:
+            output_tensor_grads.append(None)
+        else:
+            output_tensor_grads.append(p2p_communication.recv_backward(tensor_shape, config))
+    return output_tensor_grads
+
+
+def send_forward(output_tensors, tensor_shapes, config):
+    if not isinstance(output_tensors, list):
+        output_tensors = [output_tensors]
+    for (output_tensor, tensor_shape) in zip(output_tensors, tensor_shapes):
+        if tensor_shape is None:
+            continue
+        p2p_communication.send_forward(output_tensor, config)
+
+
+def send_backward(input_tensor_grads, tensor_shapes, config):
+    if not isinstance(input_tensor_grads, list):
+        input_tensor_grads = [input_tensor_grads]
+    for (input_tensor_grad, tensor_shape) in zip(input_tensor_grads, tensor_shapes):
+        if tensor_shape is None:
+            continue
+        p2p_communication.send_backward(input_tensor_grad, config)
+
+
+def send_forward_recv_backward(output_tensors, tensor_shapes, config):
+    if not isinstance(output_tensors, list):
+        output_tensors = [output_tensors]
+    output_tensor_grads = []
+    for (output_tensor, tensor_shape) in zip(output_tensors, tensor_shapes):
+        if tensor_shape is None:
+            output_tensor_grads.append(None)
+            continue
+        output_tensor_grad = p2p_communication.send_forward_recv_backward(
+            output_tensor, tensor_shape, config
+        )
+        output_tensor_grads.append(output_tensor_grad)
+    return output_tensor_grads
+
+
+def send_backward_recv_forward(input_tensor_grads, tensor_shapes, config):
+    if not isinstance(input_tensor_grads, list):
+        input_tensor_grads = [input_tensor_grads]
+    input_tensors = []
+    for (input_tensor_grad, tensor_shape) in zip(input_tensor_grads, tensor_shapes):
+        if tensor_shape is None:
+            input_tensors.append(None)
+            continue
+        input_tensor = p2p_communication.send_backward_recv_forward(
+            input_tensor_grad, tensor_shape, config
+        )
+        input_tensors.append(input_tensor)
+    return input_tensors
+
+
+def forward_backward_pipelining_without_interleaving(
+    *,
+    forward_step_func,
+    data_iterator: Union[Iterator, List[Iterator]],
+    model: Union[torch.nn.Module, List[torch.nn.Module]],
+    num_microbatches: int,
+    seq_length: int,
+    micro_batch_size: int,
+    decoder_seq_length: int = None,
+    forward_only: bool = False,
+    collect_non_loss_data: bool = False,
+):
+    """Run non-interleaved 1F1B schedule, with communication between pipeline
+    stages.
+
+    Returns dictionary with losses if the last stage, empty dict otherwise."""
+
+    if isinstance(model, list):
+        assert (
+            len(model) == 1
+        ), "non-interleaved pipeline parallelism does not support model chunking"
+        model = model[0]
+    if isinstance(data_iterator, list):
+        assert (
+            len(data_iterator) == 1
+        ), "non-pipeline-parallel schedule does not support model chunking"
+        data_iterator = data_iterator[0]
+
+    config = get_model_config(model)
+    if config.overlap_p2p_comm:
+        raise ValueError(
+            "Non-interleaved pipeline parallelism does not support overlapping p2p communication"
+        )
+
+    # Disable async grad reductions
+    no_sync_func = config.no_sync_func
+    if no_sync_func is None and isinstance(model, torchDDP):
+        no_sync_func = model.no_sync
+    if no_sync_func is None:
+        no_sync_func = contextlib.nullcontext
+    no_sync_context = None
+
+    def disable_grad_sync():
+        """Disable asynchronous grad reductions"""
+        nonlocal no_sync_context
+        if no_sync_context is None:
+            no_sync_context = no_sync_func()
+            no_sync_context.__enter__()
+
+    def enable_grad_sync():
+        """Enable asynchronous grad reductions"""
+        nonlocal no_sync_context
+        if no_sync_context is not None:
+            no_sync_context.__exit__(None, None, None)
+            no_sync_context = None
+
+    disable_grad_sync()
+
+    # Compute number of warmup microbatches.
+    num_warmup_microbatches = (
+        parallel_state.get_pipeline_model_parallel_world_size()
+        - parallel_state.get_pipeline_model_parallel_rank()
+        - 1
+    )
+    num_warmup_microbatches = min(num_warmup_microbatches, num_microbatches)
+    num_microbatches_remaining = num_microbatches - num_warmup_microbatches
+
+    # Checkpoint the activations of partial Transformer layers in a number of micro-batches
+    # within the maximum outstanding micro-batch backpropagations.
+    # Micro-batches with the ids less than 'num_microbatches_with_partial_activation_checkpoints'
+    # checkpoint partial Transformer layers (or skip checkpointing) and
+    # the rest of micro-batches within a window of micro-batches checkpoint
+    # all Transformer layers. The window of micro-batches is set by the maximum
+    # outstanding backpropagations and becomes smaller at later pipeline stages.
+    # Please refer the appendix C in https://arxiv.org/pdf/2205.05198.pdf
+    max_outstanding_backprops = None
+    if config.num_microbatches_with_partial_activation_checkpoints is not None:
+        max_outstanding_backprops = num_warmup_microbatches + 1
+
+    model_type = get_model_type(model)
+
+    rank = parallel_state.get_pipeline_model_parallel_rank()
+    recv_tensor_shapes = get_tensor_shapes(
+        rank=rank - 1,
+        model_type=model_type,
+        seq_length=seq_length,
+        micro_batch_size=micro_batch_size,
+        decoder_seq_length=decoder_seq_length,
+        config=config,
+    )
+    send_tensor_shapes = get_tensor_shapes(
+        rank=rank,
+        model_type=model_type,
+        seq_length=seq_length,
+        micro_batch_size=micro_batch_size,
+        decoder_seq_length=decoder_seq_length,
+        config=config,
+    )
+
+    # Input, output tensors only need to be saved when doing backward passes
+    input_tensors = None
+    output_tensors = None
+    if not forward_only:
+        input_tensors = []
+        output_tensors = []
+    forward_data_store = []
+
+    # Run warmup forward passes.
+    for i in range(num_warmup_microbatches):
+        # Decide to checkpoint all layers' activations of the current micro-batch
+        if max_outstanding_backprops is not None:
+            checkpoint_activations_microbatch = (
+                i % max_outstanding_backprops
+                >= config.num_microbatches_with_partial_activation_checkpoints
+            )
+        else:
+            checkpoint_activations_microbatch = None
+
+        input_tensor = recv_forward(recv_tensor_shapes, config)
+        output_tensor = forward_step(
+            forward_step_func,
+            data_iterator,
+            model,
+            num_microbatches,
+            input_tensor,
+            forward_data_store,
+            config,
+            collect_non_loss_data,
+            checkpoint_activations_microbatch,
+        )
+        send_forward(output_tensor, send_tensor_shapes, config)
+
+        if not forward_only:
+            input_tensors.append(input_tensor)
+            output_tensors.append(output_tensor)
+            deallocate_output_tensor(output_tensor[0], config.deallocate_pipeline_outputs)
+
+    # Before running 1F1B, need to receive first forward tensor.
+    # If all microbatches are run in warmup / cooldown phase, then no need to
+    # receive this tensor here.
+    if num_microbatches_remaining > 0:
+        input_tensor = recv_forward(recv_tensor_shapes, config)
+
+    # Run 1F1B in steady state.
+    for i in range(num_microbatches_remaining):
+        last_iteration = i == (num_microbatches_remaining - 1)
+
+        # Decide to checkpoint all layers' activations of the current micro-batch
+        if max_outstanding_backprops is not None:
+            checkpoint_activations_microbatch = (
+                (i + num_warmup_microbatches) % max_outstanding_backprops
+            ) >= config.num_microbatches_with_partial_activation_checkpoints
+        else:
+            checkpoint_activations_microbatch = None
+
+        output_tensor = forward_step(
+            forward_step_func,
+            data_iterator,
+            model,
+            num_microbatches,
+            input_tensor,
+            forward_data_store,
+            config,
+            collect_non_loss_data,
+            checkpoint_activations_microbatch,
+        )
+
+        if forward_only:
+            send_forward(output_tensor, send_tensor_shapes, config)
+
+            if not last_iteration:
+                input_tensor = recv_forward(recv_tensor_shapes, config)
+
+        else:
+            output_tensor_grad = send_forward_recv_backward(
+                output_tensor, send_tensor_shapes, config
+            )
+
+            # Add input_tensor and output_tensor to end of list.
+            input_tensors.append(input_tensor)
+            output_tensors.append(output_tensor)
+            deallocate_output_tensor(output_tensor[0], config.deallocate_pipeline_outputs)
+
+            # Pop input_tensor and output_tensor from the start of the list for
+            # the backward pass.
+            input_tensor = input_tensors.pop(0)
+            output_tensor = output_tensors.pop(0)
+
+            input_tensor_grad = backward_step(
+                input_tensor, output_tensor, output_tensor_grad, model_type, config
+            )
+
+            if last_iteration:
+                input_tensor = None
+                send_backward(input_tensor_grad, recv_tensor_shapes, config)
+            else:
+                input_tensor = send_backward_recv_forward(
+                    input_tensor_grad, recv_tensor_shapes, config
+                )
+
+    # Run cooldown backward passes.
+    if not forward_only:
+        for i in range(num_warmup_microbatches):
+
+            # Enable async grad reduction in the last backward pass
+            # Note: If grad sync function is provided, only enable
+            # async grad reduction in first pipeline stage. Other
+            # pipeline stages do grad reduction during pipeline
+            # bubble.
+            if i == num_warmup_microbatches - 1:
+                if config.grad_sync_func is None or rank == 0:
+                    enable_grad_sync()
+
+            input_tensor = input_tensors.pop(0)
+            output_tensor = output_tensors.pop(0)
+
+            output_tensor_grad = recv_backward(send_tensor_shapes, config)
+
+            input_tensor_grad = backward_step(
+                input_tensor, output_tensor, output_tensor_grad, model_type, config
+            )
+
+            send_backward(input_tensor_grad, recv_tensor_shapes, config)
+
+    # Launch any remaining grad reductions
+    if no_sync_context is not None:
+        enable_grad_sync()
+        if config.grad_sync_func is not None:
+            config.grad_sync_func(model.parameters())
+
+    return forward_data_store
diff --git a/training/DeepSpeed-Domino/megatron/core/requirements.txt b/training/DeepSpeed-Domino/megatron/core/requirements.txt
new file mode 100644
index 000000000..08ed5eeb4
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/requirements.txt
@@ -0,0 +1 @@
+torch
\ No newline at end of file
diff --git a/training/DeepSpeed-Domino/megatron/core/tensor_parallel/__init__.py b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/__init__.py
new file mode 100644
index 000000000..dabda5213
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/__init__.py
@@ -0,0 +1,56 @@
+from .cross_entropy import vocab_parallel_cross_entropy
+from .data import broadcast_data
+from .layers import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    VocabParallelEmbedding,
+    copy_tensor_model_parallel_attributes,
+    linear_with_grad_accumulation_and_async_allreduce,
+    param_is_not_tensor_parallel_duplicate,
+    set_defaults_if_not_set_tensor_model_parallel_attributes,
+    set_tensor_model_parallel_attributes,
+)
+from .mappings import (
+    copy_to_tensor_model_parallel_region,
+    gather_from_sequence_parallel_region,
+    gather_from_tensor_model_parallel_region,
+    scatter_to_sequence_parallel_region,
+    scatter_to_tensor_model_parallel_region,
+)
+from .random import checkpoint, get_cuda_rng_tracker, model_parallel_cuda_manual_seed
+from .utils import (
+    gather_split_1d_tensor,
+    split_tensor_along_last_dim,
+    split_tensor_into_1d_equal_chunks,
+)
+
+__all__ = [
+    # cross_entropy.py
+    "vocab_parallel_cross_entropy",
+    # data.py
+    "broadcast_data",
+    # layers.py
+    "ColumnParallelLinear",
+    "RowParallelLinear",
+    "VocabParallelEmbedding",
+    "set_tensor_model_parallel_attributes",
+    "set_defaults_if_not_set_tensor_model_parallel_attributes",
+    "copy_tensor_model_parallel_attributes",
+    "param_is_not_tensor_parallel_duplicate",
+    "linear_with_grad_accumulation_and_async_allreduce",
+    # mappings.py
+    "copy_to_tensor_model_parallel_region",
+    "gather_from_tensor_model_parallel_region",
+    "gather_from_sequence_parallel_region",
+    #    "reduce_from_tensor_model_parallel_region",
+    "scatter_to_tensor_model_parallel_region",
+    "scatter_to_sequence_parallel_region",
+    # random.py
+    "checkpoint",
+    "get_cuda_rng_tracker",
+    "model_parallel_cuda_manual_seed",
+    # utils.py
+    "split_tensor_along_last_dim",
+    "split_tensor_into_1d_equal_chunks",
+    "gather_split_1d_tensor",
+]
diff --git a/training/DeepSpeed-Domino/megatron/core/tensor_parallel/cross_entropy.py b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/cross_entropy.py
new file mode 100644
index 000000000..645fd1ea0
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/cross_entropy.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+
+from .utils import VocabUtility
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
+
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
+        torch.distributed.all_reduce(
+            logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group()
+        )
+        # Subtract the maximum value.
+        vocab_parallel_logits = vocab_parallel_logits - logits_max.unsqueeze(dim=-1)
+
+        # Get the partition's vocab indecies
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_tensor_model_parallel_rank()
+        world_size = get_tensor_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(
+            predicted_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_tensor_model_parallel_group(),
+        )
+
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = vocab_parallel_logits
+        torch.exp(vocab_parallel_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(
+            sum_exp_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_tensor_model_parallel_group(),
+        )
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+
+        # Normalize and optionally smooth logits
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+
+        vocab_size = exp_logits.size(-1)
+        if label_smoothing > 0:
+            """
+            We'd like to assign 1 / (K - 1) probability mass to every index that is not the ground truth.
+            = (1 - alpha) * y_gt + alpha * mean(y_{i for i != gt})
+            = (1 - alpha) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
+            = ((K - 1) * (1 - alpha) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
+            = (K * (1 - alpha) - 1) / (K - 1)) * y_gt  + (alpha / (K - 1)) * \sum_{i} y_i
+            = (1 - (alpha * K) / (K - 1)) * y_gt + ( (alpha * K) / (K - 1) ) * \sum_{i} y_i / K
+            From: https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/losses/smoothed_cross_entropy.py
+            """
+            assert 1.0 > label_smoothing > 0.0
+            smoothing = label_smoothing * vocab_size / (vocab_size - 1)
+
+            # Exp logits at this point are normalized probabilities. So we can just take the log to get log-probs.
+            log_probs = torch.log(exp_logits)
+            mean_log_probs = log_probs.mean(dim=-1)
+            loss = (1.0 - smoothing) * loss - smoothing * mean_log_probs
+
+        ctx.label_smoothing, ctx.vocab_size = label_smoothing, vocab_size
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+        label_smoothing, vocab_size = ctx.label_smoothing, ctx.vocab_size
+
+        # All the inputs have softmax as thier gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+
+        softmax_update = 1.0 - target_mask.view(-1).float()
+
+        if label_smoothing > 0:
+            smoothing = label_smoothing * vocab_size / (vocab_size - 1)
+            grad_2d[arange_1d, masked_target_1d] -= (1.0 - smoothing) * softmax_update
+            average_grad = 1 / vocab_size
+            grad_2d[arange_1d, :] -= smoothing * average_grad
+        else:
+            grad_2d[arange_1d, masked_target_1d] -= softmax_update
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None, None
+
+
+def vocab_parallel_cross_entropy(vocab_parallel_logits, target, label_smoothing=0.0):
+    """
+    Performs cross entropy loss when logits are split across tensor parallel ranks
+
+    Arguments:
+        vocab_parallel_logits: logits split across tensor parallel ranks
+                               dimension is [sequence_length, batch_size, hidden_size]
+
+        target: correct vocab ids of dimseion [sequence_length, micro_batch_size]
+
+        lobal_smoothing: smoothing factor, must be in range [0.0, 1.0)
+                         default is no smoothing (=0.0)
+    """
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target, label_smoothing)
diff --git a/training/DeepSpeed-Domino/megatron/core/tensor_parallel/data.py b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/data.py
new file mode 100644
index 000000000..45c4fe7eb
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/data.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_src_rank,
+)
+
+_MAX_DATA_DIM = 5
+
+
+def _check_data_types(keys, data, target_dtype):
+    """Check that all the keys have the same target data type."""
+    for key in keys:
+        assert data[key].dtype == target_dtype, (
+            '{} has data type {} which '
+            'is different than {}'.format(key, data[key].dtype, target_dtype)
+        )
+
+
+def _build_key_size_numel_dictionaries(keys, data):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+
+    # Pack the sizes on rank zero.
+    if get_tensor_model_parallel_rank() == 0:
+        offset = 0
+        for key in keys:
+            assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    torch.distributed.broadcast(
+        sizes_cuda, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()
+    )
+
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+
+    return key_size, key_numel, total_numel
+
+
+def broadcast_data(keys, data, datatype):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+
+    Arguments:
+        keys: list of keys in the data disctionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data)
+
+    # Pack on rank zero.
+    if get_tensor_model_parallel_rank() == 0:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat([data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+    else:
+        flatten_data = torch.empty(total_numel, device=torch.cuda.current_device(), dtype=datatype)
+
+    # Broadcast
+    torch.distributed.broadcast(
+        flatten_data, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()
+    )
+
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+
+    return output
diff --git a/training/DeepSpeed-Domino/megatron/core/tensor_parallel/layers.py b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/layers.py
new file mode 100644
index 000000000..834f821e1
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/layers.py
@@ -0,0 +1,899 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import math
+import os
+import warnings
+from typing import Callable, Optional
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.nn.parameter import Parameter
+
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.parallel_state import (
+    get_global_memory_buffer,
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+
+from .mappings import (
+    copy_to_tensor_model_parallel_region,
+    gather_from_sequence_parallel_region,
+    gather_from_tensor_model_parallel_region,
+    reduce_from_tensor_model_parallel_region,
+    reduce_scatter_to_sequence_parallel_region,
+    scatter_to_tensor_model_parallel_region,
+)
+from .random import get_cuda_rng_tracker
+from .utils import VocabUtility, divide, split_tensor_along_last_dim
+
+_grad_accum_fusion_available = True
+try:
+    import fused_weight_gradient_mlp_cuda
+except ImportError:
+    _grad_accum_fusion_available = False
+
+_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {
+    'tensor_model_parallel': False,
+    'partition_dim': -1,
+    'partition_stride': 1,
+}
+
+
+def param_is_not_tensor_parallel_duplicate(param):
+    return (hasattr(param, 'tensor_model_parallel') and param.tensor_model_parallel) or (
+        get_tensor_model_parallel_rank() == 0
+    )
+
+
+def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
+    # Make sure the attributes are not set.
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        assert not hasattr(tensor, attribute)
+    # Set the attributes.
+    setattr(tensor, 'tensor_model_parallel', is_parallel)
+    setattr(tensor, 'partition_dim', dim)
+    setattr(tensor, 'partition_stride', stride)
+
+
+def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
+    def maybe_set(attribute, value):
+        if not hasattr(tensor, attribute):
+            setattr(tensor, attribute, value)
+
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute])
+
+
+def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
+    def maybe_copy(attribute):
+        if hasattr(source_tensor, attribute):
+            setattr(destination_tensor, attribute, getattr(source_tensor, attribute))
+
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        maybe_copy(attribute)
+
+
+def _initialize_affine_weight_gpu(weight, init_method, partition_dim, stride=1):
+    """Initialize affine weight for model parallel on GPU."""
+
+    set_tensor_model_parallel_attributes(
+        tensor=weight, is_parallel=True, dim=partition_dim, stride=stride
+    )
+
+    with get_cuda_rng_tracker().fork():
+        init_method(weight)
+
+
+def _initialize_affine_weight_cpu(
+    weight,
+    output_size,
+    input_size,
+    per_partition_size,
+    partition_dim,
+    init_method,
+    stride=1,
+    return_master_weight=False,
+    *,
+    params_dtype=torch.float32,
+):
+    """Initialize affine weight for model parallel.
+
+    Build the master weight on all processes and scatter
+    the relevant chunk."""
+
+    set_tensor_model_parallel_attributes(
+        tensor=weight, is_parallel=True, dim=partition_dim, stride=stride
+    )
+
+    # Initialize master weight
+    master_weight = torch.empty(output_size, input_size, dtype=torch.float, requires_grad=False)
+    init_method(master_weight)
+    master_weight = master_weight.to(dtype=params_dtype)
+
+    # Split and copy
+    per_partition_per_stride_size = divide(per_partition_size, stride)
+    weight_list = torch.split(master_weight, per_partition_per_stride_size, dim=partition_dim)
+    rank = get_tensor_model_parallel_rank()
+    world_size = get_tensor_model_parallel_world_size()
+    my_weight_list = weight_list[rank::world_size]
+
+    with torch.no_grad():
+        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+    if return_master_weight:
+        return master_weight
+    return None
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+
+    Keyword Arguments:
+        config: A megatron.core.ModelParallelConfig object
+    """
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        *,
+        init_method: Callable,
+        config: ModelParallelConfig,
+    ):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.0
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
+        # Divide the weight matrix along the vocaburaly dimension.
+        (
+            self.vocab_start_index,
+            self.vocab_end_index,
+        ) = VocabUtility.vocab_range_from_global_vocab_size(
+            self.num_embeddings, get_tensor_model_parallel_rank(), self.tensor_model_parallel_size
+        )
+        self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index
+
+        # Allocate weights and initialize.
+        if config.use_cpu_initialization:
+            self.weight = Parameter(
+                torch.empty(
+                    self.num_embeddings_per_partition, self.embedding_dim, dtype=config.params_dtype
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_cpu(
+                    self.weight,
+                    self.num_embeddings,
+                    self.embedding_dim,
+                    self.num_embeddings_per_partition,
+                    0,
+                    init_method,
+                    params_dtype=config.params_dtype,
+                )
+        else:
+            self.weight = Parameter(
+                torch.empty(
+                    self.num_embeddings_per_partition,
+                    self.embedding_dim,
+                    device=torch.cuda.current_device(),
+                    dtype=config.params_dtype,
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1)
+
+    def forward(self, input_):
+        if self.tensor_model_parallel_size > 1:
+            # Build the mask.
+            input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index)
+            # Mask the input.
+            masked_input = input_.clone() - self.vocab_start_index
+            masked_input[input_mask] = 0
+        else:
+            masked_input = input_
+            # Get the embeddings.
+        output_parallel = F.embedding(
+            masked_input,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+        # Mask the output embedding.
+        if self.tensor_model_parallel_size > 1:
+            output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = reduce_from_tensor_model_parallel_region(output_parallel)
+        return output
+
+
+class LinearWithFrozenWeight(torch.autograd.Function):
+    """Linear operator that does not calculate gradient for weight.
+    This op and LinearWithGradAccumulationAndAsyncCommunication performs 
+    mathematically-identical forward and DGRAD. 
+    
+    Conceptually this op is the same as torch.nn.functional.linear with
+    weight.requires_grad==False, but in experiments they are not identical 
+    mathematically. """
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx, input, weight, bias,
+    ):
+        ctx.save_for_backward(weight)
+        output = torch.matmul(input, weight.t())
+        if bias is not None:
+            output = output + bias
+        return output
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output):
+        (weight,) = ctx.saved_tensors
+        grad_input = grad_output.matmul(weight)
+        return grad_input, None, None
+
+
+def linear_with_frozen_weight(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    gradient_accumulation_fusion: bool,
+    async_grad_allreduce: bool,
+    sequence_parallel: bool,
+) -> torch.Tensor:
+    """Linear layer execution with weight.requires_grad == False.
+
+    This function handles linear layers with weight frozen (untrainable). 
+    In the forward, it only saves weight and does not save input activations.
+    In the backward, it does not perform weight gradient calculation, or 
+    weight gradient allreduce. 
+
+    Arguments:
+
+    input (torch.Tensor required): input like torch.nn.functional.linear
+
+    weight (torch.Tensor required): weight like torch.nn.functional.linear
+
+    bias (torch.Tensor optional): bias like torch.nn.functional.linear
+
+    gradient_accumulation_fusion (bool required): dummy argument, used to 
+    keep the API unified between all forward implementation functions.
+
+    async_grad_allreduce (bool required): dummy argument, used to 
+    keep the API unified between all forward implementation functions.
+
+    sequence_parallel (bool required): Indicates that sequence
+        parallelism is used and thus in the forward pass the input is
+        all gathered, and the backward pass the input gradients are
+        reduce scattered.
+    """
+
+    if sequence_parallel:
+        input = gather_from_sequence_parallel_region(input, tensor_parallel_output_grad=True)
+    else:
+        input = input
+
+    args = [
+        input,
+        weight,
+        bias,
+    ]
+
+    return LinearWithFrozenWeight.apply(*args)
+
+
+class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
+    """See linear_with_grad_accumulation_and_async_allreduce"""
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx,
+        input,
+        weight,
+        bias,
+        gradient_accumulation_fusion,
+        async_grad_allreduce,
+        sequence_parallel,
+    ):
+        ctx.save_for_backward(input, weight)
+        ctx.use_bias = bias is not None
+        ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
+        ctx.async_grad_allreduce = async_grad_allreduce
+        ctx.sequence_parallel = sequence_parallel
+
+        if sequence_parallel:
+            world_size = get_tensor_model_parallel_world_size()
+            dim_size = list(input.size())
+            dim_size[0] = dim_size[0] * world_size
+
+            all_gather_buffer = get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
+            torch.distributed._all_gather_base(
+                all_gather_buffer, input, group=get_tensor_model_parallel_group()
+            )
+            total_input = all_gather_buffer
+        else:
+            total_input = input
+
+        output = torch.matmul(total_input, weight.t())
+        if bias is not None:
+            output = output + bias
+        return output
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output):
+        input, weight = ctx.saved_tensors
+        use_bias = ctx.use_bias
+
+        if ctx.sequence_parallel:
+            world_size = get_tensor_model_parallel_world_size()
+            dim_size = list(input.size())
+            dim_size[0] = dim_size[0] * world_size
+
+            all_gather_buffer = get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
+            handle = torch.distributed._all_gather_base(
+                all_gather_buffer, input, group=get_tensor_model_parallel_group(), async_op=True
+            )
+
+            # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
+            # gather is scheduled before the input gradient computation
+            total_input = all_gather_buffer
+        else:
+            total_input = input
+        grad_input = grad_output.matmul(weight)
+
+        if ctx.sequence_parallel:
+            handle.wait()
+
+        # Doing gather + slicing during the NeMo forward pass can make this tensor
+        # not be contiguous. PyTorch only checks if the tensor is contiguous, and only
+        # clones it if it's not contiguous:
+        # https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761
+        grad_output = grad_output.contiguous()
+        # Convert the tensor shapes to 2D for execution compatibility
+        grad_output = grad_output.view(
+            grad_output.shape[0] * grad_output.shape[1], grad_output.shape[2]
+        )
+        total_input = total_input.view(
+            total_input.shape[0] * total_input.shape[1], total_input.shape[2]
+        )
+
+        if ctx.async_grad_allreduce:
+            # Asynchronous all-reduce
+            handle = torch.distributed.all_reduce(
+                grad_input, group=get_tensor_model_parallel_group(), async_op=True
+            )
+            # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
+            # all-reduce is scheduled before the weight gradient computation
+
+        if ctx.sequence_parallel:
+            assert not ctx.async_grad_allreduce
+            dim_size = list(input.size())
+            sub_grad_input = torch.empty(
+                dim_size, dtype=input.dtype, device=torch.cuda.current_device(), requires_grad=False
+            )
+            # reduce_scatter
+            handle = torch.distributed._reduce_scatter_base(
+                sub_grad_input, grad_input, group=get_tensor_model_parallel_group(), async_op=True
+            )
+            # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
+            # reduce scatter is scheduled before the weight gradient computation
+
+        if ctx.gradient_accumulation_fusion:
+            if weight.main_grad.dtype == torch.float32:
+                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(
+                    total_input, grad_output, weight.main_grad
+                )
+            elif weight.main_grad.dtype in (torch.float16, torch.bfloat16):
+                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(
+                    total_input, grad_output, weight.main_grad
+                )
+            else:
+                raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
+            grad_weight = None
+        else:
+            grad_weight = grad_output.t().matmul(total_input)
+        grad_bias = grad_output.sum(dim=0) if use_bias else None
+
+        if ctx.sequence_parallel:
+            handle.wait()
+            return sub_grad_input, grad_weight, grad_bias, None, None, None
+
+        if ctx.async_grad_allreduce:
+            handle.wait()
+
+        return grad_input, grad_weight, grad_bias, None, None, None
+
+
+def linear_with_grad_accumulation_and_async_allreduce(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    gradient_accumulation_fusion: bool,
+    async_grad_allreduce: bool,
+    sequence_parallel: bool,
+) -> torch.Tensor:
+    """Linear layer execution with asynchronous communication and
+    gradient accumulation fusion in backprop.
+
+    This has the option to accumulate the result of backprop
+    calculation into an existing gradient buffer, preventing the need
+    to do an additional addition kernel after the gradient
+    calculation.
+
+    Additionally, the tensor parallel all reduce of the input
+    gradients can be done asynchronously with the calculation of
+    the weight gradients.
+
+    In the case of sequence parallelism, the reduce scatter of the
+    input gradients is done asynchronously with the calcluation of the
+    weight gradients.
+
+    Use of this module requires that the environment variable
+    CUDA_DEVICE_MAX_CONNECTIONS=1. There are a few collective
+    operations, noted in the code, that should be scheduled before
+    compute kernels to overlap the communication with the computation,
+    which is necessary for a speedup but not for correctness so that
+    ordering isn't imposed by the scheduler. Setting
+    CUDA_DEVICE_MAX_CONNECTIONS=1 forces the kernels to be scheduled
+    in the order they are called.
+
+    Arguments:
+
+    input (torch.Tensor required): input like torch.nn.functional.linear
+
+    weight (torch.Tensor required): weight like torch.nn.functional.linear
+
+    bias (torch.Tensor optional): bias like torch.nn.functional.linear
+
+    gradient_accumulation_fusion (bool required): Perform the gradient
+        accumulation fusion, requires the custom CUDA extension
+        fused_weight_gradient_mlp_cuda module. To use
+        gradient_accumulation_fusion you must install APEX with
+        --cpp_ext and --cuda_ext. For example: "pip install
+        --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\"
+        " Note that the extension requires CUDA>=11. Otherwise, you
+        must turn off gradient accumulation fusion."
+
+    async_grad_allreduce (bool required): Do the allreduce of input
+        gradients asyncronously with the computation of weight
+        gradients. If sequence_parallel is True, this must be
+        False, as no all reduce is performed.
+
+    sequence_parallel (bool required): Indicates that sequence
+        parallelism is used and thus in the forward pass the input is
+        all gathered, and the backward pass the input gradients are
+        reduce scattered.
+    """
+    args = [
+        input,
+        weight,
+        bias,
+        gradient_accumulation_fusion,
+        async_grad_allreduce,
+        sequence_parallel,
+    ]
+
+    if not linear_with_grad_accumulation_and_async_allreduce.warned:
+        if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
+            if sequence_parallel:
+                warnings.warn(
+                    "When using sequence parallelism it is recommended to set the "
+                    "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
+                    "maximum speedup"
+                )
+                linear_with_grad_accumulation_and_async_allreduce.warned = True
+
+            if async_grad_allreduce:
+                warnings.warn(
+                    "When using async grad allreduce it is recommended to set the "
+                    "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
+                    "maximum speedup"
+                )
+                linear_with_grad_accumulation_and_async_allreduce.warned = True
+
+    return LinearWithGradAccumulationAndAsyncCommunication.apply(*args)
+
+
+linear_with_grad_accumulation_and_async_allreduce.warned = False
+
+
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+
+    Keyword Arguments
+        bias: If true, add bias
+        gather_output: If true, call all-gather on output and make Y available
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: If True, do not add the bias term, instead
+                       return it to be added by the caller. This
+                       enables performance optimations where bias can
+                       be fused with other elementwise operations.
+
+        skip_weight_param_allocation: If True, weight parameter is not allocated and must be passed
+                                      as a keyword argument `weight` during the forward pass. Note
+                                      that this does not affect bias, which will be allocated if
+                                      bias is True. Defaults to False.
+
+        config: ModelParallelConfig object
+
+    """
+
+    def __init__(
+        self,
+        input_size,
+        output_size,
+        *,
+        config: ModelParallelConfig,
+        init_method: Callable,
+        bias=True,
+        gather_output=False,
+        stride=1,
+        keep_master_weight_for_test=False,
+        skip_bias_add=False,
+        skip_weight_param_allocation: bool = False,
+    ):
+        super(ColumnParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        world_size = get_tensor_model_parallel_world_size()
+        self.output_size_per_partition = divide(output_size, world_size)
+        self.skip_bias_add = skip_bias_add
+        self.config = config
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        if not skip_weight_param_allocation:
+            if config.use_cpu_initialization:
+                self.weight = Parameter(
+                    torch.empty(
+                        self.output_size_per_partition, self.input_size, dtype=config.params_dtype
+                    )
+                )
+                if config.perform_initialization:
+                    self.master_weight = _initialize_affine_weight_cpu(
+                        self.weight,
+                        self.output_size,
+                        self.input_size,
+                        self.output_size_per_partition,
+                        0,
+                        init_method,
+                        stride=stride,
+                        return_master_weight=keep_master_weight_for_test,
+                    )
+            else:
+                self.weight = Parameter(
+                    torch.empty(
+                        self.output_size_per_partition,
+                        self.input_size,
+                        device=torch.cuda.current_device(),
+                        dtype=config.params_dtype,
+                    )
+                )
+                if config.perform_initialization:
+                    _initialize_affine_weight_gpu(
+                        self.weight, init_method, partition_dim=0, stride=stride
+                    )
+        else:
+            self.weight = None
+
+        if bias:
+            if config.use_cpu_initialization:
+                self.bias = Parameter(
+                    torch.empty(self.output_size_per_partition, dtype=config.params_dtype)
+                )
+            else:
+                self.bias = Parameter(
+                    torch.empty(
+                        self.output_size_per_partition,
+                        device=torch.cuda.current_device(),
+                        dtype=config.params_dtype,
+                    )
+                )
+            set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
+            if config.perform_initialization:
+                # Always initialize bias to zero.
+                with torch.no_grad():
+                    self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        self.async_tensor_model_parallel_allreduce = (
+            config.async_tensor_model_parallel_allreduce and world_size > 1
+        )
+
+        self.sequence_parallel = config.sequence_parallel
+        if self.sequence_parallel and world_size <= 1:
+            warnings.warn(
+                f"`sequence_parallel` is set to `True`, but tensor model parallel size is {world_size}. "
+                f"Disabling sequence parallel."
+            )
+            self.sequence_parallel = False
+
+        if config.gradient_accumulation_fusion and not _grad_accum_fusion_available:
+            raise RuntimeError(
+                "ColumnParallelLinear was called with gradient_accumulation_fusion set "
+                "to True but the custom CUDA extension fused_weight_gradient_mlp_cuda "
+                "module is not found. To use gradient_accumulation_fusion you must "
+                "install APEX with --cpp_ext and --cuda_ext. For example: "
+                "pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\" "
+                "Note that the extension requires CUDA>=11. Otherwise, you must turn off "
+                "gradient accumulation fusion."
+            )
+        self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
+
+        if self.async_tensor_model_parallel_allreduce and self.sequence_parallel:
+            raise RuntimeError(
+                "`async_tensor_model_parallel_allreduce` and `sequence_parallel` "
+                "cannot be enabled at the same time."
+            )
+
+        self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+    def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None):
+        """Forward of ColumnParallelLinear
+
+        Args:
+            input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
+
+            weight (optional): weight tensor to use, compulsory when
+                skip_weight_param_allocation is True.
+
+        Returns:
+            - output
+            - bias
+
+        """
+        if weight is None:
+            if self.weight is None:
+                raise RuntimeError(
+                    "weight was not supplied to ColumnParallelLinear forward pass "
+                    "and skip_weight_param_allocation is True."
+                )
+            weight = self.weight
+        else:
+            # Check the weight passed in is the correct shape
+            expected_shape = (self.output_size_per_partition, self.input_size)
+            if weight.shape != expected_shape:
+                raise RuntimeError(
+                    f"supplied weight's shape is {tuple(weight.shape)}, "
+                    f"not {expected_shape} as expected"
+                )
+
+        bias = self.bias if not self.skip_bias_add else None
+
+        if self.async_tensor_model_parallel_allreduce or self.sequence_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = copy_to_tensor_model_parallel_region(input_)
+        # Matrix multiply.
+        if not weight.requires_grad:
+            self._forward_impl = linear_with_frozen_weight
+        else:
+            self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+        output_parallel = self._forward_impl(
+            input=input_parallel,
+            weight=weight,
+            bias=bias,
+            gradient_accumulation_fusion=self.gradient_accumulation_fusion,
+            async_grad_allreduce=self.async_tensor_model_parallel_allreduce,
+            sequence_parallel=self.sequence_parallel,
+        )
+        if self.gather_output:
+            # All-gather across the partitions.
+            assert not self.sequence_parallel
+            output = gather_from_tensor_model_parallel_region(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+
+class RowParallelLinear(torch.nn.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+
+    Keyword Arguments:
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: If True, do not add the bias term, instead
+                       return it to be added by the caller. This
+                       enables performance optimations where bias can
+                       be fused with other elementwise operations.
+        config: ModelParallelConfig object
+
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        *,
+        config: ModelParallelConfig,
+        init_method: Callable,
+        bias: bool = True,
+        input_is_parallel: bool = False,
+        stride: int = 1,
+        keep_master_weight_for_test: bool = False,
+        skip_bias_add: bool = False,
+    ):
+        super(RowParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        # Divide the weight matrix along the last dimension.
+        world_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, world_size)
+        self.skip_bias_add = skip_bias_add
+        self.config = config
+        self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
+        self.sequence_parallel = config.sequence_parallel
+        if self.sequence_parallel and not self.input_is_parallel:
+            raise RuntimeError("To enable `sequence_parallel`, `input_is_parallel` must be `True`")
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        if config.use_cpu_initialization:
+            self.weight = Parameter(
+                torch.empty(
+                    self.output_size, self.input_size_per_partition, dtype=config.params_dtype
+                )
+            )
+            if config.perform_initialization:
+                self.master_weight = _initialize_affine_weight_cpu(
+                    self.weight,
+                    self.output_size,
+                    self.input_size,
+                    self.input_size_per_partition,
+                    1,
+                    init_method,
+                    stride=stride,
+                    return_master_weight=keep_master_weight_for_test,
+                    params_dtype=config.params_dtype,
+                )
+        else:
+            self.weight = Parameter(
+                torch.empty(
+                    self.output_size,
+                    self.input_size_per_partition,
+                    device=torch.cuda.current_device(),
+                    dtype=config.params_dtype,
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_gpu(
+                    self.weight, init_method, partition_dim=1, stride=stride
+                )
+        if bias:
+            if config.use_cpu_initialization:
+                self.bias = Parameter(torch.empty(self.output_size, dtype=config.params_dtype))
+            else:
+                self.bias = Parameter(
+                    torch.empty(
+                        self.output_size,
+                        device=torch.cuda.current_device(),
+                        dtype=config.params_dtype,
+                    )
+                )
+            setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
+
+            if config.perform_initialization:
+                # Always initialize bias to zero.
+                with torch.no_grad():
+                    self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+    def forward(self, input_):
+        """Forward of RowParallelLinear
+
+        Args:
+            input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
+
+        Returns:
+            - output
+            - bias
+        """
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            assert not self.sequence_parallel
+            input_parallel = scatter_to_tensor_model_parallel_region(input_)
+        # Matrix multiply.
+        if not self.weight.requires_grad:
+            self._forward_impl = linear_with_frozen_weight
+        else:
+            self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+        output_parallel = self._forward_impl(
+            input=input_parallel,
+            weight=self.weight,
+            bias=None,
+            gradient_accumulation_fusion=self.gradient_accumulation_fusion,
+            async_grad_allreduce=False,
+            sequence_parallel=False,
+        )
+
+        # All-reduce across all the partitions.
+        if self.sequence_parallel:
+            output_ = reduce_scatter_to_sequence_parallel_region(output_parallel)
+        else:
+            output_ = reduce_from_tensor_model_parallel_region(output_parallel)
+        if not self.skip_bias_add:
+            output = output_ + self.bias if self.bias is not None else output_
+            output_bias = None
+        else:
+            output = output_
+            output_bias = self.bias
+        return output, output_bias
diff --git a/training/DeepSpeed-Domino/megatron/core/tensor_parallel/mappings.py b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/mappings.py
new file mode 100644
index 000000000..9f753e732
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/mappings.py
@@ -0,0 +1,283 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+
+from .utils import split_tensor_along_last_dim
+
+
+def _reduce(input_):
+    """All-reduce the input tensor across model parallel group."""
+
+    # Bypass the function if we are using only 1 GPU.
+    if get_tensor_model_parallel_world_size() == 1:
+        return input_
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
+
+    return input_
+
+
+def _split_along_last_dim(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Split along last dimension.
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = get_tensor_model_parallel_rank()
+    output = input_list[rank].contiguous()
+
+    return output
+
+
+def _split_along_first_dim(input_):
+    """Split the tensor along its first dimension and keep the
+    corresponding slice."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Split along first dimension.
+    dim_size = input_.size()[0]
+    assert (
+        dim_size % world_size == 0
+    ), "First dimension of the tensor should be divisible by tensor parallel size"
+    local_dim_size = dim_size // world_size
+    rank = get_tensor_model_parallel_rank()
+    dim_offset = rank * local_dim_size
+
+    output = input_[dim_offset : dim_offset + local_dim_size].contiguous()
+
+    return output
+
+
+def _gather_along_last_dim(input_):
+    """Gather tensors and concatinate along the last dimension."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = get_tensor_model_parallel_rank()
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=get_tensor_model_parallel_group())
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    return output
+
+
+def _gather_along_first_dim(input_):
+    """Gather tensors and concatinate along the first dimension."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    dim_size[0] = dim_size[0] * world_size
+
+    output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device())
+    torch.distributed._all_gather_base(
+        output, input_.contiguous(), group=get_tensor_model_parallel_group()
+    )
+
+    return output
+
+
+def _reduce_scatter_along_first_dim(input_):
+    """Reduce-scatter the input tensor across model parallel group."""
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    assert (
+        dim_size[0] % world_size == 0
+    ), "First dimension of the tensor should be divisible by tensor parallel size"
+
+    dim_size[0] = dim_size[0] // world_size
+
+    output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device())
+    torch.distributed._reduce_scatter_base(
+        output, input_.contiguous(), group=get_tensor_model_parallel_group()
+    )
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return input_
+
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-reduce the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split_along_last_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split_along_last_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_last_dim(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _gather_along_last_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather_along_last_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split_along_last_dim(grad_output)
+
+
+class _ScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split_along_first_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
+
+
+class _GatherFromSequenceParallelRegion(torch.autograd.Function):
+    """Gather the input from sequence parallel region and concatinate."""
+
+    @staticmethod
+    def symbolic(graph, input_, tensor_parallel_output_grad=True):
+        return _gather_along_first_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_, tensor_parallel_output_grad=True):
+        ctx.tensor_parallel_output_grad = tensor_parallel_output_grad
+        return _gather_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        tensor_parallel_output_grad = ctx.tensor_parallel_output_grad
+
+        # If the computation graph after the gather operation is
+        # in the tensor parallel mode, output gradients need to reduce
+        # scattered and whereas if the computation is duplicated,
+        # output gradients need to be scattered.
+        if tensor_parallel_output_grad:
+            return _reduce_scatter_along_first_dim(grad_output), None
+        else:
+            return _split_along_first_dim(grad_output), None
+
+
+class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Reduce scatter the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce_scatter_along_first_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce_scatter_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+
+def copy_to_tensor_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+
+def reduce_from_tensor_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_tensor_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+
+def gather_from_tensor_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_sequence_parallel_region(input_):
+    return _ScatterToSequenceParallelRegion.apply(input_)
+
+
+def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
+    return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)
+
+
+def reduce_scatter_to_sequence_parallel_region(input_):
+    return _ReduceScatterToSequenceParallelRegion.apply(input_)
diff --git a/training/DeepSpeed-Domino/megatron/core/tensor_parallel/random.py b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/random.py
new file mode 100644
index 000000000..c3a9e2b72
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/random.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import contextlib
+
+import torch
+from torch import _C
+from torch.cuda import _lazy_call
+from torch.cuda import device as device_ctx_manager
+from torch.utils.checkpoint import detach_variable
+
+from megatron.core.parallel_state import (
+    get_data_parallel_rank,
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from megatron.core.utils import safely_set_viewless_tensor_data
+
+from .utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+
+
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device('cuda')
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device('cuda', device)
+
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+
+    _lazy_call(cb)
+
+
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception('cuda rng state {} already exists'.format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            raise Exception('cuda rng state {} is not added'.format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-tensor-model-parallel regions.
+        tensor-model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank()
+    # Data parallel gets the original seed.
+    data_parallel_seed = seed
+
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, tensor_model_parallel_seed)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+
+    @staticmethod
+    def forward(ctx, run_function, distribute_saved_activations, *args):
+        ctx.run_function = run_function
+        ctx.distribute_saved_activations = distribute_saved_activations
+
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        with torch.no_grad():
+            outputs = run_function(*args)
+
+        # Divide hidden states across model parallel group and only keep
+        # the chunk corresponding to the current rank.
+        if distribute_saved_activations:
+            ctx.input_0_shape = args[0].data.shape
+            safely_set_viewless_tensor_data(
+                args[0], split_tensor_into_1d_equal_chunks(args[0].data, new_buffer=True)
+            )
+
+        # Store everything.
+        ctx.save_for_backward(*args)
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError(
+                "Checkpointing is not compatible with .grad(), "
+                "please use .backward() if possible"
+            )
+        inputs = ctx.saved_tensors
+        if ctx.distribute_saved_activations:
+            safely_set_viewless_tensor_data(
+                inputs[0], gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape)
+            )
+
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+        # Compute the forward pass.
+        detached_inputs = detach_variable(inputs)
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        torch.autograd.backward(outputs, args)
+        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp for inp in detached_inputs)
+        return (None, None) + grads
+
+
+def checkpoint(function, distribute_saved_activations, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function, distribute_saved_activations, *args)
diff --git a/training/DeepSpeed-Domino/megatron/core/tensor_parallel/utils.py b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/utils.py
new file mode 100644
index 000000000..a79ae1e87
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/tensor_parallel/utils.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+from typing import List, Sequence
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.utils import divide
+
+
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor, num_partitions: int, contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """ Split a tensor along its last dimension.
+
+        Arguments:
+            tensor: input tensor.
+            num_partitions: number of partitions to split the tensor
+            contiguous_split_chunks: If True, make each chunk contiguous
+                                     in memory.
+
+        Returns:
+            A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
+    """ Break a tensor into equal 1D chunks across tensor parallel ranks.
+
+        Returns a Tensor or View with this rank's portion of the data.
+
+        Arguments:
+            tensor: The tensor to split
+
+        Keyword Arguments:
+            new_buffer (bool): If True, returns a new Tensor.
+                               If False, returns a view into the existing Tensor.
+                               Default is False
+
+    """
+    partition_size = torch.numel(tensor) // parallel_state.get_tensor_model_parallel_world_size()
+    start_index = partition_size * parallel_state.get_tensor_model_parallel_rank()
+    end_index = start_index + partition_size
+    if new_buffer:
+        data = torch.empty(
+            partition_size,
+            dtype=tensor.dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+        data.copy_(tensor.view(-1)[start_index:end_index])
+    else:
+        data = tensor.view(-1)[start_index:end_index]
+    return data
+
+
+def gather_split_1d_tensor(tensor):
+    """ Opposite of split_tensor_into_1d_equal_chunks. Gather values from tensor
+        model parallel ranks.
+
+        Returns a new Tensor with the gathered data.
+
+        Arguments:
+            tensor: A Tensor or view of this rank's portion of the data.
+    """
+    numel_gathered = torch.numel(tensor) * parallel_state.get_tensor_model_parallel_world_size()
+    gathered = torch.empty(
+        numel_gathered, dtype=tensor.dtype, device=torch.cuda.current_device(), requires_grad=False
+    )
+    # TODO: This API is experimental in pytorch (as of Feb 2022) and
+    # this might break in future pytorch releases. We chose this API
+    # as opposed to torch.distributed.all_gather for efficiency reasons.
+    # This API calls directly NCCL all-gather versus the former does
+    # internal copies and can potentially cause slow down.
+    torch.distributed._all_gather_base(
+        gathered, tensor, group=parallel_state.get_tensor_model_parallel_group()
+    )
+    return gathered
+
+
+class VocabUtility:
+    """ Split the vocabulary into `world_size` chunks and return the first
+        and last index of the vocabulary belonging to the `rank`
+        partition: Note that indices in [fist, last)
+
+    """
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(
+        per_partition_vocab_size: int, rank, world_size: int
+    ) -> Sequence[int]:
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(
+        global_vocab_size: int, rank: int, world_size: int
+    ) -> Sequence[int]:
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size
+        )
diff --git a/training/DeepSpeed-Domino/megatron/core/transformer/__init__.py b/training/DeepSpeed-Domino/megatron/core/transformer/__init__.py
new file mode 100644
index 000000000..c4ae4739d
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/transformer/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from .transformer_config import TransformerConfig
diff --git a/training/DeepSpeed-Domino/megatron/core/transformer/attention.py b/training/DeepSpeed-Domino/megatron/core/transformer/attention.py
new file mode 100644
index 000000000..913c7c051
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/transformer/attention.py
@@ -0,0 +1,368 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from abc import ABC, abstractmethod
+
+import torch
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.models.common.rotary_pos_embedding import apply_rotary_pos_emb
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEDotProductAttention,
+    TELayerNormColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import divide
+
+from .enums import AttnMaskType
+from .transformer_config import TransformerConfig
+
+
+class Attention(MegatronModule, ABC):
+    """Attention layer abstract class.
+
+    This layer only contains common modules required for the "self attn" and
+    "cross attn" specializations.
+    """
+
+    def __init__(
+        self, config: TransformerConfig, layer_number: int = 1, attn_mask_type=AttnMaskType.padding,
+    ):
+        super().__init__(config=config)
+
+        self.config = config
+        self.layer_number = layer_number
+        self.attn_mask_type = attn_mask_type
+
+        # For normal attention without groups, num_query_groups == num_attention_heads,
+        # so these two will be the same
+        self.query_projection_size = self.config.kv_channels * self.config.num_attention_heads
+        self.kv_projection_size = self.config.kv_channels * self.config.num_query_groups
+
+        # Per attention head and per partition values.
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = divide(
+            self.query_projection_size, self.config.num_attention_heads
+        )
+        self.num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size)
+        self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size)
+
+        self.dot_product_attention = TEDotProductAttention(
+            config=self.config, layer_number=self.layer_number, attn_mask_type=self.attn_mask_type
+        )
+
+        self.checkpoint_dot_product_attention = self.config.recompute_granularity == 'selective'
+
+        # Output.
+        self.linear_proj = TERowParallelLinear(
+            self.query_projection_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+        )
+
+    def _checkpointed_attention_forward(
+        self, query, key, value, attention_mask, rotary_pos_emb=None
+    ):
+        """Forward method with selective activation checkpointing."""
+
+        def custom_forward(*inputs):
+            query = inputs[0]
+            key = inputs[1]
+            value = inputs[2]
+            attention_mask = inputs[3]
+            output_ = self.dot_product_attention(query, key, value, attention_mask)
+            return output_
+
+        hidden_states = tensor_parallel.checkpoint(
+            custom_forward, False, query, key, value, attention_mask, rotary_pos_emb
+        )
+
+        return hidden_states
+
+    def _allocate_memory(self, inference_max_sequence_length, batch_size, dtype):
+        """Allocate memory to store kv cache during inference."""
+
+        return torch.empty(
+            inference_max_sequence_length,
+            batch_size,
+            self.num_query_groups_per_partition,
+            self.hidden_size_per_attention_head,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+        )
+
+    def _adjust_key_value_for_inference(self, inference_params, key, value, rotary_pos_emb):
+        """
+        Saves the generated key and value tensors to the end of the buffers in inference_params.
+        Returns the full size keys and values from the provided inference_params, as well as
+        adjusted rotary_pos_emb.
+
+        Returns a tuple: (key, value, rotary_pos_emb)
+
+        """
+        if inference_params is None:
+            return key, value, rotary_pos_emb
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        is_first_step = False
+        if self.layer_number not in inference_params.key_value_memory_dict:
+            inf_max_seq_length = inference_params.max_sequence_length
+            inf_max_batch_size = inference_params.max_batch_size
+            inference_key_memory = self._allocate_memory(
+                inf_max_seq_length, inf_max_batch_size, key.dtype
+            )
+            inference_value_memory = self._allocate_memory(
+                inf_max_seq_length, inf_max_batch_size, value.dtype
+            )
+            inference_params.key_value_memory_dict[self.layer_number] = (
+                inference_key_memory,
+                inference_value_memory,
+            )
+            is_first_step = True
+        else:
+            # Get the pre-allocated buffers for this layer
+            inference_key_memory, inference_value_memory = inference_params.key_value_memory_dict[
+                self.layer_number
+            ]
+
+        batch_start = inference_params.batch_size_offset
+        batch_end = batch_start + key.size(1)
+        assert batch_end <= inference_key_memory.size(1)
+        sequence_start = inference_params.sequence_len_offset
+        sequence_end = sequence_start + key.size(0)
+        assert sequence_end <= inference_key_memory.size(0)
+        # Copy key and values.
+        inference_key_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = key
+        inference_value_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = value
+        key = inference_key_memory[:sequence_end, batch_start:batch_end, ...]
+        value = inference_value_memory[:sequence_end, batch_start:batch_end, ...]
+
+        # adjust the key rotary positional embedding
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+            # need to cross check this condition during inference
+            # if not set_inference_key_value_memory:
+            if not is_first_step:
+                # In inference, we compute one token at a time.
+                # Select the correct positional embedding
+                # (only the last token in the sequence)
+                q_pos_emb = q_pos_emb[sequence_end - 1 : sequence_end]
+            else:
+                # In the first forward pass of inference,
+                # we use the entire provided prefix.
+                # q_pos_emb here has the rope embeddings of the entire
+                # prefix + to-be-generated output so
+                # we slice to just the prefix.
+                q_pos_emb = q_pos_emb[:sequence_end, :, :, :]
+            k_pos_emb = k_pos_emb[:sequence_end, :, :, :]
+            rotary_pos_emb = (q_pos_emb, k_pos_emb)
+
+        return key, value, rotary_pos_emb
+
+    @abstractmethod
+    def get_query_key_value_tensors(self, hidden_states, key_value_states):
+        """
+        This method needs to be implemented based on whether the derived class
+        is "self-attn" or "cross-attn".
+        """
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        key_value_states=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+    ):
+        # hidden_states: [sq, b, h]
+
+        # For self attention we just duplicate the rotary_pos_emb if it isn't already
+        if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+        query, key, value = self.get_query_key_value_tensors(hidden_states, key_value_states)
+
+        # ===================================================
+        # Adjust key, value, and rotary_pos_emb for inference
+        # ===================================================
+        key, value, rotary_pos_emb = self._adjust_key_value_for_inference(
+            inference_params, key, value, rotary_pos_emb
+        )
+
+        # ================================================
+        # relative positional embedding (rotary embedding)
+        # ================================================
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+            query = apply_rotary_pos_emb(query, q_pos_emb)
+            key = apply_rotary_pos_emb(key, k_pos_emb)
+            # TODO, can apply positional embedding to value_layer so it has
+            # absolute positional embedding.
+            # otherwise, only relative positional embedding takes effect
+            # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        # expand the key_layer and value_layer [sk, b, ng, hn] -> [sk, b, np, hn]
+        # This is a noop for normal attention where ng == np. When using group query attention this
+        # creates a view that has the keys and values virtually repeated along their dimension to
+        # match the number of queries.
+        if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1:
+            key = key.repeat_interleave(
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
+            )
+            value = value.repeat_interleave(
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
+            )
+
+        if self.checkpoint_dot_product_attention:
+            core_attn_out = self._checkpointed_attention_forward(query, key, value, attention_mask)
+        else:
+            core_attn_out = self.dot_product_attention(query, key, value, attention_mask)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output, bias = self.linear_proj(core_attn_out)
+
+        return output, bias
+
+
+class SelfAttention(Attention):
+    """Self-attention layer class
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(
+        self, config: TransformerConfig, layer_number: int = 1, attn_mask_type=AttnMaskType.padding
+    ):
+        super().__init__(config=config, layer_number=layer_number, attn_mask_type=attn_mask_type)
+
+        self.linear_qkv = TELayerNormColumnParallelLinear(
+            self.config.hidden_size,
+            self.query_projection_size + 2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=False,
+        )
+
+    def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
+        """
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        """
+        # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
+        mixed_qkv, _ = self.linear_qkv(hidden_states)
+
+        # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
+        new_tensor_shape = mixed_qkv.size()[:-1] + (
+            self.num_query_groups_per_partition,
+            (
+                (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
+                * self.hidden_size_per_attention_head
+            ),
+        )
+        mixed_qkv = mixed_qkv.view(*new_tensor_shape)
+
+        # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+        (query, key, value) = torch.split(
+            mixed_qkv,
+            [
+                (
+                    self.num_attention_heads_per_partition
+                    // self.num_query_groups_per_partition
+                    * self.hidden_size_per_attention_head
+                ),
+                self.hidden_size_per_attention_head,
+                self.hidden_size_per_attention_head,
+            ],
+            dim=3,
+        )
+        # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
+        query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)
+
+        return query, key, value
+
+
+class CrossAttention(Attention):
+    """Cross-attention layer class
+
+    Cross-attention layer takes input with size [s, b, h] and context with size
+    [s, b, h] and returns output of the same size.
+    """
+
+    def __init__(
+        self, config: TransformerConfig, layer_number: int = 1, attn_mask_type=AttnMaskType.padding
+    ):
+        super().__init__(config=config, layer_number=layer_number, attn_mask_type=attn_mask_type)
+
+        if self.config.num_query_groups != self.config.num_attention_heads:
+            raise ValueError(
+                f"Group query attention is not currently supported in cross attention."
+            )
+        assert self.query_projection_size == self.kv_projection_size
+
+        self.linear_q = TELayerNormColumnParallelLinear(
+            self.config.hidden_size,
+            self.query_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=False,
+        )
+
+        self.linear_kv = TELayerNormColumnParallelLinear(
+            self.config.hidden_size,
+            2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=False,
+        )
+
+    def get_query_key_value_tensors(self, hidden_states, key_value_states):
+        """
+        Derives `query` tensor from `hidden_states`, and `key`/`value` tensors
+        from `key_value_states`.
+        """
+        # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
+        mixed_kv, _ = self.linear_kv(key_value_states)
+
+        # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
+        new_tensor_shape = mixed_kv.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            2 * self.hidden_size_per_attention_head,
+        )
+        mixed_kv = mixed_kv.view(*new_tensor_shape)
+
+        # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
+        (key, value) = tensor_parallel.split_tensor_along_last_dim(mixed_kv, 2)
+
+        # Attention head [sq, b, h] --> [sq, b, hp]
+        query, _ = self.linear_q(hidden_states)
+
+        # [sq, b, hp] --> [sq, b, np, hn]
+        new_tensor_shape = query.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
+        query = query.view(*new_tensor_shape)
+
+        return query, key, value
diff --git a/training/DeepSpeed-Domino/megatron/core/transformer/custom_layers/__init__.py b/training/DeepSpeed-Domino/megatron/core/transformer/custom_layers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/DeepSpeed-Domino/megatron/core/transformer/custom_layers/transformer_engine.py b/training/DeepSpeed-Domino/megatron/core/transformer/custom_layers/transformer_engine.py
new file mode 100644
index 000000000..0b66009cd
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/transformer/custom_layers/transformer_engine.py
@@ -0,0 +1,249 @@
+from importlib.metadata import version
+from typing import Callable
+
+import torch
+import transformer_engine as te
+from pkg_resources import packaging
+
+from megatron.core.parallel_state import get_tensor_model_parallel_group
+from megatron.core.tensor_parallel import get_cuda_rng_tracker
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+def _get_extra_te_kwargs(config: TransformerConfig):
+    extra_transformer_engine_kwargs = {}
+    from importlib.metadata import version
+
+    from pkg_resources import packaging
+
+    te_version = packaging.version.Version(version("transformer-engine"))
+    if te_version >= packaging.version.Version("0.12.0"):
+        if config.use_cpu_initialization:
+            extra_transformer_engine_kwargs["device"] = 'cpu'
+        else:
+            extra_transformer_engine_kwargs["device"] = torch.cuda.current_device()
+    return extra_transformer_engine_kwargs
+
+
+class TENorm:
+    """
+    A conditional wrapper to initialize an instance of Transformer-Engine's
+    `LayerNorm` or `RMSNorm` based on input
+    """
+
+    def __new__(
+        cls,
+        config: TransformerConfig,
+        hidden_size: int,
+        eps: float = 1e-5,
+        sequence_parallel: bool = False,
+        normalization="LayerNorm",
+        **kwargs
+    ):
+        zero_centered_gamma = kwargs.get('zero_centered_gamma', False)
+        if normalization == "LayerNorm":
+            instance = te.pytorch.LayerNorm(
+                hidden_size=hidden_size,
+                eps=eps,
+                sequence_parallel=sequence_parallel,
+                zero_centered_gamma=zero_centered_gamma,
+                **_get_extra_te_kwargs(config),
+            )
+        elif normalization == "RMSNorm":
+            assert hasattr(
+                te.pytorch, "RMSNorm"
+            ), "Transformer-Engine >= v0.11 required to use this feature"
+            instance = te.pytorch.RMSNorm(
+                hidden_size=hidden_size,
+                eps=eps,
+                sequence_parallel=sequence_parallel,
+                zero_centered_gamma=zero_centered_gamma,
+                **_get_extra_te_kwargs(config),
+            )
+        else:
+            raise Exception('Only LayerNorm and RMSNorm are curently supported')
+
+        return instance
+
+
+class TELinear(te.pytorch.Linear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer.
+
+    Note that if Megatron's parallel_state has not been initialized
+    yet, the tp_group passed to TE will be None and must be set later
+    via set_tensor_parallel_group().
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        config: TransformerConfig,
+        parallel_mode: str,
+        init_method: Callable,
+        *,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        **kwargs
+    ):
+        self.config = config
+
+        # TE returns a zero length Tensor when bias=False and
+        # return_bias=True, but we prefer None.  So in that case we
+        # tell TE to not return the bias, and return None
+        # ourselves. This way our forward always returns two values
+        # and we don't have to deal with the zero length Tensor.
+        self.te_return_bias = skip_bias_add and bias
+
+        super().__init__(
+            in_features=input_size,
+            out_features=output_size,
+            sequence_parallel=self.config.sequence_parallel,
+            fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion,
+            tp_group=get_tensor_model_parallel_group(check_initialized=False),
+            tp_size=self.config.tensor_model_parallel_size,
+            get_rng_state_tracker=get_cuda_rng_tracker,
+            init_method=init_method,
+            params_dtype=self.config.params_dtype,
+            parallel_mode=parallel_mode,
+            bias=bias,
+            return_bias=self.te_return_bias,
+            **_get_extra_te_kwargs(config),
+            **kwargs,
+        )
+
+    def forward(self, x):
+        out = super().forward(x)
+
+        # TE only returns a tuple when return_bias is True, otherwise
+        # it returns a single Tensor, we always want to return two
+        # values regardless of the arguments.
+        if self.te_return_bias:
+            return out
+        return out, None
+
+
+class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear):
+    """
+    Wrapper for the Transformer-Engine's `LayerNormLinear` layer that combines
+    layernorm and linear layers
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        config: TransformerConfig,
+        init_method: Callable,
+        bias: bool,
+        skip_bias_add: bool,
+        **kwargs
+    ):
+        self.config = config
+        # TE returns a zero length Tensor when bias=False and
+        # return_bias=True, but we prefer None.  So in that case we
+        # tell TE to not return the bias, and return None
+        # ourselves. This way our forward always returns two values
+        # and we don't have to deal with the zero length Tensor.
+        self.te_return_bias = skip_bias_add and bias
+
+        # Only Transformer-Engine version >= 0.11.0 supports `RMSNorm`
+        te_version = packaging.version.Version(version("transformer-engine"))
+        if te_version >= packaging.version.Version("0.11.0"):
+            kwargs["normalization"] = self.config.normalization
+
+        super().__init__(
+            in_features=input_size,
+            out_features=output_size,
+            bias=bias,
+            sequence_parallel=self.config.sequence_parallel,
+            fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion,
+            tp_group=get_tensor_model_parallel_group(check_initialized=False),
+            tp_size=self.config.tensor_model_parallel_size,
+            get_rng_state_tracker=get_cuda_rng_tracker,
+            init_method=init_method,
+            params_dtype=self.config.params_dtype,
+            parallel_mode="column",
+            return_bias=self.te_return_bias,
+            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+            **_get_extra_te_kwargs(config),
+            **kwargs,
+        )
+
+    def forward(self, x):
+        out = super().forward(x)
+
+        # TE only returns a tuple when return_bias is True, otherwise
+        # it returns a single Tensor, we always want to return two
+        # values regardless of the arguments.
+        if self.te_return_bias:
+            return out
+        return out, None
+
+
+class TEColumnParallelLinear(TELinear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
+    to megatron's `ColumnParallelLinear` layer.
+    """
+
+    def __init__(self, input_size: int, output_size: int, config: TransformerConfig, **kwargs):
+        self.config = config
+        super().__init__(
+            input_size=input_size,
+            output_size=output_size,
+            config=self.config,
+            parallel_mode="column",
+            **kwargs,
+        )
+
+
+class TERowParallelLinear(TELinear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
+    to megatron's `RowParallelLinear` layer.
+    """
+
+    def __init__(self, input_size: int, output_size: int, config: TransformerConfig, **kwargs):
+        self.config = config
+        super().__init__(
+            input_size=input_size,
+            output_size=output_size,
+            config=self.config,
+            parallel_mode="row",
+            **kwargs,
+        )
+
+
+class TEDotProductAttention(te.pytorch.DotProductAttention):
+    """
+    Wrapper for the Transformer-Engine's `DotProductAttention` layer that also
+    has "flash attention" enabled.
+
+    Note that if Megatron's parallel_state has not been initialized
+    yet, the tp_group passed to TE will be None and must be set later
+    via set_tensor_parallel_group().
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        layer_number: int = 1,
+        attn_mask_type: AttnMaskType = AttnMaskType.padding,
+        **kwargs
+    ):
+        self.config = config
+        super().__init__(
+            num_attention_heads=self.config.num_attention_heads,
+            kv_channels=self.config.kv_channels,
+            attention_dropout=self.config.attention_dropout,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type.name,
+            sequence_parallel=self.config.sequence_parallel,
+            tp_size=self.config.tensor_model_parallel_size,
+            get_rng_state_tracker=get_cuda_rng_tracker,
+            tp_group=get_tensor_model_parallel_group(check_initialized=False),
+            **kwargs,
+        )
diff --git a/training/DeepSpeed-Domino/megatron/core/transformer/dot_product_attention.py b/training/DeepSpeed-Domino/megatron/core/transformer/dot_product_attention.py
new file mode 100644
index 000000000..d99adb4c3
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/transformer/dot_product_attention.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+
+import math
+
+import torch
+from torch import Tensor
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.fusions.fused_softmax import FusedScaleMaskSoftmax
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import attention_mask_func
+from megatron.core.utils import divide
+
+
+class DotProductAttention(MegatronModule):
+    """
+    Region where selective activation recomputation is applied.
+    This region is memory intensive but less compute intensive which
+    makes activation checkpointing more efficient for LLMs (20B+).
+    See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+
+    We use the following notation:
+     h: hidden size
+     n: number of attention heads
+     p: number of tensor model parallel partitions
+     b: batch size
+     s: sequence length
+    """
+
+    def __init__(
+        self, config: TransformerConfig, layer_number: int = 1, attn_mask_type=AttnMaskType.padding
+    ):
+        super().__init__(config=config)
+
+        self.config: TransformerConfig = config
+
+        self.layer_number = max(1, layer_number)
+        self.attn_mask_type = attn_mask_type
+
+        projection_size = self.config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(projection_size, world_size)
+        self.hidden_size_per_attention_head = divide(projection_size, config.num_attention_heads)
+        self.num_attention_heads_per_partition = divide(config.num_attention_heads, world_size)
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.config.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+
+        self.scale_mask_softmax = FusedScaleMaskSoftmax(
+            input_in_fp16=self.config.fp16,
+            input_in_bf16=self.config.bf16,
+            attn_mask_type=self.attn_mask_type,
+            scaled_masked_softmax_fusion=self.config.masked_softmax_fusion,
+            mask_func=attention_mask_func,
+            softmax_in_fp32=self.config.attention_softmax_in_fp32,
+            scale=coeff,
+        )
+
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(self.config.attention_dropout)
+
+    def forward(
+        self, query_layer: Tensor, key_layer: Tensor, value_layer: Tensor, attention_mask: Tensor
+    ):
+
+        # ===================================
+        # Raw attention scores. [b, n/p, s, s]
+        # ===================================
+
+        # [b, np, sq, sk]
+        output_size = (
+            query_layer.size(1),
+            query_layer.size(2),
+            query_layer.size(0),
+            key_layer.size(0),
+        )
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        # This will be a simple view when doing normal attention, but in group query attention
+        # the key and value tensors are repeated to match the queries so you can't use simple strides
+        # to extract the queries.
+        query_layer = query_layer.reshape(output_size[2], output_size[0] * output_size[1], -1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = parallel_state.get_global_memory_buffer().get_tensor(
+            (output_size[0] * output_size[1], output_size[2], output_size[3]),
+            query_layer.dtype,
+            "mpu",
+        )
+
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer.transpose(0, 1),  # [b * np, sq, hn]
+            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0,
+            alpha=(1.0 / self.norm_factor),
+        )
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs: Tensor = self.scale_mask_softmax(attention_scores, attention_mask)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+
+        if not self.config.sequence_parallel:
+            with tensor_parallel.get_cuda_rng_tracker().fork():
+                attention_probs = self.attention_dropout(attention_probs)
+        else:
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value_layer -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (
+            value_layer.size(1),
+            value_layer.size(2),
+            query_layer.size(0),
+            value_layer.size(3),
+        )
+
+        # change view [sk, b * np, hn]
+        value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
diff --git a/training/DeepSpeed-Domino/megatron/core/transformer/enums.py b/training/DeepSpeed-Domino/megatron/core/transformer/enums.py
new file mode 100644
index 000000000..3583daa17
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/transformer/enums.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import enum
+
+
+# can we get rid of this?
+# it's being used in pipeline schedules
+class ModelType(enum.Enum):
+    encoder_or_decoder = 1
+    encoder_and_decoder = 2
+
+
+# class LayerType(enum.Enum):
+#     encoder = 1
+#     decoder = 2
+
+
+class AttnType(enum.Enum):
+    self_attn = 1
+    cross_attn = 2
+
+
+class AttnMaskType(enum.Enum):
+    padding = 1
+    causal = 2
diff --git a/training/DeepSpeed-Domino/megatron/core/transformer/identity_op.py b/training/DeepSpeed-Domino/megatron/core/transformer/identity_op.py
new file mode 100644
index 000000000..79dcddc1f
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/transformer/identity_op.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import torch
+
+
+class IdentityOp(torch.nn.Module):
+    """
+    This is a placeholder for IdentityOp (NoOp)
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(IdentityOp, self).__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
diff --git a/training/DeepSpeed-Domino/megatron/core/transformer/mlp.py b/training/DeepSpeed-Domino/megatron/core/transformer/mlp.py
new file mode 100644
index 000000000..16696ceaf
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/transformer/mlp.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+import torch.nn.functional as F
+
+from megatron.core import tensor_parallel
+from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TELayerNormColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class MLP(MegatronModule):
+    """
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+
+
+    Returns an output and a bias to be added to the output.
+    If config.add_bias_linear is False, the bias returned is None.
+
+    We use the following notation:
+     h: hidden size
+     p: number of tensor model parallel partitions
+     b: batch size
+     s: sequence length
+    """
+
+    def __init__(self, config: TransformerConfig):
+        super().__init__(config=config)
+
+        self.config: TransformerConfig = config
+
+        # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        ffn_hidden_size = self.config.ffn_hidden_size
+        if self.config.gated_linear_unit:
+            ffn_hidden_size *= 2
+
+        self.linear_fc1 = TELayerNormColumnParallelLinear(
+            self.config.hidden_size,
+            ffn_hidden_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+        )
+
+        if self.config.gated_linear_unit:
+
+            def glu(x):
+                x = torch.chunk(x, 2, dim=-1)
+                return self.config.activation_func(x[0]) * x[1]
+
+            self.activation_func = glu
+        else:
+            self.activation_func = self.config.activation_func
+
+        self.linear_fc2 = TERowParallelLinear(
+            self.config.ffn_hidden_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+        )
+
+    def forward(self, hidden_states):
+
+        # [s, b, 4 * h/p]
+        intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states)
+
+        if self.config.bias_gelu_fusion:
+            assert self.config.add_bias_linear is True
+            assert self.activation_func == F.gelu
+            intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
+        else:
+            if bias_parallel is not None:
+                intermediate_parallel = intermediate_parallel + bias_parallel
+            intermediate_parallel = self.activation_func(intermediate_parallel)
+
+        # [s, b, h]
+        output, output_bias = self.linear_fc2(intermediate_parallel)
+        return output, output_bias
diff --git a/training/DeepSpeed-Domino/megatron/core/transformer/module.py b/training/DeepSpeed-Domino/megatron/core/transformer/module.py
new file mode 100644
index 000000000..c0f08fe11
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/transformer/module.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron Module"""
+
+import torch
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
+
+
+def param_is_not_shared(param):
+    return not hasattr(param, 'shared') or not param.shared
+
+
+class MegatronModule(torch.nn.Module):
+    """Megatron specific extensions of torch Module with support
+    for pipelining."""
+
+    # def __init__(self, config: TransformerConfig, share_word_embeddings=True):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.config = config
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """Use this function to override the state dict for
+           saving checkpoints.
+        """
+
+        return self.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+    def sharded_state_dict(self, prefix=''):
+        """ Override sharded_state_dict when using distributed checkpointing.
+            keep_vars must always be set to True so that optimizer states
+            can be sharded.
+        """
+        return self.state_dict(prefix=prefix, keep_vars=True)
+
+
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val`
+    #is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+
+def fp32_to_float16(val, float16_convertor):
+    """Convert fp32 `val` to fp16/bf16"""
+
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, _FLOAT_TYPES):
+            val = float16_convertor(val)
+        return val
+
+    return conversion_helper(val, half_conversion)
+
+
+def float16_to_fp32(val):
+    """Convert fp16/bf16 `val` to fp32"""
+
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)):
+            val = val.float()
+        return val
+
+    return conversion_helper(val, float_conversion)
+
+
+class Float16Module(MegatronModule):
+    def __init__(self, config: TransformerConfig, module: torch.nn.Module):
+        super(Float16Module, self).__init__(config)
+        self.config = config
+        self.fp16 = config.fp16
+        self.bf16 = config.bf16
+
+        if self.fp16:
+            self.add_module('module', module.half())
+
+            def float16_convertor(val):
+                return val.half()
+
+        elif self.bf16:
+            self.add_module('module', module.bfloat16())
+
+            def float16_convertor(val):
+                return val.bfloat16()
+
+        else:
+            raise Exception('Either config.fp16 or config.bf16 should be True.')
+
+        self.float16_convertor = float16_convertor
+
+    def set_input_tensor(self, input_tensor):
+        return self.module.set_input_tensor(input_tensor)
+
+    def forward(self, *inputs, **kwargs):
+        if parallel_state.is_pipeline_first_stage():
+            inputs = fp32_to_float16(inputs, self.float16_convertor)
+        outputs = self.module(*inputs, **kwargs)
+        if parallel_state.is_pipeline_last_stage():
+            outputs = float16_to_fp32(outputs)
+        return outputs
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """ Retrieve state_dict from the module being wrapped."""
+        return self.module.state_dict_for_save_checkpoint(prefix=prefix, keep_vars=keep_vars)
+
+    def sharded_state_dict(self, prefix=''):
+        """ Retrieve sharded_state_dict from the module being wrapped.
+        """
+        return self.module.sharded_state_dict(prefix=prefix)
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
diff --git a/training/DeepSpeed-Domino/megatron/core/transformer/transformer_block.py b/training/DeepSpeed-Domino/megatron/core/transformer/transformer_block.py
new file mode 100644
index 000000000..2b9ba7908
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/transformer/transformer_block.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import re
+from contextlib import nullcontext
+
+import torch
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+from megatron.core.transformer.custom_layers.transformer_engine import TENorm
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer
+from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor
+
+
+class TransformerBlock(MegatronModule):
+    """Transformer class."""
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        self_attn_mask_type=AttnMaskType.padding,
+        post_layer_norm=True,
+        pre_process=True,
+        post_process=True,
+    ):
+        super().__init__(config=config)
+
+        self.config: TransformerConfig = config
+
+        self.self_attn_mask_type = self_attn_mask_type
+        self.post_layer_norm = post_layer_norm
+        self.pre_process = pre_process
+        self.post_process = post_process
+
+        # required for pipeline parallel schedules
+        self.input_tensor = None
+
+        self.checkpoint_core_attention = self.config.recompute_granularity == 'selective'
+
+        self.num_layers_per_pipeline_rank = (
+            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
+        )
+
+        self._build_layers()
+
+    def _build_layers(self):
+        # Transformer layers.
+        # @jcasper can we improve how we deal with layer_number?
+        # currently it's only used in CoreAttention?
+        # if self.apply_query_key_layer_scaling:
+        #     coeff = self.layer_number
+        #     self.norm_factor *= coeff
+        def build_layer(layer_number):
+            layer = TransformerLayer(
+                config=self.config,
+                layer_number=layer_number,
+                self_attn_mask_type=self.self_attn_mask_type,
+            )
+            return layer
+
+        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+            # Interleaved pipeline parallelism:
+            # Number of layers in each model chunk is the number of layers in the stage,
+            # divided by the number of model chunks in a stage.
+            # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0]  [2]  [4]  [6]
+            # Stage 1: [1]  [3]  [5]  [7]
+            # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0, 1]  [4, 5]
+            # Stage 1: [2, 3]  [6, 7]
+
+            vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+
+            num_layers_per_virtual_rank = self.num_layers_per_pipeline_rank // vp_size
+
+            num_layers_to_build = num_layers_per_virtual_rank
+
+        else:
+            # Non-interleaved pipeline parallelism:
+            # Each stage gets a contiguous set of layers.
+
+            num_layers_to_build = self.num_layers_per_pipeline_rank
+
+        # offset is implicit in TransformerLayer
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(num_layers_to_build)])
+
+        # # TODO: add back standalone_embedding_stage
+        # if self.num_layers == 0:
+        #     # When a standalone embedding stage is used (e.g.,
+        #     # args.standalone_embedding_stage == True), virtual pipeline ranks
+        #     # on pipeline rank 0 will have zero transformer layers assigned to
+        #     # them. This results in the model's input and output tensors to be
+        #     # the same, which will cause failure for certain output tensor
+        #     # optimizations (e.g., pipeline output deallocation). To remedy
+        #     # this, we assign a 'no-op' layer on these ranks, which will
+        #     # disconnect the input tensor from the output tensor.
+        #     self.num_layers = 1
+        #     self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)])
+        # else:
+        #     self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)])
+
+        if self.post_process and self.post_layer_norm:
+            # Final layer norm before output.
+            self.final_layernorm = TENorm(
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+                persist_layer_norm=self.config.persist_layer_norm,
+                sequence_parallel=self.config.sequence_parallel,
+                zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+                normalization=self.config.normalization,
+            )
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def _checkpointed_forward(self, hidden_states, attention_mask, rotary_pos_emb):
+        """Forward method with activation checkpointing."""
+
+        def custom(start, end):
+            def custom_forward(*args, **kwargs):
+                x_, *args = args
+                for index in range(start, end):
+                    layer = self._get_layer(index)
+                    x_ = layer(x_, *args, **kwargs)
+                return x_
+
+            return custom_forward
+
+        if self.config.recompute_method == 'uniform':
+            # Uniformly divide the total number of Transformer layers and checkpoint
+            # the input activation of each divided chunk.
+            # A method to further reduce memory usage reducing checkpoints.
+            l = 0
+            while l < self.num_layers_per_pipeline_rank:
+                hidden_states = tensor_parallel.checkpoint(
+                    custom(l, l + self.config.recompute_num_layers),
+                    self.config.distribute_saved_activations,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                )
+
+                l += self.config.recompute_num_layers
+
+        elif self.config.recompute_method == 'block':
+            # Checkpoint the input activation of only a set number of individual
+            # Transformer layers and skip the rest.
+            # A method fully use the device memory removing redundant re-computation.
+            for l in range(self.num_layers_per_pipeline_rank):
+                if l < self.config.recompute_num_layers:
+                    hidden_states = tensor_parallel.checkpoint(
+                        custom(l, l + 1),
+                        self.config.distribute_saved_activations,
+                        hidden_states,
+                        attention_mask,
+                        rotary_pos_emb,
+                    )
+                else:
+                    hidden_states = custom(l, l + 1)(hidden_states, attention_mask, rotary_pos_emb)
+        else:
+            raise ValueError("Invalid activation recompute method.")
+
+        return hidden_states
+
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+
+    def forward(self, hidden_states, attention_mask, inference_params=None, rotary_pos_emb=None):
+        # hidden_states (float): [s, b, h]
+        # attention_mask (bool): [1, 1, s, s]
+
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        # Viewless tensor.
+        # - We only need to create a viewless tensor in the case of micro batch
+        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+        #   above creates a view tensor, and '.contiguous()' is a pass-through.
+        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+        #   the need to make it viewless.
+        #
+        #   However, we don't explicitly check mbs == 1 here because
+        #   make_viewless_tensor() has negligible overhead when its input
+        #   is already viewless.
+        #
+        # - For the 'else' case above, calling make_viewless_tensor() here is
+        #   likely redundant, since p2p_communication.py (likely originator)
+        #   already creates viewless tensors. That said, make_viewless_tensor()
+        #   is called here to be future-proof and corner-case-proof.
+        hidden_states = make_viewless_tensor(
+            inp=hidden_states, requires_grad=True, keep_graph=True,
+        )
+
+        if self.config.sequence_parallel:
+            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+
+        if self.config.fp8:
+            import transformer_engine  # To keep out TE dependency when not training in fp8
+
+            if self.config.fp8 == "e4m3":
+                fp8_format = transformer_engine.common.recipe.Format.E4M3
+            elif self.config.fp8 == "hybrid":
+                fp8_format = transformer_engine.common.recipe.Format.HYBRID
+            else:
+                raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.")
+
+            fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
+                margin=self.config.fp8_margin,
+                interval=self.config.fp8_interval,
+                fp8_format=fp8_format,
+                amax_compute_algo=self.config.fp8_amax_compute_algo,
+                amax_history_len=self.config.fp8_amax_history_len,
+                override_linear_precision=(False, False, not self.config.fp8_wgrad),
+            )
+            fp8_group = None
+            if parallel_state.model_parallel_is_initialized():
+                fp8_group = parallel_state.get_amax_reduction_group()
+            fp8_context = transformer_engine.pytorch.fp8_autocast(
+                enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group
+            )
+        else:
+            fp8_context = nullcontext()
+
+        with rng_context and fp8_context:
+            # Forward pass.
+            if self.config.recompute_granularity == 'full':
+                hidden_states = self._checkpointed_forward(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    rotary_pos_emb=rotary_pos_emb,
+                )
+            else:
+                for layer in self.layers:
+                    hidden_states = layer(
+                        hidden_states=hidden_states,
+                        attention_mask=attention_mask,
+                        rotary_pos_emb=rotary_pos_emb,
+                        inference_params=inference_params,
+                    )
+
+        # Final layer norm.
+        if self.post_process and self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+    def sharded_state_dict(self, prefix=''):
+
+        sharded_state_dict = {}
+
+        layer_prefix = f'{prefix}layers.'
+        for layer in self.layers:
+            sharded_state_dict.update(layer.sharded_state_dict(prefix=layer_prefix))
+
+        if self.post_process and self.post_layer_norm:
+            state_dict = self.state_dict(keep_vars=True)
+
+            tensor = state_dict['final_layernorm.weight']
+            layer_name = f'{prefix}final_layernorm.weight'
+            sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name)
+
+            # RMSNorm doesn't have bias.
+            if 'final_layernorm.bias' in state_dict.keys():
+                tensor = state_dict['final_layernorm.bias']
+                layer_name = f'{prefix}final_layernorm.bias'
+                sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(
+                    tensor, layer_name
+                )
+
+        return sharded_state_dict
diff --git a/training/DeepSpeed-Domino/megatron/core/transformer/transformer_config.py b/training/DeepSpeed-Domino/megatron/core/transformer/transformer_config.py
new file mode 100644
index 000000000..532c89b00
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/transformer/transformer_config.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+from typing import Callable
+
+import torch
+import torch.nn.functional as F
+
+from megatron.core import ModelParallelConfig
+from megatron.core.utils import init_method_normal, scaled_init_method_normal
+
+
+@dataclass
+class TransformerConfig(ModelParallelConfig):
+    """Configuration object for megatron-core transformers.
+
+        Attributes:
+
+        # model architecture
+        num_layers (int): Number of transformer layers in a transformer block.
+        hidden_size (int): Transformer hidden size.
+        ffn_hidden_size (int): Transformer Feed-Forward Network hidden size.
+                                This is set to 4*hidden_size if not provided. Defaults to None.')
+        num_attention_heads (int): Number of transformer attention heads.
+        kv_channels (int): Projection weights dimension in multi-head attention.
+                            This is set to hidden_size // num_attention_heads if not provided.
+                            Defaults to None.
+        num_query_groups (int): Number of query groups for group query attention. If None, normal attention is used.
+
+        hidden_dropout (float): Dropout probability for transformer hidden state. Defaults to 0.1.
+        attention_dropout (float): Post attention dropout probability. Defaults to 0.1.
+        fp32_residual_connection (bool): If true, move residual connections to fp32.
+        apply_residual_connection_post_layernorm (bool): If true, uses the original BERT residule connection ordering.
+                                                         Defaults to False.
+        layernorm_epsilon (float): Layernorm epsilon. Defaults to 1e-5.
+
+        layernorm_zero_centered_gamma (bool): if set to 'True', the LayerNorm is adjusted to center the gamma values
+                                              around 0. This improves numerical stability. Defaults to False.
+
+        add_bias_linear (bool): Include a bias term in all linear layers (QKV projections, after core attention, and two
+                                in MLP layer). Default is True.
+
+        gated_linear_unit (bool): Use a gated linear unit for the first linear layer in the MLP. Defaults to False.
+
+        activation_func (Callable): Activation function to use for the non-linearity in the MLP. Defaults to F.gelu.
+
+        # initialization
+        init_method (Callable): Method to initialize weights. Note that bias is always set to
+                                zero. Should be a function that takes a single Tensor and
+                                initializes it. Defaults to
+                                megatron.core.utils.init_method_normal(init_method_std) which is
+                                torch.nn.init.normal_ with mean=0.0 and std=init_method_Std.
+
+        output_layer_init_method (Callable): Method to initialize weights of the output layer of
+                                             both attention and MLP blocks. Defaults to
+                                             megatron.core.utils.scaled_init_method_normal(init_method_std)
+                                             which is torch.nn.init.normal_ with mean=0.0 and
+                                             std=init_method_std / math.sqrt(2.0 * num_layers).
+
+        init_method_std (float): Standard deviation of the zero mean normal for the default
+                                 initialization method, not used if init_method and
+                                 output_layer_init_method are provided. Defaults to 0.02.
+
+        # mixed-precision
+        apply_query_key_layer_scaling (bool): If true, scale Q * K^T by 1 / layer-number. Defaults to True.
+        attention_softmax_in_fp32 (bool): If true, run attention masking and softmax in fp32.
+                                          This should be true if apply_query_key_layer_scaling is true.
+
+        # fusion
+        bias_gelu_fustion (bool): If true, fuses bias and gelu. Defaults to False.
+        masked_softmax_fusion (bool): If true, uses softmax fusion.
+        persist_layer_norm (bool): If true, uses the persistent fused layer norm kernel.
+                                   This kernel only supports a fixed set of hidden sizes.
+                                   Defaults to False.
+        bias_dropout_fusion (bool): If true, uses bias dropout fusion.
+
+        # activation recomputation
+
+        recompute_granularity (str): megatron-core supports 'selective' activation checkpointing where only the memory
+                                     intensive part of attention is checkpointed.  These memory intensive activations
+                                     are also less compute intensive which makes activation checkpointing more efficient
+                                     for LLMs (20B+).  See Reducing Activation Recomputation in Large Transformer
+                                     Models: https://arxiv.org/abs/2205.05198 for more details.  'full' will checkpoint
+                                     the entire transformer layer.  Must be 'selective' or 'full'. 'selective' always uses all layers.
+                                     Defaults to None.
+
+        recompute_method (str): uniform will uniformly divide the total number of transformer layers in a transformer
+                                block and recompute the input activation of each divided chunk at the specified
+                                granularity.  block will recompute the input activations for only a set number of
+                                transformer layers per pipeline stage.  The rest of the layers in the pipeline stage
+                                will not have any activations recomputed.  Must be 'uniform' or 'block'. Defaults to
+                                None.
+
+        recompute_num_layers (int): When recompute_method is uniform, recompute_num_layers is the number of transformer
+                                    layers in each uniformly divided recompute unit.  When recompute_method is block,
+                                    recompute_num_layers is the number of transformer layers to recompute within each
+                                    pipeline stage.  Must be None for 'selective' activation checkpointing. Defaults to None.
+
+        distribute_saved_activations (bool): If true, distribute recomputed activations across the model parallel
+                                             group. Defaults to None.
+
+        # fp8 related (via Transformer Engine). For detailed info, refer the the Transformer Engine docs at
+        # https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html
+
+        fp8 (str): If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined choices: (1) 'e4m3'
+                   uniformly uses e4m3 for all FP8 tensors, (2) 'hybrid' uses e4m3 for all FP8 activation and weight tensors and
+                   e5m2 for all FP8 output activation gradient tensors. Defaults to None.
+
+        fp8_margin (int): Margin for the scaling factor computation.
+
+        fp8_interval (int): Controls how often the scaling factor is recomputed.
+
+        fp8_amax_history_len (int): The length of the amax history window used for scaling factor computation.
+
+        fp8_amax_compute_algo (str): Algorithm used for choosing the `amax` value for the scaling factor computation.
+                                     There are 2 predefined choices: `max` chooses the largest `amax` in the history
+                                     window, while `most_recent` always chooses the most recently seen value.
+
+        fp8_wgrad (bool): When set to False, override FP8 config options and do the wgrad computation in higher precision.
+                          Defaults to True.
+
+        # Experimental
+        normalization (str): Swtich b/w `LayerNorm` and `RMSNorm` as normalization layers. For now, these are primarily
+                             used by Transformer-Engine's layers like `LayerNormLinear`. Default value is `LayerNorm`.
+
+
+    """
+
+    # model architecture
+    num_layers: int = 0
+    hidden_size: int = 0
+    num_attention_heads: int = 0
+    num_query_groups: int = None
+
+    ffn_hidden_size: int = None
+    kv_channels: int = None
+    hidden_dropout: float = 0.1
+    attention_dropout: float = 0.1
+    fp32_residual_connection: bool = False
+    # @jcasper should we keep this option?
+    apply_residual_connection_post_layernorm: bool = False
+    layernorm_epsilon: float = 1e-5
+    layernorm_zero_centered_gamma: bool = False
+    add_bias_linear: bool = True
+    gated_linear_unit: bool = False
+    activation_func: Callable = F.gelu
+
+    # initialization
+    init_method: Callable = None
+    output_layer_init_method: Callable = None
+    init_method_std: float = 0.02
+
+    # mixed-precision
+    apply_query_key_layer_scaling: bool = True
+    attention_softmax_in_fp32: bool = True
+
+    # communication
+
+    # fusion
+    bias_gelu_fusion: bool = False  # TODO: this should be bias_activation_fusion ?
+    masked_softmax_fusion: bool = False
+    persist_layer_norm: bool = False
+    bias_dropout_fusion: bool = False  # TODO: this should be bias_dropout_add_fusion?
+
+    # activation recomputation
+    recompute_granularity: str = None
+    recompute_method: str = None
+    recompute_num_layers: int = None
+    distribute_saved_activations: bool = None
+
+    # fp8 related
+    fp8: str = None
+    fp8_margin: int = 0
+    fp8_interval: int = 1
+    fp8_amax_history_len: int = 1
+    fp8_amax_compute_algo: str = "most_recent"
+    fp8_wgrad: bool = True
+
+    # experimental section (TODO: move to apt. section above once stable)
+    normalization: bool = "LayerNorm"  # alt value supported by TE: "RMSNorm"
+
+    def __post_init__(self):
+        """ Python dataclass method that is used to modify attributes after initialization.
+            See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
+        """
+        super().__post_init__()
+        if self.fp16 and self.bf16:
+            raise ValueError(
+                f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.'
+            )
+
+        if self.num_attention_heads % self.tensor_model_parallel_size != 0:
+            raise ValueError(
+                f"num_attention_heads ({self.num_attention_heads}) must be a multiple of "
+                f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
+            )
+
+        if self.ffn_hidden_size is None:
+            self.ffn_hidden_size = 4 * self.hidden_size
+
+        if self.kv_channels is None:
+            self.kv_channels = self.hidden_size // self.num_attention_heads
+
+        if self.num_query_groups is None:
+            self.num_query_groups = self.num_attention_heads
+
+        if self.num_query_groups % self.tensor_model_parallel_size != 0:
+            raise ValueError(
+                f"num_query_groups ({self.num_query_groups}) must be a multiple of "
+                f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
+            )
+
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+
+        if self.recompute_granularity is not None:
+            if not self.recompute_granularity in ['full', 'selective']:
+                raise ValueError(
+                    f'When using recompute_granuarlity: {self.recompute_granularity} must be "full" or "selective".'
+                )
+
+            if self.recompute_method is not None:
+                if not self.recompute_method in ['block', 'uniform']:
+                    raise ValueError(
+                        f'recompute_method: {self.recompute_method} must be "block" or "uniform".'
+                    )
+            elif self.recompute_granularity != 'selective':
+                raise ValueError(
+                    f'Using recompute_granularity: {self.recompute_granularity} so recompute_method must be "block" or "uniform"'
+                )
+
+            if self.recompute_granularity != 'selective' and self.recompute_num_layers is None:
+                raise ValueError(
+                    f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be between '
+                    f'1 and num_layers_per_pipeline_rank: {self.num_layers // self.pipeline_model_parallel_size}'
+                )
+            elif (
+                self.recompute_granularity == 'selective' and self.recompute_num_layers is not None
+            ):
+                raise ValueError(
+                    f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be None.'
+                )
+
+            if self.distribute_saved_activations and self.sequence_parallel:
+                raise ValueError(
+                    f'distribute_saved_activations: {self.distribute_saved_activations} must be false when sequence parallel is enabled: {self.sequence_parallel}'
+                )
+
+            if self.virtual_pipeline_model_parallel_size is not None:
+                if not self.num_layers % self.virtual_pipeline_model_parallel_size == 0:
+                    raise ValueError(
+                        f'num_layers: {self.num_layers} must be divisible by virtual_model_parallel_size {self.virtual_pipeline_model_parallel_size}'
+                    )
+
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+
+        if self.bias_gelu_fusion:
+            if not self.add_bias_linear:
+                raise ValueError(
+                    "When bias_gelu_fusion is True, add_bias_linear must also be True."
+                )
+
+            if self.activation_func != F.gelu:
+                raise ValueError(f'When bias_gelu_fusion is True, activation_func must be F.gelu.')
+
+        if self.init_method is None:
+            self.init_method = init_method_normal(self.init_method_std)
+
+        if self.output_layer_init_method is None:
+            self.output_layer_init_method = scaled_init_method_normal(
+                self.init_method_std, self.num_layers
+            )
diff --git a/training/DeepSpeed-Domino/megatron/core/transformer/transformer_layer.py b/training/DeepSpeed-Domino/megatron/core/transformer/transformer_layer.py
new file mode 100644
index 000000000..73b9aadc6
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/transformer/transformer_layer.py
@@ -0,0 +1,270 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import re
+from functools import partial
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing.mapping import (
+    ShardedObject,
+    ShardedTensor,
+    ShardedTensorFactory,
+)
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.transformer.attention import SelfAttention
+from megatron.core.transformer.custom_layers.transformer_engine import TENorm
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_viewless_tensor
+
+
+class TransformerLayer(MegatronModule):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        layer_number: int = 1,
+        self_attn_mask_type=AttnMaskType.padding,
+    ):
+        super().__init__(config=config)
+        self.config: TransformerConfig = config
+
+        self.layer_number = layer_number + self._get_layer_offset()
+
+        self.self_attn_mask_type = self_attn_mask_type
+
+        # Layernorm on the input data.
+        # TODO: add pytorch only layernorm
+        self.input_layernorm = IdentityOp(
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+            persist_layer_norm=self.config.persist_layer_norm,
+            sequence_parallel=self.config.sequence_parallel,
+            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+            normalization=self.config.normalization,
+        )
+
+        # Self attention.
+        self.self_attention = SelfAttention(
+            config=self.config, layer_number=layer_number, attn_mask_type=self_attn_mask_type,
+        )
+
+        # Layernorm on the attention output
+        self.post_self_attn_layernorm = IdentityOp(
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+            persist_layer_norm=self.config.persist_layer_norm,
+            sequence_parallel=self.config.sequence_parallel,
+            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+            normalization=self.config.normalization,
+        )
+
+        # MLP
+        self.mlp = MLP(config=self.config)
+
+        # @jcasper how should we handle nvfuser?
+        # Set bias+dropout+add fusion grad_enable execution handler.
+        # TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        # TORCH_MINOR = int(torch.__version__.split('.')[1])
+        # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
+        # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
+        self.bias_dropout_add_exec_handler = torch.enable_grad
+
+    def _get_layer_offset(self):
+
+        pipeline_rank = parallel_state.get_pipeline_model_parallel_rank()
+
+        num_layers_per_pipeline_rank = (
+            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
+        )
+
+        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+            vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
+            vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+
+            total_num_layers = self.config.num_layers
+            num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size
+            total_virtual_chunks = total_num_layers // vp_size
+            offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank)
+
+        else:
+            # Each stage gets a contiguous set of layers.
+            if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+                offset = pipeline_rank * num_layers_per_pipeline_rank
+            else:
+                offset = 0
+
+        return offset
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_output=None,
+        enc_dec_attn_mask=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+    ):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output_with_bias = self.self_attention(
+            layernorm_output,
+            attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+
+        # Residual connection.
+        if self.config.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        bias_dropout_add_func = get_bias_dropout_add(self.training, self.config.bias_dropout_fusion)
+
+        # bias_dropout_add fusion returning fp32 instead of bf16
+        with self.bias_dropout_add_exec_handler():
+            layernorm_input = bias_dropout_add_func(
+                attention_output_with_bias, residual, self.config.hidden_dropout
+            )
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_self_attn_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output_with_bias = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.config.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        with self.bias_dropout_add_exec_handler():
+            output = bias_dropout_add_func(
+                mlp_output_with_bias, residual, self.config.hidden_dropout
+            )
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = make_viewless_tensor(
+            inp=output, requires_grad=output.requires_grad, keep_graph=True
+        )
+
+        return output
+
+    def sharded_state_dict(self, prefix=''):
+
+        # state_dict = self.state_dict(prefix=prefix, keep_vars=True)
+        state_dict = self.state_dict(keep_vars=True)
+
+        tensor_parallel_layers_axis_map = {
+            'self_attention.linear_qkv.weight': 0,
+            'self_attention.linear_qkv.bias': 0,
+            'self_attention.linear_proj.weight': 1,
+            'mlp.linear_fc1.weight': 0,
+            'mlp.linear_fc1.bias': 0,
+            'mlp.linear_fc2.weight': 1,
+        }
+
+        offset = self._get_layer_offset()
+        num_layers = self.config.num_layers
+
+        sharded_state_dict = {}
+
+        for layer_name in state_dict.keys():
+            tensor = state_dict[layer_name]
+            global_layer_offset = self.layer_number - 1  # self.layer_number starts at 1
+            layer_key = f'{prefix}{global_layer_offset - offset}.{layer_name}'  # module list index in TransformerBlock
+            sharded_offsets = [(0, global_layer_offset, num_layers)]  # PP sharding
+
+            # TODO: move it to MLP after merging the "sharded_state_dict modularization" MR
+            is_glu_weight = (
+                layer_name == 'mlp.linear_fc1.weight' and self.mlp.config.gated_linear_unit
+            )
+
+            if layer_name in tensor_parallel_layers_axis_map:
+                tp_axis = tensor_parallel_layers_axis_map[layer_name]
+                # TP sharding
+                if not is_glu_weight:
+                    sharded_offsets.append(
+                        [
+                            tp_axis + 1,  # +1 for PP dimension
+                            parallel_state.get_tensor_model_parallel_rank(),
+                            parallel_state.get_tensor_model_parallel_world_size(),
+                        ]
+                    )
+                replica_id = parallel_state.get_data_parallel_rank()
+            else:
+                replica_id = (
+                    parallel_state.get_data_parallel_rank()
+                    * parallel_state.get_data_parallel_world_size()
+                    + parallel_state.get_tensor_model_parallel_rank()
+                )
+
+            if layer_name.endswith('._extra_state'):
+                sharded_state_dict[layer_key] = ShardedObject(
+                    f'{prefix}{layer_name}',
+                    tensor,
+                    (num_layers,),
+                    (global_layer_offset,),
+                    replica_id,
+                )
+            elif is_glu_weight:
+                # We must split the tensor into 2 parts, each sharded separately.
+                # This requires a ShardedTensorFactory which `chunk`s during saving
+                # and `cat`s during loading
+                assert tp_axis == 0, f'TP axis for GLU weight should be 0, got: {tp_axis}'
+                tp_rank = parallel_state.get_tensor_model_parallel_rank()
+                tp_size = parallel_state.get_tensor_model_parallel_world_size()
+
+                sh_ten_builder = partial(
+                    ShardedTensor.from_rank_offsets, replica_id=replica_id, prepend_axis_num=1
+                )  # for PP sharding
+
+                # NOTE: passing `tp_axis` as argument due to late binding in closures
+                def sh_ten_build_fn(key: str, t: torch.Tensor, tp_axis=tp_axis):
+                    offset_w = (tp_axis + 1, tp_rank, tp_size * 2)
+                    offset_v = (tp_axis + 1, tp_size + tp_rank, tp_size * 2)
+                    with torch.no_grad():
+                        tensor_w, tensor_v = torch.chunk(t, 2, dim=tp_axis)
+                    return [
+                        sh_ten_builder(key, tensor_w, *sharded_offsets, offset_w),
+                        sh_ten_builder(key, tensor_v, *sharded_offsets, offset_v),
+                    ]
+
+                def sh_ten_merge_fn(sub_state_dict):
+                    with torch.no_grad():
+                        return torch.cat(sub_state_dict)
+
+                sharded_state_dict[layer_key] = ShardedTensorFactory(
+                    f'{prefix}{layer_name}', tensor, sh_ten_build_fn, sh_ten_merge_fn
+                )
+            else:
+                sharded_state_dict[layer_key] = ShardedTensor.from_rank_offsets(
+                    f'{prefix}{layer_name}',
+                    tensor,
+                    *sharded_offsets,
+                    replica_id=replica_id,
+                    prepend_axis_num=1,  # for PP sharding
+                )
+
+        return sharded_state_dict
diff --git a/training/DeepSpeed-Domino/megatron/core/transformer/utils.py b/training/DeepSpeed-Domino/megatron/core/transformer/utils.py
new file mode 100644
index 000000000..c3740f848
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/transformer/utils.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Utilities for transformer layers."""
+
+import torch
+
+from megatron import get_args
+
+
+def attention_mask_func(attention_scores, attention_mask):
+    attention_scores.masked_fill_(attention_mask, -10000.0)
+    return attention_scores
+
+
+def get_linear_layer(rows, columns, init_method):
+    """Simple linear layer with weight initialization."""
+    layer = torch.nn.Linear(rows, columns)
+    if get_args().perform_initialization:
+        init_method(layer.weight)
+    with torch.no_grad():
+        layer.bias.zero_()
+    return layer
+
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
+
+
+def openai_gelu(x):
+    return gelu_impl(x)
+
+
+# This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
+@torch.jit.script
+def erf_gelu(x):
+    return (
+        x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype))
+    )
diff --git a/training/DeepSpeed-Domino/megatron/core/utils.py b/training/DeepSpeed-Domino/megatron/core/utils.py
new file mode 100644
index 000000000..7a0bc385c
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/core/utils.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Utility functions used throughout Megatron core"""
+import math
+import operator
+from functools import reduce
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing.mapping import ShardedTensor
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def get_attr_wrapped_model(model, attr, allow_none=True):
+    """Get an attribute from a wrapped model"""
+    if isinstance(model, list):
+        raise RuntimeError("_get_attr_wrapped_model given a list of models")
+
+    if allow_none:
+
+        def condition(model, attr):
+            return not hasattr(model, attr)
+
+    else:
+
+        def condition(model, attr):
+            return getattr(model, attr, None) is None
+
+    while condition(model, attr):
+        if not hasattr(model, "module"):
+            raise RuntimeError(f"_get_attr_wrapped_model couldn't find attribute {attr}")
+
+        model = model.module
+    return getattr(model, attr)
+
+
+def get_model_type(model):
+    return get_attr_wrapped_model(model, 'model_type')
+
+
+def get_model_config(model):
+    return get_attr_wrapped_model(model, 'config', allow_none=False)
+
+
+class GlobalMemoryBuffer:
+    """Global buffer to avoid dynamic memory allocations.
+    Caller should ensure that buffers of the same name
+    are not used concurrently."""
+
+    def __init__(self):
+        self.buffer = {}
+
+    def get_tensor(self, tensor_shape, dtype, name):
+        required_len = reduce(operator.mul, tensor_shape, 1)
+        if (
+            self.buffer.get((name, dtype), None) is None
+            or self.buffer[(name, dtype)].numel() < required_len
+        ):
+            self.buffer[(name, dtype)] = torch.empty(
+                required_len, dtype=dtype, device=torch.cuda.current_device(), requires_grad=False
+            )
+
+        return self.buffer[(name, dtype)][0:required_len].view(*tensor_shape)
+
+
+def _kernel_make_viewless_tensor(inp, requires_grad):
+    '''Make a viewless tensor.
+
+    View tensors have the undesirable side-affect of retaining a reference
+    to the originally-viewed tensor, even after manually setting the '.data'
+    field. This method creates a new tensor that links to the old tensor's
+    data, without linking the viewed tensor, referenced via the '._base'
+    field.
+    '''
+    out = torch.empty((1,), dtype=inp.dtype, device=inp.device, requires_grad=requires_grad,)
+    out.data = inp.data
+    return out
+
+
+class MakeViewlessTensor(torch.autograd.Function):
+    '''
+    Autograd function to make a viewless tensor.
+
+    This function should be used in cases where the computation graph needs
+    to be propagated, but we only want a viewless tensor (e.g.,
+    ParallelTransformer's hidden_states). Call this function by passing
+    'keep_graph = True' to 'make_viewless_tensor()'.
+    '''
+
+    @staticmethod
+    def forward(ctx, inp, requires_grad):
+        return _kernel_make_viewless_tensor(inp, requires_grad)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None
+
+
+def make_viewless_tensor(inp, requires_grad, keep_graph):
+    '''
+    Entry-point for creating viewless tensors.
+
+    This method should be used, rather than calling 'MakeViewlessTensor'
+    or '_kernel_make_viewless_tensor' directly. This method acts as a
+    switch for determining if an autograd function or a regular method
+    should be used to create the tensor.
+    '''
+
+    # return tensor as-is, if not a 'view'
+    if inp._base is None:
+        return inp
+
+    # create viewless tensor
+    if keep_graph:
+        return MakeViewlessTensor.apply(inp, requires_grad)
+    else:
+        return _kernel_make_viewless_tensor(inp, requires_grad)
+
+
+def assert_viewless_tensor(tensor, extra_msg=None):
+    '''Assert that a tensor is not a view (i.e., its '._base' field is
+    not set).'''
+    if isinstance(tensor, list):
+        [assert_viewless_tensor(t) for t in tensor]
+        return tensor
+    if not isinstance(tensor, torch.Tensor):
+        return tensor
+    assert tensor._base is None, (
+        "Ensure tensor._base is None before setting tensor.data or storing "
+        "tensor to memory buffer. Otherwise, a memory leak will occur (and "
+        "likely accumulate over iterations). %s"
+    ) % extra_msg
+    return tensor
+
+
+def safely_set_viewless_tensor_data(tensor, new_data_tensor):
+    '''Safely set tensor's '.data' field.
+
+    Check first that the tensor is viewless (i.e., '._base' not set). If not,
+    raise an exception.
+    '''
+    assert_viewless_tensor(
+        tensor,
+        extra_msg="FYI, tensor._base has shape %s, and new_data_tensor has shape %s."
+        % ("--" if tensor._base is None else tensor._base.shape, new_data_tensor.shape),
+    )
+    tensor.data = new_data_tensor
+
+
+def init_method_normal(sigma):
+    """Init method based on N(0, sigma)."""
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method_normal(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+def make_tp_sharded_tensor_for_checkpoint(tensor, key, tp_axis=0, replica_id=None, **kwargs):
+    """ Helper for instantiating a ShardedTensor where the `tp_axis` dimension is sharded across TP group. """
+
+    return ShardedTensor.from_rank_offsets(
+        key,
+        tensor,
+        (
+            tp_axis,
+            parallel_state.get_tensor_model_parallel_rank(),
+            parallel_state.get_tensor_model_parallel_world_size(),
+        ),
+        replica_id=parallel_state.get_data_parallel_rank() if replica_id is None else replica_id,
+        **kwargs,
+    )
+
+
+def make_sharded_tensor_for_checkpoint(tensor, key, **kwargs):
+    """ Helper for instantiating a non-sharded ShardedTensor (replicated across TP and DP group). """
+
+    return ShardedTensor.from_rank_offsets(
+        key,
+        tensor,
+        replica_id=parallel_state.get_data_parallel_rank()
+        * parallel_state.get_data_parallel_world_size()
+        + parallel_state.get_tensor_model_parallel_rank(),
+        **kwargs,
+    )
diff --git a/training/DeepSpeed-Domino/megatron/data/Makefile b/training/DeepSpeed-Domino/megatron/data/Makefile
new file mode 100644
index 000000000..8f9db7686
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/Makefile
@@ -0,0 +1,9 @@
+CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+CPPFLAGS += $(shell python3 -m pybind11 --includes)
+LIBNAME = helpers
+LIBEXT = $(shell python3-config --extension-suffix)
+
+default: $(LIBNAME)$(LIBEXT)
+
+%$(LIBEXT): %.cpp
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
diff --git a/training/DeepSpeed-Domino/megatron/data/__init__.py b/training/DeepSpeed-Domino/megatron/data/__init__.py
new file mode 100644
index 000000000..cd5f898c6
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/__init__.py
@@ -0,0 +1 @@
+from . import indexed_dataset
diff --git a/training/DeepSpeed-Domino/megatron/data/autoaugment.py b/training/DeepSpeed-Domino/megatron/data/autoaugment.py
new file mode 100644
index 000000000..585a4fa6a
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/autoaugment.py
@@ -0,0 +1,320 @@
+"""AutoAugment data augmentation policy for ImageNet.
+
+-- Begin license text.
+
+MIT License
+
+Copyright (c) 2018 Philip Popien
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+-- End license text.
+
+Code adapted from https://github.com/DeepVoltaire/AutoAugment.
+
+This module implements the fixed AutoAugment data augmentation policy for ImageNet provided in
+Appendix A, Table 9 of reference [1]. It does not include any of the search code for augmentation
+policies.
+
+Reference:
+[1] https://arxiv.org/abs/1805.09501
+"""
+
+import random
+
+import numpy as np
+from PIL import Image
+from PIL import ImageEnhance
+from PIL import ImageOps
+
+_MAX_LEVEL = 10  # Maximum integer strength of an augmentation, if applicable.
+
+
+class ImageNetPolicy:
+    """Definition of an ImageNetPolicy.
+
+    Implements a fixed AutoAugment data augmentation policy targeted at
+    ImageNet training by randomly applying at runtime one of the 25 pre-defined
+    data augmentation sub-policies provided in Reference [1].
+
+    Usage example as a Pytorch Transform:
+    >>> transform=transforms.Compose([transforms.Resize(256),
+    >>>                               ImageNetPolicy(),
+    >>>                               transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        """Initialize an ImageNetPolicy.
+
+        Args:
+            fillcolor (tuple): RGB color components of the color to be used for
+            filling when needed (default: (128, 128, 128), which
+            corresponds to gray).
+        """
+        # Instantiate a list of sub-policies.
+        # Each entry of the list is a SubPolicy which consists of
+        # two augmentation operations,
+        # each of those parametrized as operation, probability, magnitude.
+        # Those two operations are applied sequentially on the image upon call.
+        self.policies = [
+            SubPolicy("posterize", 0.4, 8, "rotate", 0.6, 9, fillcolor),
+            SubPolicy("solarize", 0.6, 5, "autocontrast", 0.6, 5, fillcolor),
+            SubPolicy("equalize", 0.8, 8, "equalize", 0.6, 3, fillcolor),
+            SubPolicy("posterize", 0.6, 7, "posterize", 0.6, 6, fillcolor),
+            SubPolicy("equalize", 0.4, 7, "solarize", 0.2, 4, fillcolor),
+            SubPolicy("equalize", 0.4, 4, "rotate", 0.8, 8, fillcolor),
+            SubPolicy("solarize", 0.6, 3, "equalize", 0.6, 7, fillcolor),
+            SubPolicy("posterize", 0.8, 5, "equalize", 1.0, 2, fillcolor),
+            SubPolicy("rotate", 0.2, 3, "solarize", 0.6, 8, fillcolor),
+            SubPolicy("equalize", 0.6, 8, "posterize", 0.4, 6, fillcolor),
+            SubPolicy("rotate", 0.8, 8, "color", 0.4, 0, fillcolor),
+            SubPolicy("rotate", 0.4, 9, "equalize", 0.6, 2, fillcolor),
+            SubPolicy("equalize", 0.0, 7, "equalize", 0.8, 8, fillcolor),
+            SubPolicy("invert", 0.6, 4, "equalize", 1.0, 8, fillcolor),
+            SubPolicy("color", 0.6, 4, "contrast", 1.0, 8, fillcolor),
+            SubPolicy("rotate", 0.8, 8, "color", 1.0, 2, fillcolor),
+            SubPolicy("color", 0.8, 8, "solarize", 0.8, 7, fillcolor),
+            SubPolicy("sharpness", 0.4, 7, "invert", 0.6, 8, fillcolor),
+            SubPolicy("shearX", 0.6, 5, "equalize", 1.0, 9, fillcolor),
+            SubPolicy("color", 0.4, 0, "equalize", 0.6, 3, fillcolor),
+            SubPolicy("equalize", 0.4, 7, "solarize", 0.2, 4, fillcolor),
+            SubPolicy("solarize", 0.6, 5, "autocontrast", 0.6, 5, fillcolor),
+            SubPolicy("invert", 0.6, 4, "equalize", 1.0, 8, fillcolor),
+            SubPolicy("color", 0.6, 4, "contrast", 1.0, 8, fillcolor),
+            SubPolicy("equalize", 0.8, 8, "equalize", 0.6, 3, fillcolor),
+        ]
+
+    def __call__(self, img):
+        """Define call method for ImageNetPolicy class."""
+        policy_idx = random.randint(0, len(self.policies) - 1)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        """Define repr method for ImageNetPolicy class."""
+        return "ImageNetPolicy"
+
+
+class SubPolicy:
+    """Definition of a SubPolicy.
+
+    A SubPolicy consists of two augmentation operations,
+    each of those parametrized as operation, probability, magnitude.
+    The two operations are applied sequentially on the image upon call.
+    """
+
+    def __init__(
+        self,
+        operation1,
+        probability1,
+        magnitude_idx1,
+        operation2,
+        probability2,
+        magnitude_idx2,
+        fillcolor,
+    ):
+        """Initialize a SubPolicy.
+
+        Args:
+            operation1 (str): Key specifying the first augmentation operation.
+            There are fourteen key values altogether (see supported_ops below
+            listing supported operations). probability1 (float): Probability
+            within [0., 1.] of applying the first augmentation operation.
+            magnitude_idx1 (int): Integer specifiying the strength of the first
+            operation as an index further used to derive the magnitude from a
+            range of possible values.
+            operation2 (str): Key specifying the second augmentation operation.
+            probability2 (float): Probability within [0., 1.] of applying the
+            second augmentation operation.
+            magnitude_idx2 (int): Integer specifiying the strength of the
+            second operation as an index further used to derive the magnitude
+            from a range of possible values.
+            fillcolor (tuple): RGB color components of the color to be used for
+            filling.
+        Returns:
+        """
+        # List of supported operations for operation1 and operation2.
+        supported_ops = [
+            "shearX",
+            "shearY",
+            "translateX",
+            "translateY",
+            "rotate",
+            "color",
+            "posterize",
+            "solarize",
+            "contrast",
+            "sharpness",
+            "brightness",
+            "autocontrast",
+            "equalize",
+            "invert",
+        ]
+        assert (operation1 in supported_ops) and (
+            operation2 in supported_ops
+        ), "SubPolicy:one of oper1 or oper2 refers to an unsupported operation."
+
+        assert (
+            0.0 <= probability1 <= 1.0 and 0.0 <= probability2 <= 1.0
+        ), "SubPolicy: prob1 and prob2 should be within [0., 1.]."
+
+        assert (
+            isinstance(magnitude_idx1, int) and 0 <= magnitude_idx1 <= 10
+        ), "SubPolicy: idx1 should be specified as an integer within [0, 10]."
+
+        assert (
+            isinstance(magnitude_idx2, int) and 0 <= magnitude_idx2 <= 10
+        ), "SubPolicy: idx2 should be specified as an integer within [0, 10]."
+
+        # Define a dictionary where each key refers to a specific type of
+        # augmentation and the corresponding value is a range of ten possible
+        # magnitude values for that augmentation.
+        num_levels = _MAX_LEVEL + 1
+        ranges = {
+            "shearX": np.linspace(0, 0.3, num_levels),
+            "shearY": np.linspace(0, 0.3, num_levels),
+            "translateX": np.linspace(0, 150 / 331, num_levels),
+            "translateY": np.linspace(0, 150 / 331, num_levels),
+            "rotate": np.linspace(0, 30, num_levels),
+            "color": np.linspace(0.0, 0.9, num_levels),
+            "posterize": np.round(np.linspace(8, 4, num_levels), 0).astype(
+                np.int
+            ),
+            "solarize": np.linspace(256, 0, num_levels),  # range [0, 256]
+            "contrast": np.linspace(0.0, 0.9, num_levels),
+            "sharpness": np.linspace(0.0, 0.9, num_levels),
+            "brightness": np.linspace(0.0, 0.9, num_levels),
+            "autocontrast": [0]
+            * num_levels,  # This augmentation doesn't use magnitude parameter.
+            "equalize": [0]
+            * num_levels,  # This augmentation doesn't use magnitude parameter.
+            "invert": [0]
+            * num_levels,  # This augmentation doesn't use magnitude parameter.
+        }
+
+        def rotate_with_fill(img, magnitude):
+            """Define rotation transformation with fill.
+
+            The input image is first rotated, then it is blended together with
+            a gray mask of the same size. Note that fillcolor as defined
+            elsewhere in this module doesn't apply here.
+
+            Args:
+                magnitude (float): rotation angle in degrees.
+            Returns:
+                rotated_filled (PIL Image): rotated image with gray filling for
+                disoccluded areas unveiled by the rotation.
+            """
+            rotated = img.convert("RGBA").rotate(magnitude)
+            rotated_filled = Image.composite(
+                rotated, Image.new("RGBA", rotated.size, (128,) * 4), rotated
+            )
+            return rotated_filled.convert(img.mode)
+
+        # Define a dictionary of augmentation functions where each key refers
+        # to a specific type of augmentation and the corresponding value defines
+        # the augmentation itself using a lambda function.
+        # pylint: disable=unnecessary-lambda
+        func_dict = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor,
+            ),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor,
+            ),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (
+                    1,
+                    0,
+                    magnitude * img.size[0] * random.choice([-1, 1]),
+                    0,
+                    1,
+                    0,
+                ),
+                fillcolor=fillcolor,
+            ),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (
+                    1,
+                    0,
+                    0,
+                    0,
+                    1,
+                    magnitude * img.size[1] * random.choice([-1, 1]),
+                ),
+                fillcolor=fillcolor,
+            ),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(
+                1 + magnitude * random.choice([-1, 1])
+            ),
+            "posterize": lambda img, magnitude: ImageOps.posterize(
+                img, magnitude
+            ),
+            "solarize": lambda img, magnitude: ImageOps.solarize(
+                img, magnitude
+            ),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(
+                img
+            ).enhance(1 + magnitude * random.choice([-1, 1])),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(
+                img
+            ).enhance(1 + magnitude * random.choice([-1, 1])),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(
+                img
+            ).enhance(1 + magnitude * random.choice([-1, 1])),
+            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img),
+        }
+
+        # Store probability, function and magnitude of the first augmentation
+        # for the sub-policy.
+        self.probability1 = probability1
+        self.operation1 = func_dict[operation1]
+        self.magnitude1 = ranges[operation1][magnitude_idx1]
+
+        # Store probability, function and magnitude of the second augmentation
+        # for the sub-policy.
+        self.probability2 = probability2
+        self.operation2 = func_dict[operation2]
+        self.magnitude2 = ranges[operation2][magnitude_idx2]
+
+    def __call__(self, img):
+        """Define call method for SubPolicy class."""
+        # Randomly apply operation 1.
+        if random.random() < self.probability1:
+            img = self.operation1(img, self.magnitude1)
+
+        # Randomly apply operation 2.
+        if random.random() < self.probability2:
+            img = self.operation2(img, self.magnitude2)
+
+        return img
diff --git a/training/DeepSpeed-Domino/megatron/data/bert_dataset.py b/training/DeepSpeed-Domino/megatron/data/bert_dataset.py
new file mode 100644
index 000000000..036e6bccc
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/bert_dataset.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""BERT Style dataset."""
+
+import numpy as np
+import torch
+
+from megatron import (
+    get_args,
+    get_tokenizer,
+    mpu,
+    print_rank_0
+)
+from megatron.data.dataset_utils import (
+    get_samples_mapping,
+    get_a_and_b_segments,
+    truncate_segments,
+    create_tokens_and_tokentypes,
+    create_masked_lm_predictions
+)
+
+class BertDataset(torch.utils.data.Dataset):
+
+    def __init__(self, name, indexed_dataset, data_prefix,
+                 num_epochs, max_num_samples, masked_lm_prob,
+                 max_seq_length, short_seq_prob, seed, binary_head):
+
+        # Params to store.
+        self.name = name
+        self.seed = seed
+        self.masked_lm_prob = masked_lm_prob
+        self.max_seq_length = max_seq_length
+        self.binary_head = binary_head
+
+        # Dataset.
+        self.indexed_dataset = indexed_dataset
+
+        # Build the samples mapping.
+        self.samples_mapping = get_samples_mapping(self.indexed_dataset,
+                                                   data_prefix,
+                                                   num_epochs,
+                                                   max_num_samples,
+                                                   self.max_seq_length - 3, # account for added tokens
+                                                   short_seq_prob,
+                                                   self.seed,
+                                                   self.name,
+                                                   self.binary_head)
+
+        # Vocab stuff.
+        tokenizer = get_tokenizer()
+        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
+        self.vocab_id_to_token_dict = tokenizer.inv_vocab
+        self.cls_id = tokenizer.cls
+        self.sep_id = tokenizer.sep
+        self.mask_id = tokenizer.mask
+        self.pad_id = tokenizer.pad
+
+    def __len__(self):
+        return self.samples_mapping.shape[0]
+
+    def __getitem__(self, idx):
+        start_idx, end_idx, seq_length = self.samples_mapping[idx]
+        sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)]
+        # Note that this rng state should be numpy and not python since
+        # python randint is inclusive whereas the numpy one is exclusive.
+        # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1
+        np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32))
+        return build_training_sample(sample, seq_length,
+                                     self.max_seq_length,  # needed for padding
+                                     self.vocab_id_list,
+                                     self.vocab_id_to_token_dict,
+                                     self.cls_id, self.sep_id,
+                                     self.mask_id, self.pad_id,
+                                     self.masked_lm_prob, np_rng,
+                                     self.binary_head)
+
+
+
+
+def build_training_sample(sample,
+                          target_seq_length, max_seq_length,
+                          vocab_id_list, vocab_id_to_token_dict,
+                          cls_id, sep_id, mask_id, pad_id,
+                          masked_lm_prob, np_rng, binary_head):
+    """Biuld training sample.
+
+    Arguments:
+        sample: A list of sentences in which each sentence is a list token ids.
+        target_seq_length: Desired sequence length.
+        max_seq_length: Maximum length of the sequence. All values are padded to
+            this length.
+        vocab_id_list: List of vocabulary ids. Used to pick a random id.
+        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
+        cls_id: Start of example id.
+        sep_id: Separator id.
+        mask_id: Mask token id.
+        pad_id: Padding token id.
+        masked_lm_prob: Probability to mask tokens.
+        np_rng: Random number genenrator. Note that this rng state should be
+              numpy and not python since python randint is inclusive for
+              the opper bound whereas the numpy one is exclusive.
+    """
+
+    if binary_head:
+        # We assume that we have at least two sentences in the sample
+        assert len(sample) > 1
+    assert target_seq_length <= max_seq_length
+
+    # Divide sample into two segments (A and B).
+    if binary_head:
+        tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample,
+                                                                  np_rng)
+    else:
+        tokens_a = []
+        for j in range(len(sample)):
+            tokens_a.extend(sample[j])
+        tokens_b = []
+        is_next_random = False
+
+    # Truncate to `target_sequence_length`.
+    max_num_tokens = target_seq_length
+    truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a),
+                                  len(tokens_b), max_num_tokens, np_rng)
+
+    # Build tokens and toketypes.
+    tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b,
+                                                      cls_id, sep_id)
+
+    # Masking.
+    max_predictions_per_seq = masked_lm_prob * max_num_tokens
+    (tokens, masked_positions, masked_labels, _, _) = create_masked_lm_predictions(
+        tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
+        cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng)
+
+    # Padding.
+    tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \
+        = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
+                                   masked_labels, pad_id, max_seq_length)
+
+    train_sample = {
+        'text': tokens_np,
+        'types': tokentypes_np,
+        'labels': labels_np,
+        'is_random': int(is_next_random),
+        'loss_mask': loss_mask_np,
+        'padding_mask': padding_mask_np,
+        'truncated': int(truncated)}
+    return train_sample
+
+
+def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
+                             masked_labels, pad_id, max_seq_length):
+    """Pad sequences and convert them to numpy."""
+
+    # Some checks.
+    num_tokens = len(tokens)
+    padding_length = max_seq_length - num_tokens
+    assert padding_length >= 0, \
+        f"num_tokens ({num_tokens}) is greater than " \
+        "max_seq_length ({max_seq_length})."
+    assert len(tokentypes) == num_tokens
+    assert len(masked_positions) == len(masked_labels)
+
+    # Tokens and token types.
+    filler = [pad_id] * padding_length
+    tokens_np = np.array(tokens + filler, dtype=np.int64)
+    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)
+
+    # Padding mask.
+    padding_mask_np = np.array([1] * num_tokens + [0] * padding_length,
+                               dtype=np.int64)
+
+    # Lables and loss mask.
+    labels = [-1] * max_seq_length
+    loss_mask = [0] * max_seq_length
+    for i in range(len(masked_positions)):
+        assert masked_positions[i] < num_tokens
+        labels[masked_positions[i]] = masked_labels[i]
+        loss_mask[masked_positions[i]] = 1
+    labels_np = np.array(labels, dtype=np.int64)
+    loss_mask_np = np.array(loss_mask, dtype=np.int64)
+
+    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
diff --git a/training/DeepSpeed-Domino/megatron/data/biencoder_dataset_utils.py b/training/DeepSpeed-Domino/megatron/data/biencoder_dataset_utils.py
new file mode 100644
index 000000000..c08f06792
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/biencoder_dataset_utils.py
@@ -0,0 +1,209 @@
+import os
+import time
+
+import numpy as np
+import torch
+
+from megatron import get_args, get_tokenizer, print_rank_0
+from megatron.core import mpu, tensor_parallel
+from megatron.data.dataset_utils import create_masked_lm_predictions, \
+                                            pad_and_convert_to_numpy
+from megatron.data.data_samplers import MegatronPretrainingSampler
+
+def make_attention_mask(source_block, target_block):
+    """
+    Returns a 2-dimensional (2-D) attention mask
+    :param source_block: 1-D array
+    :param target_block: 1-D array
+    """
+    mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1)
+    mask = mask.astype(np.int64)
+    # (source_length, target_length)
+    return mask
+
+def get_one_epoch_dataloader(dataset, micro_batch_size=None):
+    """Specifically one epoch to be used in an indexing job."""
+    args = get_args()
+
+    if micro_batch_size is None:
+        micro_batch_size = args.micro_batch_size
+    num_workers = args.num_workers
+
+    # Use megatron's sampler with consumed samples set to 0 as
+    # this is only for evaluation and don't intend to resume half way.
+    # Also, set the drop last to false as don't intend to remove
+    # the last batch
+    batch_sampler = MegatronPretrainingSampler(
+        total_samples=len(dataset),
+        consumed_samples=0,
+        micro_batch_size=args.micro_batch_size,
+        data_parallel_rank=mpu.get_data_parallel_rank(),
+        data_parallel_size=mpu.get_data_parallel_world_size(),
+        drop_last=False)
+
+    return torch.utils.data.DataLoader(dataset,
+                                       batch_sampler=batch_sampler,
+                                       num_workers=num_workers,
+                                       pin_memory=True)
+
+
+def get_ict_batch(data_iterator):
+    # Items and their type.
+    keys = ['query_tokens', 'query_mask',
+            'context_tokens', 'context_mask', 'block_data']
+    datatype = torch.int64
+
+    # Broadcast data.
+    if data_iterator is None:
+        data = None
+    else:
+        data = next(data_iterator)
+    data_b = tensor_parallel.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    query_tokens = data_b['query_tokens'].long()
+    query_mask = data_b['query_mask'] < 0.5
+    context_tokens = data_b['context_tokens'].long()
+    context_mask = data_b['context_mask'] < 0.5
+    block_indices = data_b['block_data'].long()
+
+    return query_tokens, query_mask,\
+           context_tokens, context_mask, block_indices
+
+
+def join_str_list(str_list):
+    """Join a list of strings, handling spaces appropriately"""
+    result = ""
+    for s in str_list:
+        if s.startswith("##"):
+            result += s[2:]
+        else:
+            result += " " + s
+    return result
+
+
+class BlockSampleData(object):
+    """A struct for fully describing a fixed-size block of data as used in REALM
+
+    :param start_idx: for first sentence of the block
+    :param end_idx: for last sentence of the block (may be partially truncated in sample construction)
+    :param doc_idx: the index of the document from which the block comes in the original indexed dataset
+    :param block_idx: a unique integer identifier given to every block.
+    """
+    def __init__(self, start_idx, end_idx, doc_idx, block_idx):
+        self.start_idx = start_idx
+        self.end_idx = end_idx
+        self.doc_idx = doc_idx
+        self.block_idx = block_idx
+
+    def as_array(self):
+        return np.array([self.start_idx, self.end_idx, self.doc_idx, self.block_idx]).astype(np.int64)
+
+    def as_tuple(self):
+        return self.start_idx, self.end_idx, self.doc_idx, self.block_idx
+
+
+class BlockSamplesMapping(object):
+    def __init__(self, mapping_array):
+        # make sure that the array is compatible with BlockSampleData
+        assert mapping_array.shape[1] == 4
+        self.mapping_array = mapping_array
+
+    def __len__(self):
+        return self.mapping_array.shape[0]
+
+    def __getitem__(self, idx):
+        """Get the data associated with an indexed sample."""
+        sample_data = BlockSampleData(*self.mapping_array[idx])
+        return sample_data
+
+
+def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epochs,
+                              max_num_samples, max_seq_length, seed, name, use_one_sent_docs=False):
+    """Get samples mapping for a dataset over fixed size blocks. This function also requires
+    a dataset of the titles for the source documents since their lengths must be taken into account.
+
+    :return: samples_mapping (BlockSamplesMapping)
+    """
+
+    if not num_epochs:
+        if not max_num_samples:
+            raise ValueError("Need to specify either max_num_samples "
+                             "or num_epochs")
+        num_epochs = np.iinfo(np.int32).max - 1
+    if not max_num_samples:
+        max_num_samples = np.iinfo(np.int64).max - 1
+
+    # Filename of the index mapping
+    indexmap_filename = data_prefix
+    indexmap_filename += '_{}_indexmap'.format(name)
+    if num_epochs != (np.iinfo(np.int32).max - 1):
+        indexmap_filename += '_{}ep'.format(num_epochs)
+    if max_num_samples != (np.iinfo(np.int64).max - 1):
+        indexmap_filename += '_{}mns'.format(max_num_samples)
+    indexmap_filename += '_{}msl'.format(max_seq_length)
+    indexmap_filename += '_{}s'.format(seed)
+    if use_one_sent_docs:
+        indexmap_filename += '_1sentok'
+    indexmap_filename += '.npy'
+
+    # Build the indexed mapping if not exist.
+    if mpu.get_data_parallel_rank() == 0 and \
+            not os.path.isfile(indexmap_filename):
+        print(' > WARNING: could not find index map file {}, building '
+              'the indices on rank 0 ...'.format(indexmap_filename))
+
+        # Make sure the types match the helpers input types.
+        assert block_dataset.doc_idx.dtype == np.int64
+        assert block_dataset.sizes.dtype == np.int32
+
+        # Build samples mapping
+        verbose = torch.distributed.get_rank() == 0
+        start_time = time.time()
+        print_rank_0(' > building samples index mapping for {} ...'.format(
+            name))
+
+        from megatron.data import helpers
+        mapping_array = helpers.build_blocks_mapping(
+            block_dataset.doc_idx,
+            block_dataset.sizes,
+            title_dataset.sizes,
+            num_epochs,
+            max_num_samples,
+            max_seq_length - 3,  # account for added tokens
+            seed,
+            verbose,
+            use_one_sent_docs)
+
+
+        print_rank_0(' > done building samples index mapping')
+        np.save(indexmap_filename, mapping_array, allow_pickle=True)
+        print_rank_0(' > saved the index mapping in {}'.format(
+            indexmap_filename))
+        # Make sure all the ranks have built the mapping
+        print_rank_0(' > elapsed time to build and save samples mapping '
+                     '(seconds): {:4f}'.format(
+            time.time() - start_time))
+
+    # This should be a barrier but nccl barrier assumes
+    # device_index=rank which is not the case for model
+    # parallel case
+    counts = torch.cuda.LongTensor([1])
+    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
+    assert counts[0].item() == torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+
+    # Load indexed dataset.
+    print_rank_0(' > loading indexed mapping from {}'.format(
+        indexmap_filename))
+    start_time = time.time()
+
+    mapping_array = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r')
+    samples_mapping = BlockSamplesMapping(mapping_array)
+
+    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('    total number of samples: {}'.format(
+        mapping_array.shape[0]))
+
+    return samples_mapping
diff --git a/training/DeepSpeed-Domino/megatron/data/blendable_dataset.py b/training/DeepSpeed-Domino/megatron/data/blendable_dataset.py
new file mode 100644
index 000000000..8ff5ce3da
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/blendable_dataset.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Blendable dataset."""
+
+import hashlib
+import os
+import time
+
+import numpy as np
+import torch
+
+from megatron import print_rank_0
+from megatron.core import mpu
+
+class BlendableDataset(torch.utils.data.Dataset):
+
+
+    def __init__(self, datasets, weights, size, *,
+                 data_cache_path=None):
+
+        self.datasets = datasets
+        num_datasets = len(datasets)
+        assert num_datasets == len(weights)
+
+        self.size = size
+
+        # Normalize weights.
+        weights = np.array(weights, dtype=np.float64)
+        sum_weights = np.sum(weights)
+        assert sum_weights > 0.0
+        weights /= sum_weights
+
+        # Build indicies.
+        def _build_indices():
+            start_time = time.time()
+            assert num_datasets < 255
+            dataset_index = np.zeros(self.size, dtype=np.uint8)
+            dataset_sample_index = np.zeros(self.size, dtype=np.int64)
+
+            from megatron.data import helpers
+            helpers.build_blending_indices(dataset_index, dataset_sample_index,
+                                           weights, num_datasets, self.size,
+                                           torch.distributed.get_rank() == 0)
+            print_rank_0('> elapsed time for building blendable dataset indices: '
+                         '{:.2f} (sec)'.format(time.time() - start_time))
+            return dataset_index, dataset_sample_index
+
+        desc = "Blendable dataset\n\n"
+        desc += "Datasets:\n"
+        for dataset in datasets:
+            desc += dataset.desc + "\n\n"
+        desc += f"Weights: {weights}\n"
+        desc += f"Size: {size}\n"
+        self.desc = desc
+
+        if data_cache_path:
+            desc_hash = hashlib.md5(desc.encode('utf-8')).hexdigest()
+            desc_path = os.path.join(data_cache_path, desc_hash + ".dsc")
+            index_path = os.path.join(data_cache_path, desc_hash + "_index.npy")
+            sample_index_path = os.path.join(data_cache_path, desc_hash + "_sample_index.npy")
+            cache_hit = os.path.isfile(index_path) and os.path.isfile(sample_index_path)
+            cache_success = True
+            if torch.distributed.get_rank() == 0 and not cache_hit:
+                print(' > WARNING: could not find index map files for blendable'
+                      ' dataset, building indices on rank 0 ...', flush=True)
+                dataset_index, dataset_sample_index = _build_indices()
+                try:
+                    os.makedirs(os.path.dirname(index_path), exist_ok=True)
+                    with open(desc_path, 'wt') as fd:
+                        fd.write(desc)
+                        np.save(index_path, dataset_index, allow_pickle=True)
+                        np.save(sample_index_path, dataset_sample_index,
+                                allow_pickle=True)
+                except OSError:
+                    print(f'There was an error trying to create the data cache directory ({data_cache_path})')
+                    print('or a file in it. This is set with the --data-cache-path argument. Please')
+                    print('ensure you have write access to this directory or specify one that you do have')
+                    print('write access to.')
+                    cache_success = False
+
+
+            counts = torch.cuda.LongTensor([cache_success])
+            torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
+            torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
+            if counts[0].item() != (
+                torch.distributed.get_world_size() //
+                torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())):
+                print_rank_0("Data index creation unsuccessful, exiting.")
+                exit()
+
+            # Load on all ranks.
+            print_rank_0(f'> loading blendable dataset index: {index_path}')
+            self.dataset_index = np.load(index_path, allow_pickle=True, mmap_mode='r')
+            assert self.dataset_index.size == self.size
+
+            print_rank_0(f'> loading blendable dataset sample index: {sample_index_path}')
+            self.dataset_sample_index = np.load(sample_index_path, allow_pickle=True, mmap_mode='r')
+            assert self.dataset_sample_index.size == self.size
+        else:
+            self.dataset_index, self.dataset_sample_index = _build_indices()
+
+
+        # Check size
+        _ = self.__getitem__(self.size - 1)
+        try:
+            _ = self.__getitem__(self.size)
+            raise RuntimeError('BlendedDataset size is improperly bounded')
+        except IndexError:
+            pass
+        print_rank_0('> size of blendable dataset: '
+                     '{} samples'.format(self.size))
+
+
+    def __len__(self):
+        return self.size
+
+
+    def __getitem__(self, idx):
+        dataset_idx = self.dataset_index[idx]
+        sample_idx = self.dataset_sample_index[idx]
+        return {
+            "dataset_idx" : dataset_idx,
+            **self.datasets[dataset_idx][sample_idx],
+        }
diff --git a/training/DeepSpeed-Domino/megatron/data/data_samplers.py b/training/DeepSpeed-Domino/megatron/data/data_samplers.py
new file mode 100644
index 000000000..8dec2c192
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/data_samplers.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Dataloaders."""
+
+
+import random
+import torch
+import numpy as np
+from torch.utils.data import Dataset
+from megatron import get_args
+from megatron.core import mpu
+
+
+def build_pretraining_data_loader(dataset, consumed_samples):
+    """Buld dataloader given an input dataset."""
+
+    if dataset is None:
+        return None
+    args = get_args()
+
+    # Megatron sampler
+    if args.dataloader_type == 'single':
+        batch_sampler = MegatronPretrainingSampler(
+            total_samples=len(dataset),
+            consumed_samples=consumed_samples,
+            micro_batch_size=args.micro_batch_size,
+            data_parallel_rank=mpu.get_data_parallel_rank(),
+            data_parallel_size=mpu.get_data_parallel_world_size())
+    elif args.dataloader_type == 'cyclic':
+        batch_sampler = MegatronPretrainingRandomSampler(
+            dataset,
+            total_samples=len(dataset),
+            consumed_samples=consumed_samples,
+            micro_batch_size=args.micro_batch_size,
+            data_parallel_rank=mpu.get_data_parallel_rank(),
+            data_parallel_size=mpu.get_data_parallel_world_size(),
+            data_sharding=args.data_sharding)
+    else:
+        raise Exception('{} dataloader type is not supported.'.format(
+                args.dataloader_type))
+
+    # Torch dataloader.
+    return torch.utils.data.DataLoader(dataset,
+                                       batch_sampler=batch_sampler,
+                                       num_workers=args.num_workers,
+                                       pin_memory=True)
+
+class MegatronPretrainingSampler:
+
+    def __init__(self, total_samples, consumed_samples, micro_batch_size,
+                 data_parallel_rank, data_parallel_size, drop_last=True):
+        # Keep a copy of input params for later use.
+        self.total_samples = total_samples
+        self.consumed_samples = consumed_samples
+        self.micro_batch_size = micro_batch_size
+        self.data_parallel_rank = data_parallel_rank
+        self.micro_batch_times_data_parallel_size = \
+            self.micro_batch_size * data_parallel_size
+        self.drop_last = drop_last
+
+        # Sanity checks.
+        assert self.total_samples > 0, \
+            'no sample to consume: {}'.format(self.total_samples)
+        assert self.consumed_samples < self.total_samples, \
+            'no samples left to consume: {}, {}'.format(self.consumed_samples,
+                                                        self.total_samples)
+        assert self.micro_batch_size > 0
+        assert data_parallel_size > 0
+        assert self.data_parallel_rank < data_parallel_size, \
+            'data_parallel_rank should be smaller than data size: {}, ' \
+            '{}'.format(self.data_parallel_rank, data_parallel_size)
+
+    def __len__(self):
+        return self.total_samples
+
+    def get_start_end_idx(self):
+        start_idx = self.data_parallel_rank * self.micro_batch_size
+        end_idx = start_idx + self.micro_batch_size
+        return start_idx, end_idx
+
+    def __iter__(self):
+        batch = []
+        # Last batch will be dropped if drop_last is not set False
+        for idx in range(self.consumed_samples, self.total_samples):
+            batch.append(idx)
+            if len(batch) == self.micro_batch_times_data_parallel_size:
+                start_idx, end_idx = self.get_start_end_idx()
+                yield batch[start_idx:end_idx]
+                batch = []
+
+        # Check the last partial batch and see drop_last is set
+        if len(batch) > 0 and not self.drop_last:
+            start_idx, end_idx = self.get_start_end_idx()
+            yield batch[start_idx:end_idx]
+
+
+class RandomSeedDataset(Dataset):
+
+    def __init__(self, dataset):
+        args = get_args()
+        self.base_seed = args.seed
+        self.curr_seed = args.seed
+        self.dataset = dataset
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def set_epoch(self, epoch):
+        self.curr_seed = self.base_seed + epoch
+
+    def __getitem__(self, idx):
+        seed = idx + self.curr_seed
+        torch.manual_seed(seed)
+        random.seed(seed)
+        np.random.seed(seed)
+        return self.dataset[idx]
+
+
+class MegatronPretrainingRandomSampler:
+
+    def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size,
+                 data_parallel_rank, data_parallel_size, data_sharding):
+        # Keep a copy of input params for later use.
+        self.dataset = dataset
+        self.total_samples = total_samples
+        self.consumed_samples = consumed_samples
+        self.micro_batch_size = micro_batch_size
+        self.data_parallel_rank = data_parallel_rank
+        self.data_parallel_size = data_parallel_size
+        self.data_sharding = data_sharding
+        self.micro_batch_times_data_parallel_size = \
+            self.micro_batch_size * data_parallel_size
+        self.last_batch_size = \
+            self.total_samples % self.micro_batch_times_data_parallel_size
+
+        # Sanity checks.
+        assert self.total_samples > 0, \
+            'no sample to consume: {}'.format(self.total_samples)
+        assert self.micro_batch_size > 0
+        assert data_parallel_size > 0
+        assert self.data_parallel_rank < data_parallel_size, \
+            'data_parallel_rank should be smaller than data size: {}, ' \
+            '{}'.format(self.data_parallel_rank, data_parallel_size)
+
+    def __len__(self):
+        return self.total_samples
+
+    def __iter__(self):
+        active_total_samples = self.total_samples - self.last_batch_size
+        self.epoch = self.consumed_samples // active_total_samples
+        current_epoch_samples = self.consumed_samples % active_total_samples
+        assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0
+
+        if isinstance(self.dataset, RandomSeedDataset):
+            self.dataset.set_epoch(self.epoch)
+
+        # data sharding and random sampling
+        if self.data_sharding:
+            bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \
+                           * self.micro_batch_size
+            bucket_offset = current_epoch_samples // self.data_parallel_size
+            start_idx = self.data_parallel_rank * bucket_size
+            
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            random_idx = torch.randperm(bucket_size, generator=g).tolist()
+            idx_range = [start_idx + x for x in random_idx[bucket_offset:]]
+        else:
+            full_bucket_size = (self.total_samples // self.micro_batch_size) \
+                                * self.micro_batch_size
+            full_bucket_offset = current_epoch_samples
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            idx_range_total = \
+                torch.randperm(full_bucket_size, generator=g).tolist()
+            idx_range_active = idx_range_total[full_bucket_offset:]
+            idx_range = idx_range_active[self.data_parallel_rank::self.data_parallel_size]
+
+        batch = []
+        # Last batch if not complete will be dropped.
+        for idx in idx_range:
+            batch.append(idx)
+            if len(batch) == self.micro_batch_size:
+                self.consumed_samples += self.micro_batch_times_data_parallel_size
+                yield batch
+                batch = []
diff --git a/training/DeepSpeed-Domino/megatron/data/dataset_utils.py b/training/DeepSpeed-Domino/megatron/data/dataset_utils.py
new file mode 100644
index 000000000..571d3141e
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/dataset_utils.py
@@ -0,0 +1,806 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Most of the code here has been copied from:
+#   https://github.com/google-research/albert/blob/master/create_pretraining_data.py
+# with some modifications.
+
+import math
+import os
+import time
+import collections
+
+import numpy as np
+import torch
+
+from megatron import (
+    get_args,
+    print_rank_0
+)
+from megatron.core import mpu
+from megatron.data.blendable_dataset import BlendableDataset
+from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
+
+DSET_TYPE_BERT = 'standard_bert'
+DSET_TYPE_ICT = 'ict'
+DSET_TYPE_T5  = 't5'
+DSET_TYPE_MULTIMODAL = 'multimodal'
+
+DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5, DSET_TYPE_MULTIMODAL]
+
+
+def get_datasets_weights_and_num_samples(data_prefix,
+                                         train_valid_test_num_samples):
+
+    # The data prefix should be in the format of:
+    #   weight-1, data-prefix-1, weight-2, data-prefix-2, ..
+    assert len(data_prefix) % 2 == 0
+    num_datasets = len(data_prefix) // 2
+    weights = [0]*num_datasets
+    prefixes = [0]*num_datasets
+    for i in range(num_datasets):
+        weights[i] = float(data_prefix[2*i])
+        prefixes[i] = (data_prefix[2*i+1]).strip()
+    # Normalize weights
+    weight_sum = 0.0
+    for weight in weights:
+        weight_sum += weight
+    assert weight_sum > 0.0
+    weights = [weight / weight_sum for weight in weights]
+
+    # Add 0.5% (the 1.005 factor) so in case the bleding dataset does
+    # not uniformly distribute the number of samples, we still have
+    # samples left to feed to the network.
+    if isinstance(train_valid_test_num_samples, list):
+        datasets_train_valid_test_num_samples = []
+        for weight in weights:
+            datasets_train_valid_test_num_samples.append(
+                [int(math.ceil(val * weight * 1.005))
+                for val in train_valid_test_num_samples])
+    else:
+        # Used when separate dataset files are provided for train,
+        # valid and test
+        datasets_train_valid_test_num_samples = [
+            int(math.ceil(train_valid_test_num_samples * weight * 1.005))
+            for weight in weights]
+
+    return prefixes, weights, datasets_train_valid_test_num_samples
+
+
+def compile_helper():
+    """Compile helper function ar runtime. Make sure this
+    is invoked on a single process."""
+    import os
+    import subprocess
+    path = os.path.abspath(os.path.dirname(__file__))
+    ret = subprocess.run(['make', '-C', path])
+    if ret.returncode != 0:
+        print("Making C++ dataset helpers module failed, exiting.")
+        import sys
+        sys.exit(1)
+
+
+def get_a_and_b_segments(sample, np_rng):
+    """Divide sample into a and b segments."""
+
+    # Number of sentences in the sample.
+    n_sentences = len(sample)
+    # Make sure we always have two sentences.
+    assert n_sentences > 1, 'make sure each sample has at least two sentences.'
+
+    # First part:
+    # `a_end` is how many sentences go into the `A`.
+    a_end = 1
+    if n_sentences >= 3:
+        # Note that randin in numpy is exclusive.
+        a_end = np_rng.randint(1, n_sentences)
+    tokens_a = []
+    for j in range(a_end):
+        tokens_a.extend(sample[j])
+
+    # Second part:
+    tokens_b = []
+    for j in range(a_end, n_sentences):
+        tokens_b.extend(sample[j])
+
+    # Random next:
+    is_next_random = False
+    if np_rng.random() < 0.5:
+        is_next_random = True
+        tokens_a, tokens_b = tokens_b, tokens_a
+
+    return tokens_a, tokens_b, is_next_random
+
+
+def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng):
+    """Truncates a pair of sequences to a maximum sequence length."""
+    #print(len_a, len_b, max_num_tokens)
+    assert len_a > 0
+    if len_a + len_b <= max_num_tokens:
+        return False
+    while len_a + len_b > max_num_tokens:
+        if len_a > len_b:
+            len_a -= 1
+            tokens = tokens_a
+        else:
+            len_b -= 1
+            tokens = tokens_b
+        if np_rng.random() < 0.5:
+            del tokens[0]
+        else:
+            tokens.pop()
+    return True
+
+
+def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
+    """Merge segments A and B, add [CLS] and [SEP] and build tokentypes."""
+
+    tokens = []
+    tokentypes = []
+    # [CLS].
+    tokens.append(cls_id)
+    tokentypes.append(0)
+    # Segment A.
+    for token in tokens_a:
+        tokens.append(token)
+        tokentypes.append(0)
+    # [SEP].
+    tokens.append(sep_id)
+    tokentypes.append(0)
+    # Segment B.
+    for token in tokens_b:
+        tokens.append(token)
+        tokentypes.append(1)
+    if tokens_b:
+        # [SEP].
+        tokens.append(sep_id)
+        tokentypes.append(1)
+
+    return tokens, tokentypes
+
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+
+def is_start_piece(piece):
+    """Check if the current word piece is the starting piece (BERT)."""
+    # When a word has been split into
+    # WordPieces, the first token does not have any marker and any subsequence
+    # tokens are prefixed with ##. So whenever we see the ## token, we
+    # append it to the previous set of word indexes.
+    return not piece.startswith("##")
+
+
+def create_masked_lm_predictions(tokens,
+                                 vocab_id_list, vocab_id_to_token_dict,
+                                 masked_lm_prob,
+                                 cls_id, sep_id, mask_id,
+                                 max_predictions_per_seq,
+                                 np_rng,
+                                 max_ngrams=3,
+                                 do_whole_word_mask=True,
+                                 favor_longer_ngram=False,
+                                 do_permutation=False,
+                                 geometric_dist=False,
+                                 masking_style="bert"):
+    """Creates the predictions for the masked LM objective.
+    Note: Tokens here are vocab ids and not text tokens."""
+
+    cand_indexes = []
+    # Note(mingdachen): We create a list for recording if the piece is
+    # the starting piece of current token, where 1 means true, so that
+    # on-the-fly whole word masking is possible.
+    token_boundary = [0] * len(tokens)
+
+    for (i, token) in enumerate(tokens):
+        if token == cls_id or token == sep_id:
+            token_boundary[i] = 1
+            continue
+        # Whole Word Masking means that if we mask all of the wordpieces
+        # corresponding to an original word.
+        #
+        # Note that Whole Word Masking does *not* change the training code
+        # at all -- we still predict each WordPiece independently, softmaxed
+        # over the entire vocabulary.
+        if (do_whole_word_mask and len(cand_indexes) >= 1 and
+                not is_start_piece(vocab_id_to_token_dict[token])):
+            cand_indexes[-1].append(i)
+        else:
+            cand_indexes.append([i])
+            if is_start_piece(vocab_id_to_token_dict[token]):
+                token_boundary[i] = 1
+
+    output_tokens = list(tokens)
+
+    masked_lm_positions = []
+    masked_lm_labels = []
+
+    if masked_lm_prob == 0:
+        return (output_tokens, masked_lm_positions,
+                masked_lm_labels, token_boundary)
+
+    num_to_predict = min(max_predictions_per_seq,
+                         max(1, int(round(len(tokens) * masked_lm_prob))))
+
+    ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
+    if not geometric_dist:
+        # Note(mingdachen):
+        # By default, we set the probilities to favor shorter ngram sequences.
+        pvals = 1. / np.arange(1, max_ngrams + 1)
+        pvals /= pvals.sum(keepdims=True)
+        if favor_longer_ngram:
+            pvals = pvals[::-1]
+
+    ngram_indexes = []
+    for idx in range(len(cand_indexes)):
+        ngram_index = []
+        for n in ngrams:
+            ngram_index.append(cand_indexes[idx:idx + n])
+        ngram_indexes.append(ngram_index)
+
+    np_rng.shuffle(ngram_indexes)
+
+    (masked_lms, masked_spans) = ([], [])
+    covered_indexes = set()
+    for cand_index_set in ngram_indexes:
+        if len(masked_lms) >= num_to_predict:
+            break
+        if not cand_index_set:
+            continue
+        # Note(mingdachen):
+        # Skip current piece if they are covered in lm masking or previous ngrams.
+        for index_set in cand_index_set[0]:
+            for index in index_set:
+                if index in covered_indexes:
+                    continue
+
+        if not geometric_dist:
+            n = np_rng.choice(ngrams[:len(cand_index_set)],
+                              p=pvals[:len(cand_index_set)] /
+                              pvals[:len(cand_index_set)].sum(keepdims=True))
+        else:
+            # Sampling "n" from the geometric distribution and clipping it to
+            # the max_ngrams. Using p=0.2 default from the SpanBERT paper
+            # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1)
+            n = min(np_rng.geometric(0.2), max_ngrams)
+
+        index_set = sum(cand_index_set[n - 1], [])
+        n -= 1
+        # Note(mingdachen):
+        # Repeatedly looking for a candidate that does not exceed the
+        # maximum number of predictions by trying shorter ngrams.
+        while len(masked_lms) + len(index_set) > num_to_predict:
+            if n == 0:
+                break
+            index_set = sum(cand_index_set[n - 1], [])
+            n -= 1
+        # If adding a whole-word mask would exceed the maximum number of
+        # predictions, then just skip this candidate.
+        if len(masked_lms) + len(index_set) > num_to_predict:
+            continue
+        is_any_index_covered = False
+        for index in index_set:
+            if index in covered_indexes:
+                is_any_index_covered = True
+                break
+        if is_any_index_covered:
+            continue
+        for index in index_set:
+            covered_indexes.add(index)
+            masked_token = None
+            if masking_style == "bert":
+                # 80% of the time, replace with [MASK]
+                if np_rng.random() < 0.8:
+                    masked_token = mask_id
+                else:
+                    # 10% of the time, keep original
+                    if np_rng.random() < 0.5:
+                        masked_token = tokens[index]
+                    # 10% of the time, replace with random word
+                    else:
+                        masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))]
+            elif masking_style == "t5":
+                masked_token = mask_id
+            else:
+                raise ValueError("invalid value of masking style")
+
+            output_tokens[index] = masked_token
+            masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+
+        masked_spans.append(MaskedLmInstance(
+            index=index_set,
+            label=[tokens[index] for index in index_set]))
+
+    assert len(masked_lms) <= num_to_predict
+    np_rng.shuffle(ngram_indexes)
+
+    select_indexes = set()
+    if do_permutation:
+        for cand_index_set in ngram_indexes:
+            if len(select_indexes) >= num_to_predict:
+                break
+            if not cand_index_set:
+                continue
+            # Note(mingdachen):
+            # Skip current piece if they are covered in lm masking or previous ngrams.
+            for index_set in cand_index_set[0]:
+                for index in index_set:
+                    if index in covered_indexes or index in select_indexes:
+                        continue
+
+            n = np.random.choice(ngrams[:len(cand_index_set)],
+                                 p=pvals[:len(cand_index_set)] /
+                                 pvals[:len(cand_index_set)].sum(keepdims=True))
+            index_set = sum(cand_index_set[n - 1], [])
+            n -= 1
+
+            while len(select_indexes) + len(index_set) > num_to_predict:
+                if n == 0:
+                    break
+                index_set = sum(cand_index_set[n - 1], [])
+                n -= 1
+            # If adding a whole-word mask would exceed the maximum number of
+            # predictions, then just skip this candidate.
+            if len(select_indexes) + len(index_set) > num_to_predict:
+                continue
+            is_any_index_covered = False
+            for index in index_set:
+                if index in covered_indexes or index in select_indexes:
+                    is_any_index_covered = True
+                    break
+            if is_any_index_covered:
+                continue
+            for index in index_set:
+                select_indexes.add(index)
+        assert len(select_indexes) <= num_to_predict
+
+        select_indexes = sorted(select_indexes)
+        permute_indexes = list(select_indexes)
+        np_rng.shuffle(permute_indexes)
+        orig_token = list(output_tokens)
+
+        for src_i, tgt_i in zip(select_indexes, permute_indexes):
+            output_tokens[src_i] = orig_token[tgt_i]
+            masked_lms.append(MaskedLmInstance(index=src_i, label=orig_token[src_i]))
+
+    masked_lms = sorted(masked_lms, key=lambda x: x.index)
+    # Sort the spans by the index of the first span
+    masked_spans = sorted(masked_spans, key=lambda x: x.index[0])
+
+    for p in masked_lms:
+        masked_lm_positions.append(p.index)
+        masked_lm_labels.append(p.label)
+    return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary, masked_spans)
+
+
+def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
+                             masked_labels, pad_id, max_seq_length):
+    """Pad sequences and convert them to numpy."""
+
+    # Some checks.
+    num_tokens = len(tokens)
+    padding_length = max_seq_length - num_tokens
+    assert padding_length >= 0
+    assert len(tokentypes) == num_tokens
+    assert len(masked_positions) == len(masked_labels)
+
+    # Tokens and token types.
+    filler = [pad_id] * padding_length
+    tokens_np = np.array(tokens + filler, dtype=np.int64)
+    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)
+
+    # Padding mask.
+    padding_mask_np = np.array([1] * num_tokens + [0] * padding_length,
+                               dtype=np.int64)
+
+    # Lables and loss mask.
+    labels = [-1] * max_seq_length
+    loss_mask = [0] * max_seq_length
+    for i in range(len(masked_positions)):
+        assert masked_positions[i] < num_tokens
+        labels[masked_positions[i]] = masked_labels[i]
+        loss_mask[masked_positions[i]] = 1
+    labels_np = np.array(labels, dtype=np.int64)
+    loss_mask_np = np.array(loss_mask, dtype=np.int64)
+
+    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
+
+
+def build_train_valid_test_datasets_with_prefixes(data_impl,
+                                                  train_valid_test_num_samples,
+                                                  max_seq_length,
+                                                  seed,
+                                                  skip_warmup,
+                                                  train_data_prefix=None,
+                                                  valid_data_prefix=None,
+                                                  test_data_prefix=None,
+                                                  binary_head=False,
+                                                  max_seq_length_dec=None,
+                                                  dataset_type='standard_bert'):
+    print_rank_0("Separate data paths provided for train, valid & test.")
+
+    train_dataset, valid_dataset, test_dataset = None, None, None
+    # Single dataset.
+    if train_data_prefix is not None:
+        train_dataset = build_dataset("train", train_data_prefix, data_impl,
+                                      train_valid_test_num_samples[0],
+                                      max_seq_length, seed, skip_warmup,
+                                      binary_head, max_seq_length_dec,
+                                      dataset_type=dataset_type)
+
+    if valid_data_prefix is not None:
+        valid_dataset = build_dataset("valid", valid_data_prefix, data_impl,
+                                      train_valid_test_num_samples[1],
+                                      max_seq_length, seed, False,
+                                      binary_head, max_seq_length_dec,
+                                      dataset_type=dataset_type)
+
+    if test_data_prefix is not None:
+        test_dataset = build_dataset("test", test_data_prefix, data_impl,
+                                     train_valid_test_num_samples[2],
+                                     max_seq_length, seed, False,
+                                     binary_head, max_seq_length_dec,
+                                     dataset_type=dataset_type)
+
+    return (train_dataset, valid_dataset, test_dataset)
+
+
+def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                    train_valid_test_num_samples,
+                                    max_seq_length, seed,
+                                    skip_warmup, binary_head=False,
+                                    max_seq_length_dec=None,
+                                    dataset_type='standard_bert'):
+
+    if len(data_prefix) == 1:
+        return _build_train_valid_test_datasets(data_prefix[0],
+                                                data_impl, splits_string,
+                                                train_valid_test_num_samples,
+                                                max_seq_length, seed,
+                                                skip_warmup,
+                                                binary_head,
+                                                max_seq_length_dec,
+                                                dataset_type=dataset_type)
+    # Blending dataset.
+    # Parse the values.
+    output = get_datasets_weights_and_num_samples(data_prefix,
+                                                  train_valid_test_num_samples)
+    prefixes, weights, datasets_train_valid_test_num_samples = output
+    train_num_samples, valid_num_samples, test_num_samples = map(
+        sum,
+        zip(*datasets_train_valid_test_num_samples)
+    )
+
+    # Build individual datasets.
+    train_datasets = []
+    valid_datasets = []
+    test_datasets = []
+    for i in range(len(prefixes)):
+        train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
+            prefixes[i], data_impl, splits_string,
+            datasets_train_valid_test_num_samples[i],
+            max_seq_length, seed, skip_warmup, binary_head,
+            max_seq_length_dec, dataset_type=dataset_type)
+        if train_ds:
+            train_datasets.append(train_ds)
+        if valid_ds:
+            valid_datasets.append(valid_ds)
+        if test_ds:
+            test_datasets.append(test_ds)
+
+    # Blend.
+    blending_train_dataset = None
+    if train_datasets:
+        blending_train_dataset = BlendableDataset(train_datasets, weights, train_num_samples)
+    blending_valid_dataset = None
+    if valid_datasets:
+        blending_valid_dataset = BlendableDataset(valid_datasets, weights, valid_num_samples)
+    blending_test_dataset = None
+    if test_datasets:
+        blending_test_dataset = BlendableDataset(test_datasets, weights, test_num_samples)
+
+    return (blending_train_dataset, blending_valid_dataset,
+            blending_test_dataset)
+
+
+def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                     train_valid_test_num_samples,
+                                     max_seq_length, seed,
+                                     skip_warmup, binary_head,
+                                     max_seq_length_dec,
+                                     dataset_type='standard_bert'):
+
+    # Indexed dataset.
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           data_impl,
+                                           dataset_type,
+                                           skip_warmup)
+
+    # Get start and end indices of train/valid/train into doc-idx
+    # Note that doc-idx is desinged to be num-docs + 1 so we can
+    # easily iterate over it.
+    total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1
+    splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
+
+    # Print stats about the splits.
+    print_rank_0(' > dataset split:')
+
+    def print_split_stats(name, index):
+        print_rank_0('    {}:'.format(name))
+        print_rank_0('     document indices in [{}, {}) total of {} '
+                     'documents'.format(splits[index], splits[index + 1],
+                                        splits[index + 1] - splits[index]))
+        start_index = indexed_dataset.doc_idx[splits[index]]
+        end_index = indexed_dataset.doc_idx[splits[index + 1]]
+        print_rank_0('     sentence indices in [{}, {}) total of {} '
+                     'sentences'.format(start_index, end_index,
+                                        end_index - start_index))
+    print_split_stats('train', 0)
+    print_split_stats('validation', 1)
+    print_split_stats('test', 2)
+
+    def build_split_dataset(index, name):
+        dataset = None
+        if splits[index + 1] > splits[index]:
+            # Get the pointer to the original doc-idx so we can set it later.
+            doc_idx_ptr = indexed_dataset.get_doc_idx()
+            # Slice the doc-idx
+            start_index = splits[index]
+            # Add +1 so we can index into the dataset to get the upper bound.
+            end_index = splits[index + 1] + 1
+            # New doc_idx view.
+            indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index])
+
+            dataset = build_dataset(
+                name, data_prefix, data_impl,
+                train_valid_test_num_samples[index], max_seq_length,
+                seed, skip_warmup, binary_head, max_seq_length_dec,
+                dataset_type, indexed_dataset)
+
+            # Set the original pointer so dataset remains the main dataset.
+            indexed_dataset.set_doc_idx(doc_idx_ptr)
+            # Checks.
+            assert indexed_dataset.doc_idx[0] == 0
+            assert indexed_dataset.doc_idx.shape[0] == \
+                (total_num_of_documents + 1)
+        return dataset
+    
+    train_dataset = build_split_dataset(0, 'train')
+    valid_dataset = build_split_dataset(1, 'valid')
+    test_dataset = build_split_dataset(2, 'test')
+
+    return (train_dataset, valid_dataset, test_dataset)
+
+
+def build_dataset(name, data_prefix, data_impl, max_num_samples,
+                  max_seq_length, seed, skip_warmup, binary_head,
+                  max_seq_length_dec, dataset_type='standard_bert',
+                  indexed_dataset=None):
+
+    from megatron.data.bert_dataset import BertDataset
+    from megatron.data.ict_dataset import ICTDataset
+    from megatron.data.t5_dataset import T5Dataset
+    from megatron.data.multimodal_dataset import MultiModalDataset
+
+    if dataset_type not in DSET_TYPES:
+        raise ValueError("Invalid dataset_type: ", dataset_type)
+
+    if indexed_dataset is None:
+        indexed_dataset = get_indexed_dataset_(data_prefix,
+                                               data_impl,
+                                               dataset_type,
+                                               skip_warmup)
+
+    kwargs = dict(
+        name=name,
+        data_prefix=data_prefix,
+        num_epochs=None,
+        max_num_samples=max_num_samples,
+        max_seq_length=max_seq_length,
+        seed=seed,
+    )
+
+    if dataset_type == DSET_TYPE_ICT:
+        args = get_args()
+
+        title_dataset = get_indexed_dataset_(
+            args.titles_data_path,
+            data_impl,
+            dataset_type,
+            skip_warmup)
+
+        dataset = ICTDataset(
+            block_dataset=indexed_dataset,
+            title_dataset=title_dataset,
+            query_in_block_prob=args.query_in_block_prob,
+            use_one_sent_docs=args.use_one_sent_docs,
+            binary_head=binary_head,
+            **kwargs
+        )
+    elif dataset_type == DSET_TYPE_T5:
+        args = get_args()
+        dataset = T5Dataset(
+            indexed_dataset=indexed_dataset,
+            masked_lm_prob=args.mask_prob,
+            max_seq_length_dec=max_seq_length_dec,
+            short_seq_prob=args.short_seq_prob,
+            **kwargs
+        )
+    elif dataset_type == DSET_TYPE_BERT:
+        args = get_args()
+        dataset = BertDataset(
+            indexed_dataset=indexed_dataset,
+            masked_lm_prob=args.mask_prob,
+            short_seq_prob=args.short_seq_prob,
+            binary_head=binary_head,
+            **kwargs
+        )
+    elif dataset_type == DSET_TYPE_MULTIMODAL:
+        args = get_args()
+        dataset = MultiModalDataset(
+            name=name,
+            data_prefix=data_prefix,
+            indexed_dataset=indexed_dataset,
+            num_samples=max_num_samples,
+            seq_length=max_seq_length,
+            seed=seed,
+            img_h=args.img_h,
+            img_w=args.img_w,
+        )
+    else:
+        raise NotImplementedError("Dataset type not fully implemented.")
+
+    return dataset
+
+
+def get_indexed_dataset_(data_prefix, data_impl, dataset_type, skip_warmup):
+
+    print_rank_0(' > building dataset index ...')
+
+    start_time = time.time()
+    multimodal = dataset_type == DSET_TYPE_MULTIMODAL
+    indexed_dataset = make_indexed_dataset(data_prefix,
+                                           data_impl,
+                                           skip_warmup,
+                                           multimodal)
+    assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1]
+    print_rank_0(' > finished creating indexed dataset in {:4f} '
+                 'seconds'.format(time.time() - start_time))
+
+    print_rank_0(' > indexed dataset stats:')
+    print_rank_0('    number of documents: {}'.format(
+        indexed_dataset.doc_idx.shape[0] - 1))
+    print_rank_0('    number of sentences: {}'.format(
+        indexed_dataset.sizes.shape[0]))
+
+    return indexed_dataset
+
+
+def get_train_valid_test_split_(splits_string, size):
+    """ Get dataset splits from comma or '/' separated string list."""
+
+    splits = []
+    if splits_string.find(',') != -1:
+        splits = [float(s) for s in splits_string.split(',')]
+    elif splits_string.find('/') != -1:
+        splits = [float(s) for s in splits_string.split('/')]
+    else:
+        splits = [float(splits_string)]
+    while len(splits) < 3:
+        splits.append(0.)
+    splits = splits[:3]
+    splits_sum = sum(splits)
+    assert splits_sum > 0.0
+    splits = [split / splits_sum for split in splits]
+    splits_index = [0]
+    for index, split in enumerate(splits):
+        splits_index.append(splits_index[index] +
+                            int(round(split * float(size))))
+    diff = splits_index[-1] - size
+    for index in range(1, len(splits_index)):
+        splits_index[index] -= diff
+    assert len(splits_index) == 4
+    assert splits_index[-1] == size
+    return splits_index
+
+def get_samples_mapping(indexed_dataset,
+                        data_prefix,
+                        num_epochs,
+                        max_num_samples,
+                        max_seq_length,
+                        short_seq_prob,
+                        seed,
+                        name,
+                        binary_head):
+    """Get a list that maps a sample index to a starting sentence index, end sentence index, and length"""
+
+    if not num_epochs:
+        if not max_num_samples:
+            raise ValueError("Need to specify either max_num_samples "
+                             "or num_epochs")
+        num_epochs = np.iinfo(np.int32).max - 1
+    if not max_num_samples:
+        max_num_samples = np.iinfo(np.int64).max - 1
+
+    # Filename of the index mapping
+    indexmap_filename = data_prefix
+    indexmap_filename += '_{}_indexmap'.format(name)
+    if num_epochs != (np.iinfo(np.int32).max - 1):
+        indexmap_filename += '_{}ep'.format(num_epochs)
+    if max_num_samples != (np.iinfo(np.int64).max - 1):
+        indexmap_filename += '_{}mns'.format(max_num_samples)
+    indexmap_filename += '_{}msl'.format(max_seq_length)
+    indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
+    indexmap_filename += '_{}s'.format(seed)
+    indexmap_filename += '.npy'
+
+    # Build the indexed mapping if not exist.
+    if torch.distributed.get_rank() == 0 and \
+       not os.path.isfile(indexmap_filename):
+        print(' > WARNING: could not find index map file {}, building '
+              'the indices on rank 0 ...'.format(indexmap_filename))
+
+        # Make sure the types match the helpers input types.
+        assert indexed_dataset.doc_idx.dtype == np.int64
+        assert indexed_dataset.sizes.dtype == np.int32
+
+        # Build samples mapping
+        verbose = torch.distributed.get_rank() == 0
+        start_time = time.time()
+        print_rank_0(' > building samples index mapping for {} ...'.format(
+            name))
+        # First compile and then import.
+        from megatron.data import helpers
+        samples_mapping = helpers.build_mapping(
+            indexed_dataset.doc_idx,
+            indexed_dataset.sizes,
+            num_epochs,
+            max_num_samples,
+            max_seq_length,
+            short_seq_prob,
+            seed,
+            verbose,
+            2 if binary_head else 1)
+        print_rank_0(' > done building samples index maping')
+        np.save(indexmap_filename, samples_mapping, allow_pickle=True)
+        print_rank_0(' > saved the index mapping in {}'.format(
+            indexmap_filename))
+        # Make sure all the ranks have built the mapping
+        print_rank_0(' > elasped time to build and save samples mapping '
+                     '(seconds): {:4f}'.format(
+                         time.time() - start_time))
+    # This should be a barrier but nccl barrier assumes
+    # device_index=rank which is not the case for model
+    # parallel case
+    counts = torch.cuda.LongTensor([1])
+    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
+    torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
+    assert counts[0].item() == (
+        torch.distributed.get_world_size() //
+        torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()))
+
+    # Load indexed dataset.
+    print_rank_0(' > loading indexed mapping from {}'.format(
+        indexmap_filename))
+    start_time = time.time()
+    samples_mapping = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r')
+    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('    total number of samples: {}'.format(
+        samples_mapping.shape[0]))
+
+    return samples_mapping
diff --git a/training/DeepSpeed-Domino/megatron/data/gpt_dataset.py b/training/DeepSpeed-Domino/megatron/data/gpt_dataset.py
new file mode 100644
index 000000000..088748bc9
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/gpt_dataset.py
@@ -0,0 +1,590 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""GPT style dataset."""
+
+import hashlib
+import os
+import time
+
+import numpy as np
+import torch
+
+from megatron import print_rank_0
+from megatron.core import mpu
+from megatron.data.blendable_dataset import BlendableDataset
+from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
+from megatron.data.dataset_utils import get_train_valid_test_split_
+from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
+
+
+def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                    train_valid_test_num_samples,
+                                    seq_length, seed, skip_warmup,
+                                    train_data_prefix=None,
+                                    valid_data_prefix=None,
+                                    test_data_prefix=None,
+                                    return_doc_ids=False, *,
+                                    data_cache_path=None):
+    """Build train, valid, and test datasets."""
+
+    if data_prefix:
+        print_rank_0("Single data path provided for train, valid & test")
+
+        # Single dataset.
+        if len(data_prefix) == 1:
+            return _build_train_valid_test_datasets(data_prefix[0],
+                                                    data_impl, splits_string,
+                                                    train_valid_test_num_samples,
+                                                    seq_length, seed, skip_warmup,
+                                                    data_cache_path=data_cache_path)
+
+        # Blending dataset.
+        # Parse the values.
+        output = get_datasets_weights_and_num_samples(data_prefix,
+                                                      train_valid_test_num_samples)
+        prefixes, weights, datasets_train_valid_test_num_samples = output
+        train_num_samples, valid_num_samples, test_num_samples = map(
+            sum,
+            zip(*datasets_train_valid_test_num_samples)
+        )
+
+        # Build individual datasets.
+        train_datasets = []
+        valid_datasets = []
+        test_datasets = []
+        for i in range(len(prefixes)):
+            train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
+                prefixes[i], data_impl, splits_string,
+                datasets_train_valid_test_num_samples[i],
+                seq_length, seed, skip_warmup,
+                return_doc_ids,
+                data_cache_path=data_cache_path)
+            if train_ds:
+                train_datasets.append(train_ds)
+            if valid_ds:
+                valid_datasets.append(valid_ds)
+            if test_ds:
+                test_datasets.append(test_ds)
+
+        # Blend.
+        blending_train_dataset = None
+        if train_datasets:
+            blending_train_dataset = BlendableDataset(train_datasets, weights, train_num_samples,
+                                                      data_cache_path=data_cache_path)
+        blending_valid_dataset = None
+        if valid_datasets:
+            blending_valid_dataset = BlendableDataset(valid_datasets, weights, valid_num_samples,
+                                                      data_cache_path=data_cache_path)
+        blending_test_dataset = None
+        if test_datasets:
+            blending_test_dataset = BlendableDataset(test_datasets, weights, test_num_samples,
+                                                     data_cache_path=data_cache_path)
+
+        return (blending_train_dataset, blending_valid_dataset,
+                blending_test_dataset)
+
+    else:
+        print_rank_0("Separate data paths provided for train, valid & test. Split string will be ignored.")
+
+        train_dataset, valid_dataset, test_dataset = None, None, None
+        # Single dataset.
+        if train_data_prefix is not None:
+            train_dataset = build_dataset("train", train_data_prefix, data_impl,
+                                          splits_string,
+                                          train_valid_test_num_samples[0],
+                                          seq_length, seed, skip_warmup,
+                                          data_cache_path=data_cache_path)
+
+        if valid_data_prefix is not None:
+            valid_dataset = build_dataset("valid", valid_data_prefix, data_impl,
+                                          splits_string,
+                                          train_valid_test_num_samples[1],
+                                          seq_length, seed, False,
+                                          data_cache_path=data_cache_path)
+
+
+        if test_data_prefix is not None:
+            test_dataset = build_dataset("test", test_data_prefix, data_impl,
+                                         splits_string,
+                                         train_valid_test_num_samples[2],
+                                         seq_length, seed, False,
+                                         data_cache_path=data_cache_path)
+
+        return (train_dataset, valid_dataset, test_dataset)
+
+
+def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                     train_valid_test_num_samples,
+                                     seq_length, seed, skip_warmup,
+                                     return_doc_ids=False, *,
+                                     data_cache_path=None):
+    """Build train, valid, and test datasets."""
+
+    # Indexed dataset.
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+
+    total_num_of_documents = indexed_dataset.sizes.shape[0]
+    splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
+
+    # Print stats about the splits.
+    print_rank_0(' > dataset split:')
+
+    def print_split_stats(name, index):
+        print_rank_0('    {}:'.format(name))
+        print_rank_0('     document indices in [{}, {}) total of {} '
+                     'documents'.format(splits[index], splits[index + 1],
+                                        splits[index + 1] - splits[index]))
+    print_split_stats('train', 0)
+    print_split_stats('validation', 1)
+    print_split_stats('test', 2)
+
+    def build_dataset(index, name):
+        dataset = None
+        if splits[index + 1] > splits[index]:
+            documents = np.arange(start=splits[index], stop=splits[index + 1],
+                                  step=1, dtype=np.int32)
+            dataset = GPTDataset(name, data_prefix, documents, indexed_dataset,
+                                 splits_string,
+                                 train_valid_test_num_samples[index],
+                                 seq_length, seed,
+                                 return_doc_ids,
+                                 data_cache_path=data_cache_path)
+        return dataset
+
+    train_dataset = build_dataset(0, 'train')
+    valid_dataset = build_dataset(1, 'valid')
+    test_dataset = build_dataset(2, 'test')
+
+    return (train_dataset, valid_dataset, test_dataset)
+
+
+def build_dataset(dataset_name, data_prefix, data_impl,
+                  splits_string, num_samples,
+                  seq_length, seed, skip_warmup,
+                  *,
+                  data_cache_path=None):
+    dataset = None
+    if len(data_prefix) == 1:
+        dataset = _build_dataset(dataset_name, data_prefix[0], data_impl,
+                                 splits_string, num_samples, seq_length,
+                                 seed, skip_warmup,
+                                 data_cache_path=data_cache_path)
+    else:
+        # Blending dataset.
+        # Parse the values.
+        output = get_datasets_weights_and_num_samples(data_prefix, num_samples)
+        prefixes, weights, dataset_num_samples = output
+        num_samples = sum(dataset_num_samples)
+
+        # Build individual datasets.
+        datasets = []
+        for i in range(len(prefixes)):
+            ds = _build_dataset(dataset_name, prefixes[i], data_impl,
+                                splits_string, dataset_num_samples[i],
+                                seq_length, seed, skip_warmup,
+                                data_cache_path=data_cache_path)
+            if ds:
+                datasets.append(ds)
+
+        if datasets:
+            dataset = BlendableDataset(datasets, weights, num_samples,
+                                       data_cache_path=data_cache_path)
+
+    return dataset
+
+
+def _build_dataset(dataset_name, data_prefix, data_impl, splits_string,
+                   num_samples, seq_length, seed, skip_warmup,
+                   *,
+                   data_cache_path=None):
+    """
+    Build dataset. This method is called when individual
+    train, valid, test datasets are provided
+    """
+
+    # Indexed dataset.
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+
+    total_num_of_documents = indexed_dataset.sizes.shape[0]
+
+    print_rank_0('    {}:'.format(dataset_name))
+    print_rank_0('     document indices in [0, {}) total of {} '
+                 'documents'.format(total_num_of_documents, total_num_of_documents))
+
+    documents = np.arange(start=0, stop=total_num_of_documents,
+                        step=1, dtype=np.int32)
+
+    dataset = GPTDataset(dataset_name, data_prefix, documents, indexed_dataset,
+                         splits_string, num_samples, seq_length, seed,
+                         data_cache_path=data_cache_path)
+
+    return dataset
+
+
+def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
+    """Build indexed dataset."""
+    print_rank_0(' > building dataset index ...')
+
+    start_time = time.time()
+    indexed_dataset = make_indexed_dataset(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+    print_rank_0(' > finished creating indexed dataset in {:4f} '
+                 'seconds'.format(time.time() - start_time))
+    print_rank_0('    number of documents: {}'.format(
+        indexed_dataset.sizes.shape[0]))
+
+    return indexed_dataset
+
+
+class GPTDataset(torch.utils.data.Dataset):
+
+    def __init__(self, name, data_prefix, documents, indexed_dataset,
+                 splits_string, num_samples, seq_length, seed,
+                 return_doc_ids=False, *,
+                 data_cache_path=None):
+
+        self.name = name
+        self.indexed_dataset = indexed_dataset
+        self.return_doc_ids = return_doc_ids
+
+        # Checks
+        assert np.min(documents) >= 0
+        assert np.max(documents) < indexed_dataset.sizes.shape[0]
+
+        # Build index mappings.
+        self.doc_idx, self.sample_idx, self.shuffle_idx, self.desc, self.desc_hash = \
+            _build_index_mappings(self.name, data_prefix,
+                                  documents, self.indexed_dataset.sizes,
+                                  splits_string, num_samples, seq_length, seed,
+                                  data_cache_path=data_cache_path)
+
+
+    def __len__(self):
+        # -1 is due to data structure used to retieve the index:
+        #    sample i --> [sample_idx[i], sample_idx[i+1])
+        return self.sample_idx.shape[0] - 1
+
+    def __getitem__(self, idx):
+        # Get the shuffled index.
+        idx = self.shuffle_idx[idx]
+        # Start and end documents and offsets.
+        doc_index_f = self.sample_idx[idx][0]
+        doc_index_l = self.sample_idx[idx + 1][0]
+        offset_f = self.sample_idx[idx][1]
+        offset_l = self.sample_idx[idx + 1][1]
+        # If we are within the same document, just extract the chunk.
+        doc_ids = []
+        if doc_index_f == doc_index_l:
+            doc_ids.append(self.doc_idx[doc_index_f])
+            sample = self.indexed_dataset.get(self.doc_idx[doc_index_f],
+                                              offset=offset_f,
+                                              length=offset_l - offset_f + 1)
+        else:
+            # Otherwise, get the rest of the initial document.
+            doc_ids.append(self.doc_idx[doc_index_f])
+            sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f],
+                                                    offset=offset_f)]
+            # Loop over all in between documents and add the entire document.
+            for i in range(doc_index_f + 1, doc_index_l):
+                doc_ids.append(self.doc_idx[i])
+                sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
+            # And finally add the relevant portion of last document.
+            doc_ids.append(self.doc_idx[doc_index_l])
+            sample_list.append(self.indexed_dataset.get(
+                self.doc_idx[doc_index_l],
+                length=offset_l + 1))
+            sample = np.concatenate(sample_list)
+
+        if self.return_doc_ids: # for retro preprocessing
+            return {'text': np.array(sample, dtype=np.int64),
+                    'doc_ids': np.array(doc_ids, dtype=np.int64)}
+        else:
+            return {'text': np.array(sample, dtype=np.int64)}
+
+
+def _build_index_mappings(name, data_prefix, documents, sizes,
+                          splits_string, num_samples, seq_length, seed,
+                          *,
+                          data_cache_path):
+    """Build doc-idx, sample-idx, and shuffle-idx.
+    doc-idx: is an array (ordered) of documents to be used in training.
+    sample-idx: is the start document index and document offset for each
+       training sample.
+    shuffle-idx: maps the sample index into a random index into sample-idx.
+    """
+    # Number of tokens in each epoch and number of required epochs.
+    tokens_per_epoch = _num_tokens(documents, sizes)
+    num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
+
+    # rng state
+    np_rng = np.random.RandomState(seed=seed)
+
+    # Filename of the index mappings.
+    desc = "GPT Dataset\n\n"
+    desc += f"Data prefix {data_prefix}\n"
+    desc += f"Dataset name {name}\n"
+    desc += f"Number of samples {num_samples}\n"
+    desc += f"Sequence length {seq_length}\n"
+    desc += f"Random seed {seed}\n"
+    desc += f"Split {splits_string}\n"
+    desc_hash = hashlib.md5(desc.encode('utf-8')).hexdigest()
+    desc_filename = desc_hash + ".dsc"
+    doc_idx_filename = desc_hash + '_doc_idx.npy'
+    sample_idx_filename = desc_hash + '_sample_idx.npy'
+    shuffle_idx_filename = desc_hash + '_shuffle_idx.npy'
+
+    # Look for cache in main data dir first to avoid unnecessary
+    # duplication, then look in data-cache-path if specified,
+    # If nothing is found, use the last path looked in
+    build_indices = True
+    prefixes = [os.path.join(os.path.dirname(data_prefix), 'index-cache')]
+    if data_cache_path is not None:
+        prefixes.append(data_cache_path)
+    for prefix in prefixes:
+        idx_path = {
+            'desc': os.path.join(prefix, desc_filename),
+            'doc': os.path.join(prefix, doc_idx_filename),
+            'sample': os.path.join(prefix, sample_idx_filename),
+            'shuffle': os.path.join(prefix, shuffle_idx_filename)
+        }
+        for f in idx_path.values():
+            if not os.path.isfile(f):
+                break
+        else:
+            # Found our files!
+            build_indices = False
+            break
+    data_cache_dir = os.path.dirname(idx_path['desc'])
+    data_cache_success = True
+
+    # Build the indexed mapping if not exist.
+    if build_indices and torch.distributed.get_rank() == 0:
+        print_rank_0(' > WARNING: could not find index map files, building '
+                     'the indices on rank 0 ...')
+
+        # For the last epoch, decide whether include the entire epoch
+        # in the global shuffle or not.
+
+        # If we need only one epoch, then separating last epoch  does
+        # not mean anything.
+        if num_epochs == 1:
+            separate_last_epoch = False
+            print(' > only one epoch required, setting '
+                  'separate_last_epoch to False', flush=True)
+
+        else:
+            # Get the number of samples for the last epoch
+            num_samples_from_epochs_minus_one = (
+                (num_epochs - 1) * tokens_per_epoch - 1) // seq_length
+            last_epoch_num_samples = num_samples - \
+                                     num_samples_from_epochs_minus_one
+            assert last_epoch_num_samples >= 0, \
+                'last epoch number of samples should be non-negative.'
+            num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length
+            assert last_epoch_num_samples <= (num_samples_per_epoch + 1), \
+                'last epoch number of samples exceeded max value.'
+            # If we have less than 80% of the samples for the last epoch,
+            # seperate out the epoch and treat it differently.
+            # Note: the 80% number is just based on common sense and can
+            # be adjusted if needed.
+            separate_last_epoch = (last_epoch_num_samples <
+                                   int(0.80 * num_samples_per_epoch))
+            if separate_last_epoch:
+                string = ' > last epoch number of samples ({}) is smaller '\
+                         'than 80% of number of samples per epoch ({}), '\
+                         'setting separate_last_epoch to True'
+            else:
+                string = ' > last epoch number of samples ({}) is larger '\
+                         'than 80% of number of samples per epoch ({}), '\
+                         'setting separate_last_epoch to False'
+            print(string.format(last_epoch_num_samples,
+                                num_samples_per_epoch), flush=True)
+
+
+        try:
+            os.makedirs(data_cache_dir, exist_ok=True)
+
+            # description
+            with open(idx_path['desc'], 'wt') as fd:
+                fd.write(desc)
+
+            # doc-idx.
+            start_time = time.time()
+            doc_idx = _build_doc_idx(documents, num_epochs, np_rng,
+                                     separate_last_epoch)
+            np.save(idx_path['doc'], doc_idx, allow_pickle=True)
+            print_rank_0(' > elasped time to build and save doc-idx mapping '
+                         '(seconds): {:4f}'.format(time.time() - start_time))
+            # sample-idx.
+            start_time = time.time()
+            # Use C++ implementation for speed.
+            # First compile and then import.
+            from megatron.data import helpers
+            assert doc_idx.dtype == np.int32
+            assert sizes.dtype == np.int32
+            sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
+                                                  num_epochs, tokens_per_epoch)
+            np.save(idx_path['sample'], sample_idx, allow_pickle=True)
+            print_rank_0(' > elasped time to build and save sample-idx mapping '
+                         '(seconds): {:4f}'.format(time.time() - start_time))
+            # shuffle-idx.
+            start_time = time.time()
+            # -1 is due to data structure used to retieve the index:
+            #    sample i --> [sample_idx[i], sample_idx[i+1])
+            if separate_last_epoch:
+                num_samples_ = num_samples_from_epochs_minus_one
+            else:
+                num_samples_ = sample_idx.shape[0] - 1
+            shuffle_idx = _build_shuffle_idx(num_samples_,
+                                             sample_idx.shape[0] - 1, np_rng)
+            np.save(idx_path['shuffle'], shuffle_idx, allow_pickle=True)
+            print_rank_0(' > elasped time to build and save shuffle-idx mapping'
+                         ' (seconds): {:4f}'.format(time.time() - start_time))
+        except OSError:
+            print(f'There was an error trying to create the data cache directory ({data_cache_dir})')
+            print('or a file in it. This defaults to a directory "index-cache" within the directory')
+            print('the data files are in and can be set with the --data-cache-path argument. Please')
+            print('ensure you have write access to this directory or specify one that you do have')
+            print('write access to.')
+            data_cache_success = False
+
+    counts = torch.cuda.LongTensor([data_cache_success])
+    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
+    torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
+    if counts[0].item() != (
+        torch.distributed.get_world_size() //
+        torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())):
+        print_rank_0("Data index creation unsuccessful, exiting.")
+        exit()
+
+    # Load mappings.
+    start_time = time.time()
+    print_rank_0(f" > loading doc-idx mapping from {idx_path['doc']}")
+    doc_idx = np.load(idx_path['doc'], allow_pickle=True, mmap_mode='r')
+
+    print_rank_0(f" > loading sample-idx mapping from {idx_path['sample']}")
+    sample_idx = np.load(idx_path['sample'], allow_pickle=True, mmap_mode='r')
+
+    print_rank_0(f" > loading shuffle-idx mapping from {idx_path['shuffle']}")
+    shuffle_idx = np.load(idx_path['shuffle'], allow_pickle=True, mmap_mode='r')
+
+    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('    total number of samples: {}'.format(
+        sample_idx.shape[0]))
+    print_rank_0('    total number of epochs: {}'.format(num_epochs))
+
+    return doc_idx, sample_idx, shuffle_idx, desc, desc_hash
+
+
+def _num_tokens(documents, sizes):
+    """Total number of tokens in the dataset."""
+    return np.sum(sizes[documents])
+
+
+def _num_epochs(tokens_per_epoch, seq_length, num_samples):
+    """Based on number of samples and sequence lenght, calculate how many
+    epochs will be needed."""
+    num_epochs = 0
+    total_tokens = 0
+    while True:
+        num_epochs += 1
+        total_tokens += tokens_per_epoch
+        # -1 is because we need to retrieve seq_length + 1 token each time
+        # but the last token will overlap with the first token of the next
+        # sample except for the last sample.
+        if ((total_tokens - 1) // seq_length) >= num_samples:
+            return num_epochs
+
+
+def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
+    """Build an array with length = number-of-epochs * number-of-dcuments.
+    Each index is mapped to a corresponding document."""
+    if not separate_last_epoch or num_epochs == 1:
+        doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1]
+        doc_idx[:] = documents
+        doc_idx = doc_idx.reshape(-1)
+        doc_idx = doc_idx.astype(np.int32)
+        np_rng.shuffle(doc_idx)
+        return doc_idx
+
+    doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False)
+    doc_idx_last = _build_doc_idx(documents, 1, np_rng, False)
+    return np.concatenate((doc_idx_first, doc_idx_last))
+
+
+def _build_sample_idx(sizes, doc_idx, seq_length,
+                      num_epochs, tokens_per_epoch):
+    """Sample index mapping is a 2D array with sizes
+    [number-of-samples + 1, 2] where [..., 0] contains
+    the index into `doc_idx` and [..., 1] is the
+    starting offset in that document."""
+
+    # Total number of samples. For -1 see comments in `_num_epochs`.
+    num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length
+    sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32)
+
+    # Index into sample_idx.
+    sample_index = 0
+    # Index into doc_idx.
+    doc_idx_index = 0
+    # Begining offset for each document.
+    doc_offset = 0
+    # Start with first document and no offset.
+    sample_idx[sample_index][0] = doc_idx_index
+    sample_idx[sample_index][1] = doc_offset
+    sample_index += 1
+    while sample_index <= num_samples:
+        # Start with a fresh sequence.
+        remaining_seq_length = seq_length + 1
+        while remaining_seq_length != 0:
+            # Get the document length.
+            doc_id = doc_idx[doc_idx_index]
+            doc_length = sizes[doc_id] - doc_offset
+            # And add it to the current sequence.
+            remaining_seq_length -= doc_length
+            # If we have more than a full sequence, adjust offset and set
+            # remaining length to zero so we return from the while loop.
+            # Note that -1 here is for the same reason we have -1 in
+            # `_num_epochs` calculations.
+            if remaining_seq_length <= 0:
+                doc_offset += (remaining_seq_length + doc_length - 1)
+                remaining_seq_length = 0
+            else:
+                # Otherwise, start from the begining of the next document.
+                doc_idx_index += 1
+                doc_offset = 0
+        # Record the sequence.
+        sample_idx[sample_index][0] = doc_idx_index
+        sample_idx[sample_index][1] = doc_offset
+        sample_index += 1
+
+    return sample_idx
+
+
+def _build_shuffle_idx(num_samples, total_size, np_rng):
+    """Build the range [0, size) and shuffle."""
+    print(' > building shuffle index with split [0, {}) and [{}, {}) '
+          '...'.format(num_samples, num_samples, total_size), flush=True)
+
+    dtype_ = np.uint32
+    if total_size >= (np.iinfo(np.uint32).max - 1):
+        dtype_ = np.int64
+
+    shuffle_idx_first = np.arange(start=0, stop=num_samples,
+                                  step=1, dtype=dtype_)
+    np_rng.shuffle(shuffle_idx_first)
+    if num_samples == total_size:
+        return shuffle_idx_first
+
+    shuffle_idx_last = np.arange(start=num_samples, stop=total_size,
+                                 step=1, dtype=dtype_)
+    np_rng.shuffle(shuffle_idx_last)
+
+    return np.concatenate((shuffle_idx_first, shuffle_idx_last))
+
diff --git a/training/DeepSpeed-Domino/megatron/data/helpers.cpp b/training/DeepSpeed-Domino/megatron/data/helpers.cpp
new file mode 100644
index 000000000..09f5f9762
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/helpers.cpp
@@ -0,0 +1,701 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
+
+/* Helper methods for fast index mapping builds */
+
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <math.h>
+#include <stdexcept>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <random>
+
+namespace py = pybind11;
+using namespace std;
+
+const int32_t LONG_SENTENCE_LEN = 512;
+
+
+void build_blending_indices(py::array_t<uint8_t>& dataset_index,
+			    py::array_t<int64_t>& dataset_sample_index,
+			    const py::array_t<double>& weights,
+			    const int32_t num_datasets,
+			    const int64_t size, const bool verbose) {
+  /* Given multiple datasets and a weighting array, build samples
+   such that it follows those wieghts.*/
+
+  if (verbose) {
+    std::cout << "> building indices for blendable datasets ..." << std::endl;
+  }
+
+  // Get the pointer access without the checks.
+  auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();
+  auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();
+  auto weights_ptr = weights.unchecked<1>();
+
+  // Initialize buffer for number of samples used for each dataset.
+  int64_t current_samples[num_datasets];
+  for(int64_t i = 0; i < num_datasets; ++i) {
+    current_samples[i] = 0;
+  }
+
+  // For each sample:
+  for(int64_t sample_idx = 0; sample_idx < size; ++sample_idx) {
+
+    // Determine where the max error in sampling is happening.
+    auto sample_idx_double = std::max(static_cast<double>(sample_idx), 1.0);
+    int64_t max_error_index = 0;
+    double max_error = weights_ptr[0] * sample_idx_double -
+      static_cast<double>(current_samples[0]);
+    for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx) {
+      double error = weights_ptr[dataset_idx] * sample_idx_double -
+	static_cast<double>(current_samples[dataset_idx]);
+      if (error > max_error) {
+	max_error = error;
+	max_error_index = dataset_idx;
+      }
+    }
+
+    // Populate the indices.
+    dataset_index_ptr[sample_idx] = static_cast<uint8_t>(max_error_index);
+    dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index];
+
+    // Update the total samples.
+    current_samples[max_error_index] += 1;
+    
+  }
+
+  // print info
+  if (verbose) {
+    std::cout << " > sample ratios:" << std::endl;
+    for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx) {
+      auto ratio = static_cast<double>(current_samples[dataset_idx]) /
+	static_cast<double>(size);
+      std::cout << "   dataset " << dataset_idx << ", input: " <<
+	weights_ptr[dataset_idx] << ", achieved: " << ratio << std::endl; 
+    }
+  }
+
+}
+
+
+py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
+			   const py::array_t<int32_t>& doc_idx_,
+			   const int32_t seq_length,
+			   const int32_t num_epochs,
+			   const int64_t tokens_per_epoch) {
+    /* Sample index (sample_idx) is used for gpt2 like dataset for which
+       the documents are flattened and the samples are built based on this
+       1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
+       where [..., 0] contains the index into `doc_idx` and [..., 1] is the
+       starting offset in that document.*/
+
+    // Consistency checks.
+    assert(seq_length > 1);
+    assert(num_epochs > 0);
+    assert(tokens_per_epoch > 1);
+
+    // Remove bound checks.
+    auto sizes = sizes_.unchecked<1>();
+    auto doc_idx = doc_idx_.unchecked<1>();
+
+    // Mapping and it's length (1D).
+    int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
+    int32_t* sample_idx = new int32_t[2*(num_samples+1)];
+
+    cout << "    using:" << endl << std::flush;
+    cout << "     number of documents:       " <<
+      doc_idx_.shape(0) / num_epochs << endl << std::flush;
+    cout << "     number of epochs:          " << num_epochs <<
+      endl << std::flush;
+    cout << "     sequence length:           " << seq_length <<
+      endl << std::flush;
+    cout << "     total number of samples:   " << num_samples <<
+      endl << std::flush;
+
+    // Index into sample_idx.
+    int64_t sample_index = 0;
+    // Index into doc_idx.
+    int64_t doc_idx_index = 0;
+    // Begining offset for each document.
+    int32_t doc_offset = 0;
+    // Start with first document and no offset.
+    sample_idx[2 * sample_index] = doc_idx_index;
+    sample_idx[2 * sample_index + 1] = doc_offset;
+    ++sample_index;
+
+    while (sample_index <= num_samples) {
+        // Start with a fresh sequence.
+      int32_t remaining_seq_length = seq_length + 1;
+      while (remaining_seq_length != 0) {
+            // Get the document length.
+	auto doc_id = doc_idx[doc_idx_index];
+	auto doc_length = sizes[doc_id] - doc_offset;
+	// And add it to the current sequence.
+	remaining_seq_length -= doc_length;
+	// If we have more than a full sequence, adjust offset and set
+	// remaining length to zero so we return from the while loop.
+	// Note that -1 here is for the same reason we have -1 in
+	// `_num_epochs` calculations.
+	if (remaining_seq_length <= 0) {
+	  doc_offset += (remaining_seq_length + doc_length - 1);
+	  remaining_seq_length = 0;
+	} else {
+	  // Otherwise, start from the begining of the next document.
+	  ++doc_idx_index;
+	  doc_offset = 0;
+	}
+      }
+      // Record the sequence.
+      sample_idx[2 * sample_index] = doc_idx_index;
+      sample_idx[2 * sample_index + 1] = doc_offset;
+      ++sample_index;
+    }
+
+    // Method to deallocate memory.
+    py::capsule free_when_done(sample_idx, [](void *mem_) {
+	int32_t *mem = reinterpret_cast<int32_t*>(mem_);
+	delete[] mem;
+      });
+
+    // Return the numpy array.
+    const auto byte_size = sizeof(int32_t);
+    return py::array(std::vector<int64_t>{num_samples+1, 2}, // shape
+                     {2*byte_size, byte_size}, // C-style contiguous strides
+                     sample_idx, // the data pointer
+                     free_when_done); // numpy array references
+    
+}
+
+
+inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
+				     const int32_t max_length,
+				     std::mt19937& rand32_gen) {
+    /* Training sample length. */
+    if (short_seq_ratio == 0) {
+      return max_length;
+    }
+    const auto random_number = rand32_gen();
+    if ((random_number % short_seq_ratio) == 0) {
+      return 2 + random_number % (max_length - 1);
+    }
+    return max_length;
+}
+
+
+template<typename DocIdx>
+py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
+                             const py::array_t<int32_t>& sizes_,
+                             const int32_t num_epochs,
+                             const uint64_t max_num_samples,
+                             const int32_t max_seq_length,
+                             const double short_seq_prob,
+                             const int32_t seed,
+			     const bool verbose,
+			     const int32_t min_num_sent) {
+    /* Build a mapping of (start-index, end-index, sequence-length) where
+       start and end index are the indices of the sentences in the sample
+       and sequence-length is the target sequence length.
+    */
+
+    // Consistency checks.
+    assert(num_epochs > 0);
+    assert(max_seq_length > 1);
+    assert(short_seq_prob >= 0.0);
+    assert(short_seq_prob <= 1.0);
+    assert(seed > 0);
+
+    // Remove bound checks.
+    auto docs = docs_.unchecked<1>();
+    auto sizes = sizes_.unchecked<1>();
+
+    // For efficiency, convert probability to ratio. Note: rand() generates int.
+    int32_t short_seq_ratio = 0;
+    if (short_seq_prob > 0) {
+      short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));
+    }
+
+    if (verbose) {
+        const auto sent_start_index = docs[0];
+	const auto sent_end_index = docs[docs_.shape(0) - 1];
+	const auto num_sentences = sent_end_index - sent_start_index;
+	cout << "    using:" << endl << std::flush;
+	cout << "     number of documents:            " << docs_.shape(0) - 1 <<
+	  endl << std::flush;
+	cout << "     sentences range:                [" << sent_start_index <<
+	", " << sent_end_index << ")" << endl << std::flush;
+	cout << "     total number of sentences:      " << num_sentences <<
+	  endl << std::flush;
+	cout << "     number of epochs:               " << num_epochs <<
+	  endl << std::flush;
+	cout << "     maximum number of samples:      " << max_num_samples <<
+	  endl << std::flush;
+	cout << "     maximum sequence length:        " << max_seq_length <<
+	  endl << std::flush;
+	cout << "     short sequence probability:     " << short_seq_prob <<
+	endl << std::flush;
+	cout << "     short sequence ration (1/prob): " << short_seq_ratio <<
+	  endl << std::flush;
+	cout << "     seed:                           " << seed << endl <<
+	  std::flush;
+    }
+
+    // Mapping and it's length (1D).
+    int64_t num_samples = -1;
+    DocIdx* maps = NULL;
+
+    // Perform two iterations, in the first iteration get the size
+    // and allocate memory and in the second iteration populate the map.
+    bool second = false;
+    for (int32_t iteration=0; iteration<2; ++iteration) {
+
+        // Set the seed so both iterations produce the same results.
+        std::mt19937 rand32_gen(seed);
+
+        // Set the flag on second iteration.
+        second = (iteration == 1);
+
+        // Counters:
+        uint64_t empty_docs = 0;
+        uint64_t one_sent_docs = 0;
+	uint64_t long_sent_docs = 0;
+
+        // Current map index.
+        uint64_t map_index = 0;
+
+        // For each epoch:
+        for (int32_t epoch=0; epoch<num_epochs; ++epoch) {
+            if (map_index >= max_num_samples) {
+	        if (verbose && (!second)) {
+		  cout << "    reached " << max_num_samples << " samples after "
+		       << epoch << " epochs ..." << endl << std::flush;
+		}
+                break;
+            }
+            // For each document:
+            for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {
+
+                // Document sentences are in [sent_index_first, sent_index_last)
+                const auto sent_index_first = docs[doc];
+                const auto sent_index_last = docs[doc + 1];
+
+                // At the begining of the document previous index is the
+		// start index.
+                auto prev_start_index = sent_index_first;
+
+                // Remaining documents.
+                auto num_remain_sent = sent_index_last - sent_index_first;
+
+                // Some bookkeeping
+                if ((epoch == 0) && (!second)) {
+                    if (num_remain_sent == 0) {
+		        ++empty_docs;
+                    }
+                    if (num_remain_sent == 1) {
+		        ++one_sent_docs;
+                    }
+                }
+
+		// Detect documents with long sentences.
+		bool contains_long_sentence = false;
+		if (num_remain_sent > 1) {
+		    for (auto sent_index=sent_index_first;
+			 sent_index < sent_index_last; ++sent_index) {
+		        if (sizes[sent_index] > LONG_SENTENCE_LEN){
+			    if ((epoch == 0) && (!second)) {
+			        ++long_sent_docs;
+			    }
+			    contains_long_sentence = true;
+			    break;
+			}
+		    }
+		}
+
+                // If we have more than two sentences.
+                if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {
+
+                    // Set values.
+                    auto seq_len = int32_t{0};
+                    auto num_sent = int32_t{0};
+                    auto target_seq_len = get_target_sample_len(short_seq_ratio,
+								max_seq_length,
+								rand32_gen);
+
+                    // Loop through sentences.
+                    for (auto sent_index=sent_index_first;
+                         sent_index < sent_index_last; ++sent_index) {
+
+		        // Add the size and number of sentences.
+		        seq_len += sizes[sent_index];
+		        ++num_sent;
+			--num_remain_sent;
+
+			// If we have reached the target length.
+			// and if not only one sentence is left in the document.
+			// and if we have at least two sentneces.
+			// and if we have reached end of the document.
+			if (((seq_len >= target_seq_len) &&
+			     (num_remain_sent > 1) &&
+			     (num_sent >= min_num_sent) ) || (num_remain_sent == 0)) {
+
+			    // Check for overflow.
+			    if ((3 * map_index + 2) >
+				std::numeric_limits<int64_t>::max()) {
+			        cout << "number of samples exceeded maximum "
+				     << "allowed by type int64: "
+				     << std::numeric_limits<int64_t>::max()
+				     << endl;
+				throw std::overflow_error("Number of samples");
+			    }
+
+			    // Populate the map.
+			    if (second) {
+			        const auto map_index_0 = 3 * map_index;
+				maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+				maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+				maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);
+			    }
+
+			    // Update indices / counters.
+			    ++map_index;
+			    prev_start_index = sent_index + 1;
+			    target_seq_len = get_target_sample_len(short_seq_ratio,
+								   max_seq_length,
+								   rand32_gen);
+			    seq_len = 0;
+			    num_sent = 0;
+			}
+
+                    } // for (auto sent_index=sent_index_first; ...
+                } // if (num_remain_sent > 1) {
+            } // for (int doc=0; doc < num_docs; ++doc) {
+        } // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+        if (!second) {
+	    if (verbose) {
+	        cout << "   number of empty documents: " << empty_docs <<
+		  endl << std::flush;
+		cout << "   number of documents with one sentence: " <<
+		  one_sent_docs << endl << std::flush;
+		cout << "   number of documents with long sentences: " <<
+		  long_sent_docs << endl << std::flush;
+		cout << "   will create mapping for " << map_index <<
+		  " samples" << endl << std::flush;
+	    }
+	    assert(maps == NULL);
+	    assert(num_samples < 0);
+            maps = new DocIdx[3*map_index];
+            num_samples = static_cast<int64_t>(map_index);
+        }
+
+    } // for (int iteration=0; iteration < 2; ++iteration) {
+
+    // Shuffle.
+    // We need a 64 bit random number generator as we might have more
+    // than 2 billion samples.
+    std::mt19937_64 rand64_gen(seed + 1);
+    for (auto i=(num_samples - 1); i > 0; --i) {
+      const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+      const auto i0 = 3 * i;
+      const auto j0 = 3 * j;
+      // Swap values.
+      swap(maps[i0], maps[j0]);
+      swap(maps[i0 + 1], maps[j0 + 1]);
+      swap(maps[i0 + 2], maps[j0 + 2]);
+    }
+
+    // Method to deallocate memory.
+    py::capsule free_when_done(maps, [](void *mem_) {
+            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
+	    delete[] mem;
+        });
+
+    // Return the numpy array.
+    const auto byte_size = sizeof(DocIdx);
+    return py::array(std::vector<int64_t>{num_samples, 3}, // shape
+                     {3*byte_size, byte_size}, // C-style contiguous strides
+                     maps, // the data pointer
+                     free_when_done); // numpy array references
+
+}
+
+
+py::array build_mapping(const py::array_t<int64_t>& docs_,
+                        const py::array_t<int>& sizes_,
+                        const int num_epochs,
+                        const uint64_t max_num_samples,
+                        const int max_seq_length,
+                        const double short_seq_prob,
+                        const int seed,
+			const bool verbose,
+			const int32_t min_num_sent) {
+
+    if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
+        if (verbose) {
+	   cout << "    using uint64 for data mapping..." << endl << std::flush;
+	}
+	return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs,
+					    max_num_samples, max_seq_length,
+					    short_seq_prob, seed, verbose,
+					    min_num_sent);
+    } else {
+       if (verbose) {
+	   cout << "    using uint32 for data mapping..." << endl << std::flush;
+       }
+       return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs,
+					   max_num_samples, max_seq_length,
+					   short_seq_prob, seed, verbose,
+					   min_num_sent);
+    }
+}
+
+template<typename DocIdx>
+py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
+                                    const py::array_t<int32_t>& sizes_,
+                                    const py::array_t<int32_t>& titles_sizes_,
+                                    const int32_t num_epochs,
+                                    const uint64_t max_num_samples,
+                                    const int32_t max_seq_length,
+                                    const int32_t seed,
+                                    const bool verbose,
+                                    const bool use_one_sent_blocks) {
+    /* Build a mapping of (start-index, end-index, sequence-length) where
+       start and end index are the indices of the sentences in the sample
+       and sequence-length is the target sequence length.
+    */
+
+    // Consistency checks.
+    assert(num_epochs > 0);
+    assert(max_seq_length > 1);
+    assert(seed > 0);
+
+    // Remove bound checks.
+    auto docs = docs_.unchecked<1>();
+    auto sizes = sizes_.unchecked<1>();
+    auto titles_sizes = titles_sizes_.unchecked<1>();
+
+    if (verbose) {
+        const auto sent_start_index = docs[0];
+        const auto sent_end_index = docs[docs_.shape(0) - 1];
+        const auto num_sentences = sent_end_index - sent_start_index;
+        cout << "    using:" << endl << std::flush;
+        cout << "     number of documents:            " << docs_.shape(0) - 1 <<
+          endl << std::flush;
+        cout << "     sentences range:                [" << sent_start_index <<
+        ", " << sent_end_index << ")" << endl << std::flush;
+        cout << "     total number of sentences:      " << num_sentences <<
+          endl << std::flush;
+        cout << "     number of epochs:               " << num_epochs <<
+          endl << std::flush;
+        cout << "     maximum number of samples:      " << max_num_samples <<
+          endl << std::flush;
+        cout << "     maximum sequence length:        " << max_seq_length <<
+          endl << std::flush;
+        cout << "     seed:                           " << seed << endl <<
+          std::flush;
+    }
+
+    // Mapping and its length (1D).
+    int64_t num_samples = -1;
+    DocIdx* maps = NULL;
+
+    // Acceptable number of sentences per block.
+    int min_num_sent = 2;
+    if (use_one_sent_blocks) {
+        min_num_sent = 1;
+    }
+
+    // Perform two iterations, in the first iteration get the size
+    // and allocate memory and in the second iteration populate the map.
+    bool second = false;
+    for (int32_t iteration=0; iteration<2; ++iteration) {
+
+        // Set the flag on second iteration.
+        second = (iteration == 1);
+
+        // Current map index.
+        uint64_t map_index = 0;
+
+        uint64_t empty_docs = 0;
+        uint64_t one_sent_docs = 0;
+        uint64_t long_sent_docs = 0;
+        // For each epoch:
+        for (int32_t epoch=0; epoch<num_epochs; ++epoch) {
+            // assign every block a unique id
+            int32_t block_id = 0;
+
+            if (map_index >= max_num_samples) {
+                if (verbose && (!second)) {
+                cout << "    reached " << max_num_samples << " samples after "
+                     << epoch << " epochs ..." << endl << std::flush;
+                }
+                break;
+            }
+            // For each document:
+            for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {
+
+                // Document sentences are in [sent_index_first, sent_index_last)
+                const auto sent_index_first = docs[doc];
+                const auto sent_index_last = docs[doc + 1];
+                const auto target_seq_len = max_seq_length - titles_sizes[doc];
+
+                // At the begining of the document previous index is the
+                // start index.
+                auto prev_start_index = sent_index_first;
+
+                // Remaining documents.
+                auto num_remain_sent = sent_index_last - sent_index_first;
+
+                // Some bookkeeping
+                if ((epoch == 0) && (!second)) {
+                    if (num_remain_sent == 0) {
+		                ++empty_docs;
+                    }
+                    if (num_remain_sent == 1) {
+		                ++one_sent_docs;
+                    }
+                }
+                // Detect documents with long sentences.
+                bool contains_long_sentence = false;
+                if (num_remain_sent >= min_num_sent) {
+                    for (auto sent_index=sent_index_first;
+                    sent_index < sent_index_last; ++sent_index) {
+                        if (sizes[sent_index] > LONG_SENTENCE_LEN){
+                            if ((epoch == 0) && (!second)) {
+                                ++long_sent_docs;
+                            }
+                            contains_long_sentence = true;
+                            break;
+                        }
+                    }
+                }
+                // If we have enough sentences and no long sentences.
+                if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {
+
+                    // Set values.
+                    auto seq_len = int32_t{0};
+                    auto num_sent = int32_t{0};
+
+                    // Loop through sentences.
+                    for (auto sent_index=sent_index_first;
+                         sent_index < sent_index_last; ++sent_index) {
+
+                            // Add the size and number of sentences.
+                            seq_len += sizes[sent_index];
+                            ++num_sent;
+                            --num_remain_sent;
+
+                        // If we have reached the target length.
+                        // and there are an acceptable number of sentences left
+                        // and if we have at least the minimum number of sentences.
+                        // or if we have reached end of the document.
+                        if (((seq_len >= target_seq_len) &&
+                             (num_remain_sent >= min_num_sent) &&
+                             (num_sent >= min_num_sent) ) || (num_remain_sent == 0)) {
+
+                            // Populate the map.
+                            if (second) {
+                                const auto map_index_0 = 4 * map_index;
+                                // Each sample has 4 items: the starting sentence index, ending sentence index,
+                                // the index of the document from which the block comes (used for fetching titles)
+                                // and the unique id of the block (used for creating block indexes)
+
+                                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+                                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+                                maps[map_index_0 + 2] = static_cast<DocIdx>(doc);
+                                maps[map_index_0 + 3] = static_cast<DocIdx>(block_id);
+                            }
+
+                            // Update indices / counters.
+                            ++map_index;
+                            ++block_id;
+                            prev_start_index = sent_index + 1;
+                            seq_len = 0;
+                            num_sent = 0;
+                        }
+                    } // for (auto sent_index=sent_index_first; ...
+                } // if (num_remain_sent > 1) {
+            } // for (int doc=0; doc < num_docs; ++doc) {
+        } // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+        if (!second) {
+            if (verbose) {
+	        cout << "   number of empty documents: " << empty_docs <<
+              endl << std::flush;
+            cout << "   number of documents with one sentence: " <<
+              one_sent_docs << endl << std::flush;
+            cout << "   number of documents with long sentences: " <<
+              long_sent_docs << endl << std::flush;
+            cout << "   will create mapping for " << map_index <<
+              " samples" << endl << std::flush;
+            }
+            assert(maps == NULL);
+            assert(num_samples < 0);
+            maps = new DocIdx[4*map_index];
+            num_samples = static_cast<int64_t>(map_index);
+        }
+
+    } // for (int iteration=0; iteration < 2; ++iteration) {
+
+    // Shuffle.
+    // We need a 64 bit random number generator as we might have more
+    // than 2 billion samples.
+    std::mt19937_64 rand64_gen(seed + 1);
+    for (auto i=(num_samples - 1); i > 0; --i) {
+        const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+        const auto i0 = 4 * i;
+        const auto j0 = 4 * j;
+        // Swap values.
+        swap(maps[i0], maps[j0]);
+        swap(maps[i0 + 1], maps[j0 + 1]);
+        swap(maps[i0 + 2], maps[j0 + 2]);
+        swap(maps[i0 + 3], maps[j0 + 3]);
+    }
+
+    // Method to deallocate memory.
+    py::capsule free_when_done(maps, [](void *mem_) {
+            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
+	    delete[] mem;
+        });
+
+    // Return the numpy array.
+    const auto byte_size = sizeof(DocIdx);
+    return py::array(std::vector<int64_t>{num_samples, 4}, // shape
+                     {4*byte_size, byte_size}, // C-style contiguous strides
+                     maps, // the data pointer
+                     free_when_done); // numpy array references
+
+}
+
+py::array build_blocks_mapping(const py::array_t<int64_t>& docs_,
+                               const py::array_t<int>& sizes_,
+                               const py::array_t<int>& titles_sizes_,
+                               const int num_epochs,
+                               const uint64_t max_num_samples,
+                               const int max_seq_length,
+                               const int seed,
+                    const bool verbose,
+                    const bool use_one_sent_blocks) {
+
+    if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
+        if (verbose) {
+	   cout << "    using uint64 for data mapping..." << endl << std::flush;
+	}
+	return build_blocks_mapping_impl<uint64_t>(docs_, sizes_, titles_sizes_,
+	                    num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
+    } else {
+       if (verbose) {
+	   cout << "    using uint32 for data mapping..." << endl << std::flush;
+       }
+       return build_blocks_mapping_impl<uint32_t>(docs_, sizes_, titles_sizes_,
+                        num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
+    }
+}
+
+PYBIND11_MODULE(helpers, m) {
+    m.def("build_mapping", &build_mapping);
+    m.def("build_blocks_mapping", &build_blocks_mapping);
+    m.def("build_sample_idx", &build_sample_idx);
+    m.def("build_blending_indices", &build_blending_indices);
+}
diff --git a/training/DeepSpeed-Domino/megatron/data/ict_dataset.py b/training/DeepSpeed-Domino/megatron/data/ict_dataset.py
new file mode 100644
index 000000000..6dac35ff9
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/ict_dataset.py
@@ -0,0 +1,156 @@
+import itertools
+import random
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from megatron import get_tokenizer
+from megatron import get_args
+from megatron.data.dataset_utils import get_indexed_dataset_
+from megatron.data.realm_dataset_utils import get_block_samples_mapping
+
+def make_attention_mask(source_block, target_block):
+    """
+    Returns a 2-dimensional (2-D) attention mask
+    :param source_block: 1-D array
+    :param target_block: 1-D array
+    """
+    mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1)
+    mask = mask.astype(np.int64)
+    # (source_length, target_length)
+    return mask
+
+def get_ict_dataset(use_titles=True, query_in_block_prob=1):
+    """Get a dataset which uses block samples mappings to get ICT/block indexing data (via get_block())
+    rather than for training, since it is only built with a single epoch sample mapping.
+    """
+    args = get_args()
+    block_dataset = get_indexed_dataset_(args.data_path, 'mmap', True)
+    titles_dataset = get_indexed_dataset_(args.titles_data_path, 'mmap', True)
+
+    kwargs = dict(
+        name='full',
+        block_dataset=block_dataset,
+        title_dataset=titles_dataset,
+        data_prefix=args.data_path,
+        num_epochs=1,
+        max_num_samples=None,
+        max_seq_length=args.seq_length,
+        seed=1,
+        query_in_block_prob=query_in_block_prob,
+        use_titles=use_titles,
+        use_one_sent_docs=args.use_one_sent_docs
+    )
+    dataset = ICTDataset(**kwargs)
+    return dataset
+
+
+class ICTDataset(Dataset):
+    """Dataset containing sentences and their blocks for an inverse cloze task."""
+    def __init__(self, name, block_dataset, title_dataset, data_prefix,
+                 num_epochs, max_num_samples, max_seq_length, query_in_block_prob,
+                 seed, use_titles=True, use_one_sent_docs=False, binary_head=False):
+        self.name = name
+        self.seed = seed
+        self.max_seq_length = max_seq_length
+        self.query_in_block_prob = query_in_block_prob
+        self.block_dataset = block_dataset
+        self.title_dataset = title_dataset
+        self.rng = random.Random(self.seed)
+        self.use_titles = use_titles
+        self.use_one_sent_docs = use_one_sent_docs
+
+        self.samples_mapping = get_block_samples_mapping(
+            block_dataset, title_dataset, data_prefix, num_epochs,
+            max_num_samples, max_seq_length, seed, name, use_one_sent_docs)
+        self.tokenizer = get_tokenizer()
+        self.vocab_id_list = list(self.tokenizer.inv_vocab.keys())
+        self.vocab_id_to_token_list = self.tokenizer.inv_vocab
+        self.cls_id = self.tokenizer.cls
+        self.sep_id = self.tokenizer.sep
+        self.mask_id = self.tokenizer.mask
+        self.pad_id = self.tokenizer.pad
+
+    def __len__(self):
+        return len(self.samples_mapping)
+
+    def __getitem__(self, idx):
+        """Get an ICT example of a pseudo-query and the block of text from which it was extracted"""
+        sample_data = self.samples_mapping[idx]
+        start_idx, end_idx, doc_idx, block_idx = sample_data.as_tuple()
+
+        if self.use_titles:
+            title = self.title_dataset[int(doc_idx)]
+            title_pad_offset = 3 + len(title)
+        else:
+            title = None
+            title_pad_offset = 2
+        block = [self.block_dataset[i] for i in range(start_idx, end_idx)]
+        assert len(block) > 1 or self.use_one_sent_docs or self.query_in_block_prob == 1
+
+        # randint() is inclusive for Python rng
+        rand_sent_idx = self.rng.randint(0, len(block) - 1)
+
+        # keep the query in the context query_in_block_prob fraction of the time.
+        if self.rng.random() < self.query_in_block_prob:
+            query = block[rand_sent_idx].copy()
+        else:
+            query = block.pop(rand_sent_idx)
+
+        # still need to truncate because blocks are concluded when
+        # the sentence lengths have exceeded max_seq_length.
+        query = query[:self.max_seq_length - 2]
+        block = list(itertools.chain(*block))[:self.max_seq_length - title_pad_offset]
+
+        query_tokens, query_pad_mask = self.concat_and_pad_tokens(query)
+        context_tokens, context_pad_mask = self.concat_and_pad_tokens(block, title)
+
+        query_mask = make_attention_mask(query_tokens, query_tokens)
+        context_mask = make_attention_mask(context_tokens, context_tokens)
+
+        block_data = sample_data.as_array()
+
+        sample = {
+            'query_tokens': query_tokens,
+            'query_mask': query_mask,
+            'query_pad_mask': query_pad_mask,
+            'context_tokens': context_tokens,
+            'context_mask': context_mask,
+            'context_pad_mask': context_pad_mask,
+            'block_data': block_data,
+        }
+
+        return sample
+
+    def get_block(self, start_idx, end_idx, doc_idx):
+        """Get the IDs for an evidence block plus the title of the corresponding document"""
+        block = [self.block_dataset[i] for i in range(start_idx, end_idx)]
+        title = self.title_dataset[int(doc_idx)]
+
+        block = list(itertools.chain(*block))[:self.max_seq_length - (3 + len(title))]
+        block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
+
+        return block_tokens, block_pad_mask
+
+    def get_null_block(self):
+        """Get empty block and title - used in REALM pretraining"""
+        block, title = [], []
+        block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
+
+        return block_tokens, block_pad_mask
+
+    def concat_and_pad_tokens(self, tokens, title=None):
+        """Concat with special tokens and pad sequence to self.max_seq_length"""
+        tokens = list(tokens)
+        if title is None:
+            tokens = [self.cls_id] + tokens + [self.sep_id]
+        else:
+            title = list(title)
+            tokens = [self.cls_id] + title + [self.sep_id] + tokens + [self.sep_id]
+        assert len(tokens) <= self.max_seq_length
+
+        num_pad = self.max_seq_length - len(tokens)
+        pad_mask = [1] * len(tokens) + [0] * num_pad
+        tokens += [self.pad_id] * num_pad
+
+        return np.array(tokens), np.array(pad_mask)
diff --git a/training/DeepSpeed-Domino/megatron/data/image_folder.py b/training/DeepSpeed-Domino/megatron/data/image_folder.py
new file mode 100644
index 000000000..de15b29bf
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/image_folder.py
@@ -0,0 +1,302 @@
+# BSD 3-Clause License
+#
+# Copyright (c) Soumith Chintala 2016, 
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# code taken from 
+# https://github.com/pytorch/vision/blob/main/torchvision/datasets/folder.py
+# added support for classes_fraction and data_per_class_fraction
+
+from torchvision.datasets import VisionDataset
+from PIL import Image
+
+import os
+import os.path
+from typing import Any, Callable, cast, Dict, List, Optional, Tuple
+import numpy as np
+
+def has_file_allowed_extension(filename: str, extensions: Tuple[str, ...]) -> bool:
+    """Checks if a file is an allowed extension.
+    Args:
+        filename (string): path to a file
+        extensions (tuple of strings): extensions to consider (lowercase)
+    Returns:
+        bool: True if the filename ends with one of given extensions
+    """
+    return filename.lower().endswith(extensions)
+
+
+def is_image_file(filename: str) -> bool:
+    """Checks if a file is an allowed image extension.
+    Args:
+        filename (string): path to a file
+    Returns:
+        bool: True if the filename ends with a known image extension
+    """
+    return has_file_allowed_extension(filename, IMG_EXTENSIONS)
+
+
+def make_dataset(
+    directory: str,
+    class_to_idx: Dict[str, int],
+    data_per_class_fraction: float,
+    extensions: Optional[Tuple[str, ...]] = None,
+    is_valid_file: Optional[Callable[[str], bool]] = None,
+) -> List[Tuple[str, int]]:
+    """Generates a list of samples of a form (path_to_sample, class).
+    Args:
+        directory (str): root dataset directory
+        class_to_idx (Dict[str, int]): dictionary mapping class name to class index
+        extensions (optional): A list of allowed extensions.
+            Either extensions or is_valid_file should be passed. Defaults to None.
+        is_valid_file (optional): A function that takes path of a file
+            and checks if the file is a valid file
+            (used to check of corrupt files) both extensions and
+            is_valid_file should not be passed. Defaults to None.
+    Raises:
+        ValueError: In case ``extensions`` and ``is_valid_file`` are None or both are not None.
+    Returns:
+        List[Tuple[str, int]]: samples of a form (path_to_sample, class)
+    """
+    instances = []
+    directory = os.path.expanduser(directory)
+    both_none = extensions is None and is_valid_file is None
+    both_something = extensions is not None and is_valid_file is not None
+    if both_none or both_something:
+        raise ValueError("Both extensions and is_valid_file cannot be None or not None at the same time")
+    if extensions is not None:
+        def is_valid_file(x: str) -> bool:
+            return has_file_allowed_extension(x, cast(Tuple[str, ...], extensions))
+    is_valid_file = cast(Callable[[str], bool], is_valid_file)
+    for target_class in sorted(class_to_idx.keys()):
+        class_index = class_to_idx[target_class]
+        target_dir = os.path.join(directory, target_class)
+        if not os.path.isdir(target_dir):
+            continue
+        local_instances = []
+        for root, _, fnames in sorted(os.walk(target_dir, followlinks=True)):
+            for fname in sorted(fnames):
+                path = os.path.join(root, fname)
+                if is_valid_file(path):
+                    item = path, class_index
+                    local_instances.append(item)
+
+        instances.extend(local_instances[0:int(len(local_instances) * data_per_class_fraction)])
+
+    return instances
+
+
+class DatasetFolder(VisionDataset):
+    """A generic data loader where the samples are arranged in this way: ::
+        root/class_x/xxx.ext
+        root/class_x/xxy.ext
+        root/class_x/[...]/xxz.ext
+        root/class_y/123.ext
+        root/class_y/nsdf3.ext
+        root/class_y/[...]/asd932_.ext
+    Args:
+        root (string): Root directory path.
+        loader (callable): A function to load a sample given its path.
+        extensions (tuple[string]): A list of allowed extensions.
+            both extensions and is_valid_file should not be passed.
+        transform (callable, optional): A function/transform that takes in
+            a sample and returns a transformed version.
+            E.g, ``transforms.RandomCrop`` for images.
+        target_transform (callable, optional): A function/transform that takes
+            in the target and transforms it.
+        is_valid_file (callable, optional): A function that takes path of a file
+            and check if the file is a valid file (used to check of corrupt files)
+            both extensions and is_valid_file should not be passed.
+     Attributes:
+        classes (list): List of the class names sorted alphabetically.
+        class_to_idx (dict): Dict with items (class_name, class_index).
+        samples (list): List of (sample path, class_index) tuples
+        targets (list): The class_index value for each image in the dataset
+    """
+
+    def __init__(
+            self,
+            root: str,
+            loader: Callable[[str], Any],
+            extensions: Optional[Tuple[str, ...]] = None,
+            transform: Optional[Callable] = None,
+            target_transform: Optional[Callable] = None,
+            classes_fraction=1.0,
+            data_per_class_fraction=1.0,
+            is_valid_file: Optional[Callable[[str], bool]] = None,
+    ) -> None:
+        super(DatasetFolder, self).__init__(root, transform=transform,
+                                            target_transform=target_transform)
+        self.classes_fraction = classes_fraction
+        self.data_per_class_fraction = data_per_class_fraction
+        classes, class_to_idx = self._find_classes(self.root)
+        samples = self.make_dataset(self.root,
+                                    class_to_idx,
+                                    self.data_per_class_fraction,
+                                    extensions,
+                                    is_valid_file)
+        if len(samples) == 0:
+            msg = "Found 0 files in subfolders of: {}\n".format(self.root)
+            if extensions is not None:
+                msg += "Supported extensions are: {}".format(",".join(extensions))
+            raise RuntimeError(msg)
+
+        self.loader = loader
+        self.extensions = extensions
+        self.total = len(samples)
+        self.classes = classes
+        self.class_to_idx = class_to_idx
+        self.samples = samples
+        self.targets = [s[1] for s in samples]
+
+    @staticmethod
+    def make_dataset(
+        directory: str,
+        class_to_idx: Dict[str, int],
+        data_per_class_fraction: float,
+        extensions: Optional[Tuple[str, ...]] = None,
+        is_valid_file: Optional[Callable[[str], bool]] = None,
+    ) -> List[Tuple[str, int]]:
+        return make_dataset(directory,
+                            class_to_idx,
+                            data_per_class_fraction,
+                            extensions=extensions,
+                            is_valid_file=is_valid_file)
+
+    def _find_classes(self, dir: str) -> Tuple[List[str], Dict[str, int]]:
+        """
+        Finds the class folders in a dataset.
+        Args:
+            dir (string): Root directory path.
+        Returns:
+            tuple: (classes, class_to_idx) where classes are relative to (dir), and class_to_idx is a dictionary.
+        Ensures:
+            No class is a subdirectory of another.
+        """
+        all_classes = [d.name for d in os.scandir(dir) if d.is_dir()]
+        classes = all_classes[0:int(len(all_classes) * self.classes_fraction)]
+        classes.sort()
+        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+        return classes, class_to_idx
+
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        curr_index = index
+        for x in range(self.total):
+            try:
+                path, target = self.samples[curr_index]
+                sample = self.loader(path)
+                break
+            except Exception as e:
+                curr_index = np.random.randint(0, self.total)
+
+        if self.transform is not None:
+            sample = self.transform(sample)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return sample, target
+
+    def __len__(self) -> int:
+        return len(self.samples)
+
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp')
+
+
+def pil_loader(path: str) -> Image.Image:
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        img = Image.open(f)
+        return img.convert('RGB')
+
+
+# TODO: specify the return type
+def accimage_loader(path: str) -> Any:
+    import accimage
+    try:
+        return accimage.Image(path)
+    except IOError:
+        # Potentially a decoding problem, fall back to PIL.Image
+        return pil_loader(path)
+
+
+def default_loader(path: str) -> Any:
+    from torchvision import get_image_backend
+    if get_image_backend() == 'accimage':
+        return accimage_loader(path)
+    else:
+        return pil_loader(path)
+
+
+class ImageFolder(DatasetFolder):
+    """A generic data loader where the images are arranged in this way: ::
+        root/dog/xxx.png
+        root/dog/xxy.png
+        root/dog/[...]/xxz.png
+        root/cat/123.png
+        root/cat/nsdf3.png
+        root/cat/[...]/asd932_.png
+    Args:
+        root (string): Root directory path.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        loader (callable, optional): A function to load an image given its path.
+        is_valid_file (callable, optional): A function that takes path of an Image file
+            and check if the file is a valid file (used to check of corrupt files)
+     Attributes:
+        classes (list): List of the class names sorted alphabetically.
+        class_to_idx (dict): Dict with items (class_name, class_index).
+        imgs (list): List of (image path, class_index) tuples
+    """
+
+    def __init__(
+            self,
+            root: str,
+            transform: Optional[Callable] = None,
+            target_transform: Optional[Callable] = None,
+            classes_fraction=1.0,
+            data_per_class_fraction=1.0,
+            loader: Callable[[str], Any] = default_loader,
+            is_valid_file: Optional[Callable[[str], bool]] = None,
+    ):
+        super(ImageFolder, self).__init__(root, loader, IMG_EXTENSIONS if is_valid_file is None else None,
+                                          transform=transform,
+                                          target_transform=target_transform,
+                                          classes_fraction=classes_fraction,
+                                          data_per_class_fraction=data_per_class_fraction,
+                                          is_valid_file=is_valid_file)
+        self.imgs = self.samples
+
diff --git a/training/DeepSpeed-Domino/megatron/data/indexed_dataset.py b/training/DeepSpeed-Domino/megatron/data/indexed_dataset.py
new file mode 100644
index 000000000..05ef5c4b2
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/indexed_dataset.py
@@ -0,0 +1,623 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# copied from fairseq/fairseq/data/indexed_dataset.py
+# Removed IndexedRawTextDataset since it relied on Fairseq dictionary
+# other slight modifications to remove fairseq dependencies
+# Added document index to index file and made it accessible.
+#    An empty sentence no longer separates documents.
+
+from functools import lru_cache
+import os
+import shutil
+import struct
+from itertools import accumulate
+
+import numpy as np
+import torch
+from megatron import print_rank_0
+
+
+def __best_fitting_dtype(vocab_size=None):
+    if vocab_size is not None and vocab_size < 65500:
+        return np.uint16
+    else:
+        return np.int32
+
+
+def get_available_dataset_impl():
+    return ['lazy', 'cached', 'mmap']
+
+
+def infer_dataset_impl(path):
+    if IndexedDataset.exists(path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            if magic == IndexedDataset._HDR_MAGIC:
+                return 'cached'
+            elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
+                return 'mmap'
+            else:
+                return None
+    else:
+        print(f"Dataset does not exist: {path}")
+        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
+        return None
+
+
+def make_builder(out_file, impl, vocab_size=None):
+    if impl == 'mmap':
+        return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size))
+    else:
+        return IndexedDatasetBuilder(out_file)
+
+
+def make_dataset(path, impl, skip_warmup=False, multimodal=False):
+    if not IndexedDataset.exists(path):
+        print(f"Dataset does not exist: {path}")
+        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
+        return None
+    if impl == 'infer':
+        impl = infer_dataset_impl(path)
+    if impl == 'lazy' and IndexedDataset.exists(path):
+        return IndexedDataset(path)
+    elif impl == 'cached' and IndexedDataset.exists(path):
+        return IndexedCachedDataset(path)
+    elif impl == 'mmap' and MMapIndexedDataset.exists(path):
+        return MMapIndexedDataset(path, skip_warmup, multimodal)
+    print(f"Unknown dataset implementation: {impl}")
+    return None
+
+
+def dataset_exists(path, impl):
+    if impl == 'mmap':
+        return MMapIndexedDataset.exists(path)
+    else:
+        return IndexedDataset.exists(path)
+
+
+def read_longs(f, n):
+    a = np.empty(n, dtype=np.int64)
+    f.readinto(a)
+    return a
+
+
+def write_longs(f, a):
+    f.write(np.array(a, dtype=np.int64))
+
+
+dtypes = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: np.float64,
+    7: np.float32,
+    8: np.uint16,
+}
+
+
+def code(dtype):
+    for k in dtypes.keys():
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+
+
+def index_file_path(prefix_path):
+    return prefix_path + '.idx'
+
+
+def data_file_path(prefix_path):
+    return prefix_path + '.bin'
+
+
+def create_doc_idx(sizes):
+    doc_idx = [0]
+    for i, s in enumerate(sizes):
+        if s == 0:
+            doc_idx.append(i + 1)
+    return doc_idx
+
+
+class IndexedDataset(torch.utils.data.Dataset):
+    """Loader for IndexedDataset"""
+    _HDR_MAGIC = b'TNTIDX\x00\x00'
+
+    def __init__(self, path):
+        super().__init__()
+        self.path = path
+        self.data_file = None
+        self.read_index(path)
+
+    def read_index(self, path):
+        with open(index_file_path(path), 'rb') as f:
+            magic = f.read(8)
+            assert magic == self._HDR_MAGIC, (
+                'Index file doesn\'t match expected format. '
+                'Make sure that --dataset-impl is configured properly.'
+            )
+            version = f.read(8)
+            assert struct.unpack('<Q', version) == (1,)
+            code, self.element_size = struct.unpack('<QQ', f.read(16))
+            self.dtype = dtypes[code]
+            self._len, self.s = struct.unpack('<QQ', f.read(16))
+            self.doc_count = struct.unpack('<Q', f.read(8))
+            self.dim_offsets = read_longs(f, self._len + 1)
+            self.data_offsets = read_longs(f, self._len + 1)
+            self.sizes = read_longs(f, self.s)
+            self.doc_idx = read_longs(f, self.doc_count)
+
+    def read_data(self, path):
+        self.data_file = open(data_file_path(path), 'rb', buffering=0)
+
+    def check_index(self, i):
+        if i < 0 or i >= self._len:
+            raise IndexError('index out of range')
+
+    def __del__(self):
+        if self.data_file:
+            self.data_file.close()
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if not self.data_file:
+            self.read_data(self.path)
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            return a
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]]
+            size = sum(sizes)
+            a = np.empty(size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[start] * self.element_size)
+            self.data_file.readinto(a)
+            offsets = list(accumulate(sizes))
+            sents = np.split(a, offsets[:-1])
+            return sents
+
+    def __len__(self):
+        return self._len
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    @staticmethod
+    def exists(path):
+        return (
+            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+        )
+
+    @property
+    def supports_prefetch(self):
+        return False  # avoid prefetching to save memory
+
+
+class IndexedCachedDataset(IndexedDataset):
+
+    def __init__(self, path):
+        super().__init__(path)
+        self.cache = None
+        self.cache_index = {}
+
+    @property
+    def supports_prefetch(self):
+        return True
+
+    def prefetch(self, indices):
+        if all(i in self.cache_index for i in indices):
+            return
+        if not self.data_file:
+            self.read_data(self.path)
+        indices = sorted(set(indices))
+        total_size = 0
+        for i in indices:
+            total_size += self.data_offsets[i + 1] - self.data_offsets[i]
+        self.cache = np.empty(total_size, dtype=self.dtype)
+        ptx = 0
+        self.cache_index.clear()
+        for i in indices:
+            self.cache_index[i] = ptx
+            size = self.data_offsets[i + 1] - self.data_offsets[i]
+            a = self.cache[ptx: ptx + size]
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            ptx += size
+        if self.data_file:
+            # close and delete data file after prefetch so we can pickle
+            self.data_file.close()
+            self.data_file = None
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            ptx = self.cache_index[i]
+            np.copyto(a, self.cache[ptx: ptx + a.size])
+            return a
+        elif isinstance(idx, slice):
+            # Hack just to make this work, can optimizer later if necessary
+            sents = []
+            for i in range(*idx.indices(len(self))):
+                sents.append(self[i])
+            return sents
+
+
+class IndexedDatasetBuilder(object):
+    element_sizes = {
+        np.uint8: 1,
+        np.int8: 1,
+        np.int16: 2,
+        np.int32: 4,
+        np.int64: 8,
+        np.float32: 4,
+        np.float64: 8,
+    }
+
+    def __init__(self, out_file, dtype=np.int32):
+        self.out_file = open(out_file, 'wb')
+        self.dtype = dtype
+        self.data_offsets = [0]
+        self.dim_offsets = [0]
+        self.sizes = []
+        self.element_size = self.element_sizes[self.dtype]
+        self.doc_idx = [0]
+
+    def add_item(self, tensor):
+        bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype))
+        self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
+        for s in tensor.size():
+            self.sizes.append(s)
+        self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
+
+    def end_document(self):
+        self.doc_idx.append(len(self.sizes))
+
+    def merge_file_(self, another_file):
+        index = IndexedDataset(another_file)
+        assert index.dtype == self.dtype
+
+        doc_offset = len(self.sizes)
+
+        begin = self.data_offsets[-1]
+        for data_offset in index.data_offsets[1:]:
+            self.data_offsets.append(begin + data_offset)
+        self.sizes.extend(index.sizes)
+
+        begin = self.dim_offsets[-1]
+        for dim_offset in index.dim_offsets[1:]:
+            self.dim_offsets.append(begin + dim_offset)
+
+        self.doc_idx.extend((doc_offset + index.doc_idx)[1:])
+
+        with open(data_file_path(another_file), 'rb') as f:
+            while True:
+                data = f.read(1024)
+                if data:
+                    self.out_file.write(data)
+                else:
+                    break
+
+    def finalize(self, index_file):
+        self.out_file.close()
+        index = open(index_file, 'wb')
+        index.write(b'TNTIDX\x00\x00')
+        index.write(struct.pack('<Q', 1))
+        index.write(struct.pack('<QQ', code(self.dtype), self.element_size))
+        index.write(struct.pack('<QQ', len(self.data_offsets) - 1, len(self.sizes)))
+        index.write(struct.pack('<Q', len(self.doc_idx)))
+        write_longs(index, self.dim_offsets)
+        write_longs(index, self.data_offsets)
+        write_longs(index, self.sizes)
+        write_longs(index, self.doc_idx)
+        index.close()
+
+
+def _warmup_mmap_file(path):
+    with open(path, 'rb') as stream:
+        while stream.read(100 * 1024 * 1024):
+            pass
+
+
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    class Index(object):
+        _HDR_MAGIC = b'MMIDIDX\x00\x00'
+
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer(object):
+                def __enter__(self):
+                    self._file = open(path, 'wb')
+
+                    self._file.write(cls._HDR_MAGIC)
+                    self._file.write(struct.pack('<Q', 1))
+                    self._file.write(struct.pack('<B', code(dtype)))
+
+                    return self
+
+                @staticmethod
+                def _get_pointers(sizes):
+                    dtype_size = dtype().itemsize
+                    address = 0
+                    pointers = []
+
+                    for size in sizes:
+                        pointers.append(address)
+                        address += size * dtype_size
+
+                    return pointers
+
+                def write(self, sizes, modes, doc_idx):
+                    pointers = self._get_pointers(sizes)
+
+                    self._file.write(struct.pack('<Q', len(sizes)))
+                    self._file.write(struct.pack('<Q', len(doc_idx)))
+
+                    sizes = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes.tobytes(order='C'))
+                    del sizes
+
+                    if modes is not None:
+                        modes = np.array(modes, dtype=np.int32)
+                        self._file.write(modes.tobytes(order='C'))
+                        del modes
+
+                    pointers = np.array(pointers, dtype=np.int64)
+                    self._file.write(pointers.tobytes(order='C'))
+                    del pointers
+
+                    doc_idx = np.array(doc_idx, dtype=np.int64)
+                    self._file.write(doc_idx.tobytes(order='C'))
+
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+
+            return _Writer()
+
+        def __init__(self, path, skip_warmup=False, multimodal=False):
+            with open(path, 'rb') as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    'Index file doesn\'t match expected format. '
+                    'Make sure that --dataset-impl is configured properly.'
+                )
+                version = struct.unpack('<Q', stream.read(8))
+                assert (1,) == version
+
+                dtype_code, = struct.unpack('<B', stream.read(1))
+                self._dtype = dtypes[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+                self.multimodal = multimodal
+
+                self._len = struct.unpack('<Q', stream.read(8))[0]
+                self._doc_count = struct.unpack('<Q', stream.read(8))[0]
+                offset = stream.tell()
+
+            if not skip_warmup:
+                print_rank_0("    warming up index mmap file...")
+                _warmup_mmap_file(path)
+
+            self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
+            self._bin_buffer = memoryview(self._bin_buffer_mmap)
+            print_rank_0("    reading sizes...")
+            self._sizes = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int32,
+                count=self._len,
+                offset=offset)
+
+            print_rank_0("    reading pointers...")
+            self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len,
+                                           offset=offset + self._sizes.nbytes)
+            print_rank_0("    reading document index...")
+            self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count,
+                                          offset=offset + self._sizes.nbytes + self._pointers.nbytes)
+            self._modes = None
+            if multimodal:
+                print_rank_0("    reading modes...")
+                self._modes = np.frombuffer(
+                    self._bin_buffer,
+                    dtype=np.int8,
+                    count=self._len,
+                    offset=offset + self._sizes.nbytes + self._pointers.nbytes + self._doc_idx.nbytes)
+
+        def __del__(self):
+            self._bin_buffer_mmap._mmap.close()
+            del self._bin_buffer_mmap
+
+        @property
+        def dtype(self):
+            return self._dtype
+
+        @property
+        def sizes(self):
+            return self._sizes
+
+        @property
+        def modes(self):
+            return self._modes
+
+        @property
+        def doc_idx(self):
+            return self._doc_idx
+
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i], (self._modes[i] if self.multimodal else None)
+
+        def __len__(self):
+            return self._len
+
+    def __init__(self, path, skip_warmup=False, multimodal=False):
+        super().__init__()
+
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+        self.multimodal = multimodal
+
+        self._do_init(path, skip_warmup, multimodal)
+
+    def __getstate__(self):
+        return self._path
+
+    def __setstate__(self, state):
+        self._do_init(state, skip_warmup=True, multimodal=False)
+
+    def _do_init(self, path, skip_warmup, multimodal):
+        self._path = path
+        self._index = self.Index(index_file_path(self._path), skip_warmup, multimodal)
+
+        if not skip_warmup:
+            print_rank_0("    warming up data mmap file...")
+            _warmup_mmap_file(data_file_path(self._path))
+        print_rank_0("    creating numpy buffer of mmap...")
+        self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C')
+        print_rank_0("    creating memory view of numpy buffer...")
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, (int, np.integer)):
+            ptr, size, mode = self._index[idx]
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                     count=size, offset=ptr)
+            return (np_array, mode) if mode is not None else np_array
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            ptr = self._index._pointers[start]
+            sizes = self._index._sizes[idx]
+            modes = self._index._modes[idx] if self.multimodal else None
+            offsets = list(accumulate(sizes))
+            total_size = sum(sizes)
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                     count=total_size, offset=ptr)
+            sents = np.split(np_array, offsets[:-1])
+            return (sents, modes) if modes is not None else sents
+        else:
+            raise TypeError("Unexpected type received for idx: {}".format(type(idx)))
+
+    def get(self, idx, offset=0, length=None):
+        """ Retrieves a single item from the dataset with the option to only
+        return a portion of the item.
+
+        get(idx) is the same as [idx] but get() does not support slicing.
+        """
+        ptr, size, mode = self._index[idx]
+        if length is None:
+            length = size - offset
+        ptr += offset * np.dtype(self._index.dtype).itemsize
+        np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                 count=length, offset=ptr)
+        return (np_array, mode) if mode is not None else np_array
+            
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    @property
+    def modes(self):
+        return self._index.modes
+
+    @property
+    def doc_idx(self):
+        return self._index.doc_idx
+
+    def get_doc_idx(self):
+        return self._index._doc_idx
+
+    def set_doc_idx(self, doc_idx_):
+        self._index._doc_idx = doc_idx_
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @staticmethod
+    def exists(path):
+        return (
+            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+        )
+
+
+class MMapIndexedDatasetBuilder(object):
+    def __init__(self, out_file, dtype=np.int64, multimodal=False):
+        self._data_file = open(out_file, 'wb')
+        self._dtype = dtype
+        self._multimodal = multimodal
+        self._sizes = []
+        self._doc_idx = [0]
+        self._modes = [] if self._multimodal else None
+
+    def add_item(self, tensor, mode=0):
+        np_array = np.array(tensor.numpy(), dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order='C'))
+        self._sizes.append(np_array.size)
+        
+        if self._multimodal:
+            self._modes.append(mode)
+
+    def add_doc(self, tensor, sizes, modes=None):
+        np_array = np.array(tensor, dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order='C'))
+        self._sizes.extend(sizes)
+        self._doc_idx.append(len(self._sizes))
+        
+        if self._multimodal:
+            self._modes.extend(modes if modes is not None else [0]*sizes)
+
+    def end_document(self):
+        self._doc_idx.append(len(self._sizes))
+
+    def merge_file_(self, another_file):
+        # Concatenate index
+        index = MMapIndexedDataset.Index(
+                index_file_path(another_file),
+                multimodal=self._multimodal)
+        assert index.dtype == self._dtype
+
+        offset = len(self._sizes)
+        self._sizes.extend(index.sizes)
+        self._doc_idx.extend((offset + index.doc_idx)[1:])
+        
+        if self._multimodal:
+            self._modes.extend(index.modes)
+
+        # Concatenate data
+        with open(data_file_path(another_file), 'rb') as f:
+            shutil.copyfileobj(f, self._data_file)
+
+    def finalize(self, index_file):
+        self._data_file.close()
+
+        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
+            index.write(self._sizes, self._modes, self._doc_idx)
diff --git a/training/DeepSpeed-Domino/megatron/data/multimodal_dataset.py b/training/DeepSpeed-Domino/megatron/data/multimodal_dataset.py
new file mode 100644
index 000000000..bca277aa9
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/multimodal_dataset.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from PIL import Image, UnidentifiedImageError
+import numpy as np
+import io
+import torch
+
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
+from torchvision.transforms import Compose, ToTensor, Normalize, ToPILImage, RandomResizedCrop, Resize
+
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+
+def _transform(img_h, img_w):
+    return Compose([
+        ToPILImage(),
+        RandomResizedCrop((img_h, img_w), scale=(0.5, 1.0), interpolation=BICUBIC),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+
+class MultiModalDataset(torch.utils.data.Dataset):
+
+    def __init__(self, name, data_prefix, indexed_dataset,
+                 num_samples, seq_length, seed, img_h, img_w):
+
+        self.name = name
+        self.indexed_dataset = indexed_dataset
+        self.doc_idx = indexed_dataset.get_doc_idx()
+        self.visual_transform = _transform(img_h, img_w)
+
+    def __len__(self):
+        return self.indexed_dataset.sizes.shape[0]
+
+    def __getitem__(self, idx):
+        text_sample, mode = self.indexed_dataset.get(self.doc_idx[idx])
+        assert mode == 0
+        img_sample, mode = self.indexed_dataset.get(self.doc_idx[idx]+1)
+        assert mode == 1
+        img_pad = img_sample[0].item()
+        xs = img_sample[1:].tobytes(order='C')
+        xs = xs[:len(xs)-img_pad]
+
+        img_sample = np.array(Image.open(io.BytesIO(xs)))
+        img_sample = self.visual_transform(img_sample).reshape(-1)
+
+        return {'text': np.array(text_sample, dtype=np.int64),
+                'img': np.array(img_sample, dtype=np.float32)}
diff --git a/training/DeepSpeed-Domino/megatron/data/orqa_wiki_dataset.py b/training/DeepSpeed-Domino/megatron/data/orqa_wiki_dataset.py
new file mode 100644
index 000000000..4019cd764
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/orqa_wiki_dataset.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Wikipedia dataset from DPR code for ORQA."""
+
+from abc import ABC
+import csv
+import numpy as np
+import random
+import torch
+from torch.utils.data import Dataset
+
+from megatron import print_rank_0, get_args, get_tokenizer
+from megatron.core import tensor_parallel
+from megatron.data.biencoder_dataset_utils import make_attention_mask
+
+def get_open_retrieval_wiki_dataset():
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    dataset = OpenRetrievalEvidenceDataset('2018 Wikipedia from DPR codebase',
+                                           'evidence',
+                                           args.evidence_data_path,
+                                           tokenizer,
+                                           args.retriever_seq_length)
+    return dataset
+
+
+def get_open_retrieval_batch(data_iterator):
+    # Items and their type.
+    keys = ['row_id', 'context', 'context_mask', 'context_types', 
+        'context_pad_mask']
+    datatype = torch.int64
+
+    # Broadcast data.
+    data = None if data_iterator is None else next(data_iterator)
+    data_b = tensor_parallel.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    row_id = data_b['row_id'].long()
+    context = data_b['context'].long()
+
+    # TODO: make the context mask a binary one
+    context_mask = (data_b['context_mask'] < 0.5)
+
+    context_types = data_b['context_types'].long()
+    context_pad_mask = data_b['context_pad_mask'].long()
+
+    return row_id, context, context_mask, context_types, context_pad_mask
+
+
+def build_tokens_types_paddings_from_text(row, tokenizer, max_seq_length):
+    """Build token types and paddings, trim if needed, and pad if needed."""
+
+    title_ids = tokenizer.tokenize(row['title'])
+    context_ids = tokenizer.tokenize(row['text'])
+
+    # Appending the title of the context at front
+    extended_context_ids = title_ids + [tokenizer.sep_id] + context_ids
+
+    context_ids, context_types, context_pad_mask = \
+        build_tokens_types_paddings_from_ids(extended_context_ids, 
+            max_seq_length, tokenizer.cls, tokenizer.sep, tokenizer.pad)
+
+    return context_ids, context_types, context_pad_mask
+
+
+# noinspection DuplicatedCode
+def build_tokens_types_paddings_from_ids(text_ids, max_seq_length,
+                                         cls_id, sep_id, pad_id):
+    """Build token types and paddings, trim if needed, and pad if needed."""
+    enc_ids = []
+    tokentypes_enc = []
+
+    # [CLS].
+    enc_ids.append(cls_id)
+    tokentypes_enc.append(0)
+
+    # A.
+    len_src = len(text_ids)
+    enc_ids.extend(text_ids)
+    tokentypes_enc.extend([0] * len_src)
+
+    # Cap the size.
+    if len(enc_ids) > max_seq_length - 1:
+        enc_ids = enc_ids[0: max_seq_length - 1]
+        tokentypes_enc = tokentypes_enc[0: max_seq_length - 1]
+
+    # [SEP].
+    enc_ids.append(sep_id)
+    tokentypes_enc.append(0)
+
+    num_tokens_enc = len(enc_ids)
+    # Padding.
+    padding_length = max_seq_length - len(enc_ids)
+    if padding_length > 0:
+        enc_ids.extend([pad_id] * padding_length)
+        tokentypes_enc.extend([pad_id] * padding_length)
+
+    pad_mask = ([1] * num_tokens_enc) + ([0] * padding_length)
+    pad_mask = np.array(pad_mask, dtype=np.int64)
+
+    return enc_ids, tokentypes_enc, pad_mask
+
+
+def build_sample(row_id, context_ids, context_types, context_pad_mask):
+    """Convert to numpy and return a sample consumed by the batch producer."""
+
+    context_ids = np.array(context_ids, dtype=np.int64)
+    context_types = np.array(context_types, dtype=np.int64)
+    context_mask = make_attention_mask(context_ids, context_ids)
+
+    sample = ({
+        'row_id': row_id,
+        'context': context_ids,
+        'context_mask': context_mask,
+        'context_types': context_types,
+        'context_pad_mask': context_pad_mask
+    })
+    return sample
+
+
+class OpenRetrievalEvidenceDataset(ABC, Dataset):
+    """Open Retrieval Evidence dataset class."""
+
+    def __init__(self, task_name, dataset_name, datapath, tokenizer,
+            max_seq_length):
+        # Store inputs.
+        self.task_name = task_name
+        self.dataset_name = dataset_name
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+        print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
+                                                            self.dataset_name))
+        # Process the files.
+        print_rank_0(datapath)
+        self.samples, self.id2text = self.process_samples_from_single_path(
+                                        datapath)
+
+        args = get_args()
+        if args.sample_rate < 1:  # subsample
+            k = int(len(self.samples) * args.sample_rate)
+            self.samples = random.sample(self.samples, k)
+
+        print_rank_0('  >> total number of samples: {}'.format(
+            len(self.samples)))
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, idx):
+        row = self.samples[idx]
+
+        context_ids, context_types, context_pad_mask = \
+            build_tokens_types_paddings_from_text(row, self.tokenizer, 
+                self.max_seq_length)
+
+        sample = build_sample(row['doc_id'],
+                              context_ids,
+                              context_types,
+                              context_pad_mask)
+        return sample
+
+    @staticmethod
+    def process_samples_from_single_path(filename):
+        print_rank_0(' > Processing {} ...'.format(filename))
+        total = 0
+
+        rows = []
+        id2text = {}
+
+        with open(filename) as tsvfile:
+            reader = csv.reader(tsvfile, delimiter='\t')
+            next(reader, None)  # skip the headers
+            for row in reader:
+                # file format: doc_id, doc_text, title
+                doc_id = int(row[0])
+                text = row[1]
+                title = row[2]
+
+                rows.append({'doc_id': doc_id,
+                             'text': text,
+                             'title': title})
+
+                assert doc_id not in id2text
+                id2text[doc_id] = (text, title)
+
+                total += 1
+                if total % 100000 == 0:
+                    print_rank_0('  > processed {} rows so far ...'.format(
+                        total))
+
+        print_rank_0(' >> processed {} samples.'.format(len(rows)))
+        return rows, id2text
diff --git a/training/DeepSpeed-Domino/megatron/data/realm_dataset_utils.py b/training/DeepSpeed-Domino/megatron/data/realm_dataset_utils.py
new file mode 100644
index 000000000..21445573e
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/realm_dataset_utils.py
@@ -0,0 +1,199 @@
+import os
+import time
+
+import numpy as np
+import torch
+
+from megatron import print_rank_0
+from megatron.core import mpu, tensor_parallel
+from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
+from megatron import get_args, get_tokenizer, print_rank_0
+
+
+def get_one_epoch_dataloader(dataset, micro_batch_size=None):
+    """Specifically one epoch to be used in an indexing job."""
+    args = get_args()
+
+    world_size = mpu.get_data_parallel_world_size()
+    rank = mpu.get_data_parallel_rank()
+    if micro_batch_size is None:
+        micro_batch_size = args.micro_batch_size
+    global_batch_size = micro_batch_size * world_size
+    num_workers = args.num_workers
+
+    sampler = torch.utils.data.SequentialSampler(dataset)
+    # importantly, drop_last must be False to get all the data.
+    assert False, 'DistributedBatchSampler deprecated, change the implementation'
+    from megatron.data.samplers import DistributedBatchSampler
+    batch_sampler = DistributedBatchSampler(sampler,
+                                            batch_size=global_batch_size,
+                                            drop_last=False,
+                                            rank=rank,
+                                            world_size=world_size)
+
+    return torch.utils.data.DataLoader(dataset,
+                                       batch_sampler=batch_sampler,
+                                       num_workers=num_workers,
+                                       pin_memory=True)
+
+
+def get_ict_batch(data_iterator):
+    # Items and their type.
+    keys = ['query_tokens', 'query_pad_mask',
+            'block_tokens', 'block_pad_mask', 'block_data']
+    datatype = torch.int64
+
+    # Broadcast data.
+    if data_iterator is None:
+        data = None
+    else:
+        data = next(data_iterator)
+    data_b = tensor_parallel.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    query_tokens = data_b['query_tokens'].long()
+    query_pad_mask = data_b['query_pad_mask'].long()
+    block_tokens = data_b['block_tokens'].long()
+    block_pad_mask = data_b['block_pad_mask'].long()
+    block_indices = data_b['block_data'].long()
+
+    return query_tokens, query_pad_mask,\
+           block_tokens, block_pad_mask, block_indices
+
+
+def join_str_list(str_list):
+    """Join a list of strings, handling spaces appropriately"""
+    result = ""
+    for s in str_list:
+        if s.startswith("##"):
+            result += s[2:]
+        else:
+            result += " " + s
+    return result
+
+
+class BlockSampleData(object):
+    """A struct for fully describing a fixed-size block of data as used in REALM
+
+    :param start_idx: for first sentence of the block
+    :param end_idx: for last sentence of the block (may be partially truncated in sample construction)
+    :param doc_idx: the index of the document from which the block comes in the original indexed dataset
+    :param block_idx: a unique integer identifier given to every block.
+    """
+    def __init__(self, start_idx, end_idx, doc_idx, block_idx):
+        self.start_idx = start_idx
+        self.end_idx = end_idx
+        self.doc_idx = doc_idx
+        self.block_idx = block_idx
+
+    def as_array(self):
+        return np.array([self.start_idx, self.end_idx, self.doc_idx, self.block_idx]).astype(np.int64)
+
+    def as_tuple(self):
+        return self.start_idx, self.end_idx, self.doc_idx, self.block_idx
+
+
+class BlockSamplesMapping(object):
+    def __init__(self, mapping_array):
+        # make sure that the array is compatible with BlockSampleData
+        assert mapping_array.shape[1] == 4
+        self.mapping_array = mapping_array
+
+    def __len__(self):
+        return self.mapping_array.shape[0]
+
+    def __getitem__(self, idx):
+        """Get the data associated with an indexed sample."""
+        sample_data = BlockSampleData(*self.mapping_array[idx])
+        return sample_data
+
+
+def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epochs,
+                              max_num_samples, max_seq_length, seed, name, use_one_sent_docs=False):
+    """Get samples mapping for a dataset over fixed size blocks. This function also requires
+    a dataset of the titles for the source documents since their lengths must be taken into account.
+
+    :return: samples_mapping (BlockSamplesMapping)
+    """
+
+    if not num_epochs:
+        if not max_num_samples:
+            raise ValueError("Need to specify either max_num_samples "
+                             "or num_epochs")
+        num_epochs = np.iinfo(np.int32).max - 1
+    if not max_num_samples:
+        max_num_samples = np.iinfo(np.int64).max - 1
+
+    # Filename of the index mapping
+    indexmap_filename = data_prefix
+    indexmap_filename += '_{}_indexmap'.format(name)
+    if num_epochs != (np.iinfo(np.int32).max - 1):
+        indexmap_filename += '_{}ep'.format(num_epochs)
+    if max_num_samples != (np.iinfo(np.int64).max - 1):
+        indexmap_filename += '_{}mns'.format(max_num_samples)
+    indexmap_filename += '_{}msl'.format(max_seq_length)
+    indexmap_filename += '_{}s'.format(seed)
+    if use_one_sent_docs:
+        indexmap_filename += '_1sentok'
+    indexmap_filename += '.npy'
+
+    # Build the indexed mapping if not exist.
+    if mpu.get_data_parallel_rank() == 0 and \
+            not os.path.isfile(indexmap_filename):
+        print(' > WARNING: could not find index map file {}, building '
+              'the indices on rank 0 ...'.format(indexmap_filename))
+
+        # Make sure the types match the helpers input types.
+        assert block_dataset.doc_idx.dtype == np.int64
+        assert block_dataset.sizes.dtype == np.int32
+
+        # Build samples mapping
+        verbose = torch.distributed.get_rank() == 0
+        start_time = time.time()
+        print_rank_0(' > building samples index mapping for {} ...'.format(
+            name))
+
+        from megatron.data import helpers
+        mapping_array = helpers.build_blocks_mapping(
+            block_dataset.doc_idx,
+            block_dataset.sizes,
+            title_dataset.sizes,
+            num_epochs,
+            max_num_samples,
+            max_seq_length - 3,  # account for added tokens
+            seed,
+            verbose,
+            use_one_sent_docs)
+
+
+        print_rank_0(' > done building samples index mapping')
+        np.save(indexmap_filename, mapping_array, allow_pickle=True)
+        print_rank_0(' > saved the index mapping in {}'.format(
+            indexmap_filename))
+        # Make sure all the ranks have built the mapping
+        print_rank_0(' > elapsed time to build and save samples mapping '
+                     '(seconds): {:4f}'.format(
+            time.time() - start_time))
+
+    # This should be a barrier but nccl barrier assumes
+    # device_index=rank which is not the case for model
+    # parallel case
+    counts = torch.cuda.LongTensor([1])
+    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
+    assert counts[0].item() == torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+
+    # Load indexed dataset.
+    print_rank_0(' > loading indexed mapping from {}'.format(
+        indexmap_filename))
+    start_time = time.time()
+
+    mapping_array = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r')
+    samples_mapping = BlockSamplesMapping(mapping_array)
+
+    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('    total number of samples: {}'.format(
+        mapping_array.shape[0]))
+
+    return samples_mapping
diff --git a/training/DeepSpeed-Domino/megatron/data/realm_index.py b/training/DeepSpeed-Domino/megatron/data/realm_index.py
new file mode 100644
index 000000000..1fa4a309e
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/realm_index.py
@@ -0,0 +1,224 @@
+import itertools
+import os
+import pickle
+import shutil
+
+import numpy as np
+import torch
+
+from megatron import get_args
+from megatron.core import mpu
+
+
+def detach(tensor):
+    return tensor.detach().cpu().numpy()
+
+
+class OpenRetreivalDataStore(object):
+    """
+    Serializable data structure for holding data for blocks --
+    embeddings and necessary metadata for Retriever
+    """
+    def __init__(self, embedding_path=None, load_from_path=True, rank=None):
+        self.embed_data = dict()
+        if embedding_path is None:
+            args = get_args()
+            embedding_path = args.embedding_path
+            rank = args.rank
+        self.embedding_path = embedding_path
+        self.rank = rank
+
+        if load_from_path:
+            self.load_from_file()
+
+        block_data_name = os.path.splitext(self.embedding_path)[0]
+        self.temp_dir_name = block_data_name + '_tmp'
+
+    def state(self):
+        return {
+            'embed_data': self.embed_data,
+        }
+
+    def clear(self):
+        """
+        Clear the embedding data structures to save memory.
+        The metadata ends up getting used, and is also much smaller in
+        dimensionality so it isn't really worth clearing.
+        """
+        self.embed_data = dict()
+
+    def load_from_file(self):
+        """Populate members from instance saved to file"""
+
+        if not mpu.model_parallel_is_initialized() or mpu.get_data_parallel_rank() == 0:
+            print("\n> Unpickling BlockData", flush=True)
+        state_dict = pickle.load(open(self.embedding_path, 'rb'))
+        if not mpu.model_parallel_is_initialized() or mpu.get_data_parallel_rank() == 0:
+            print(">> Finished unpickling BlockData\n", flush=True)
+
+        self.embed_data = state_dict['embed_data']
+
+    def add_block_data(self, row_id, block_embeds, allow_overwrite=False):
+        """
+        Add data for set of blocks
+        :param row_id: 1D array of unique int ids for the blocks
+        :param block_embeds: 2D array of embeddings of the blocks
+            In the case of retriever this will be [start_idx, end_idx, doc_idx]
+        """
+        for idx, embed in zip(row_id, block_embeds):
+            if not allow_overwrite and idx in self.embed_data:
+                raise ValueError("Unexpectedly tried to overwrite block data")
+
+            self.embed_data[idx] = np.float16(embed)
+
+    def save_shard(self):
+        """
+        Save the block data that was created this in this process
+        """
+        if not os.path.isdir(self.temp_dir_name):
+            os.makedirs(self.temp_dir_name, exist_ok=True)
+
+        # save the data for each shard
+        with open('{}/{}.pkl'.format(self.temp_dir_name, self.rank), 'wb') \
+            as writer:
+            pickle.dump(self.state(), writer)
+
+    def merge_shards_and_save(self):
+        #Combine all the shards made using save_shard
+        shard_names = os.listdir(self.temp_dir_name)
+        seen_own_shard = False
+
+        for fname in os.listdir(self.temp_dir_name):
+            shard_rank = int(os.path.splitext(fname)[0])
+            if shard_rank == self.rank:
+                seen_own_shard = True
+                continue
+
+            with open('{}/{}'.format(self.temp_dir_name, fname), 'rb') as f:
+                data = pickle.load(f)
+                old_size = len(self.embed_data)
+                shard_size = len(data['embed_data'])
+
+                # add the shard's data and check to make sure there
+                # is no overlap
+                self.embed_data.update(data['embed_data'])
+                assert len(self.embed_data) == old_size + shard_size
+
+        assert seen_own_shard
+
+        # save the consolidated shards and remove temporary directory
+        with open(self.embedding_path, 'wb') as final_file:
+            pickle.dump(self.state(), final_file)
+        shutil.rmtree(self.temp_dir_name, ignore_errors=True)
+
+        print("Finished merging {} shards for a total of {} embeds".format(
+            len(shard_names), len(self.embed_data)), flush=True)
+
+
+class FaissMIPSIndex(object):
+    """
+    Wrapper object for a BlockData which similarity search via FAISS under the hood
+    """
+    def __init__(self, embed_size, embed_data=None, use_gpu=False):
+        self.embed_size = embed_size
+        self.embed_data = embed_data
+        self.use_gpu = use_gpu
+
+        self.mips_index = None
+        self._set_mips_index()
+
+    def _set_mips_index(self):
+        """
+        Create a Faiss Flat index with inner product as the metric
+        to search against
+        """
+        try:
+            import faiss
+        except ImportError:
+            raise Exception("Error: Please install faiss to use FaissMIPSIndex")
+
+        if not mpu.model_parallel_is_initialized() or mpu.get_data_parallel_rank() == 0:
+            print("\n> Building index", flush=True)
+
+        cpu_index = faiss.IndexFlatIP(self.embed_size)
+
+        if self.use_gpu:
+            # create resources and config for GpuIndex
+            config = faiss.GpuMultipleClonerOptions()
+            config.shard = True
+            config.useFloat16 = True
+            gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=config)
+            self.mips_index = faiss.IndexIDMap(gpu_index)
+            if not mpu.model_parallel_is_initialized() or mpu.get_data_parallel_rank() == 0:
+                print(">> Initialized index on GPU", flush=True)
+        else:
+            # CPU index supports IDs so wrap with IDMap
+            self.mips_index = faiss.IndexIDMap(cpu_index)
+            if not mpu.model_parallel_is_initialized() or mpu.get_data_parallel_rank() == 0:
+                print(">> Initialized index on CPU", flush=True)
+
+        # if we were constructed with a BlockData, then automatically load it
+        # when the FAISS structure is built
+        if self.embed_data is not None:
+            self.add_embed_data(self.embed_data)
+
+    def reset_index(self):
+        """Delete existing index and create a new"""
+        del self.mips_index
+
+        # reset the block data so that _set_block_index will reload it as well
+        if self.embed_data is not None:
+            embed_data_path = self.embed_data.embedding_path
+            del self.embed_data
+            self.embed_data = OpenRetreivalDataStore(embed_data_path)
+
+        self._set_mips_index()
+
+    def update_index(self):
+        """Delete existing index and create a new"""
+        del self.mips_index
+
+        # reset the block data so that _set_mips_index will reload it as well
+        if self.embed_data is not None:
+            self.embed_data.load_from_file()
+        self._set_mips_index()
+
+    def add_embed_data(self, all_embed_data):
+        """Add the embedding of each block to the underlying FAISS index"""
+
+        # this assumes the embed_data is a dict : {int: np.array<float>}
+        block_indices, block_embeds = zip(*all_embed_data.embed_data.items())
+
+        # the embeddings have to be entered in as float32 even though the math
+        # internally is done with float16.
+        embeds_arr = np.float32(np.array(block_embeds))
+        indices_arr = np.array(block_indices)
+
+        # we no longer need the embedding data since it's in the index now
+        all_embed_data.clear()
+
+        self.mips_index.add_with_ids(embeds_arr, indices_arr)
+
+        if not mpu.model_parallel_is_initialized() or mpu.get_data_parallel_rank() == 0:
+            print(">>> Finished adding block data to index", flush=True)
+
+    def search_mips_index(self, query_embeds, top_k, reconstruct=True):
+        """
+        Get the top-k blocks by the index distance metric.
+
+        :param reconstruct: if True: return a [num_queries x k x embed_dim]
+                                array of blocks
+                            if False: return [num_queries x k] array of
+                                distances, and another for indices
+        """
+        query_embeds = np.float32(detach(query_embeds))
+
+        if reconstruct:
+            # get the vectors themselves
+            top_k_block_embeds = self.mips_index.search_and_reconstruct(\
+                query_embeds, top_k)
+            return top_k_block_embeds
+        else:
+            # get distances and indices of closest vectors
+            distances, block_indices = self.mips_index.search(query_embeds, top_k)
+            return distances, block_indices
diff --git a/training/DeepSpeed-Domino/megatron/data/t5_dataset.py b/training/DeepSpeed-Domino/megatron/data/t5_dataset.py
new file mode 100644
index 000000000..e60681490
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/t5_dataset.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""T5 Style dataset."""
+
+import collections
+
+import numpy as np
+import torch
+
+from megatron import get_tokenizer
+from megatron.data.dataset_utils import (
+    create_masked_lm_predictions,
+    get_samples_mapping
+)
+
+class T5Dataset(torch.utils.data.Dataset):
+
+    def __init__(self, name, indexed_dataset, data_prefix,
+                 num_epochs, max_num_samples, masked_lm_prob,
+                 max_seq_length, max_seq_length_dec,
+                 short_seq_prob, seed):
+
+        # Params to store.
+        self.name = name
+        self.seed = seed
+        self.masked_lm_prob = masked_lm_prob
+        self.max_seq_length = max_seq_length
+        self.max_seq_length_dec = max_seq_length_dec
+
+        # Dataset.
+        self.indexed_dataset = indexed_dataset
+
+        # Build the samples mapping.
+        self.samples_mapping = get_samples_mapping(self.indexed_dataset,
+                                                   data_prefix,
+                                                   num_epochs,
+                                                   max_num_samples,
+                                                   self.max_seq_length - 2, # account for added tokens
+                                                   short_seq_prob,
+                                                   self.seed,
+                                                   self.name,
+                                                   False)
+
+        # Vocab stuff.
+        tokenizer = get_tokenizer()
+        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
+        self.vocab_id_to_token_dict = tokenizer.inv_vocab
+        self.cls_id = tokenizer.cls
+        self.sep_id = tokenizer.sep
+        self.mask_id = tokenizer.mask
+        self.pad_id = tokenizer.pad
+        self.bos_id = tokenizer.bos_token_id
+        self.eos_id = tokenizer.eos_token_id
+        self.sentinel_tokens = tokenizer.additional_special_tokens_ids
+        assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script"
+
+    def __len__(self):
+        return self.samples_mapping.shape[0]
+
+    def __getitem__(self, idx):
+
+        start_index, end_index, seq_length = self.samples_mapping[idx]
+        sample = []
+        for index in range(start_index, end_index):
+            sample.append(self.indexed_dataset[index])
+        # Note that this rng state should be numpy and not python since
+        # python randint is inclusive whereas the numpy one is exclusive.
+        np_rng = np.random.RandomState(seed=(self.seed + idx))
+        return build_training_sample(sample, seq_length,
+                                     self.max_seq_length,  # needed for padding
+                                     self.max_seq_length_dec,
+                                     self.vocab_id_list,
+                                     self.vocab_id_to_token_dict,
+                                     self.cls_id, self.sep_id,
+                                     self.mask_id, self.pad_id,
+                                     self.masked_lm_prob, np_rng,
+                                     self.bos_id, self.eos_id,
+                                     self.sentinel_tokens)
+
+
+def build_training_sample(sample, target_seq_length,
+                          max_seq_length, max_seq_length_dec,
+                          vocab_id_list, vocab_id_to_token_dict,
+                          cls_id, sep_id, mask_id, pad_id,
+                          masked_lm_prob, np_rng, bos_id=None,
+                          eos_id=None, sentinel_tokens=None):
+    """Build training sample.
+
+    Arguments:
+        sample: A list of sentences in which each sentence is a list token ids.
+        target_seq_length: Desired sequence length.
+        max_seq_length: Maximum length of the sequence. All values are padded to
+            this length.
+        vocab_id_list: List of vocabulary ids. Used to pick a random id.
+        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
+        cls_id: Start of example id.
+        sep_id: Separator id.
+        mask_id: Mask token id.
+        pad_id: Padding token id.
+        masked_lm_prob: Probability to mask tokens.
+        np_rng: Random number genenrator. Note that this rng state should be
+              numpy and not python since python randint is inclusive for
+              the opper bound whereas the numpy one is exclusive.
+        bos_id: start of decoder example id
+        eos_id: end of generation id
+        sentinel_tokens: unique value to be substituted for every replaced span
+    """
+
+    assert target_seq_length <= max_seq_length
+
+    # flatten sentences into one list
+    tokens = [token for sentence in sample for token in sentence]
+
+    # Truncate to `target_sequence_length`.
+    max_num_tokens = target_seq_length
+    truncated = len(tokens) > max_num_tokens
+    tokens = tokens[:max_num_tokens]
+
+    # Masking.
+    max_predictions_per_seq = masked_lm_prob * max_num_tokens
+    (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions(
+        tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
+        cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng,
+        max_ngrams=10, geometric_dist=True, masking_style="t5")
+
+    # Padding.
+    tokens_enc, tokens_dec_in, labels, enc_mask, \
+    dec_mask, enc_dec_mask, loss_mask \
+        = pad_and_convert_to_numpy(tokens, masked_positions,
+                                   masked_labels, pad_id, max_seq_length,
+                                   max_seq_length_dec, masked_spans,
+                                   bos_id, eos_id, sentinel_tokens)
+
+    train_sample = {
+        'text_enc': tokens_enc,
+        'text_dec': tokens_dec_in,
+        'labels': labels,
+        'loss_mask': loss_mask,
+        'truncated': int(truncated),
+        'enc_mask': enc_mask,
+        'dec_mask': dec_mask,
+        'enc_dec_mask': enc_dec_mask,
+    }
+    return train_sample
+
+
+def pad_and_convert_to_numpy(tokens, masked_positions,
+                             masked_labels, pad_id,
+                             max_seq_length, max_seq_length_dec,
+                             masked_spans=None, bos_id=None,
+                             eos_id=None, sentinel_tokens=None):
+    """Pad sequences and convert them to numpy."""
+
+    sentinel_tokens = collections.deque(sentinel_tokens)
+    t5_input = []
+    (t5_decoder_in, t5_decoder_out) = ([bos_id], [])
+    (start_index, end_index) = (0, None)
+    for span in masked_spans:
+        flag = sentinel_tokens.popleft()
+
+        # Append the same tokens in decoder input and output
+        t5_decoder_in.append(flag)
+        t5_decoder_in.extend(span.label)
+        t5_decoder_out.append(flag)
+        t5_decoder_out.extend(span.label)
+
+        end_index = span.index[0]
+        t5_input.extend(tokens[start_index: end_index])
+        t5_input.append(flag)
+
+        # the next start index is the token after the last span token
+        start_index = span.index[-1] + 1
+
+    # Add <eos> token to the t5_decoder_out
+    t5_decoder_out.append(eos_id)
+
+    # Add the remaining tokens to the t5 input
+    t5_input.extend(tokens[start_index:])
+
+    # assert (len(t5_input) - len(masked_spans)) + \
+    #        (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens)
+
+    # Some checks.
+
+    # Encoder-side padding mask.
+    num_tokens = len(t5_input)
+    padding_length = max_seq_length - num_tokens
+    assert padding_length >= 0
+    assert len(masked_positions) == len(masked_labels)
+
+    # Tokens..
+    filler = [pad_id] * padding_length
+    tokens_enc = np.array(t5_input + filler, dtype=np.int64)
+
+    # Decoder-side padding mask.
+    num_tokens_dec = len(t5_decoder_in)
+    padding_length_dec = max_seq_length_dec - num_tokens_dec
+    assert padding_length_dec >= 0
+    filler_dec = [pad_id] * padding_length_dec
+    tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64)
+
+    # Create attention masks
+    enc_mask = make_attention_mask(tokens_enc, tokens_enc)
+    enc_dec_mask = make_attention_mask(tokens_dec_in, tokens_enc)
+    dec_mask = make_attention_mask(tokens_dec_in, tokens_dec_in)
+    dec_mask = dec_mask * make_history_mask(tokens_dec_in)
+
+    # Labels mask.
+    labels = t5_decoder_out + ([-1] * padding_length_dec)
+    labels = np.array(labels, dtype=np.int64)
+
+    # Loss mask
+    loss_mask = ([1] * num_tokens_dec) + ([0] * padding_length_dec)
+    loss_mask = np.array(loss_mask, dtype=np.int64)
+
+    return tokens_enc, tokens_dec_in, labels, enc_mask, \
+           dec_mask, enc_dec_mask, loss_mask
+
+
+def make_attention_mask(source_block, target_block):
+    """
+    Returns a 2-dimensional (2-D) attention mask
+    :param source_block: 1-D array
+    :param target_block: 1-D array
+    """
+    mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1)
+    mask = mask.astype(np.int64)
+    # (source_length, target_length)
+    return mask
+
+
+def make_attention_mask_3d(source_block, target_block):
+    """
+    Returns a 3-dimensional (3-D) attention mask
+    :param source_block: 1-D array
+    :param target_block: 1-D array
+    """
+    mask = (target_block[:, None, :] >= 1) * (source_block[:, :, None] >= 1)
+    # (batch, source_length, target_length)
+    # mask = mask.astype(np.int64)
+    return mask
+
+
+def make_history_mask(block):
+    length = block.shape[0]
+    arange = np.arange(length)
+    history_mask = (arange[None, ] <= arange[:, None])
+    history_mask = history_mask.astype(np.int64)
+    return history_mask
+
+
+def make_history_mask_3d(block):
+    batch, length = block.shape
+    arange = torch.arange(length, device=block.device)
+    history_mask = (arange[None, ] <= arange[:, None])[None, ]
+    history_mask = history_mask.expand(batch, length, length)
+    return history_mask
diff --git a/training/DeepSpeed-Domino/megatron/data/test/test_indexed_dataset.py b/training/DeepSpeed-Domino/megatron/data/test/test_indexed_dataset.py
new file mode 100644
index 000000000..12fec8d81
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/test/test_indexed_dataset.py
@@ -0,0 +1,125 @@
+# This file isn't really a formal automated test, it's just a place to
+# put some code used during development and manual testing of
+# indexed_dataset.
+
+from megatron.data import indexed_dataset
+from megatron.tokenizer import build_tokenizer
+import argparse
+import os
+import sys
+
+import torch
+
+script_dir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(os.path.join(script_dir, "../../../"))
+
+
+def test_indexed_dataset(args):
+    ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
+    tokenizer = build_tokenizer(args)
+    print(len(ds.doc_idx))
+    print(len(ds))
+    print(ds.doc_idx[-1])
+    if ds.supports_prefetch:
+        # just prefetch the whole thing in test (so assume it is small)
+        ds.prefetch(range(len(ds)))
+    if args.count > len(ds.doc_idx) - 1:
+        args.count = len(ds.doc_idx) - 1
+
+    for i in range(args.count):
+        start = ds.doc_idx[i]
+        end = ds.doc_idx[i + 1]
+        ids = ds[start:end]
+        print(f"Document {i}:")
+        print("--------------")
+        for s in ids:
+            assert len(s) > 0
+            l = s.data.tolist()
+            text = tokenizer.detokenize(l)
+            print(text)
+            print("---")
+
+
+def test_indexed_dataset_get(args):
+    ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
+    tokenizer = build_tokenizer(args)
+    size = ds.sizes[0]
+    print(f"size: {size}")
+    full = ds.get(0)
+    print(full)
+    # print(tokenizer.detokenize(full.data.tolist()))
+    print("---")
+    end = ds.get(0, offset=size - 10)
+    print(end)
+    # print(tokenizer.detokenize(end.data.tolist()))
+
+    start = ds.get(0, length=10)
+    print(start)
+    # print(tokenizer.detokenize(start.data.tolist()))
+
+    part = ds.get(0, offset=2, length=8)
+    print(part)
+    # print(tokenizer.detokenize(part.data.tolist()))
+
+# def test_albert_dataset(args):
+#     # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
+#     # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl)
+#     # ds = AlbertDataset(idataset, tokenizer)
+#     ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl,
+#                                   args.epochs, args.max_num_samples,
+#                                   args.masked_lm_prob, args.seq_length,
+#                                   args.short_seq_prob, args.seed)
+#     truncated = 0
+#     total = 0
+#     for i, s in enumerate(ds):
+#         ids = s['text']
+#         tokens = ds.tokenizer.convert_ids_to_tokens(ids)
+#         print(tokens)
+#         if i >= args.count-1:
+#             exit()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data', type=str, help='prefix to data files')
+    parser.add_argument('--dataset-impl', type=str, default='infer',
+                        choices=['lazy', 'cached', 'mmap', 'infer'])
+    parser.add_argument('--count', type=int, default=10,
+                        help='Number of samples/documents to print')
+
+    group = parser.add_argument_group(title='tokenizer')
+    group.add_argument('--tokenizer-type', type=str, required=True,
+                       choices=['BertWordPieceLowerCase',
+                                'GPT2BPETokenizer'],
+                       help='What type of tokenizer to use.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file')
+    group.add_argument('--merge-file', type=str, default=None,
+                       help='Path to the BPE merge file (if necessary).')
+
+    parser.add_argument('--epochs', type=int, default=5,
+                        help='Number of epochs to plan for')
+    parser.add_argument('--max-num-samples', type=int, default=None,
+                        help='Maximum number of samples to plan for')
+    parser.add_argument('--masked-lm-prob', type=float, default=0.15,
+                        help='probability of masking tokens')
+    parser.add_argument('--seq-length', type=int, default=512,
+                        help='maximum sequence length')
+    parser.add_argument('--short-seq-prob', type=float, default=0.1,
+                        help='probability of creating a short sequence')
+    parser.add_argument('--seed', type=int, default=1234,
+                        help='random seed')
+    args = parser.parse_args()
+    args.rank = 0
+    args.make_vocab_size_divisible_by = 128
+    args.tensor_model_parallel_size = 1
+
+    if args.dataset_impl == "infer":
+        args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data)
+
+#    test_albert_dataset(args)
+    test_indexed_dataset_get(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/DeepSpeed-Domino/megatron/data/test/test_preprocess_data.sh b/training/DeepSpeed-Domino/megatron/data/test/test_preprocess_data.sh
new file mode 100755
index 000000000..d121c8595
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/test/test_preprocess_data.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+IMPL=cached
+python ../preprocess_data.py \
+       --input test_samples.json \
+       --vocab vocab.txt \
+       --dataset-impl ${IMPL} \
+       --output-prefix test_samples_${IMPL} \
+       --workers 1 \
+       --log-interval 2
diff --git a/training/DeepSpeed-Domino/megatron/data/vit_dataset.py b/training/DeepSpeed-Domino/megatron/data/vit_dataset.py
new file mode 100644
index 000000000..82391e915
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/data/vit_dataset.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+import os
+import random
+import numpy as np
+import torch
+import torchvision.transforms as T
+from torchvision import datasets
+from megatron import get_args
+from megatron.data.image_folder import ImageFolder
+from megatron.data.autoaugment import ImageNetPolicy
+from megatron.data.data_samplers import RandomSeedDataset
+from PIL import Image, ImageFilter, ImageOps
+
+
+class GaussianBlur(object):
+    """
+    Apply Gaussian Blur to the PIL image.
+    """
+    def __init__(self, p=0.5, radius_min=0.1, radius_max=2.):
+        self.prob = p
+        self.radius_min = radius_min
+        self.radius_max = radius_max
+
+    def __call__(self, img):
+        do_it = random.random() <= self.prob
+        if not do_it:
+            return img
+
+        return img.filter(
+            ImageFilter.GaussianBlur(
+                radius=random.uniform(self.radius_min, self.radius_max)
+            )
+        )
+
+
+class Solarization(object):
+    """
+    Apply Solarization to the PIL image.
+    """
+    def __init__(self, p):
+        self.p = p
+
+    def __call__(self, img):
+        if random.random() < self.p:
+            return ImageOps.solarize(img)
+        else:
+            return img
+
+
+class ClassificationTransform():
+    def __init__(self, image_size, train=True):
+        args = get_args()
+        assert args.fp16 or args.bf16
+        self.data_type = torch.half if args.fp16 else torch.bfloat16
+        if train:
+            self.transform = T.Compose([
+                T.RandomResizedCrop(image_size),
+                T.RandomHorizontalFlip(),
+                T.ColorJitter(0.4, 0.4, 0.4, 0.1),
+                ImageNetPolicy(),
+                T.ToTensor(),
+                T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+                T.ConvertImageDtype(self.data_type)
+            ])
+        else:
+            self.transform = T.Compose([
+                T.Resize(image_size),
+                T.CenterCrop(image_size),
+                T.ToTensor(),
+                T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+                T.ConvertImageDtype(self.data_type)
+            ])
+
+    def __call__(self, input):
+        output = self.transform(input)
+        return output
+
+
+class InpaintingTransform():
+    def __init__(self, image_size, train=True):
+
+        args = get_args()
+        self.mask_factor = args.mask_factor
+        self.mask_type = args.mask_type
+        self.image_size = image_size
+        self.patch_size = args.patch_dim
+        self.mask_size = int(self.mask_factor*(image_size[0]/self.patch_size)*(image_size[1]/self.patch_size))
+        self.train = train
+        assert args.fp16 or args.bf16
+        self.data_type = torch.half if args.fp16 else torch.bfloat16
+     
+        if self.train:
+            self.transform = T.Compose([
+                T.RandomResizedCrop(self.image_size),
+                T.RandomHorizontalFlip(),
+                T.ColorJitter(0.4, 0.4, 0.4, 0.1),
+                ImageNetPolicy(),
+                T.ToTensor(),
+                T.ConvertImageDtype(self.data_type)
+            ])
+        else:
+            self.transform = T.Compose([
+                T.Resize(self.image_size, interpolation=2),
+                T.CenterCrop(self.image_size),
+                T.ToTensor(),
+                T.ConvertImageDtype(self.data_type)
+            ])
+
+    def gen_mask(self, image_size, mask_size, mask_type, patch_size):
+        # output: mask as a list with indices for missing patches
+        action_list = [[0, 1], [0, -1], [1, 0], [-1, 0]]
+        assert image_size[0] == image_size[1]
+        img_size_patch = image_size[0] // patch_size
+
+        # drop masked patches
+        mask = torch.zeros((image_size[0], image_size[1]), dtype=torch.float)
+
+        if mask_type == 'random':
+            x = torch.randint(0, img_size_patch, ())
+            y = torch.randint(0, img_size_patch, ())
+            for i in range(mask_size):
+                r = torch.randint(0, len(action_list), ())
+                x = torch.clamp(x + action_list[r][0], min=0, max=img_size_patch - 1)
+                y = torch.clamp(y + action_list[r][1], min=0, max=img_size_patch - 1)
+                x_offset = x * patch_size
+                y_offset = y * patch_size
+                mask[x_offset:x_offset+patch_size, y_offset:y_offset+patch_size] = 1
+        else:
+            assert mask_type == 'row'
+            count = 0
+            for x in reversed(range(img_size_patch)):
+                for y in reversed(range(img_size_patch)):
+                    if (count < mask_size):
+                        count += 1
+                        x_offset = x * patch_size
+                        y_offset = y * patch_size
+                        mask[x_offset:x_offset+patch_size, y_offset:y_offset+patch_size] = 1
+        return mask
+
+    def __call__(self, input):
+        trans_input = self.transform(input)
+        mask = self.gen_mask(self.image_size, self.mask_size, 
+			     self.mask_type, self.patch_size)
+        mask = mask.unsqueeze(dim=0)
+        return trans_input, mask
+
+
+class DinoTransform(object):
+    def __init__(self, image_size, train=True):
+        args = get_args()
+        self.data_type = torch.half if args.fp16 else torch.bfloat16
+
+        flip_and_color_jitter = T.Compose([
+            T.RandomHorizontalFlip(p=0.5),
+            T.RandomApply(
+                [T.ColorJitter(brightness=0.4, contrast=0.4,
+			       saturation=0.2, hue=0.1)],
+                p=0.8
+            ),
+            T.RandomGrayscale(p=0.2),
+        ])
+
+        if args.fp16 or args.bf16:
+            normalize = T.Compose([
+                T.ToTensor(),
+                T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+                T.ConvertImageDtype(self.data_type)
+            ])
+        else:
+            normalize = T.Compose([
+                T.ToTensor(),
+                T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+            ])
+
+        # first global crop
+        scale_const = 0.4
+        self.global_transform1 = T.Compose([
+            T.RandomResizedCrop(image_size,
+                                scale=(scale_const, 1),
+                                interpolation=Image.BICUBIC),
+            flip_and_color_jitter,
+            GaussianBlur(1.0),
+            normalize
+        ])
+        # second global crop
+        self.global_transform2 = T.Compose([
+            T.RandomResizedCrop(image_size,
+                                scale=(scale_const, 1),
+                                interpolation=Image.BICUBIC),
+            flip_and_color_jitter,
+            GaussianBlur(0.1),
+            Solarization(0.2),
+            normalize
+        ])
+        # transformation for the local small crops
+        self.local_crops_number = args.dino_local_crops_number
+        self.local_transform = T.Compose([
+            T.RandomResizedCrop(args.dino_local_img_size,
+                                scale=(0.05, scale_const),
+                                interpolation=Image.BICUBIC),
+            flip_and_color_jitter,
+            GaussianBlur(p=0.5),
+            normalize
+        ])
+
+    def __call__(self, image):
+        crops = []
+        crops.append(self.global_transform1(image))
+        crops.append(self.global_transform2(image))
+        for _ in range(self.local_crops_number):
+            crops.append(self.local_transform(image))
+        return crops
+
+
+def build_train_valid_datasets(data_path, image_size=224):
+    args = get_args()
+
+    if args.vision_pretraining_type == 'classify':
+        train_transform = ClassificationTransform(image_size)
+        val_transform = ClassificationTransform(image_size, train=False)
+    elif args.vision_pretraining_type == 'inpaint':
+        train_transform = InpaintingTransform(image_size, train=False)
+        val_transform = InpaintingTransform(image_size, train=False)
+    elif args.vision_pretraining_type == 'dino':
+        train_transform = DinoTransform(image_size, train=True)
+        val_transform = ClassificationTransform(image_size, train=False)
+    else:
+        raise Exception('{} vit pretraining type is not supported.'.format(
+                args.vit_pretraining_type))
+
+    # training dataset
+    train_data_path = data_path[0] if len(data_path) <= 2 else data_path[2]
+    train_data = ImageFolder(
+        root=train_data_path,
+        transform=train_transform,
+        classes_fraction=args.classes_fraction,
+        data_per_class_fraction=args.data_per_class_fraction
+    )
+    train_data = RandomSeedDataset(train_data)
+
+    # validation dataset
+    val_data_path = data_path[1]
+    val_data = ImageFolder(
+        root=val_data_path,
+        transform=val_transform
+    )
+    val_data = RandomSeedDataset(val_data)
+
+    return train_data, val_data
diff --git a/training/DeepSpeed-Domino/megatron/dist_signal_handler.py b/training/DeepSpeed-Domino/megatron/dist_signal_handler.py
new file mode 100644
index 000000000..a60204f00
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/dist_signal_handler.py
@@ -0,0 +1,81 @@
+import signal
+
+import torch
+
+
+def get_world_size():
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        world_size = torch.distributed.get_world_size()
+    else:
+        world_size = 1
+    return world_size
+
+
+def get_device(local_rank=None):
+    backend = torch.distributed.get_backend()
+    if backend == 'nccl':
+        if local_rank is None:
+            device = torch.device('cuda')
+        else:
+            device = torch.device(f'cuda:{local_rank}')
+    elif backend == 'gloo':
+        device = torch.device('cpu')
+    else:
+        raise RuntimeError
+    return device
+
+
+def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None):
+    if not torch.distributed.is_available() or \
+       not torch.distributed.is_initialized():
+        return [item]
+
+    device = get_device(local_rank)
+
+    if group is not None:
+        group_size = group.size()
+    else:
+        group_size = get_world_size()
+
+    tensor = torch.tensor([item], device=device, dtype=dtype)
+    output_tensors = [
+        torch.zeros(1, dtype=tensor.dtype, device=tensor.device)
+        for _ in range(group_size)
+    ]
+    torch.distributed.all_gather(output_tensors, tensor, group, async_op)
+    output = [elem.item() for elem in output_tensors]
+    return output
+
+
+class DistributedSignalHandler:
+    def __init__(self, sig=signal.SIGTERM):
+        self.sig = sig
+
+    def signals_received(self):
+        all_received = all_gather_item(
+            self._signal_received, dtype=torch.int32
+        )
+        return all_received
+
+    def __enter__(self):
+        self._signal_received = False
+        self.released = False
+        self.original_handler = signal.getsignal(self.sig)
+
+        def handler(signum, frame):
+            self._signal_received = True
+
+        signal.signal(self.sig, handler)
+
+        return self
+
+    def __exit__(self, type, value, tb):
+        self.release()
+
+    def release(self):
+        if self.released:
+            return False
+
+        signal.signal(self.sig, self.original_handler)
+        self.released = True
+        return True
diff --git a/training/DeepSpeed-Domino/megatron/fused_kernels/__init__.py b/training/DeepSpeed-Domino/megatron/fused_kernels/__init__.py
new file mode 100644
index 000000000..87cceac3e
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/fused_kernels/__init__.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import os
+import pathlib
+import subprocess
+
+from torch.utils import cpp_extension
+
+# Setting this param to a list has a problem of generating different
+# compilation commands (with diferent order of architectures) and
+# leading to recompilation of fused kernels. Set it to empty string
+# to avoid recompilation and assign arch flags explicity in
+# extra_cuda_cflags below
+os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
+
+def load(args):
+
+    # Check if cuda 11 is installed for compute capability 8.0
+    cc_flag = []
+    _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(
+        cpp_extension.CUDA_HOME
+    )
+    if int(bare_metal_major) >= 11:
+        cc_flag.append('-gencode')
+        cc_flag.append('arch=compute_80,code=sm_80')
+        if int(bare_metal_minor) >= 8:
+            cc_flag.append('-gencode')
+            cc_flag.append('arch=compute_90,code=sm_90')
+
+    # Build path
+    srcpath = pathlib.Path(__file__).parent.absolute()
+    buildpath = srcpath / "build"
+    _create_build_dir(buildpath)
+
+    # Helper function to build the kernels.
+    def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
+        return cpp_extension.load(
+            name=name,
+            sources=sources,
+            build_directory=buildpath,
+            extra_cflags=[
+                "-O3",
+            ],
+            extra_cuda_cflags=[
+                "-O3",
+                "-gencode",
+                "arch=compute_70,code=sm_70",
+                "--use_fast_math",
+            ]
+            + extra_cuda_flags
+            + cc_flag,
+            verbose=(args.rank == 0),
+        )
+
+
+def _get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+    )
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+
+    return raw_output, bare_metal_major, bare_metal_minor
+
+
+def _create_build_dir(buildpath):
+    try:
+        os.mkdir(buildpath)
+    except OSError:
+        if not os.path.isdir(buildpath):
+            print(f"Creation of the build directory {buildpath} failed")
diff --git a/training/DeepSpeed-Domino/megatron/fused_kernels/compat.h b/training/DeepSpeed-Domino/megatron/fused_kernels/compat.h
new file mode 100644
index 000000000..5495d7807
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/fused_kernels/compat.h
@@ -0,0 +1,17 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
+
+/*This code is copied fron NVIDIA apex:
+ *     https://github.com/NVIDIA/apex
+ *     with minor changes. */
+
+
+
+#ifndef TORCH_CHECK
+#define TORCH_CHECK AT_CHECK
+#endif
+
+#ifdef VERSION_GE_1_3
+#define DATA_PTR data_ptr
+#else
+#define DATA_PTR data
+#endif
diff --git a/training/DeepSpeed-Domino/megatron/fused_kernels/tests/__init__.py b/training/DeepSpeed-Domino/megatron/fused_kernels/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/DeepSpeed-Domino/megatron/fused_kernels/tests/test_fused_kernels.py b/training/DeepSpeed-Domino/megatron/fused_kernels/tests/test_fused_kernels.py
new file mode 100644
index 000000000..74024c502
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/fused_kernels/tests/test_fused_kernels.py
@@ -0,0 +1,388 @@
+import math
+
+import torch
+from torch.nn import LayerNorm
+
+from megatron.model.enums import AttnMaskType
+from megatron.model.fused_layer_norm import MixedFusedLayerNorm
+from megatron.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.model.utils import attention_mask_func
+from megatron.fused_kernels import load
+
+def test_load_fused_kernels():
+    try:
+        import fused_layer_norm_cuda
+        import scaled_masked_softmax_cuda
+        import scaled_upper_triang_masked_softmax_cuda
+        import torch
+
+        print("[Success] load_fused_kernels")
+    except ImportError as e:
+        print("[Fail] load_fused_kernels")
+        raise e
+
+def test_fused_softmax():
+    bert = BertModel.from_pretrained("bert-base-cased").cuda().half()
+    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    test_text = (
+        "Hello. How are you? I am fine thank you and you? yes Good. "
+        "hi hi hi hi hi hi hi hi hi hi hi hi hi"  # 32
+    )
+
+    tokens = tokenizer(
+        [test_text] * 4,
+        return_tensors="pt",
+    )
+
+    embedding_output = bert.embeddings(
+        input_ids=tokens["input_ids"].cuda(),
+        position_ids=None,
+        token_type_ids=tokens["token_type_ids"].cuda(),
+        inputs_embeds=None,
+        past_key_values_length=0,
+    )
+
+    # (bsz, 1, 1, seq_len)
+    mask = bert.get_extended_attention_mask(
+        attention_mask=tokens["attention_mask"].cuda(),
+        input_shape=tokens["input_ids"].shape,
+        device=bert.device,
+    )
+    # (bsz, 1, seq_len, seq_len)
+    mask = mask.repeat(1, 1, mask.size()[-1], 1)
+
+    attention = bert.encoder.layer[0].attention.self
+    key_layer = attention.transpose_for_scores(attention.key(embedding_output))
+    query_layer = attention.transpose_for_scores(attention.query(embedding_output))
+
+    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+    attention_scores /= math.sqrt(key_layer.size()[-1])
+
+    fused_softmax = (
+        FusedScaleMaskSoftmax(
+            input_in_fp16=True,
+            input_in_bf16=False,
+            mask_func=attention_mask_func,
+            scale=None,
+            softmax_in_fp32=False,
+            attn_mask_type=AttnMaskType.padding,
+            scaled_masked_softmax_fusion=True,
+        )
+        .cuda()
+        .half()
+    )
+
+    fused_softmax_output = fused_softmax(
+        attention_scores,
+        (mask != 0),
+    )
+
+    torch_softmax = (
+        FusedScaleMaskSoftmax(
+            input_in_fp16=True,
+            input_in_bf16=False,
+            mask_func=attention_mask_func,
+            scale=None,
+            softmax_in_fp32=False,
+            attn_mask_type=AttnMaskType.padding,
+            scaled_masked_softmax_fusion=False,
+        )
+        .cuda()
+        .half()
+    )
+
+    torch_softmax_output = torch_softmax(
+        attention_scores,
+        (mask != 0),
+    )
+
+    test_result = (fused_softmax_output - torch_softmax_output).abs()
+
+    while test_result.dim() != 1:
+        test_result = test_result.mean(dim=-1)
+
+    diff = test_result.mean(dim=-1)
+
+    if diff <= 1e-3:
+        print(
+            f"\n[Success] test_fused_softmax"
+            f"\n > mean_difference={diff}"
+            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}"
+            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
+        )
+    else:
+        print(
+            f"\n[Fail] test_fused_softmax"
+            f"\n > mean_difference={diff}, "
+            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, "
+            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
+        )
+
+
+def test_fused_upper_triangle_mask_softmax():
+    gpt = GPT2Model.from_pretrained("gpt2").cuda().half()
+    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    test_text = (
+        "Hello. How are you? I am fine thank you and you? yes Good. "
+        "hi hi hi hi hi hi hi"  # 24
+    )
+
+    tokens = tokenizer(
+        [test_text] * 4,
+        return_tensors="pt",
+    )
+
+    attention_mask = tokens["attention_mask"].cuda()
+    attention_mask = attention_mask.view(attention_mask.size(0), -1)
+    attention_mask = attention_mask[:, None, None, :]
+    attention_mask = (1.0 - attention_mask) * -10000.0
+    attention_mask = attention_mask.repeat(1, 1, attention_mask.size()[-1], 1)
+    attn = gpt.h[0]
+
+    hidden_states = gpt.wte(tokens["input_ids"].cuda())
+    q, k, v = attn.attn.c_attn(hidden_states).split(768, dim=-1)
+    q = attn.attn._split_heads(q, attn.attn.num_heads, attn.attn.head_dim)
+    k = attn.attn._split_heads(k, attn.attn.num_heads, attn.attn.head_dim)
+    attn_weights = torch.matmul(q, k.transpose(-1, -2))
+
+    sq, sk = q.size(-2), k.size(-2)
+    causal_mask = attn.attn.bias[:, :, sk - sq : sk, :sk].bool()
+    total_mask = ~(causal_mask & (attention_mask == 0))
+    """
+    tensor([[[[False,  True,  True,  ...,  True,  True,  True],
+              [False, False,  True,  ...,  True,  True,  True],
+              [False, False, False,  ...,  True,  True,  True],
+              ...,
+              [False, False, False,  ..., False,  True,  True],
+              [False, False, False,  ..., False, False,  True],
+              [False, False, False,  ..., False, False, False]]]
+    """
+
+    fused_softmax = (
+        FusedScaleMaskSoftmax(
+            input_in_fp16=True,
+            input_in_bf16=False,
+            mask_func=attention_mask_func,
+            scale=None,
+            softmax_in_fp32=False,
+            attn_mask_type=AttnMaskType.causal,
+            scaled_masked_softmax_fusion=True,
+        )
+        .cuda()
+        .half()
+    )
+
+    fused_softmax_output = fused_softmax(
+        attn_weights,
+        total_mask,
+    )
+
+    torch_softmax = (
+        FusedScaleMaskSoftmax(
+            input_in_fp16=True,
+            input_in_bf16=False,
+            mask_func=attention_mask_func,
+            scale=None,
+            softmax_in_fp32=False,
+            attn_mask_type=AttnMaskType.causal,
+            scaled_masked_softmax_fusion=False,
+        )
+        .cuda()
+        .half()
+    )
+
+    torch_softmax_output = torch_softmax(
+        attn_weights,
+        total_mask,
+    )
+
+    test_result = (fused_softmax_output - torch_softmax_output).abs()
+
+    while test_result.dim() != 1:
+        test_result = test_result.mean(dim=-1)
+
+    diff = test_result.mean(dim=-1)
+
+    if diff <= 1e-3:
+        print(
+            f"\n[Success] test_fused_upper_triangle_mask_softmax"
+            f"\n > mean_difference={diff}"
+            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}"
+            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
+        )
+    else:
+        print(
+            f"\n[Fail] test_fused_upper_triangle_mask_softmax"
+            f"\n > mean_difference={diff}, "
+            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, "
+            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
+        )
+
+
+def test_layer_norm():
+    bert = BertModel.from_pretrained("bert-base-cased").cuda().half()
+    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    test_text = (
+        "Hello. How are you? I am fine thank you and you? yes Good. "
+        "hi hi hi hi hi hi hi hi hi hi hi hi hi"  # 32
+    )
+
+    tokens = tokenizer(
+        [test_text] * 4,
+        return_tensors="pt",
+    )
+
+    # [bsz, seq_len, d_model]
+    embedding_output = (
+        bert.embeddings(
+            input_ids=tokens["input_ids"].cuda(),
+            position_ids=None,
+            token_type_ids=tokens["token_type_ids"].cuda(),
+            inputs_embeds=None,
+            past_key_values_length=0,
+        )
+        .cuda()
+        .half()
+    )
+
+    fused_layernorm_layer = (
+        MixedFusedLayerNorm(normalized_shape=embedding_output.size(-1)).cuda().half()
+    )
+
+    torch_layernorm_layer = (
+        LayerNorm(normalized_shape=embedding_output.size(-1)).cuda().half()
+    )
+
+    fused_output = fused_layernorm_layer(embedding_output)
+    torch_output = torch_layernorm_layer(embedding_output)
+    test_result = (fused_output - torch_output).abs()
+
+    while test_result.dim() != 1:
+        test_result = test_result.mean(dim=-1)
+
+    diff = test_result.mean(dim=-1)
+
+    if diff <= 1e-3:
+        print(
+            f"\n[Success] test_layer_norm"
+            f"\n > mean_difference={diff}"
+            f"\n > fused_values={fused_output[-1][-1][:5].tolist()}"
+            f"\n > torch_values={torch_output[-1][-1][:5].tolist()}"
+        )
+    else:
+        print(
+            f"\n[Fail] test_layer_norm"
+            f"\n > mean_difference={diff}, "
+            f"\n > fused_values={fused_output[-1][-1][:5].tolist()}, "
+            f"\n > torch_values={torch_output[-1][-1][:5].tolist()}"
+        )
+
+
+def attention_mask_func(attention_scores, attention_mask):
+    attention_scores.masked_fill_(attention_mask, -10000.0)
+    return attention_scores
+
+
+def forward_torch_softmax(input, mask, scale):
+    input = input * scale
+    mask_output = attention_mask_func(input, mask) if mask is not None else input
+    probs = torch.nn.Softmax(dim=-1)(mask_output)
+    return probs
+
+
+def test_masked_softmax_forward():
+    import scaled_masked_softmax_cuda
+
+    batch = 2
+    attn = 16
+    scale_t = torch.tensor([1.0])
+    for qlen in [128, 256, 1024, 2048, 4096]:
+        for klen in [128, 256, 1024, 2048]:
+            inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0')
+            masks = torch.randint(0, 2, (batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0')
+            softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item())
+            softmax_results_torch = forward_torch_softmax(inputs, masks, scale_t[0].item())
+            error = (softmax_results_torch - softmax_results).abs().max()
+            assert error < 1e-3
+
+def test_masked_softmax_backward():
+    import scaled_masked_softmax_cuda
+
+    batch = 2
+    attn = 16
+    scale_t = torch.tensor([1.0])
+    for qlen in [128, 256, 1024, 2048, 4096]:
+        for klen in [128, 256, 1024, 2048]:
+            inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0')
+            backward = torch.rand_like(inputs, dtype=torch.float16, device='cuda:0')
+            masks = torch.randint(0, 2, (batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0')
+            softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item())
+            back_grad = scaled_masked_softmax_cuda.backward(backward, softmax_results, scale_t[0].item())
+
+            inputs.requires_grad = True
+            softmax_results_torch = forward_torch_softmax(inputs, masks, scale_t[0].item())
+            softmax_results_torch.backward(backward)
+            error = (back_grad - inputs.grad).abs().max()
+            assert error < 1e-3
+
+
+def test_allmasked_softmax_forward():
+    import scaled_masked_softmax_cuda
+
+    batch = 2
+    attn = 16
+    scale_t = torch.tensor([1.0])
+    for qlen in [128, 256, 1024, 2048, 4096]:
+        for klen in [128, 256, 1024, 2048]:
+            inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0')
+            masks = torch.ones((batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0')
+            softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item())
+            softmax_results_torch = torch.zeros_like(inputs)
+            error = (softmax_results_torch - softmax_results).abs().max()
+            assert error == 0.0
+
+
+def test_allmasked_softmax_backward():
+    import scaled_masked_softmax_cuda
+
+    batch = 2
+    attn = 16
+    scale_t = torch.tensor([1.0])
+    for qlen in [128, 256, 1024, 2048, 4096]:
+        for klen in [128, 256, 1024, 2048]:
+            inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0')
+            backward = torch.rand_like(inputs, dtype=torch.float16, device='cuda:0')
+            masks = torch.ones((batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0')
+            softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item())
+            back_grad = scaled_masked_softmax_cuda.backward(backward, softmax_results, scale_t[0].item())
+            inputs.requires_grad = True
+            softmax_results_torch = forward_torch_softmax(inputs, masks, scale_t[0].item())
+            softmax_results_torch.backward(backward)
+            error = (back_grad - inputs.grad).abs().max()
+            assert error < 1e-3
+
+
+if __name__ == "__main__":
+    try:
+        from transformers import BertTokenizer, GPT2Tokenizer
+        from transformers.models.bert.modeling_bert import BertModel
+        from transformers.models.gpt2.modeling_gpt2 import GPT2Model
+        import transformers
+
+        transformers.logging.set_verbosity(
+            transformers.logging.FATAL,
+        )
+
+    except:
+        print("\n[Fail] Please install `transformers` package to test fused kernels\n")
+        exit(-1)
+
+    load()
+    test_masked_softmax_forward()
+    test_masked_softmax_backward()
+    test_allmasked_softmax_forward()
+    test_allmasked_softmax_backward()
+    test_load_fused_kernels()
+    test_fused_softmax()
+    test_fused_upper_triangle_mask_softmax()
+    test_layer_norm()
diff --git a/training/DeepSpeed-Domino/megatron/fused_kernels/type_shim.h b/training/DeepSpeed-Domino/megatron/fused_kernels/type_shim.h
new file mode 100644
index 000000000..d60a6f8c6
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/fused_kernels/type_shim.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
+
+
+#include <ATen/ATen.h>
+#include "compat.h"
+
+
+#define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)			\
+  switch(TYPE)								\
+    {									\
+    case at::ScalarType::Half:						\
+      {									\
+	using scalar_t = at::Half;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    case at::ScalarType::BFloat16:					\
+      {									\
+	using scalar_t = at::BFloat16;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    default:								\
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
+      }
+
+
+#define DISPATCH_HALF_BFLOAT_AND_FLOAT(TYPE, NAME, ...)			\
+  switch(TYPE)								\
+    {									\
+    case at::ScalarType::Half:						\
+      {									\
+	using scalar_t = at::Half;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    case at::ScalarType::BFloat16:					\
+      {									\
+	using scalar_t = at::BFloat16;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    case at::ScalarType::Float:						\
+      {									\
+	using scalar_t = float;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    default:								\
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
+      }
+
+
+
+#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
+  switch(TYPEIN)							\
+    {									\
+    case at::ScalarType::Float:						\
+      {									\
+	using scalar_t_in = float;					\
+	switch(TYPEOUT)							\
+	  {								\
+	  case at::ScalarType::Float:					\
+	    {								\
+	      using scalar_t_out = float;				\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  case at::ScalarType::Half:					\
+	    {								\
+	      using scalar_t_out = at::Half;				\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  case at::ScalarType::BFloat16:				\
+	    {								\
+	      using scalar_t_out = at::BFloat16;			\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  default:							\
+	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
+	  }								\
+	break;								\
+      }									\
+    case at::ScalarType::Half:						\
+      {									\
+	using scalar_t_in = at::Half;					\
+	using scalar_t_out = at::Half;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    case at::ScalarType::BFloat16:					\
+      {									\
+	using scalar_t_in = at::BFloat16;				\
+	using scalar_t_out = at::BFloat16;				\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    default:								\
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
+    }
+
diff --git a/training/DeepSpeed-Domino/megatron/global_vars.py b/training/DeepSpeed-Domino/megatron/global_vars.py
new file mode 100644
index 000000000..4e0118e10
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/global_vars.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron global variables."""
+
+import os
+import sys
+import torch
+
+from megatron import dist_signal_handler
+from megatron.tokenizer import build_tokenizer
+from .microbatches import build_num_microbatches_calculator
+from .timers import Timers
+
+_GLOBAL_ARGS = None
+_GLOBAL_RETRO_ARGS = None
+_GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+_GLOBAL_TOKENIZER = None
+_GLOBAL_TENSORBOARD_WRITER = None
+_GLOBAL_ADLR_AUTORESUME = None
+_GLOBAL_TIMERS = None
+_GLOBAL_SIGNAL_HANDLER = None
+
+def get_args():
+    """Return arguments."""
+    _ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
+    return _GLOBAL_ARGS
+
+
+def get_retro_args():
+    """Return retro arguments."""
+    return _GLOBAL_RETRO_ARGS
+
+
+def get_num_microbatches():
+    return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get()
+
+
+def get_current_global_batch_size():
+    return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get_current_global_batch_size()
+
+
+def update_num_microbatches(consumed_samples, consistency_check=True):
+    _GLOBAL_NUM_MICROBATCHES_CALCULATOR.update(consumed_samples,
+                                               consistency_check)
+
+
+def get_tokenizer():
+    """Return tokenizer."""
+    _ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
+    return _GLOBAL_TOKENIZER
+
+
+def get_tensorboard_writer():
+    """Return tensorboard writer. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_TENSORBOARD_WRITER
+
+
+def get_adlr_autoresume():
+    """ADLR autoresume object. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_ADLR_AUTORESUME
+
+
+def get_timers():
+    """Return timers."""
+    _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
+    return _GLOBAL_TIMERS
+
+
+def get_signal_handler():
+    _ensure_var_is_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler')
+    return _GLOBAL_SIGNAL_HANDLER
+
+
+def _set_signal_handler():
+    global _GLOBAL_SIGNAL_HANDLER
+    _ensure_var_is_not_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler')
+    _GLOBAL_SIGNAL_HANDLER = dist_signal_handler.DistributedSignalHandler().__enter__()
+
+
+
+def set_global_variables(args, build_tokenizer=True):
+    """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
+
+    assert args is not None
+
+    _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
+    set_args(args)
+
+    _build_num_microbatches_calculator(args)
+    if build_tokenizer:
+        _ = _build_tokenizer(args)
+    _set_tensorboard_writer(args)
+    _set_adlr_autoresume(args)
+    _set_timers(args)
+
+    if args.exit_signal_handler:
+        _set_signal_handler()
+    
+
+def set_args(args):
+    global _GLOBAL_ARGS
+    _GLOBAL_ARGS = args
+
+
+def set_retro_args(retro_args):
+    global _GLOBAL_RETRO_ARGS
+    _GLOBAL_RETRO_ARGS = retro_args
+
+
+def _build_num_microbatches_calculator(args):
+
+    global _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+    _ensure_var_is_not_initialized(_GLOBAL_NUM_MICROBATCHES_CALCULATOR,
+                                   'num microbatches calculator')
+
+    _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator(
+        args)
+
+
+def _build_tokenizer(args):
+    """Initialize tokenizer."""
+    global _GLOBAL_TOKENIZER
+    _ensure_var_is_not_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
+    _GLOBAL_TOKENIZER = build_tokenizer(args)
+    return _GLOBAL_TOKENIZER
+
+
+def rebuild_tokenizer(args):
+    global _GLOBAL_TOKENIZER
+    _GLOBAL_TOKENIZER = None
+    return _build_tokenizer(args)
+
+
+def _set_tensorboard_writer(args):
+    """Set tensorboard writer."""
+    global _GLOBAL_TENSORBOARD_WRITER
+    _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER,
+                                   'tensorboard writer')
+
+    if hasattr(args, 'tensorboard_dir') and \
+       args.tensorboard_dir and args.rank == (args.world_size - 1):
+        try:
+            from torch.utils.tensorboard import SummaryWriter
+            print('> setting tensorboard ...')
+            _GLOBAL_TENSORBOARD_WRITER = SummaryWriter(
+                log_dir=args.tensorboard_dir,
+                max_queue=args.tensorboard_queue_size)
+        except ModuleNotFoundError:
+            print('WARNING: TensorBoard writing requested but is not '
+                  'available (are you using PyTorch 1.1.0 or later?), '
+                  'no TensorBoard logs will be written.', flush=True)
+
+
+def _set_adlr_autoresume(args):
+    """Initialize ADLR autoresume."""
+    global _GLOBAL_ADLR_AUTORESUME
+    _ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume')
+
+    if args.adlr_autoresume:
+        if args.rank == 0:
+            print('enabling autoresume ...', flush=True)
+        sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
+        try:
+            from userlib.auto_resume import AutoResume
+        except BaseException:
+            print('ADLR autoresume is not available, exiting ...')
+            sys.exit()
+
+        _GLOBAL_ADLR_AUTORESUME = AutoResume
+
+
+def _set_timers(args):
+    """Initialize timers."""
+    global _GLOBAL_TIMERS
+    _ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
+    _GLOBAL_TIMERS = Timers(args.timing_log_level, args.timing_log_option)
+
+
+def _ensure_var_is_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is not None, '{} is not initialized.'.format(name)
+
+
+def _ensure_var_is_not_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is None, '{} is already initialized.'.format(name)
+
+
+
diff --git a/training/DeepSpeed-Domino/megatron/indexer.py b/training/DeepSpeed-Domino/megatron/indexer.py
new file mode 100644
index 000000000..45f530a7d
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/indexer.py
@@ -0,0 +1,129 @@
+import sys
+import time
+import torch
+import torch.distributed as dist
+
+from megatron import get_args, print_rank_0
+from megatron.core import mpu
+from megatron.checkpointing import load_biencoder_checkpoint
+from megatron.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset
+from megatron.data.orqa_wiki_dataset import get_open_retrieval_batch
+from megatron.data.biencoder_dataset_utils import get_one_epoch_dataloader
+from megatron.data.realm_index import detach, OpenRetreivalDataStore
+from megatron.model.biencoder_model import get_model_provider
+from megatron.training import get_model
+
+
+class IndexBuilder(object):
+    """
+    Object for taking one pass over a dataset and creating a BlockData of its
+    embeddings
+    """
+    def __init__(self):
+        args = get_args()
+        self.model = None
+        self.dataloader = None
+        self.evidence_embedder_obj = None
+        self.biencoder_shared_query_context_model = \
+            args.biencoder_shared_query_context_model
+
+        # need to know whether we're using a REALM checkpoint (args.load)
+        # or ICT checkpoint
+        assert not (args.load and args.ict_load)
+
+        self.log_interval = args.indexer_log_interval
+        self.batch_size = args.indexer_batch_size
+
+        self.load_attributes()
+        self.is_main_builder = mpu.get_data_parallel_rank() == 0
+        self.num_total_builders = mpu.get_data_parallel_world_size()
+        self.iteration = self.total_processed = 0
+
+    def load_attributes(self):
+        """
+        Load the necessary attributes: model, dataloader and empty BlockData
+        """
+        only_context_model = True
+        if self.biencoder_shared_query_context_model:
+            only_context_model = False
+
+        model = get_model(get_model_provider(only_context_model=\
+            only_context_model, biencoder_shared_query_context_model=\
+            self.biencoder_shared_query_context_model))
+
+        self.model = load_biencoder_checkpoint(model,
+                only_context_model=only_context_model)
+
+        assert len(self.model) == 1
+        self.model[0].eval()
+
+        self.dataset = get_open_retrieval_wiki_dataset()
+        self.dataloader = iter(get_one_epoch_dataloader(self.dataset, \
+            self.batch_size))
+
+        self.evidence_embedder_obj = OpenRetreivalDataStore( \
+            load_from_path=False)
+
+    def track_and_report_progress(self, batch_size):
+        """
+        Utility function for tracking progress
+        """
+        self.iteration += 1
+        self.total_processed += batch_size * self.num_total_builders
+        if self.is_main_builder and self.iteration % self.log_interval == 0:
+            print('Batch {:10d} | Total {:10d}'.format(self.iteration,
+                self.total_processed), flush=True)
+
+    def build_and_save_index(self):
+        """
+        Goes through one epoch of the dataloader and adds all data to this
+        instance's BlockData.
+
+        The copy of BlockData is saved as a shard, which when run in a
+        distributed setting will be consolidated by the rank 0 process
+        and saved as a final pickled BlockData.
+        """
+        assert len(self.model) == 1
+        unwrapped_model = self.model[0]
+
+        while not hasattr(unwrapped_model, 'embed_text'):
+            unwrapped_model = unwrapped_model.module
+
+        while True:
+            try:
+                # batch also has query_tokens and query_pad_data
+                row_id, context_tokens, context_mask, context_types, \
+                    context_pad_mask = get_open_retrieval_batch( \
+                    self.dataloader)
+            except (StopIteration, IndexError):
+                break
+
+            # TODO: can we add with torch.no_grad() to reduce memory usage
+            # detach, separate fields and add to BlockData
+            assert context_mask.dtype == torch.bool
+            context_logits = unwrapped_model.embed_text(
+                unwrapped_model.context_model, context_tokens, context_mask,
+                context_types)
+
+            context_logits = detach(context_logits)
+            row_id = detach(row_id)
+
+            self.evidence_embedder_obj.add_block_data(row_id, context_logits)
+            self.track_and_report_progress(batch_size=len(row_id))
+
+        # This process signals to finalize its shard and then synchronize with
+        # the other processes
+        self.evidence_embedder_obj.save_shard()
+        torch.distributed.barrier()
+        del self.model
+
+        # rank 0 process builds the final copy
+        if self.is_main_builder:
+            self.evidence_embedder_obj.merge_shards_and_save()
+            # make sure that every single piece of data was embedded
+            assert len(self.evidence_embedder_obj.embed_data) == \
+                len(self.dataset)
+        self.evidence_embedder_obj.clear()
+
+        # complete building the final copy
+        torch.distributed.barrier()
diff --git a/training/DeepSpeed-Domino/megatron/initialize.py b/training/DeepSpeed-Domino/megatron/initialize.py
new file mode 100644
index 000000000..367ba85cb
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/initialize.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron initialization."""
+
+import random
+import os
+import time
+
+import numpy as np
+import torch
+from datetime import timedelta
+
+from megatron import fused_kernels
+from megatron import get_adlr_autoresume
+from megatron import get_args
+from megatron import get_tensorboard_writer
+from megatron.core import mpu, tensor_parallel
+from megatron.arguments import parse_args, validate_args
+from megatron.checkpointing import load_args_from_checkpoint
+from megatron.global_vars import set_global_variables
+from megatron.model.transformer import bias_dropout_add_fused_train
+from megatron.model.fused_bias_gelu import bias_gelu
+
+
+def initialize_megatron(
+    extra_args_provider=None,
+    args_defaults={},
+    ignore_unknown_args=False,
+    allow_no_cuda=False,
+):
+    """Set global variables, initialize distributed, and
+    set autoresume and random seeds.
+    `allow_no_cuda` should not be set unless using megatron for cpu only
+    data processing. In general this arg should not be set unless you know
+    what you are doing.
+    Returns a function to finalize distributed env initialization
+    (optionally, only when args.lazy_mpu_init == True)
+    """
+    if not allow_no_cuda:
+        # Make sure cuda is available.
+        assert torch.cuda.is_available(), "Megatron requires CUDA."
+
+    # Parse arguments
+    args = parse_args(extra_args_provider, ignore_unknown_args)
+
+    if args.use_checkpoint_args or args_defaults.get("use_checkpoint_args", False):
+        assert args.load is not None, "--use-checkpoints-args requires --load argument"
+        load_args_from_checkpoint(args)
+
+    validate_args(args, args_defaults)
+
+    # set global args, build tokenizer, and set adlr-autoresume,
+    # tensorboard-writer, and timers.
+    set_global_variables(args)
+
+    # torch.distributed initialization
+    def finish_mpu_init():
+        args = get_args()
+        # Pytorch distributed.
+        _initialize_distributed()
+
+        # Random seeds for reproducibility.
+        if args.rank == 0:
+            print("> setting random seeds to {} ...".format(args.seed))
+        _set_random_seed(args.seed, args.data_parallel_random_init)
+
+    args = get_args()
+    if args.lazy_mpu_init:
+        # TODO is this still a necessary option?
+        args.use_cpu_initialization = True
+        # delayed initialization of DDP-related stuff
+        # We only set basic DDP globals
+        mpu.set_tensor_model_parallel_world_size(args.tensor_model_parallel_size)
+        # and return function for external DDP manager
+        # to call when it has DDP initialized
+        mpu.set_tensor_model_parallel_rank(args.rank)
+        return finish_mpu_init
+    else:
+        # Megatron's MPU is the master. Complete initialization right away.
+        finish_mpu_init()
+
+        # Autoresume.
+        _init_autoresume()
+
+        # Compile dependencies.
+        _compile_dependencies()
+
+        # No continuation function
+        return None
+
+
+def _compile_dependencies():
+
+    args = get_args()
+
+    # =========================
+    # Compile dataset C++ code.
+    # =========================
+    # TODO: move this to ninja
+    if torch.distributed.get_rank() == 0:
+        start_time = time.time()
+        print("> compiling dataset index builder ...")
+        from megatron.data.dataset_utils import compile_helper
+
+        compile_helper()
+        print(
+            ">>> done with dataset index builder. Compilation time: {:.3f} "
+            "seconds".format(time.time() - start_time),
+            flush=True,
+        )
+
+    # ==================
+    # Load fused kernels
+    # ==================
+
+    # Custom kernel constraints check.
+    seq_len = args.seq_length
+    attn_batch_size = (
+        args.num_attention_heads / args.tensor_model_parallel_size
+    ) * args.micro_batch_size
+    # Constraints on sequence length and attn_batch_size to enable warp based
+    # optimization and upper triangular optimization (for causal mask)
+    custom_kernel_constraint = (
+        seq_len > 16
+        and seq_len <= 16384
+        and seq_len % 4 == 0
+        and attn_batch_size % 4 == 0
+    )
+    # Print a warning.
+    if not (
+        (args.fp16 or args.bf16)
+        and custom_kernel_constraint
+        and args.masked_softmax_fusion
+    ):
+        if args.rank == 0:
+            print(
+                "WARNING: constraints for invoking optimized"
+                " fused softmax kernel are not met. We default"
+                " back to unfused kernel invocations.",
+                flush=True,
+            )
+
+    # Always build on rank zero first.
+    if torch.distributed.get_rank() == 0:
+        start_time = time.time()
+        print("> compiling and loading fused kernels ...", flush=True)
+        fused_kernels.load(args)
+        torch.distributed.barrier()
+    else:
+        torch.distributed.barrier()
+        fused_kernels.load(args)
+    # Simple barrier to make sure all ranks have passed the
+    # compilation phase successfully before moving on to the
+    # rest of the program. We think this might ensure that
+    # the lock is released.
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(
+            ">>> done with compiling and loading fused kernels. "
+            "Compilation time: {:.3f} seconds".format(time.time() - start_time),
+            flush=True,
+        )
+
+
+def _initialize_distributed():
+    """Initialize torch.distributed and core model parallel."""
+    args = get_args()
+
+    device_count = torch.cuda.device_count()
+    if torch.distributed.is_initialized():
+
+        if args.rank == 0:
+            print(
+                "torch distributed is already initialized, "
+                "skipping initialization ...",
+                flush=True,
+            )
+        args.rank = torch.distributed.get_rank()
+        args.world_size = torch.distributed.get_world_size()
+
+    else:
+
+        if args.rank == 0:
+            print("> initializing torch distributed ...", flush=True)
+        # Manually set the device ids.
+        if device_count > 0:
+            device = args.rank % device_count
+            if args.local_rank is not None:
+                assert (
+                    args.local_rank == device
+                ), "expected local-rank to be the same as rank % device-count."
+            else:
+                args.local_rank = device
+            torch.cuda.set_device(device)
+    # Call the init process
+    torch.distributed.init_process_group(
+        backend=args.distributed_backend,
+        world_size=args.world_size,
+        rank=args.rank,
+        timeout=timedelta(minutes=args.distributed_timeout_minutes),
+    )
+
+    # Set the tensor model-parallel, pipeline model-parallel, and
+    # data-parallel communicators.
+    if device_count > 0:
+        if mpu.model_parallel_is_initialized():
+            print("model parallel is already initialized")
+        else:
+            mpu.initialize_model_parallel(
+                args.tensor_model_parallel_size,
+                args.pipeline_model_parallel_size,
+                args.virtual_pipeline_model_parallel_size,
+                args.pipeline_model_parallel_split_rank,
+                args.fp8 is not None,
+            )
+            if args.rank == 0:
+                print(
+                    f"> initialized tensor model parallel with size "
+                    f"{mpu.get_tensor_model_parallel_world_size()}"
+                )
+                print(
+                    f"> initialized pipeline model parallel with size "
+                    f"{mpu.get_pipeline_model_parallel_world_size()}"
+                )
+
+
+def _init_autoresume():
+    """Set autoresume start time."""
+    autoresume = get_adlr_autoresume()
+    if autoresume:
+        torch.distributed.barrier()
+        autoresume.init()
+        torch.distributed.barrier()
+
+
+def _set_random_seed(seed_, data_parallel_random_init=False):
+    """Set random seed for reproducability."""
+    if seed_ is not None and seed_ > 0:
+        # Ensure that different pipeline MP stages get different seeds.
+        seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
+        # Ensure different data parallel ranks get different seeds
+        if data_parallel_random_init:
+            seed = seed + (10 * mpu.get_data_parallel_rank())
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.device_count() > 0:
+            tensor_parallel.model_parallel_cuda_manual_seed(seed)
+    else:
+        raise ValueError("Seed ({}) should be a positive integer.".format(seed))
+
+
+def write_args_to_tensorboard():
+    """Write arguments to tensorboard."""
+    args = get_args()
+    writer = get_tensorboard_writer()
+    if writer:
+        for arg in vars(args):
+            writer.add_text(arg, str(getattr(args, arg)), global_step=args.iteration)
+
+
+def set_jit_fusion_options():
+    """Set PyTorch JIT layer fusion options."""
+    # flags required to enable jit fusion kernels
+    TORCH_MAJOR = int(torch.__version__.split(".")[0])
+    TORCH_MINOR = int(torch.__version__.split(".")[1])
+    if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10):
+        # nvfuser
+        torch._C._jit_set_profiling_executor(True)
+        torch._C._jit_set_profiling_mode(True)
+        torch._C._jit_override_can_fuse_on_cpu(False)
+        torch._C._jit_override_can_fuse_on_gpu(False)
+        torch._C._jit_set_texpr_fuser_enabled(False)
+        torch._C._jit_set_nvfuser_enabled(True)
+        torch._C._debug_set_autodiff_subgraph_inlining(False)
+    else:
+        # legacy pytorch fuser
+        torch._C._jit_set_profiling_mode(False)
+        torch._C._jit_set_profiling_executor(False)
+        torch._C._jit_override_can_fuse_on_cpu(True)
+        torch._C._jit_override_can_fuse_on_gpu(True)
+
+    _warmup_jit_function()
+
+
+def _warmup_jit_function():
+    """Compilie JIT functions before the main training steps"""
+    args = get_args()
+    if args.bf16:
+        dtype = torch.bfloat16
+    elif args.fp16:
+        dtype = torch.float16
+    else:
+        dtype = torch.float32
+
+    # Warmup fused bias+gelu
+    bias = torch.rand(
+        args.ffn_hidden_size // args.tensor_model_parallel_size,
+        dtype=dtype,
+        device="cuda",
+    )
+    input = torch.rand(
+        (
+            args.seq_length,
+            args.micro_batch_size,
+            args.ffn_hidden_size // args.tensor_model_parallel_size,
+        ),
+        dtype=dtype,
+        device="cuda",
+    )
+    # Warmup JIT fusions with the input grad_enable state of both forward
+    # prop and recomputation
+    for bias_grad, input_grad in zip([True, True], [False, True]):
+        bias.requires_grad, input.requires_grad = bias_grad, input_grad
+        for _ in range(5):
+            output = bias_gelu(bias, input)
+    del bias, input, output
+
+    # Warmup fused bias+dropout+add
+    if args.sequence_parallel:
+        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
+    else:
+        seq_length = args.seq_length
+    input = torch.rand(
+        (seq_length, args.micro_batch_size, args.hidden_size),
+        dtype=dtype,
+        device="cuda",
+    )
+    residual = torch.rand(
+        (seq_length, args.micro_batch_size, args.hidden_size),
+        dtype=dtype,
+        device="cuda",
+    )
+    bias = torch.rand((args.hidden_size), dtype=dtype, device="cuda").expand_as(
+        residual
+    )
+    dropout_rate = 0.1
+    # Warmup JIT fusions with the input grad_enable state of both forward
+    # prop and recomputation
+    for input_grad, bias_grad, residual_grad in zip(
+        [False, True], [True, True], [True, True]
+    ):
+        input.requires_grad = input_grad
+        bias.requires_grad = bias_grad
+        residual.requires_grad = residual_grad
+        for _ in range(5):
+            output = bias_dropout_add_fused_train(input, bias, residual, dropout_rate)
+    del bias, input, residual, output
+    torch.cuda.empty_cache()
diff --git a/training/DeepSpeed-Domino/megatron/memory.py b/training/DeepSpeed-Domino/megatron/memory.py
new file mode 100644
index 000000000..a5fef75ba
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/memory.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+
+import torch
+
+
+# A dictionary of all the memory buffers allocated.
+_MEM_BUFFS = dict()
+
+
+def allocate_mem_buff(name, numel, dtype, track_usage):
+    """Allocate a memory buffer."""
+    assert name not in _MEM_BUFFS, \
+        'memory buffer {} already allocated.'.format(name)
+    _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage)
+    return _MEM_BUFFS[name]
+
+
+def get_mem_buff(name):
+    """Get the memory buffer."""
+    return _MEM_BUFFS[name]
+
+
+class MemoryBuffer:
+    """Contiguous memory buffer.
+    Allocate a contiguous memory of type `dtype` and size `numel`. It is
+    used to reduce memory fragmentation.
+
+    Usage: After the allocation, the `_start` index is set tot the first
+           index of the memory. A memory chunk starting from `_start` index
+           can be `allocated` for an input tensor, with the elements of the
+           tensor being coppied. The buffer can be reused by resetting the
+           `_start` index.
+
+    """
+    def __init__(self, name, numel, dtype, track_usage):
+        if torch.distributed.get_rank() == 0:
+            element_size = torch.tensor([], dtype=dtype).element_size()
+            print('> building the {} memory buffer with {} num elements '
+                  'and {} dtype ({:.1f} MB)...'.format(
+                      name, numel, dtype, numel*element_size/1024/1024),
+                  flush=True)
+        self.name = name
+        self.numel = numel
+        self.dtype = dtype
+        self.data = torch.empty(self.numel,
+                                dtype=self.dtype,
+                                device=torch.cuda.current_device(),
+                                requires_grad=False)
+
+        # Index tracking the start of the free memory.
+        self._start = 0
+
+        # Values used for tracking usage.
+        self.track_usage = track_usage
+        if self.track_usage:
+            self.in_use_value = 0.0
+            self.total_value = 0.0
+
+
+    def reset(self):
+        """Reset the buffer start index to the beginning of the buffer."""
+        self._start = 0
+
+
+    def is_in_use(self):
+        """Whether the current buffer hold on to any memory."""
+        return self._start > 0
+
+
+    def numel_in_use(self):
+        """Return number of elements in use."""
+        return self._start
+
+
+    def add(self, tensor):
+        """Allocate a chunk of memory from the buffer to tensor and copy
+        the values."""
+        assert tensor.dtype == self.dtype, \
+            'Input tensor type {} different from buffer type {}'.format(
+                tensor.dtype, self.dtype)
+        # Number of elements of the input tensor.
+        tensor_numel = torch.numel(tensor)
+        new_start = self._start + tensor_numel
+        assert new_start <= self.numel, \
+            'Not enough memory left in the buffer ({} > {})'.format(
+                tensor_numel, self.numel - self._start)
+        # New tensor is a view into the memory.
+        new_tensor = self.data[self._start:new_start]
+        self._start = new_start
+        new_tensor = new_tensor.view(tensor.shape)
+        new_tensor.copy_(tensor)
+        # Return a pointer to the new tensor.
+        return new_tensor
+
+
+    def get_data(self):
+        """Return the data currently in use."""
+        if self.track_usage:
+            self.in_use_value += float(self._start)
+            self.total_value += float(self.numel)
+        return self.data[:self._start]
+
+
+    def print_average_usage(self):
+        """Print memory usage average over time. We would like this value
+        to be as high as possible."""
+        assert self.track_usage, 'You need to enable track usage.'
+        if torch.distributed.get_rank() == 0:
+            print(' > usage of {} memory buffer: {:.2f} %'.format(
+                self.name, self.in_use_value * 100.0 / self.total_value),
+                  flush=True)
+
+
+
+class RingMemBuffer:
+    """A ring of memory buffers."""
+
+    def __init__(self, name, num_buffers, numel, dtype, track_usage):
+        self.num_buffers = num_buffers
+        self.buffers = [
+            allocate_mem_buff(name+' {}'.format(i), numel, dtype, track_usage)
+            for i in range(num_buffers)]
+        self._index = -1
+
+
+    def get_next_buffer(self):
+        self._index += 1
+        self._index = self._index % self.num_buffers
+        buff = self.buffers[self._index]
+        assert not buff.is_in_use(), 'buffer is already in use.'
+        return buff
diff --git a/training/DeepSpeed-Domino/megatron/microbatches.py b/training/DeepSpeed-Domino/megatron/microbatches.py
new file mode 100644
index 000000000..6449d7479
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/microbatches.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron number of micro-batches calculators."""
+
+from abc import ABC
+from abc import abstractmethod
+
+
+def build_num_microbatches_calculator(args):
+
+    # Constant num micro-batches.
+    if args.rampup_batch_size is None:
+        num_microbatches_calculator = ConstantNumMicroBatches(
+            args.global_batch_size, args.micro_batch_size,
+            args.data_parallel_size)
+        if args.rank == 0:
+            print('setting number of micro-batches to constant {}'.format(
+                num_microbatches_calculator.get()), flush=True)
+
+    else:
+        assert len(args.rampup_batch_size) == 3, 'expected the following ' \
+            'format: --rampup-batch-size <start batch size> ' \
+            '<batch size incerement> <ramp-up samples>'
+        start_batch_size = int(args.rampup_batch_size[0])
+        batch_size_increment = int(args.rampup_batch_size[1])
+        ramup_samples = int(args.rampup_batch_size[2])
+        if args.rank == 0:
+            print('will use batch size rampup starting from global batch '
+                  'size {} to global batch size {} with batch size increments '
+                  '{} over {} samples.'.format(start_batch_size,
+                                               args.global_batch_size,
+                                               batch_size_increment,
+                                               ramup_samples), flush=True)
+        num_microbatches_calculator = RampupBatchsizeNumMicroBatches(
+            start_batch_size, batch_size_increment, ramup_samples,
+            args.global_batch_size, args.micro_batch_size,
+            args.data_parallel_size)
+
+    return num_microbatches_calculator
+
+
+class NumMicroBatchesCalculator(ABC):
+
+    def __init__(self):
+        self.num_micro_batches = None
+        self.current_global_batch_size = None
+
+    def get(self):
+        return self.num_micro_batches
+
+    def get_current_global_batch_size(self):
+        return self.current_global_batch_size
+
+    @abstractmethod
+    def update(self, consumed_samples, consistency_check):
+        pass
+
+
+class ConstantNumMicroBatches(NumMicroBatchesCalculator):
+
+    def __init__(self, global_batch_size, micro_batch_size, data_parallel_size):
+        micro_batch_times_data_parallel = micro_batch_size * \
+                                          data_parallel_size
+        assert global_batch_size % micro_batch_times_data_parallel == 0, \
+            'global batch size ({}) is not divisible by micro batch size ({})' \
+            ' times data parallel size ({})'.format(global_batch_size,
+                                                    micro_batch_size,
+                                                    data_parallel_size)
+        self.num_micro_batches = global_batch_size // \
+                                 micro_batch_times_data_parallel
+        assert self.num_micro_batches >= 1
+        self.current_global_batch_size = global_batch_size
+
+    def update(self, consumed_samples, consistency_check):
+        pass
+
+
+class RampupBatchsizeNumMicroBatches(NumMicroBatchesCalculator):
+
+    def __init__(self, start_batch_size, batch_size_increment, ramup_samples,
+                 global_batch_size, micro_batch_size, data_parallel_size):
+        """Batch size ramp up.
+        Over 
+          steps = (global-batch-size - start-batch-size) / batch_size_increment
+        increment batch size from start-batch-size to global-batch-size using
+          rampup-samples / steps
+        samples.
+        Arguments:
+            start_batch_size: global batch size to start with
+            batch_size_increment: global batch size increments
+            ramup_samples: number of samples to use ramp up global
+               batch size from `start_batch_size` to `global_batch_size`
+            global_batch_size: global batch size post rampup
+            micro_batch_size: micro batch size
+            data_parallel_size: data parallel size.
+        """
+
+        self.micro_batch_size = micro_batch_size
+        self.data_parallel_size = data_parallel_size
+        self.micro_batch_times_data_parallel_size = self.micro_batch_size * \
+                                                    self.data_parallel_size
+        assert self.micro_batch_times_data_parallel_size > 0
+        
+        assert start_batch_size > 0
+        self.start_batch_size = start_batch_size
+
+        assert global_batch_size > 0
+        self.global_batch_size = global_batch_size
+        diff_batch_size = self.global_batch_size - self.start_batch_size
+        assert diff_batch_size >= 0
+        assert batch_size_increment > 0
+        self.batch_size_increment = batch_size_increment
+        assert diff_batch_size % batch_size_increment == 0, 'expected ' \
+            'global batch size interval ({}) to be divisible by global batch ' \
+            'size increment ({})'.format(diff_batch_size, batch_size_increment)
+
+        num_increments = diff_batch_size // self.batch_size_increment
+        self.ramup_samples = ramup_samples
+        assert self.ramup_samples >= 0
+        self.rampup_samples_per_increment = self.ramup_samples / num_increments
+
+        # Initialize number of microbatches.
+        self.update(0, False)
+
+
+    def update(self, consumed_samples, consistency_check):
+
+        if consumed_samples > self.ramup_samples:
+            self.current_global_batch_size = self.global_batch_size
+        else:
+            steps = int(consumed_samples / self.rampup_samples_per_increment)
+            self.current_global_batch_size = self.start_batch_size + \
+                steps * self.batch_size_increment
+            assert self.current_global_batch_size <= self.global_batch_size
+
+        if consistency_check:
+            assert self.current_global_batch_size % \
+                self.micro_batch_times_data_parallel_size == 0, 'current global ' \
+                'batch size ({}) is not divisible by micro-batch-size ({}) times' \
+                'data parallel size ({})'.format(self.current_global_batch_size,
+                                                 self.micro_batch_size,
+                                                 self.data_parallel_size)
+        self.num_micro_batches = self.current_global_batch_size // \
+                                 self.micro_batch_times_data_parallel_size
diff --git a/training/DeepSpeed-Domino/megatron/model/__init__.py b/training/DeepSpeed-Domino/megatron/model/__init__.py
new file mode 100644
index 000000000..f5025bf25
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+
+from .distributed import DistributedDataParallel
+from .bert_model import BertModel
+from .gpt_model import GPTModel
+from .t5_model import T5Model
+from .language_model import get_language_model
+from .module import Float16Module
diff --git a/training/DeepSpeed-Domino/megatron/model/bert_model.py b/training/DeepSpeed-Domino/megatron/model/bert_model.py
new file mode 100644
index 000000000..0750d7e6c
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/bert_model.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""BERT model."""
+
+import torch
+
+from megatron import get_args
+from megatron.core import tensor_parallel
+from megatron.model.enums import AttnMaskType
+from megatron.model.language_model import parallel_lm_logits
+from megatron.model.language_model import get_language_model
+from megatron.model import LayerNorm
+from megatron.model.utils import openai_gelu, erf_gelu
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from .module import MegatronModule
+
+
+def bert_extended_attention_mask(attention_mask):
+    # We create a 3D attention mask from a 2D tensor mask.
+    # [b, 1, s]
+    attention_mask_b1s = attention_mask.unsqueeze(1)
+    # [b, s, 1]
+    attention_mask_bs1 = attention_mask.unsqueeze(2)
+    # [b, s, s]
+    attention_mask_bss = attention_mask_b1s * attention_mask_bs1
+    # [b, 1, s, s]
+    extended_attention_mask = attention_mask_bss.unsqueeze(1)
+
+    # Convert attention mask to binary:
+    extended_attention_mask = (extended_attention_mask < 0.5)
+
+    return extended_attention_mask
+
+def bert_position_ids(token_ids):
+    # Create position ids
+    seq_length = token_ids.size(1)
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=token_ids.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+
+    return position_ids
+
+
+class BertLMHead(MegatronModule):
+    """Masked LM head for Bert
+
+    Arguments:
+        config: TransformerConfig object
+        mpu_vocab_size: model parallel size of vocabulary.
+        hidden_size: hidden size
+        parallel_output: whether output logits being distributed or not.
+    """
+
+    def __init__(self, mpu_vocab_size, hidden_size, config, parallel_output):
+        super().__init__(config=config)
+
+        args = get_args()
+        self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size))
+        tensor_parallel.set_tensor_model_parallel_attributes(self.bias, True, 0, 1)
+        self.parallel_output = parallel_output
+
+        self.dense = get_linear_layer(hidden_size, hidden_size, config.init_method)
+        setattr(self.dense.weight, 'sequence_parallel', config.sequence_parallel)
+        setattr(self.dense.bias, 'sequence_parallel', config.sequence_parallel)
+
+        self.layernorm = LayerNorm(hidden_size,
+                                   eps=config.layernorm_epsilon,
+                                   sequence_parallel=config.sequence_parallel)
+        self.gelu = torch.nn.functional.gelu
+        if args.openai_gelu:
+            self.gelu = openai_gelu
+        elif args.onnx_safe:
+            self.gelu = erf_gelu
+
+    def forward(self, hidden_states, word_embeddings_weight):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        hidden_states = self.layernorm(hidden_states)
+        output = parallel_lm_logits(hidden_states,
+                                    word_embeddings_weight,
+                                    self.parallel_output,
+                                    bias=self.bias)
+        return output
+
+
+def post_language_model_processing(lm_output, pooled_output,
+                                   lm_head, binary_head,
+                                   lm_labels,
+                                   logit_weights,
+                                   fp16_lm_cross_entropy):
+    # Output.
+    lm_logits = lm_head(
+        lm_output, logit_weights)
+
+    binary_logits = None
+    if binary_head is not None:
+        binary_logits = binary_head(pooled_output)
+
+    if lm_labels is None:
+        # [s b h] => [b s h]
+        return lm_logits.transpose(0,1).contiguous(), binary_logits
+    else:
+        # [b s] => [s b]
+        lm_labels = lm_labels.transpose(0,1).contiguous()
+        # lm_logits : [s, b, h] and lm_labels: [s, b]
+        if fp16_lm_cross_entropy:
+            assert lm_logits.dtype == torch.half
+            lm_loss = tensor_parallel.vocab_parallel_cross_entropy(lm_logits, lm_labels)
+        else:
+            lm_loss = tensor_parallel.vocab_parallel_cross_entropy(lm_logits.float(),
+                                                        lm_labels)
+        # [s, b] => [b s]
+        lm_loss = lm_loss.transpose(0,1).contiguous()
+        return lm_loss, binary_logits
+
+
+class BertModel(MegatronModule):
+    """Bert Language model."""
+
+    def __init__(self,
+                 config,
+                 num_tokentypes=2,
+                 add_binary_head=True,
+                 parallel_output=True,
+                 pre_process=True,
+                 post_process=True):
+        super().__init__(config=config)
+        args = get_args()
+
+        # TODO this option is not yet implemented in BERT
+        assert args.untie_embeddings_and_output_weights is False
+
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+        self.add_binary_head = add_binary_head
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+
+        self.return_embeddings = args.output_bert_embeddings
+        if self.return_embeddings:
+            assert self.post_process and self.add_binary_head
+
+        self.language_model, self._language_model_key = get_language_model(
+            config=config,
+            num_tokentypes=num_tokentypes,
+            add_pooler=self.add_binary_head,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+
+        self.initialize_word_embeddings()
+        if self.post_process:
+            self.lm_head = BertLMHead(self.shared_embedding_or_output_weight().size(0), config.hidden_size,
+                                      config, parallel_output)
+            self._lm_head_key = 'lm_head'
+            self.binary_head = None
+            if self.add_binary_head:
+                self.binary_head = get_linear_layer(config.hidden_size, 2,
+                                                    config.init_method)
+                self._binary_head_key = 'binary_head'
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+
+    def forward(self, bert_model_input, attention_mask,
+                tokentype_ids=None, lm_labels=None):
+
+        extended_attention_mask = bert_extended_attention_mask(attention_mask)
+        input_ids = bert_model_input
+        position_ids = bert_position_ids(input_ids)
+
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids
+        )
+
+        if self.post_process and self.add_binary_head:
+            lm_output, pooled_output = lm_output
+
+            # Return pooled output (e.g., when computing Bert embeddings).
+            if self.return_embeddings:
+
+                # Sum attention mask.
+                embeddings = torch.transpose(lm_output, 0, 1)
+                masks = torch.sum(attention_mask, dim=1)
+
+                # Collect masked embeddings.
+                output = torch.zeros(
+                    size=(embeddings.shape[0], embeddings.shape[2]),
+                    dtype=torch.float32,
+                    device=torch.cuda.current_device())
+                for i, (embedding, mask) in enumerate(zip(embeddings, masks)):
+                    output[i, :] = torch.mean(embedding[1: mask - 1], dim=0)
+
+                return output
+
+        else:
+            pooled_output = None
+
+        if self.post_process:
+            return post_language_model_processing(lm_output, pooled_output,
+                                                  self.lm_head, self.binary_head,
+                                                  lm_labels,
+                                                  self.shared_embedding_or_output_weight(),
+                                                  self.fp16_lm_cross_entropy)
+        else:
+            return lm_output
+
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(prefix=prefix,
+                                                                 keep_vars=keep_vars)
+        if self.post_process:
+            state_dict_[self._lm_head_key] \
+                = self.lm_head.state_dict_for_save_checkpoint(prefix=prefix,
+                                                              keep_vars=keep_vars)
+        if self.post_process and self.add_binary_head:
+            state_dict_[self._binary_head_key] \
+                = self.binary_head.state_dict(prefix=prefix, keep_vars=keep_vars)
+        # Save word_embeddings.
+        if self.post_process and not self.pre_process:
+            state_dict_[self._word_embeddings_for_head_key] \
+                = self.word_embeddings.state_dict(prefix=prefix, keep_vars=keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self.post_process:
+            self.lm_head.load_state_dict(
+                state_dict[self._lm_head_key], strict=strict)
+        if self.post_process and self.add_binary_head:
+            self.binary_head.load_state_dict(
+                state_dict[self._binary_head_key], strict=strict)
+        # Load word_embeddings.
+        if self.post_process and not self.pre_process:
+            self.word_embeddings.load_state_dict(
+                state_dict[self._word_embeddings_for_head_key], strict=strict)
diff --git a/training/DeepSpeed-Domino/megatron/model/biencoder_model.py b/training/DeepSpeed-Domino/megatron/model/biencoder_model.py
new file mode 100644
index 000000000..c910879dc
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/biencoder_model.py
@@ -0,0 +1,328 @@
+import os
+import torch
+import sys
+
+from megatron import get_args, print_rank_0, get_tokenizer
+from megatron.core import mpu
+from megatron.checkpointing import fix_query_key_value_ordering
+from megatron.checkpointing import get_checkpoint_tracker_filename
+from megatron.checkpointing import get_checkpoint_name
+from megatron.model.bert_model import bert_position_ids
+from megatron.model.enums import AttnMaskType
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from .module import MegatronModule
+
+def get_model_provider(only_query_model=False, only_context_model=False,
+        biencoder_shared_query_context_model=False):
+
+    def model_provider(pre_process=True, post_process=True):
+        """Build the model."""
+
+        print_rank_0('building Bienoder model ...')
+        model = biencoder_model_provider(only_query_model=only_query_model,
+                only_context_model = only_context_model,
+                biencoder_shared_query_context_model = \
+                biencoder_shared_query_context_model,
+                pre_process=pre_process, post_process=post_process)
+
+        return model
+
+    return model_provider
+
+
+def biencoder_model_provider(only_query_model=False,
+                             only_context_model=False,
+                             biencoder_shared_query_context_model=False,
+                             pre_process=True,
+                             post_process=True):
+    """Build the model."""
+
+    assert mpu.get_tensor_model_parallel_world_size() == 1 and \
+        mpu.get_pipeline_model_parallel_world_size() == 1, \
+        "Model parallel size > 1 not supported for ICT"
+
+    print_rank_0('building BiEncoderModel...')
+
+    # simpler to just keep using 2 tokentypes since
+    # the LM we initialize with has 2 tokentypes
+    model = BiEncoderModel(
+        num_tokentypes=2,
+        parallel_output=False,
+        only_query_model=only_query_model,
+        only_context_model=only_context_model,
+        biencoder_shared_query_context_model=\
+        biencoder_shared_query_context_model,
+        pre_process=pre_process,
+        post_process=post_process)
+
+    return model
+
+
+class BiEncoderModel(MegatronModule):
+    """Bert-based module for Biencoder model."""
+
+    def __init__(self,
+                 num_tokentypes=1,
+                 parallel_output=True,
+                 only_query_model=False,
+                 only_context_model=False,
+                 biencoder_shared_query_context_model=False,
+                 pre_process=True,
+                 post_process=True):
+        super(BiEncoderModel, self).__init__()
+        args = get_args()
+
+        bert_kwargs = dict(
+            num_tokentypes=num_tokentypes,
+            parallel_output=parallel_output,
+            pre_process=pre_process,
+            post_process=post_process)
+
+        self.biencoder_shared_query_context_model = \
+            biencoder_shared_query_context_model
+        assert not (only_context_model and only_query_model)
+        self.use_context_model = not only_query_model
+        self.use_query_model = not only_context_model
+        self.biencoder_projection_dim = args.biencoder_projection_dim
+
+        if self.biencoder_shared_query_context_model:
+            self.model = PretrainedBertModel(**bert_kwargs)
+            self._model_key = 'shared_model'
+            self.query_model, self.context_model = self.model, self.model
+        else:
+            if self.use_query_model:
+                # this model embeds (pseudo-)queries - Embed_input in the paper
+                self.query_model = PretrainedBertModel(**bert_kwargs)
+                self._query_key = 'query_model'
+
+            if self.use_context_model:
+                # this model embeds evidence blocks - Embed_doc in the paper
+                self.context_model = PretrainedBertModel(**bert_kwargs)
+                self._context_key = 'context_model'
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        # this is just a placeholder and will be needed when model
+        # parallelism will be used
+        # self.language_model.set_input_tensor(input_tensor)
+        return
+
+    def forward(self, query_tokens, query_attention_mask, query_types,
+                context_tokens, context_attention_mask, context_types):
+        """Run a forward pass for each of the models and
+        return the respective embeddings."""
+
+        if self.use_query_model:
+            query_logits = self.embed_text(self.query_model,
+                                           query_tokens,
+                                           query_attention_mask,
+                                           query_types)
+        else:
+            raise ValueError("Cannot embed query without the query model.")
+        if self.use_context_model:
+            context_logits = self.embed_text(self.context_model,
+                                             context_tokens,
+                                             context_attention_mask,
+                                             context_types)
+        else:
+            raise ValueError("Cannot embed block without the block model.")
+        return query_logits, context_logits
+
+    @staticmethod
+    def embed_text(model, tokens, attention_mask, token_types):
+        """Embed a batch of tokens using the model"""
+        logits = model(tokens,
+                              attention_mask,
+                              token_types)
+        return logits
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """Save dict with state dicts of each of the models."""
+        state_dict_ = {}
+        if self.biencoder_shared_query_context_model:
+            state_dict_[self._model_key] = \
+                self.model.state_dict_for_save_checkpoint(
+                    prefix=prefix, keep_vars=keep_vars)
+        else:
+            if self.use_query_model:
+                state_dict_[self._query_key] = \
+                    self.query_model.state_dict_for_save_checkpoint(
+                        prefix=prefix, keep_vars=keep_vars)
+
+            if self.use_context_model:
+                state_dict_[self._context_key] = \
+                    self.context_model.state_dict_for_save_checkpoint(
+                        prefix=prefix, keep_vars=keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Load the state dicts of each of the models"""
+        if self.biencoder_shared_query_context_model:
+            print_rank_0("Loading shared query-context model")
+            self.model.load_state_dict(state_dict[self._model_key], \
+                strict=strict)
+        else:
+            if self.use_query_model:
+                print_rank_0("Loading query model")
+                self.query_model.load_state_dict( \
+                    state_dict[self._query_key], strict=strict)
+
+            if self.use_context_model:
+                print_rank_0("Loading context model")
+                self.context_model.load_state_dict( \
+                    state_dict[self._context_key], strict=strict)
+
+    def init_state_dict_from_bert(self):
+        """Initialize the state from a pretrained BERT model
+        on iteration zero of ICT pretraining"""
+        args = get_args()
+
+        if args.bert_load is None:
+            print_rank_0("bert-load argument is None")
+            return
+
+        tracker_filename = get_checkpoint_tracker_filename(args.bert_load)
+        if not os.path.isfile(tracker_filename):
+            raise FileNotFoundError("Could not find BERT checkpoint")
+        with open(tracker_filename, 'r') as f:
+            iteration = int(f.read().strip())
+            assert iteration > 0
+
+        checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False)
+        if mpu.get_data_parallel_rank() == 0:
+            print('global rank {} is loading BERT checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name))
+
+        # Load the checkpoint.
+        try:
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+        except ModuleNotFoundError:
+            from megatron.fp16_deprecated import loss_scaler
+            # For backward compatibility.
+            print_rank_0(' > deserializing using the old code structure ...')
+            sys.modules['fp16.loss_scaler'] = sys.modules[
+                'megatron.fp16_deprecated.loss_scaler']
+            sys.modules['megatron.fp16.loss_scaler'] = sys.modules[
+                'megatron.fp16_deprecated.loss_scaler']
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+            sys.modules.pop('fp16.loss_scaler', None)
+            sys.modules.pop('megatron.fp16.loss_scaler', None)
+        except BaseException:
+            print_rank_0('could not load the BERT checkpoint')
+            sys.exit()
+
+        checkpoint_version = state_dict.get('checkpoint_version', 0)
+
+        # load the LM state dict into each model
+        model_dict = state_dict['model']['language_model']
+
+        if self.biencoder_shared_query_context_model:
+            self.model.language_model.load_state_dict(model_dict)
+            fix_query_key_value_ordering(self.model, checkpoint_version)
+        else:
+            if self.use_query_model:
+                self.query_model.language_model.load_state_dict(model_dict)
+                # give each model the same ict_head to begin with as well
+                if self.biencoder_projection_dim > 0:
+                    query_proj_state_dict = \
+                        self.state_dict_for_save_checkpoint()\
+                        [self._query_key]['projection_enc']
+                fix_query_key_value_ordering(self.query_model, checkpoint_version)
+
+            if self.use_context_model:
+                self.context_model.language_model.load_state_dict(model_dict)
+                if self.query_model is not None and \
+                    self.biencoder_projection_dim > 0:
+                    self.context_model.projection_enc.load_state_dict\
+                        (query_proj_state_dict)
+                fix_query_key_value_ordering(self.context_model, checkpoint_version)
+
+
+class PretrainedBertModel(MegatronModule):
+    """BERT-based encoder for queries or contexts used for
+    learned information retrieval."""
+
+    def __init__(self, num_tokentypes=2,
+            parallel_output=True, pre_process=True, post_process=True):
+        super(PretrainedBertModel, self).__init__()
+
+        args = get_args()
+        tokenizer = get_tokenizer()
+        self.pad_id = tokenizer.pad
+        self.biencoder_projection_dim = args.biencoder_projection_dim
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(
+            args.init_method_std, args.num_layers)
+
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=False,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method,
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+
+        if args.biencoder_projection_dim > 0:
+            self.projection_enc = get_linear_layer(args.hidden_size,
+                                                   args.biencoder_projection_dim,
+                                                   init_method)
+            self._projection_enc_key = 'projection_enc'
+
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
+        extended_attention_mask = attention_mask.unsqueeze(1)
+        #extended_attention_mask = bert_extended_attention_mask(attention_mask)
+        position_ids = bert_position_ids(input_ids)
+
+        lm_output = self.language_model(input_ids,
+                                        position_ids,
+                                        extended_attention_mask,
+                                        tokentype_ids=tokentype_ids)
+        # This mask will be used in average-pooling and max-pooling
+        pool_mask = (input_ids == self.pad_id).unsqueeze(2)
+
+        # Taking the representation of the [CLS] token of BERT
+        pooled_output = lm_output[0, :, :]
+
+        # Converting to float16 dtype
+        pooled_output = pooled_output.to(lm_output.dtype)
+
+        # Output.
+        if self.biencoder_projection_dim:
+            pooled_output = self.projection_enc(pooled_output)
+
+        return pooled_output
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                prefix=prefix, keep_vars=keep_vars)
+
+        if self.biencoder_projection_dim > 0:
+            state_dict_[self._projection_enc_key] = \
+                self.projection_enc.state_dict(prefix=prefix,
+                                               keep_vars=keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        print_rank_0("loading pretrained weights")
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+
+        if self.biencoder_projection_dim > 0:
+            print_rank_0("loading projection head weights")
+            self.projection_enc.load_state_dict(
+                state_dict[self._projection_enc_key], strict=strict)
diff --git a/training/DeepSpeed-Domino/megatron/model/classification.py b/training/DeepSpeed-Domino/megatron/model/classification.py
new file mode 100644
index 000000000..bac50c54c
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/classification.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Classification model."""
+
+import torch
+
+from megatron import get_args, print_rank_last
+from megatron.model.enums import AttnMaskType
+from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from .module import MegatronModule
+
+
+class Classification(MegatronModule):
+
+    def __init__(self,
+                 config,
+                 num_classes,
+                 num_tokentypes=2,
+                 pre_process=True,
+                 post_process=True):
+        super().__init__(config=config, share_embeddings_and_output_weights=False)
+        args = get_args()
+
+        self.num_classes = num_classes
+        self.pre_process = pre_process
+        self.post_process = post_process
+
+        self.language_model, self._language_model_key = get_language_model(
+            config=config,
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+
+        # Multi-choice head.
+        if self.post_process:
+            self.classification_dropout = torch.nn.Dropout(args.hidden_dropout)
+            self.classification_head = get_linear_layer(args.hidden_size,
+                                                        self.num_classes,
+                                                        init_method)
+            self._classification_head_key = 'classification_head'
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+
+    def forward(self, model_input, attention_mask, tokentype_ids=None):
+
+        extended_attention_mask = bert_extended_attention_mask(attention_mask)
+        input_ids = model_input
+        position_ids = bert_position_ids(input_ids)
+
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids
+        )
+
+        if self.post_process:
+            _, pooled_output = lm_output
+            classification_output = self.classification_dropout(pooled_output)
+            classification_logits = self.classification_head(classification_output)
+
+            # Reshape back to separate choices.
+            classification_logits = classification_logits.view(-1, self.num_classes)
+
+            return classification_logits
+        return lm_output
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(prefix=prefix,
+                                                                 keep_vars=keep_vars)
+        if self.post_process:
+            state_dict_[self._classification_head_key] \
+                = self.classification_head.state_dict(prefix=prefix, keep_vars=keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self.post_process:
+            if self._classification_head_key in state_dict:
+                self.classification_head.load_state_dict(
+                    state_dict[self._classification_head_key], strict=strict)
+            else:
+                print_rank_last('***WARNING*** could not find {} in the checkpoint, '
+                                'initializing to random'.format(
+                                    self._classification_head_key))
diff --git a/training/DeepSpeed-Domino/megatron/model/distributed.py b/training/DeepSpeed-Domino/megatron/model/distributed.py
new file mode 100644
index 000000000..87d5f258d
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/distributed.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from abc import ABC
+from abc import abstractmethod
+import math
+
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from megatron import get_args
+from megatron.core import mpu
+from .module import MegatronModule
+
+
+class MemoryBuffer:
+
+    def __init__(self, numel, numel_padded, dtype):
+        self.numel = numel
+        self.numel_padded = numel_padded
+        self.dtype = dtype
+        self.data = torch.zeros(self.numel_padded,
+                                dtype=self.dtype,
+                                device=torch.cuda.current_device(),
+                                requires_grad=False)
+
+    def zero(self):
+        """Reset the buffer to zero."""
+        self.data.zero_()
+
+
+    def get(self, shape, start_index):
+        """Return a tensor with the input `shape` as a view into the
+        1-D data starting at `start_index`."""
+        end_index = start_index + shape.numel()
+        assert end_index <= self.numel, \
+            'requested tensor is out of the buffer range.'
+        buffer_tensor = self.data[start_index:end_index]
+        buffer_tensor = buffer_tensor.view(shape)
+        return buffer_tensor
+
+
+
+class DistributedDataParallelBase(MegatronModule, ABC):
+    """Abstract class for DDP."""
+
+    def __init__(self, module):
+        super(DistributedDataParallelBase, self).__init__()
+        # Keep a pointer to the model.
+        self.module = module
+
+
+    @abstractmethod
+    def allreduce_gradients(self):
+        pass
+
+
+    def forward(self, *inputs, **kwargs):
+        return self.module(*inputs, **kwargs)
+
+
+    def state_dict(self, prefix='', keep_vars=False):
+        return self.module.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        return self.module.state_dict_for_save_checkpoint(prefix=prefix,
+                                                          keep_vars=keep_vars)
+
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
+
+
+
+class DistributedDataParallel(DistributedDataParallelBase):
+    """DDP with contiguous buffers options to store and accumulate gradients.
+    This class:
+        - has the potential to reduce memory fragmentation.
+        - provides the option to do the gradient accumulation
+          in a type other than the params type (for example fp32)
+
+    Arguments:
+        module: input model.
+        accumulate_allreduce_grads_in_fp32: if true do the gradient accumulation
+            and the gradient all-reduce all in in float32. If this option is
+            true, we require `use_contiguous_buffers` to be true too.
+        use_contiguous_buffers: if true, use a contiguous buffer to store the
+            gradients.
+    """
+
+    def __init__(self, module,
+                 accumulate_allreduce_grads_in_fp32,
+                 use_contiguous_buffers):
+
+        super(DistributedDataParallel, self).__init__(module)
+
+        self.accumulate_allreduce_grads_in_fp32 \
+            = accumulate_allreduce_grads_in_fp32
+        self.use_contiguous_buffers = use_contiguous_buffers
+        # If we are using fp32-accumulate-allreduce explicitly
+        # this means we need main grads in a continous buffer.
+        if self.accumulate_allreduce_grads_in_fp32:
+            assert self.use_contiguous_buffers
+
+        # ===================================
+        # Rest of this part applies only to
+        # the case we use continuous buffers.
+        # ===================================
+        self._grad_buffers = None
+        self._grad_buffer_param_index_map = None
+        if self.use_contiguous_buffers:
+            self._grad_buffers = {}
+            self._grad_buffer_param_index_map = {}
+            data_parallel_world_size = mpu.get_data_parallel_world_size()
+
+            # Simple function to define buffer type.
+            def _get_buffer_type(param):
+                return torch.float if \
+                    self.accumulate_allreduce_grads_in_fp32 else param.dtype
+
+            # First calculate total number of elements per type.
+            type_num_elements = {}
+            for param in self.module.parameters():
+                if param.requires_grad:
+                    dtype = _get_buffer_type(param)
+                    type_num_elements[dtype] = type_num_elements.get(dtype, 0) \
+                                               + param.data.nelement()
+
+            # Allocate the buffer.
+            for dtype, num_elements in type_num_elements.items():
+
+                # If using distributed optimizer, pad memory buffer to be
+                # multiple of data_parallel_world_size. (This padding is done
+                # due to a constraint with the reduce_scatter op, which requires
+                # all tensors have equal size. See: optimizer.py.)
+                num_elements_padded = data_parallel_world_size * \
+                    int(math.ceil(num_elements / data_parallel_world_size))
+
+                # Allocate grad buffer.
+                self._grad_buffers[dtype] = MemoryBuffer(num_elements,
+                                                         num_elements_padded,
+                                                         dtype)
+
+            # Assume the back prop order is reverse the params order,
+            # store the start index for the gradients.
+            for param in self.module.parameters():
+                if param.requires_grad:
+                    dtype = _get_buffer_type(param)
+                    type_num_elements[dtype] -= param.data.nelement()
+                    param.main_grad = self._grad_buffers[dtype].get(
+                        param.data.shape, type_num_elements[dtype])
+                    if dtype not in self._grad_buffer_param_index_map:
+                        self._grad_buffer_param_index_map[dtype] = {}
+                    self._grad_buffer_param_index_map[dtype][param] = (
+                        type_num_elements[dtype],
+                        type_num_elements[dtype] + param.data.nelement(),
+                    )
+
+            # Backward hook.
+            # Accumalation function for the gradients. We need
+            # to store them so they don't go out of scope.
+            self.grad_accs = []
+            # Loop over all the parameters in the model.
+            for param in self.module.parameters():
+                if param.requires_grad:
+                    # Expand so we get access to grad_fn.
+                    param_tmp = param.expand_as(param)
+                    # Get the gradient accumulator functtion.
+                    grad_acc = param_tmp.grad_fn.next_functions[0][0]
+                    grad_acc.register_hook(self._make_param_hook(param))
+                    self.grad_accs.append(grad_acc)
+
+
+    def _make_param_hook(self, param):
+        """Create the all-reduce hook for backprop."""
+        # Hook used for back-prop.
+        def param_hook(*unused):
+            # Add the gradient to the buffer.
+            if param.grad is not None:
+                # The gradient function of linear layers is fused with GEMMs
+                param.main_grad.add_(param.grad.data)
+                # Now we can deallocate grad memory.
+                param.grad = None
+        return param_hook
+
+
+    def zero_grad_buffer(self):
+        """Set the grad buffer data to zero. Needs to be called at the
+        begining of each iteration."""
+        assert self._grad_buffers is not None, 'buffers are not initialized.'
+        for _, buffer_ in self._grad_buffers.items():
+            buffer_.zero()
+
+
+    def broadcast_params(self):
+        for param in self.module.parameters():
+            torch.distributed.broadcast(param.data,
+                                        src=mpu.get_data_parallel_src_rank(),
+                                        group=mpu.get_data_parallel_group())
+
+
+    def allreduce_gradients(self):
+        """Reduce gradients across data parallel ranks."""
+        # If we have buffers, simply reduce the data in the buffer.
+        if self._grad_buffers is not None:
+            for _, buffer_ in self._grad_buffers.items():
+                buffer_.data /= mpu.get_data_parallel_world_size()
+                torch.distributed.all_reduce(
+                    buffer_.data, group=mpu.get_data_parallel_group())
+        else:
+            # Otherwise, bucketize and all-reduce
+            buckets = {}
+            # Pack the buckets.
+            for param in self.module.parameters():
+                if param.requires_grad and param.grad is not None:
+                    tp = param.data.type()
+                    if tp not in buckets:
+                        buckets[tp] = []
+                    buckets[tp].append(param)
+                    param.main_grad = param.grad
+
+            # For each bucket, all-reduce and copy all-reduced grads.
+            for tp in buckets:
+                bucket = buckets[tp]
+                grads = [param.grad.data for param in bucket]
+                coalesced = _flatten_dense_tensors(grads)
+                coalesced /= mpu.get_data_parallel_world_size()
+                torch.distributed.all_reduce(
+                    coalesced, group=mpu.get_data_parallel_group())
+                for buf, synced in zip(grads, _unflatten_dense_tensors(
+                        coalesced, grads)):
+                    buf.copy_(synced)
diff --git a/training/DeepSpeed-Domino/megatron/model/enums.py b/training/DeepSpeed-Domino/megatron/model/enums.py
new file mode 100644
index 000000000..bc4e4aa29
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/enums.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import enum
+
+class LayerType(enum.Enum):
+    encoder = 1
+    decoder = 2
+    retro_encoder = 3
+    retro_decoder = 4
+    retro_decoder_with_retriever = 5
+ 
+class AttnType(enum.Enum):
+    self_attn = 1
+    cross_attn = 2
+
+class AttnMaskType(enum.Enum):
+    padding = 1
+    causal = 2
+
+# For backward compatibility with old model checkpoints
+from megatron.core.enums import ModelType
diff --git a/training/DeepSpeed-Domino/megatron/model/fused_bias_gelu.py b/training/DeepSpeed-Domino/megatron/model/fused_bias_gelu.py
new file mode 100644
index 000000000..29222db02
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/fused_bias_gelu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+
+###### BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+
+@torch.jit.script
+def bias_gelu(bias, y):
+    x = bias + y
+    return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, bias, y):
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    return ff*g
+
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(bias, input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, bias, input)
+        return tmp, tmp
+
+bias_gelu_impl = GeLUFunction.apply
diff --git a/training/DeepSpeed-Domino/megatron/model/fused_layer_norm.py b/training/DeepSpeed-Domino/megatron/model/fused_layer_norm.py
new file mode 100644
index 000000000..fd8591e4a
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/fused_layer_norm.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""This code is copied fron NVIDIA apex:
+      https://github.com/NVIDIA/apex
+   with some changes. """
+
+import numbers
+import torch
+from torch.nn.parameter import Parameter
+from torch.nn import init
+import importlib
+
+from megatron.core.utils import make_viewless_tensor
+
+try:
+    from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
+    HAVE_PERSIST_LAYER_NORM = True
+except:
+    HAVE_PERSIST_LAYER_NORM = False
+
+from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+
+
+global fused_layer_norm_cuda
+fused_layer_norm_cuda = None
+
+
+class MixedFusedLayerNorm(torch.nn.Module):
+
+  def __init__(self, normalized_shape, eps=1e-5,
+               no_persist_layer_norm=True,
+               sequence_parallel=False,
+               apply_layernorm_1p=False):
+        super(MixedFusedLayerNorm, self).__init__()
+
+        self.apply_layernorm_1p = apply_layernorm_1p
+
+        global fused_layer_norm_cuda
+        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+
+        # List of hiddens sizes supported in the persistent layer norm kernel
+        # If the hidden size is not supported, fall back to the non-persistent
+        # kernel.
+        persist_ln_hidden_sizes = [1024, 1536, 2048, 2304, 3072, 3840, 4096,
+            5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480,
+            24576, 25600, 30720, 32768, 40960, 49152, 65536]
+        if normalized_shape not in persist_ln_hidden_sizes or \
+                not HAVE_PERSIST_LAYER_NORM:
+            no_persist_layer_norm = True
+
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = torch.Size(normalized_shape)
+        self.eps = eps
+        self.weight = Parameter(torch.Tensor(*normalized_shape))
+        self.bias = Parameter(torch.Tensor(*normalized_shape))
+        self.reset_parameters()
+        self.no_persist_layer_norm = no_persist_layer_norm
+        self.sequence_parallel = sequence_parallel
+
+        # set sequence parallelism flag on weight and bias parameters
+        setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
+        setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
+
+
+  def reset_parameters(self):
+
+    if self.apply_layernorm_1p:
+        init.zeros_(self.weight)
+        init.zeros_(self.bias)
+    else:
+        init.ones_(self.weight)
+        init.zeros_(self.bias)
+
+  def forward(self, input):
+
+    weight = self.weight + 1 if self.apply_layernorm_1p else self.weight
+
+    if self.no_persist_layer_norm:
+        return FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.normalized_shape, self.eps)
+    else:
+        output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
+
+        # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
+        # a populated '_base' field). This will result in schedule.py's
+        # deallocate_output_tensor() throwing an error, so a viewless tensor is
+        # created to prevent this.
+        output = make_viewless_tensor(inp = output,
+                                      requires_grad = input.requires_grad,
+                                      keep_graph = True)
+
+        return output
diff --git a/training/DeepSpeed-Domino/megatron/model/fused_softmax.py b/training/DeepSpeed-Domino/megatron/model/fused_softmax.py
new file mode 100644
index 000000000..9bacf3374
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/fused_softmax.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+
+import torch
+import torch.nn as nn
+from megatron.model.enums import AttnMaskType
+
+
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_upper_triang_masked_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
+            inputs, scale_t[0]
+        )
+
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_upper_triang_masked_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+
+        return input_grads, None
+
+
+class ScaledMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply the mask.
+    3. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, mask, scale):
+        import scaled_masked_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+
+        softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_masked_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads = scaled_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None, None
+
+
+class ScaledSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following two operations in sequence
+    1. Scale the tensor.
+    2. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+
+        softmax_results = scaled_softmax_cuda.forward(
+            inputs, scale_t[0]
+        )
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads = scaled_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None, None
+
+
+class FusedScaleMaskSoftmax(nn.Module):
+    """
+    fused operation: scaling + mask + softmax
+
+    Arguments:
+        input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_bf16: flag to indicate if input in bf16 data format.
+        attn_mask_type: attention mask type (pad or causal)
+        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        scale: scaling factor used in input tensor scaling.
+    """
+
+    def __init__(
+        self,
+        input_in_fp16,
+        input_in_bf16,
+        attn_mask_type,
+        scaled_masked_softmax_fusion,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
+        super(FusedScaleMaskSoftmax, self).__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.input_in_bf16 = input_in_bf16
+        assert not (
+            self.input_in_fp16 and self.input_in_bf16
+        ), "both fp16 and bf16 flags cannot be active at the same time."
+        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
+        self.attn_mask_type = attn_mask_type
+        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+
+        assert (
+            self.scale is None or softmax_in_fp32
+        ), "softmax should be in fp32 when scaled"
+
+    def forward(self, input, mask):
+        # [b, np, sq, sk]
+        assert input.dim() == 4
+
+        if self.is_kernel_available(mask, *input.size()):
+            return self.forward_fused_softmax(input, mask)
+        else:
+            return self.forward_torch_softmax(input, mask)
+
+    def is_kernel_available(self, mask, b, np, sq, sk):
+        attn_batches = b * np
+
+        if (
+            self.scaled_masked_softmax_fusion  # user want to fuse
+            and self.input_in_float16  # input must be fp16
+            and 16 < sk <= 16384  # sk must be 16 ~ 16384
+            and sq % 4 == 0  # sq must be divisor of 4
+            and sk % 4 == 0  # sk must be divisor of 4
+            and attn_batches % 4 == 0  # np * b must be divisor of 4
+        ):
+            if 0 <= sk <= 16384:
+                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
+
+                if self.attn_mask_type == AttnMaskType.causal:
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if sq % batch_per_block == 0:
+                        return True
+        return False
+
+    def forward_fused_softmax(self, input, mask):
+        b, np, sq, sk = input.size()
+        scale = self.scale if self.scale is not None else 1.0
+
+        if self.attn_mask_type == AttnMaskType.causal:
+            assert sq == sk, "causal mask is only for self attention"
+
+            # input is 3D tensor (attn_batches, sq, sk)
+            input = input.view(-1, sq, sk)
+            probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
+            return probs.view(b, np, sq, sk)
+        else:
+            # input is 4D tensor (b, np, sq, sk)
+            if mask is not None:
+                return ScaledMaskedSoftmax.apply(input, mask, scale)
+            else:
+                return ScaledSoftmax.apply(input, scale)
+
+    def forward_torch_softmax(self, input, mask):
+        if self.input_in_float16 and self.softmax_in_fp32:
+            input = input.float()
+
+        if self.scale is not None:
+            input = input * self.scale
+        mask_output = self.mask_func(input, mask) if mask is not None else input
+        probs = torch.nn.Softmax(dim=-1)(mask_output)
+
+        if self.input_in_float16 and self.softmax_in_fp32:
+            if self.input_in_fp16:
+                probs = probs.half()
+            else:
+                probs = probs.bfloat16()
+
+        return probs
+
+    @staticmethod
+    def get_batch_per_block(sq, sk, b, np):
+        import scaled_masked_softmax_cuda
+
+        return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
diff --git a/training/DeepSpeed-Domino/megatron/model/gpt_model.py b/training/DeepSpeed-Domino/megatron/model/gpt_model.py
new file mode 100644
index 000000000..dd47188da
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/gpt_model.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""GPT-2 model."""
+
+import torch
+
+from megatron import get_args
+from megatron.core import tensor_parallel
+from .module import MegatronModule
+
+from .enums import AttnMaskType
+from .language_model import parallel_lm_logits
+from .language_model import get_language_model
+
+
+def post_language_model_processing(lm_output, labels, logit_weights,
+                                   parallel_output,
+                                   fp16_lm_cross_entropy):
+
+    # Output. Format [s b h]
+    output = parallel_lm_logits(
+        lm_output,
+        logit_weights,
+        parallel_output)
+
+    if labels is None:
+        # [s b h] => [b s h]
+        return output.transpose(0,1).contiguous()
+    else:
+        # [b s] => [s b]
+        labels = labels.transpose(0,1).contiguous()
+        if fp16_lm_cross_entropy:
+            assert output.dtype == torch.half
+            loss = tensor_parallel.vocab_parallel_cross_entropy(output, labels)
+        else:
+            loss = tensor_parallel.vocab_parallel_cross_entropy(output.float(), labels)
+        
+        # [s b] => [b, s]
+        loss = loss.transpose(0,1).contiguous()
+        return loss
+
+
+class GPTModel(MegatronModule):
+    """GPT-2 Language model."""
+
+    def __init__(self,
+                 config,
+                 num_tokentypes=0,
+                 parallel_output=True,
+                 pre_process=True,
+                 post_process=True):
+        args = get_args()
+        super().__init__(config=config, share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights)
+
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+        self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights
+
+        self.language_model, self._language_model_key = get_language_model(
+            config=config,
+            num_tokentypes=num_tokentypes,
+            add_pooler=False,
+            encoder_attn_mask_type=AttnMaskType.causal,
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+        
+        if not args.untie_embeddings_and_output_weights:
+            self.initialize_word_embeddings()
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+
+    def forward(self, input_ids, position_ids, attention_mask,
+                retriever_input_ids=None,
+                retriever_position_ids=None,
+                retriever_attn_mask=None,
+                labels=None, tokentype_ids=None, inference_params=None):
+
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            retriever_input_ids=retriever_input_ids,
+            retriever_position_ids=retriever_position_ids,
+            retriever_attn_mask=retriever_attn_mask,
+            inference_params=inference_params)
+
+        if self.post_process:
+            return post_language_model_processing(
+                lm_output, labels,
+                self.language_model.output_layer.weight if self.untie_embeddings_and_output_weights else self.shared_embedding_or_output_weight(),
+                self.parallel_output,
+                self.fp16_lm_cross_entropy)
+        else:
+            return lm_output
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                prefix=prefix, keep_vars=keep_vars)
+        # Save word_embeddings.
+        if self.post_process and not self.pre_process and not self.untie_embeddings_and_output_weights:
+            state_dict_[self._word_embeddings_for_head_key] \
+                = self.word_embeddings.state_dict(prefix=prefix,
+                                                  keep_vars=keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Load word_embeddings.
+        if self.post_process and not self.pre_process and not self.untie_embeddings_and_output_weights:
+            self.word_embeddings.load_state_dict(
+                state_dict[self._word_embeddings_for_head_key], strict=strict)
+        if self._language_model_key in state_dict:
+            state_dict = state_dict[self._language_model_key]
+        self.language_model.load_state_dict(state_dict, strict=strict)
diff --git a/training/DeepSpeed-Domino/megatron/model/language_model.py b/training/DeepSpeed-Domino/megatron/model/language_model.py
new file mode 100644
index 000000000..85b5dc5cb
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/language_model.py
@@ -0,0 +1,634 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Transformer based language model."""
+
+import torch
+import torch.nn.functional as F
+
+from megatron import get_args
+from megatron.core import mpu, tensor_parallel
+from megatron.core.enums import ModelType
+from megatron.core.models.common.rotary_pos_embedding import RotaryEmbedding
+
+from .enums import AttnMaskType, LayerType
+from .module import MegatronModule
+from .transformer import ParallelTransformer
+from .utils import get_linear_layer
+from .utils import init_method_normal, scaled_init_method_normal
+
+
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
+                       bias=None):
+    """LM logits using word embedding weights."""
+    args = get_args()
+    # Parallel logits.
+    if args.async_tensor_model_parallel_allreduce or\
+            args.sequence_parallel:
+        input_parallel = input_
+        model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
+        async_grad_allreduce = args.async_tensor_model_parallel_allreduce and \
+            model_parallel and not args.sequence_parallel
+    else:
+        input_parallel = tensor_parallel.copy_to_tensor_model_parallel_region(input_)
+        async_grad_allreduce = False
+
+    # Matrix multiply.
+    logits_parallel = tensor_parallel.linear_with_grad_accumulation_and_async_allreduce(
+        input=input_parallel,
+        weight=word_embeddings_weight,
+        bias=bias,
+        gradient_accumulation_fusion=args.gradient_accumulation_fusion,
+        async_grad_allreduce=async_grad_allreduce,
+        sequence_parallel=args.sequence_parallel)
+    # Gather if needed.
+
+    if parallel_output:
+        return logits_parallel
+
+    return tensor_parallel.gather_from_tensor_model_parallel_region(logits_parallel)
+
+
+def get_language_model(config, num_tokentypes, add_pooler,
+                       encoder_attn_mask_type,
+                       add_encoder=True,
+                       add_decoder=False,
+                       decoder_attn_mask_type=AttnMaskType.causal,
+                       pre_process=True, post_process=True):
+    """Build language model and return along with the key to save."""
+    args = get_args()
+    if config.init_method is None:
+        config.init_method = init_method_normal(config.init_method_std)
+
+    if config.output_layer_init_method is None:
+        config.output_layer_init_method = scaled_init_method_normal(config.init_method_std,
+                                                                    config.num_layers)
+
+    # Language model.
+    language_model = TransformerLanguageModel(
+        config,
+        encoder_attn_mask_type,
+        num_tokentypes=num_tokentypes,
+        add_encoder=add_encoder,
+        add_decoder=add_decoder,
+        decoder_attn_mask_type=decoder_attn_mask_type,
+        add_pooler=add_pooler,
+        pre_process=pre_process,
+        post_process=post_process
+    )
+    # key used for checkpoints.
+    language_model_key = 'language_model'
+
+    return language_model, language_model_key
+
+
+class Pooler(MegatronModule):
+    """Pooler layer.
+
+    Pool hidden states of a specific token (for example start of the
+    sequence) and add a linear transformation followed by a tanh.
+
+    Arguments:
+        hidden_size: hidden size
+        init_method: weight initialization method for the linear layer.
+            bias is set to zero.
+    """
+
+    def __init__(self, hidden_size, init_method):
+        super(Pooler, self).__init__()
+        args = get_args()
+        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
+        self.sequence_parallel = args.sequence_parallel
+
+
+    def forward(self, hidden_states, sequence_index=0):
+        # hidden_states: [s, b, h]
+        # sequence_index: index of the token to pool.
+
+        # gather data along sequence dimensions
+        # same pooler is run on all tensor parallel nodes
+        if self.sequence_parallel:
+            hidden_states = tensor_parallel.gather_from_sequence_parallel_region(
+                hidden_states,
+                tensor_parallel_output_grad=False)
+
+        pooled = hidden_states[sequence_index, :, :]
+        pooled = self.dense(pooled)
+        pooled = torch.tanh(pooled)
+        return pooled
+
+
+class Embedding(MegatronModule):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+        embedding_weights_in_fp32: casts word embedding weights to
+                                   fp32 before sampling. Required to
+                                   maintain reproducibility when
+                                   training in bf16.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 max_sequence_length,
+                 embedding_dropout_prob,
+                 config,
+                 num_tokentypes=0,
+                 embedding_weights_in_fp32=False):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.init_method = config.init_method
+        self.num_tokentypes = num_tokentypes
+
+        args = get_args()
+
+        # Word embeddings (parallel).
+        self.embedding_weights_in_fp32 = embedding_weights_in_fp32
+        self.params_dtype = args.params_dtype
+        self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
+            vocab_size, self.hidden_size, config=config, init_method=config.init_method)
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding (serial).
+        self.add_position_embedding = args.position_embedding_type == 'learned_absolute'
+        if self.add_position_embedding:
+            self.position_embeddings = torch.nn.Embedding(
+                max_sequence_length, self.hidden_size)
+            self._position_embeddings_key = 'position_embeddings'
+            # Initialize the position embeddings.
+            if args.perform_initialization:
+                self.init_method(self.position_embeddings.weight)
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes,
+                                                           self.hidden_size)
+            # Initialize the token-type embeddings.
+            if args.perform_initialization:
+                self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        self.fp32_residual_connection = args.fp32_residual_connection
+        self.sequence_parallel = args.sequence_parallel
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        if self.add_position_embedding:
+            self.position_embeddings.weight.data.fill_(0)
+            self.position_embeddings.weight.shared = True
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes),
+                  flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes,
+                                                       self.hidden_size)
+        # Initialize the token-type embeddings.
+        args = get_args()
+        self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids, tokentype_ids=None):
+        # Embeddings.
+        if self.embedding_weights_in_fp32:
+            self.word_embeddings = self.word_embeddings.to(torch.float32)
+        words_embeddings = self.word_embeddings(input_ids)
+        if self.embedding_weights_in_fp32:
+            words_embeddings = words_embeddings.to(self.params_dtype)
+            self.word_embeddings = self.word_embeddings.to(self.params_dtype)
+        if self.add_position_embedding:
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings = words_embeddings + position_embeddings
+        else:
+            embeddings = words_embeddings
+
+        if tokentype_ids is not None:
+            assert self.tokentype_embeddings is not None
+            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+        else:
+            assert self.tokentype_embeddings is None
+
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+
+        # Dropout.
+        if self.sequence_parallel:
+            embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
+            with tensor_parallel.get_cuda_rng_tracker().fork():
+                embeddings = self.embedding_dropout(embeddings)
+        else:
+            embeddings = self.embedding_dropout(embeddings)
+
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(prefix=prefix,
+                                              keep_vars=keep_vars)
+        if self.add_position_embedding:
+            state_dict_[self._position_embeddings_key] \
+                = self.position_embeddings.state_dict(prefix=prefix,
+                                                  keep_vars=keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(prefix=prefix,
+                                                       keep_vars=keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self.add_position_embedding:
+            if self._position_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._position_embeddings_key]
+            else:
+                # for backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if 'position_embeddings' in key:
+                        state_dict_[key.split('position_embeddings.')[1]] \
+                            = state_dict[key]
+            self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_,
+                                                          strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it', flush=True)
+
+
+class TransformerLanguageModel(MegatronModule):
+    """Transformer language model.
+
+    Arguments:
+        transformer_hparams: transformer hyperparameters
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self,
+                 config,
+                 encoder_attn_mask_type,
+                 num_tokentypes=0,
+                 add_encoder=True,
+                 add_decoder=False,
+                 decoder_attn_mask_type=AttnMaskType.causal,
+                 add_pooler=False,
+                 pre_process=True,
+                 post_process=True):
+        args = get_args()
+        # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.
+        if args.untie_embeddings_and_output_weights: assert not add_decoder
+        super(TransformerLanguageModel, self).__init__(share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights)
+
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.hidden_size = config.hidden_size
+        self.num_tokentypes = num_tokentypes
+        self.init_method = config.init_method
+        self.add_encoder = add_encoder
+        self.encoder_attn_mask_type = encoder_attn_mask_type
+        self.add_decoder = add_decoder
+        self.decoder_attn_mask_type = decoder_attn_mask_type
+        self.add_pooler = add_pooler
+        self.encoder_hidden_state = None
+        self.add_retriever = args.retro_add_retriever
+        self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights
+
+        # Embeddings.
+        if self.pre_process:
+            self.embedding = Embedding(self.hidden_size,
+                                       args.padded_vocab_size,
+                                       args.max_position_embeddings,
+                                       args.hidden_dropout,
+                                       config,
+                                       self.num_tokentypes,
+                                       args.embedding_weights_in_fp32)
+            self._embedding_key = 'embedding'
+
+        # Rotary positional embeddings
+        self.use_rotary_position_embeddings = \
+            args.position_embedding_type == 'rope'
+        if self.use_rotary_position_embeddings:
+            self.seq_length = args.seq_length
+            rotary_dim = args.hidden_size // args.num_attention_heads \
+                if args.kv_channels is None else args.kv_channels
+
+            if args.rotary_percent < 1.0:
+                rotary_dim = int(rotary_dim * args.rotary_percent)
+
+            # partial rotary embeddings, which is better than full rotary
+            # Wang and Komatsuzaki et al
+            # https://github.com/kingoflolz/mesh-transformer-jax/
+            self.rotary_pos_emb = RotaryEmbedding(
+                rotary_dim,
+                seq_len_interpolation_factor=args.rotary_seq_len_interpolation_factor
+            )
+
+        # Encoder (usually set to True, False if part of an encoder-decoder
+        # architecture and in encoder-only stage).
+        if self.add_encoder:
+            self.encoder = ParallelTransformer(
+                config,
+                model_type=args.model_type if not args.retro_add_retriever \
+                    else ModelType.retro_decoder,
+                self_attn_mask_type=self.encoder_attn_mask_type,
+                pre_process=self.pre_process,
+                post_process=self.post_process,
+            )
+            self._encoder_key = 'encoder'
+        else:
+            self.encoder = None
+
+        # Decoder (usually set to False, True if part of an encoder-decoder
+        # architecture and in decoder-only stage).
+        if self.add_decoder:
+            self.decoder = ParallelTransformer(
+                config,
+                model_type=args.model_type,
+                layer_type=LayerType.decoder,
+                self_attn_mask_type=self.decoder_attn_mask_type,
+                pre_process=self.pre_process,
+                post_process=self.post_process)
+            self._decoder_key = 'decoder'
+        else:
+            self.decoder = None
+
+        if self.post_process:
+            # Pooler.
+            if self.add_pooler:
+                self.pooler = Pooler(self.hidden_size, self.init_method)
+                self._pooler_key = 'pooler'
+
+            if self.untie_embeddings_and_output_weights:
+                self.output_layer = tensor_parallel.ColumnParallelLinear(
+                    args.hidden_size,
+                    args.padded_vocab_size,
+                    config=config,
+                    init_method=self.init_method,
+                    bias=False) # Setting bias to False always to keep it consistent with embedding tying that also does not have a bias.
+                self._output_layer_key = 'output_layer'
+
+    def set_input_tensor(self, input_tensor):
+        """ See megatron.model.transformer.set_input_tensor()"""
+
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        if self.add_encoder and self.add_decoder:
+            assert len(input_tensor) == 1, \
+                'input_tensor should only be length 1 for stage with both encoder and decoder'
+            self.encoder.set_input_tensor(input_tensor[0])
+        elif self.add_encoder:
+            assert len(input_tensor) == 1, \
+                'input_tensor should only be length 1 for stage with only encoder'
+            self.encoder.set_input_tensor(input_tensor[0])
+        elif self.add_decoder:
+            if len(input_tensor) == 2:
+                self.decoder.set_input_tensor(input_tensor[0])
+                self.encoder_hidden_state = input_tensor[1]
+            elif len(input_tensor) == 1:
+                self.decoder.set_input_tensor(None)
+                self.encoder_hidden_state = input_tensor[0]
+            else:
+                raise Exception('input_tensor must have either length 1 or 2')
+        else:
+            raise Exception('Stage must have at least either encoder or decoder')
+
+    def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
+                dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None,
+                retriever_input_ids=None,
+                retriever_position_ids=None,
+                retriever_attn_mask=None,
+                enc_dec_attn_mask=None, tokentype_ids=None,
+                inference_params=None,
+                pooling_sequence_index=0,
+                enc_hidden_states=None, output_enc_hidden=False):
+
+        # Encoder embedding.
+        if self.pre_process:
+            encoder_input = self.embedding(enc_input_ids, enc_position_ids,
+                                           tokentype_ids=tokentype_ids)
+        else:
+            encoder_input = None
+
+        # Retriever embedding.
+        if self.add_retriever and self.pre_process:
+            retriever_input = self.embedding(retriever_input_ids,
+                                             retriever_position_ids,
+                                             tokentype_ids=tokentype_ids)
+        else:
+            retriever_input = None
+
+        # Rotary positional embeddings
+        rotary_pos_emb = None
+        if self.use_rotary_position_embeddings:
+            if inference_params is not None:
+                rotary_pos_emb = \
+                    self.rotary_pos_emb(inference_params.max_sequence_length)
+            else:
+                rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+
+        # Run encoder.
+        if enc_hidden_states is None:
+            if self.encoder is not None:
+                encoder_output = self.encoder(
+                    encoder_input,
+                    enc_attn_mask,
+                    retriever_input=retriever_input,
+                    retriever_attn_mask=retriever_attn_mask,
+                    inference_params=inference_params,
+                    rotary_pos_emb=rotary_pos_emb)
+            else:
+                encoder_output = self.encoder_hidden_state
+        else:
+            encoder_output = enc_hidden_states.to(encoder_input.dtype)
+
+        if self.post_process:
+            if self.add_pooler:
+                pooled_output = self.pooler(encoder_output,
+                                            pooling_sequence_index)
+
+        # output_enc_hidden refers to when we just need the encoder's
+        # output. For example, it is helpful to compute
+        # similarity between two sequences by average pooling
+        if not self.add_decoder or output_enc_hidden:
+            if self.add_pooler and self.post_process:
+                return encoder_output, pooled_output
+            else:
+                return encoder_output
+
+        # Decoder embedding.
+        if self.pre_process:
+            decoder_input = self.embedding(dec_input_ids,
+                                           dec_position_ids)
+        else:
+            decoder_input = None
+
+        # Run decoder.
+        decoder_output = self.decoder(
+            decoder_input,
+            dec_attn_mask,
+            encoder_output=encoder_output,
+            enc_dec_attn_mask=enc_dec_attn_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb)
+
+        if self.add_pooler and self.post_process:
+            return decoder_output, encoder_output, pooled_output
+        else:
+            return decoder_output, encoder_output
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        if self.pre_process:
+            state_dict_[self._embedding_key] \
+                = self.embedding.state_dict_for_save_checkpoint(prefix=prefix,
+                                                                keep_vars=keep_vars)
+        if self.add_encoder:
+            state_dict_[self._encoder_key] \
+                = self.encoder.state_dict_for_save_checkpoint(prefix=prefix,
+                                                              keep_vars=keep_vars)
+        if self.post_process:
+            if self.add_pooler:
+                state_dict_[self._pooler_key] \
+                    = self.pooler.state_dict_for_save_checkpoint(prefix=prefix,
+                                                                 keep_vars=keep_vars)
+            if self.untie_embeddings_and_output_weights:
+                state_dict_[self._output_layer_key] \
+                    = self.output_layer.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+        if self.add_decoder:
+            state_dict_[self._decoder_key] \
+                = self.decoder.state_dict_for_save_checkpoint(prefix=prefix,
+                                                              keep_vars=keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Embedding.
+        if self.pre_process:
+            if self._embedding_key in state_dict:
+                state_dict_ = state_dict[self._embedding_key]
+            else:
+                # for backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if '_embeddings' in key:
+                        state_dict_[key] = state_dict[key]
+            self.embedding.load_state_dict(state_dict_, strict=strict)
+
+        # Encoder.
+        if self.add_encoder:
+            if self._encoder_key in state_dict:
+                state_dict_ = state_dict[self._encoder_key]
+            # For backward compatibility.
+            elif 'transformer' in state_dict:
+                state_dict_ = state_dict['transformer']
+            else:
+                # For backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if 'transformer.' in key:
+                        state_dict_[key.split('transformer.')[1]] = state_dict[key]
+
+            # For backward compatibility.
+            state_dict_self_attention = {}
+            for key in state_dict_.keys():
+                if '.attention.' in key:
+                    state_dict_self_attention[key.replace(".attention.",
+                        ".self_attention.")] = state_dict_[key]
+                else:
+                    state_dict_self_attention[key] = state_dict_[key]
+            state_dict_ = state_dict_self_attention
+
+            self.encoder.load_state_dict(state_dict_, strict=strict)
+
+        # Pooler.
+        if self.post_process:
+            if self.add_pooler:
+                assert 'pooler' in state_dict, \
+                    'could not find data for pooler in the checkpoint'
+                self.pooler.load_state_dict(state_dict[self._pooler_key],
+                                            strict=strict)
+            if self.untie_embeddings_and_output_weights:
+                assert 'output_layer' in state_dict, \
+                    'could not find data for output_layer in the checkpoint'
+                self.output_layer.load_state_dict(state_dict[self._output_layer_key],
+                                                  strict=strict)
+        # Decoder.
+        if self.add_decoder:
+            assert 'decoder' in state_dict, \
+                'could not find data for pooler in the checkpoint'
+            self.decoder.load_state_dict(state_dict[self._decoder_key],
+                                         strict=strict)
diff --git a/training/DeepSpeed-Domino/megatron/model/module.py b/training/DeepSpeed-Domino/megatron/model/module.py
new file mode 100644
index 000000000..c2887315a
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/module.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron Module"""
+
+import torch
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+from megatron import get_args
+from megatron.core import mpu, tensor_parallel
+
+
+_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
+
+
+
+def param_is_not_shared(param):
+    return not hasattr(param, 'shared') or not param.shared
+
+
+
+class MegatronModule(torch.nn.Module):
+    """Megatron specific extensions of torch Module with support
+    for pipelining."""
+
+    def __init__(self, config=None, share_embeddings_and_output_weights=True):
+        super(MegatronModule, self).__init__()
+        self.config = config
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """Use this function to override the state dict for
+        saving checkpoints."""
+        return self.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+
+    def shared_embedding_or_output_weight(self):
+        if self.pre_process:
+            return self.language_model.embedding.word_embeddings.weight
+        else:
+            if not self.share_embeddings_and_output_weights:
+                raise Exception('shared_embedding_or_output_weight() called for last '
+                                'stage, but share_embeddings_and_output_weights is false')
+            return self.word_embeddings.weight
+
+
+    def initialize_word_embeddings(self):
+        args = get_args()
+        if not self.share_embeddings_and_output_weights:
+            raise Exception('initialize_word_embeddings() was called but '
+                            'share_embeddings_and_output_weights is false')
+
+        # This function just initializes the word embeddings in the final stage
+        # when we are using pipeline parallelism. Nothing to do if we aren't
+        # using pipeline parallelism.
+        if args.pipeline_model_parallel_size == 1:
+            return
+
+        # Parameters are shared between the word embeddings layers, and the
+        # heads at the end of the model. In a pipelined setup with more than
+        # one stage, the initial embedding layer and the head are on different
+        # workers, so we do the following:
+        # 1. Create a second copy of word_embeddings on the last stage, with
+        #    initial parameters of 0.0.
+        # 2. Do an all-reduce between the first and last stage to ensure that
+        #    the two copies of word_embeddings start off with the same
+        #    parameter values.
+        # 3. In the training loop, before an all-reduce between the grads of
+        #    the two word_embeddings layers to ensure that every applied weight
+        #    update is the same on both stages.
+        if mpu.is_pipeline_last_stage() and not self.pre_process:
+            assert not mpu.is_pipeline_first_stage()
+            self._word_embeddings_for_head_key = 'word_embeddings_for_head'
+            # set word_embeddings weights to 0 here, then copy first
+            # stage's weights using all_reduce below.
+            self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
+                args.padded_vocab_size, self.config.hidden_size,
+                config=self.config, init_method=self.config.init_method)
+            self.word_embeddings.weight.data.fill_(0)
+            self.word_embeddings.weight.shared = True
+
+        # Zero out initial weights for decoder embedding.
+        # NOTE: We don't currently support T5 with the interleaved schedule.
+        if not mpu.is_pipeline_first_stage(ignore_virtual=True) and \
+                self.pre_process:
+            self.language_model.embedding.zero_parameters()
+
+        if not torch.distributed.is_initialized():
+            if not getattr(MegatronModule, "embedding_warning_printed", False):
+                print("WARNING! Distributed processes aren't initialized, so "
+                      "word embeddings in the last layer are not initialized. "
+                      "If you are just manipulating a model this is fine, but "
+                      "this needs to be handled manually. If you are training "
+                      "something is definitely wrong.")
+                MegatronModule.embedding_warning_printed = True
+            return
+
+        # Ensure that first and last stages have the same initial parameter
+        # values.
+        if mpu.is_rank_in_embedding_group():
+            torch.distributed.all_reduce(self.shared_embedding_or_output_weight().data,
+                                         group=mpu.get_embedding_group())
+
+        # Ensure that encoder(first stage) and decoder(split stage) position
+        # embeddings have the same initial parameter values
+        # NOTE: We don't currently support T5 with the interleaved schedule.
+        if mpu.is_rank_in_position_embedding_group() and \
+                args.pipeline_model_parallel_split_rank is not None:
+            # TODO: Support tokentype embedding.
+            self.language_model.embedding.cuda()
+            position_embeddings = self.language_model.embedding.position_embeddings
+            torch.distributed.all_reduce(position_embeddings.weight.data,
+                                         group=mpu.get_position_embedding_group())
+
+
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val`
+    #is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+
+def fp32_to_float16(val, float16_convertor):
+    """Convert fp32 `val` to fp16/bf16"""
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, _FLOAT_TYPES):
+            val = float16_convertor(val)
+        return val
+    return conversion_helper(val, half_conversion)
+
+
+def float16_to_fp32(val):
+    """Convert fp16/bf16 `val` to fp32"""
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)):
+            val = val.float()
+        return val
+    return conversion_helper(val, float_conversion)
+
+
+
+class Float16Module(MegatronModule):
+
+    def __init__(self, module, args):
+        super(Float16Module, self).__init__()
+
+        if args.fp16:
+            self.add_module('module', module.half())
+            def float16_convertor(val):
+                return val.half()
+        elif args.bf16:
+            self.add_module('module', module.bfloat16())
+            def float16_convertor(val):
+                return val.bfloat16()
+        else:
+            raise Exception('should not be here')
+
+        self.float16_convertor = float16_convertor
+
+
+    def set_input_tensor(self, input_tensor):
+        return self.module.set_input_tensor(input_tensor)
+
+
+    def forward(self, *inputs, **kwargs):
+        if mpu.is_pipeline_first_stage():
+            inputs = fp32_to_float16(inputs, self.float16_convertor)
+        outputs = self.module(*inputs, **kwargs)
+        if mpu.is_pipeline_last_stage():
+            outputs = float16_to_fp32(outputs)
+        return outputs
+
+
+    def state_dict(self, prefix='', keep_vars=False):
+        return self.module.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        return self.module.state_dict_for_save_checkpoint(prefix=prefix,
+                                                          keep_vars=keep_vars)
+
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
diff --git a/training/DeepSpeed-Domino/megatron/model/multiple_choice.py b/training/DeepSpeed-Domino/megatron/model/multiple_choice.py
new file mode 100644
index 000000000..41f8bb49f
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/multiple_choice.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Multiple choice model."""
+
+import torch
+
+from megatron import get_args, print_rank_last
+from megatron.model.enums import AttnMaskType
+from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from .module import MegatronModule
+
+
+class MultipleChoice(MegatronModule):
+
+    def __init__(self,
+                 config,
+                 num_tokentypes=2,
+                 pre_process=True,
+                 post_process=True):
+        super(MultipleChoice, self).__init__(share_embeddings_and_output_weights=False)
+        args = get_args()
+
+        self.pre_process = pre_process
+        self.post_process = post_process
+
+        self.language_model, self._language_model_key = get_language_model(
+            config=config,
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+
+        # Multi-choice head.
+        if self.post_process:
+            self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout)
+            self.multichoice_head = get_linear_layer(args.hidden_size, 1,
+                                                     init_method)
+            self._multichoice_head_key = 'multichoice_head'
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+
+    def forward(self, model_input, attention_mask, tokentype_ids=None):
+
+        # [batch, choices, sequence] --> [batch * choices, sequence] -->
+        #    transformer --> [batch, choices] --> softmax
+
+        # Ensure the shape is [batch-size, choices, sequence]
+        assert len(attention_mask.shape) == 3
+        num_choices = attention_mask.shape[1]
+
+        # Reshape and treat choice dimension the same as batch.
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1))
+        extended_attention_mask = bert_extended_attention_mask(attention_mask)
+
+        input_ids = model_input
+        # Do the same as attention_mask for input_ids, tokentype_ids
+        assert len(input_ids.shape) == 3
+        assert len(tokentype_ids.shape) == 3
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1))
+        position_ids = bert_position_ids(input_ids)
+
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids
+        )
+        if self.post_process:
+            _, pooled_output = lm_output
+            multichoice_output = self.multichoice_dropout(pooled_output)
+            multichoice_logits = self.multichoice_head(multichoice_output)
+
+            # Reshape back to separate choices.
+            multichoice_logits = multichoice_logits.view(-1, num_choices)
+
+            return multichoice_logits
+        return lm_output
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(prefix=prefix,
+                                                                 keep_vars=keep_vars)
+        if self.post_process:
+            state_dict_[self._multichoice_head_key] \
+                = self.multichoice_head.state_dict(prefix=prefix, keep_vars=keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self.post_process:
+            if self._multichoice_head_key in state_dict:
+                self.multichoice_head.load_state_dict(
+                    state_dict[self._multichoice_head_key], strict=strict)
+            else:
+                print_rank_last('***WARNING*** could not find {} in the checkpoint, '
+                                'initializing to random'.format(
+                                    self._multichoice_head_key))
diff --git a/training/DeepSpeed-Domino/megatron/model/realm_model.py b/training/DeepSpeed-Domino/megatron/model/realm_model.py
new file mode 100644
index 000000000..654f2992f
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/realm_model.py
@@ -0,0 +1,204 @@
+import os
+import torch
+
+from megatron import get_args, print_rank_0
+from megatron.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name
+from megatron.model import BertModel
+from .module import MegatronModule
+from megatron.core import mpu
+from megatron.model.enums import AttnMaskType
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import scaled_init_method_normal
+from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
+
+
+def general_ict_model_provider(only_query_model=False, only_block_model=False):
+    """Build the model."""
+    args = get_args()
+    assert args.ict_head_size is not None, \
+        "Need to specify --ict-head-size to provide an ICTBertModel"
+    assert mpu.get_tensor_model_parallel_world_size() == 1 and mpu.get_pipeline_model_parallel_world_size() == 1, \
+        "Model parallel size > 1 not supported for ICT"
+
+    print_rank_0('building ICTBertModel...')
+
+    # simpler to just keep using 2 tokentypes since the LM we initialize with has 2 tokentypes
+    model = ICTBertModel(
+        ict_head_size=args.ict_head_size,
+        num_tokentypes=2,
+        parallel_output=True,
+        only_query_model=only_query_model,
+        only_block_model=only_block_model)
+
+    return model
+
+
+class ICTBertModel(MegatronModule):
+    """Bert-based module for Inverse Cloze task."""
+    def __init__(self,
+                 ict_head_size,
+                 num_tokentypes=1,
+                 parallel_output=True,
+                 only_query_model=False,
+                 only_block_model=False):
+        super(ICTBertModel, self).__init__()
+        bert_kwargs = dict(
+            ict_head_size=ict_head_size,
+            num_tokentypes=num_tokentypes,
+            parallel_output=parallel_output
+        )
+        assert not (only_block_model and only_query_model)
+        self.use_block_model = not only_query_model
+        self.use_query_model = not only_block_model
+
+        if self.use_query_model:
+            # this model embeds (pseudo-)queries - Embed_input in the paper
+            self.query_model = IREncoderBertModel(**bert_kwargs)
+            self._query_key = 'question_model'
+
+        if self.use_block_model:
+            # this model embeds evidence blocks - Embed_doc in the paper
+            self.block_model = IREncoderBertModel(**bert_kwargs)
+            self._block_key = 'context_model'
+
+    def forward(self, query_tokens, query_attention_mask, block_tokens, block_attention_mask):
+        """Run a forward pass for each of the models and return the respective embeddings."""
+        query_logits = self.embed_query(query_tokens, query_attention_mask)
+        block_logits = self.embed_block(block_tokens, block_attention_mask)
+        return query_logits, block_logits
+
+    def embed_query(self, query_tokens, query_attention_mask):
+        """Embed a batch of tokens using the query model"""
+        if self.use_query_model:
+            query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0)
+            query_ict_logits, _ = self.query_model.forward(query_tokens, query_attention_mask, query_types)
+            return query_ict_logits
+        else:
+            raise ValueError("Cannot embed query without query model.")
+
+    def embed_block(self, block_tokens, block_attention_mask):
+        """Embed a batch of tokens using the block model"""
+        if self.use_block_model:
+            block_types = torch.cuda.LongTensor(*block_tokens.shape).fill_(0)
+            block_ict_logits, _ = self.block_model.forward(block_tokens, block_attention_mask, block_types)
+            return block_ict_logits
+        else:
+            raise ValueError("Cannot embed block without block model.")
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """Save dict with state dicts of each of the models."""
+        state_dict_ = {}
+        if self.use_query_model:
+            state_dict_[self._query_key] \
+                = self.query_model.state_dict_for_save_checkpoint(
+                    prefix=prefix, keep_vars=keep_vars)
+
+        if self.use_block_model:
+            state_dict_[self._block_key] \
+                = self.block_model.state_dict_for_save_checkpoint(
+                    prefix=prefix, keep_vars=keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Load the state dicts of each of the models"""
+        if self.use_query_model:
+            print("Loading ICT query model", flush=True)
+            self.query_model.load_state_dict(
+                state_dict[self._query_key], strict=strict)
+
+        if self.use_block_model:
+            print("Loading ICT block model", flush=True)
+            self.block_model.load_state_dict(
+                state_dict[self._block_key], strict=strict)
+
+    def init_state_dict_from_bert(self):
+        """Initialize the state from a pretrained BERT model on iteration zero of ICT pretraining"""
+        args = get_args()
+        tracker_filename = get_checkpoint_tracker_filename(args.bert_load)
+        if not os.path.isfile(tracker_filename):
+            raise FileNotFoundError("Could not find BERT load for ICT")
+        with open(tracker_filename, 'r') as f:
+            iteration = int(f.read().strip())
+            assert iteration > 0
+
+        checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False)
+        if mpu.get_data_parallel_rank() == 0:
+            print('global rank {} is loading checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name))
+
+        try:
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+        except BaseException:
+            raise ValueError("Could not load checkpoint")
+
+        # load the LM state dict into each model
+        model_dict = state_dict['model']['language_model']
+        self.query_model.language_model.load_state_dict(model_dict)
+        self.block_model.language_model.load_state_dict(model_dict)
+
+        # give each model the same ict_head to begin with as well
+        query_ict_head_state_dict = self.state_dict_for_save_checkpoint()[self._query_key]['ict_head']
+        self.block_model.ict_head.load_state_dict(query_ict_head_state_dict)
+
+
+class IREncoderBertModel(MegatronModule):
+    """BERT-based encoder for queries or blocks used for learned information retrieval."""
+    def __init__(self, ict_head_size, num_tokentypes=2, parallel_output=True):
+        super(IREncoderBertModel, self).__init__()
+        args = get_args()
+
+        self.ict_head_size = ict_head_size
+        self.parallel_output = parallel_output
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method)
+
+        self.ict_head = get_linear_layer(args.hidden_size, ict_head_size, init_method)
+        self._ict_head_key = 'ict_head'
+
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
+        extended_attention_mask = bert_extended_attention_mask(
+            attention_mask, next(self.language_model.parameters()).dtype)
+        position_ids = bert_position_ids(input_ids)
+
+        lm_output, pooled_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids)
+
+        # Output.
+        ict_logits = self.ict_head(pooled_output)
+        return ict_logits, None
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(prefix=prefix,
+                                                                 keep_vars=keep_vars)
+        state_dict_[self._ict_head_key] \
+            = self.ict_head.state_dict(prefix=prefix,
+                                       keep_vars=keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        self.ict_head.load_state_dict(
+            state_dict[self._ict_head_key], strict=strict)
+
+
diff --git a/training/DeepSpeed-Domino/megatron/model/t5_model.py b/training/DeepSpeed-Domino/megatron/model/t5_model.py
new file mode 100644
index 000000000..f9fabd340
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/t5_model.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""T5 model."""
+
+import torch
+
+from megatron import get_args
+from megatron.core import tensor_parallel
+from megatron.model.enums import AttnMaskType
+from megatron.model.language_model import parallel_lm_logits, get_language_model
+from megatron.model import LayerNorm
+from megatron.model.utils import (
+    openai_gelu,
+    get_linear_layer
+)
+from .module import MegatronModule
+
+
+def t5_extended_attention_mask(attention_mask_list):
+
+    def attn_mask_postprocess(attn_mask):
+        # [b, 1, s, s]
+        extended_attention_mask = attn_mask.unsqueeze(1)
+        return extended_attention_mask
+
+    return [attn_mask_postprocess(attn_mask) for attn_mask in attention_mask_list]
+
+
+def t5_position_ids(token_ids):
+    # Create position ids
+    seq_length = token_ids.size(1)
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=token_ids.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+
+    return position_ids
+
+
+class T5LMHead(MegatronModule):
+    """Masked LM head for T5
+
+    Arguments:
+        mpu_vocab_size: model parallel size of vocabulary.
+        parallel_output: wether output logits being distributed or not.
+    """
+
+    def __init__(self, mpu_vocab_size, parallel_output):
+        super(T5LMHead, self).__init__()
+
+        self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size))
+        self.bias.model_parallel = True
+        self.bias.partition_dim = 0
+        self.bias.stride = 1
+        self.parallel_output = parallel_output
+
+    def forward(self, hidden_states, word_embeddings_weight):
+        output = parallel_lm_logits(hidden_states,
+                                    word_embeddings_weight,
+                                    self.parallel_output,
+                                    bias=self.bias)
+        return output
+
+
+class T5Model(MegatronModule):
+    """T5 Language model."""
+
+    def __init__(self,
+                 config,
+                 num_tokentypes=0,
+                 parallel_output=True,
+                 pre_process=True,
+                 post_process=True,
+                 add_encoder=True,
+                 add_decoder=True):
+        super().__init__(config=config)
+        args = get_args()
+
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.add_encoder = add_encoder
+        self.add_decoder = add_decoder
+
+        self.language_model, self._language_model_key = get_language_model(
+            config=config,
+            num_tokentypes=num_tokentypes,
+            add_pooler=False,
+            add_encoder=add_encoder,
+            add_decoder=add_decoder,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+
+        self.initialize_word_embeddings()
+
+        if self.post_process and self.add_decoder:
+            self.lm_head = T5LMHead(
+                self.shared_embedding_or_output_weight().size(0),
+                parallel_output)
+            self._lm_head_key = 'lm_head'
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+
+    def forward(self, encoder_input_ids, decoder_input_ids, encoder_attn_mask,
+                decoder_attn_mask, encoder_decoder_attn_mask,
+                tokentype_ids=None, lm_labels=None, enc_hidden_states=None):
+
+        # Converting the attention masks to proper parameter settings
+        encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask = t5_extended_attention_mask(
+            [encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask])
+
+        encoder_position_ids = t5_position_ids(encoder_input_ids)
+        decoder_position_ids = t5_position_ids(decoder_input_ids)
+
+        lm_output = self.language_model(encoder_input_ids,
+                                        encoder_position_ids,
+                                        encoder_attn_mask,
+                                        decoder_input_ids,
+                                        decoder_position_ids,
+                                        decoder_attn_mask,
+                                        encoder_decoder_attn_mask,
+                                        tokentype_ids=tokentype_ids,
+                                        enc_hidden_states=enc_hidden_states)
+
+        if self.post_process and self.add_decoder:
+            decoder_output, encoder_output = lm_output
+            # Output. [s, b, h]
+            lm_logits = self.lm_head(decoder_output,
+                                     self.shared_embedding_or_output_weight())
+
+            if lm_labels is None:
+                # [s b h] => [b s h]
+                return lm_logits.transpose(0,1).contiguous()
+            else:
+                # [b s] => [s b]
+                lm_labels = lm_labels.transpose(0,1).contiguous()
+                if self.fp16_lm_cross_entropy:
+                    assert lm_logits.dtype == torch.half
+                    lm_loss = tensor_parallel.vocab_parallel_cross_entropy(lm_logits, lm_labels)
+                else:
+                    lm_loss = tensor_parallel.vocab_parallel_cross_entropy(lm_logits.float(),
+                                                                                lm_labels)
+                # [s b] => [b s]
+                lm_loss = lm_loss.transpose(0,1).contiguous()
+            return lm_loss
+        elif self.add_decoder and not self.add_encoder:
+            decoder_output, encoder_output = lm_output
+            return decoder_output
+        else:
+            encoder_output = lm_output
+            return encoder_output
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(prefix=prefix,
+                                                                 keep_vars=keep_vars)
+        if self.post_process and self.add_decoder:
+            state_dict_[self._lm_head_key] \
+                = self.lm_head.state_dict_for_save_checkpoint(prefix=prefix,
+                                                              keep_vars=keep_vars)
+         # Save word_embeddings.
+        if self.post_process and not self.pre_process and self.add_decoder:
+            state_dict_[self._word_embeddings_for_head_key] \
+                = self.word_embeddings.state_dict(prefix=prefix,
+                                                  keep_vars=keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self.post_process and self.add_decoder:
+            self.lm_head.load_state_dict(state_dict[self._lm_head_key],
+                                         strict=strict)
+        # Load word embeddings.
+        if self.post_process and not self.pre_process and self.add_decoder:
+            self.word_embeddings.load_state_dict(
+                state_dict[self._word_embeddings_for_head_key], strict=strict)
diff --git a/training/DeepSpeed-Domino/megatron/model/transformer.py b/training/DeepSpeed-Domino/megatron/model/transformer.py
new file mode 100644
index 000000000..a7898156f
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/transformer.py
@@ -0,0 +1,1710 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Transformer."""
+from contextlib import nullcontext
+import math
+import numpy as np
+import torch
+import torch.nn.functional as F
+from typing import Optional
+
+from megatron import get_timers, get_args, get_retro_args, core, get_num_microbatches
+from .module import MegatronModule
+from megatron.core import mpu, tensor_parallel
+from megatron.core.enums import ModelType
+from megatron.model import LayerNorm
+from megatron.model.enums import AttnMaskType, LayerType, AttnType
+from megatron.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.model.fused_bias_gelu import bias_gelu_impl
+from megatron.core.models.common.rotary_pos_embedding import apply_rotary_pos_emb
+from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu
+
+try:
+    from einops import rearrange
+except ImportError:
+    rearrange = None
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_func
+except ImportError:
+    try:
+        from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func
+    except ImportError:
+        flash_attn_unpadded_func = None
+
+""" We use the following notation throughout this file:
+     h: hidden size
+     n: number of attention heads
+     p: number of model parallel partitions
+     np: n/p
+     hp: h/p
+     hn: h/n
+     b: batch size
+     s: sequence length
+     l: number of layers
+    Transformer takes input of size [s, b, h] and returns a
+    tensor of the same size. We use the following arguments:
+        hyperparameters: transformer hyperparameters
+"""
+
+class DropPath(MegatronModule):
+    """Drop paths (Stochastic Depth) per sample
+    (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=0.):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_state):
+        if self.drop_prob == 0. or not self.training:
+            return hidden_state
+        keep_prob = 1 - self.drop_prob
+        # work with diff dim tensors, not just 2D ConvNets
+        # hidden_state: [s, b, h]
+        shape = (1,) + (hidden_state.shape[1],) + (1,) * (hidden_state.ndim - 2)
+        random_tensor = keep_prob + \
+            torch.rand(shape, dtype=hidden_state.dtype, device=hidden_state.device)
+        random_tensor.floor_()  # binarize
+        output = hidden_state.div(keep_prob) * random_tensor
+        return output
+
+class ParallelMLP(MegatronModule):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(self, config):
+        super(ParallelMLP, self).__init__()
+        args = get_args()
+
+        self.add_bias = config.add_bias_linear
+
+        ffn_hidden_size = config.ffn_hidden_size
+        if config.gated_linear_unit:
+            ffn_hidden_size *= 2
+
+        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = tensor_parallel.ColumnParallelLinear(
+            config.hidden_size,
+            ffn_hidden_size,
+            config=config,
+            init_method=config.init_method,
+            bias=self.add_bias,
+            gather_output=False,
+            skip_bias_add=True,
+        )
+
+        self.bias_gelu_fusion = False
+        self.activation_func = None
+        self.swiglu = args.swiglu
+
+        if args.openai_gelu:
+            self.activation_func = openai_gelu
+        elif args.onnx_safe:
+            self.activation_func = erf_gelu
+        elif args.swiglu:
+            def swiglu(x):
+                x = torch.chunk(x, 2, dim=-1)
+                return F.silu(x[0]) * x[1]
+            self.activation_func = swiglu
+        elif args.squared_relu:
+            def squared_relu(x):
+                return torch.pow(F.relu(x), 2)
+            self.activation_func = squared_relu
+        else:
+            self.bias_gelu_fusion = args.bias_gelu_fusion
+            self.activation_func = F.gelu
+
+        # Project back to h.
+        self.dense_4h_to_h = tensor_parallel.RowParallelLinear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            config=config,
+            init_method=config.output_layer_init_method,
+            bias=self.add_bias,
+            input_is_parallel=True
+        )
+
+    def forward(self, hidden_states):
+
+        # [s, b, 4hp]
+        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
+
+        if self.bias_gelu_fusion:
+            assert self.add_bias is True
+            assert self.activation_func == F.gelu
+            intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
+        else:
+            if bias_parallel is not None:
+                intermediate_parallel = intermediate_parallel + bias_parallel
+            intermediate_parallel = self.activation_func(intermediate_parallel)
+
+        # [s, b, h]
+        output, output_bias = self.dense_4h_to_h(intermediate_parallel)
+        return output, output_bias
+
+class SwitchMLP(MegatronModule):
+    """
+    Routes input to one of N MLP "experts"
+    """
+    def __init__(self, config):
+        super(SwitchMLP, self).__init__()
+        args = get_args()
+        self.router = torch.nn.Linear(config.hidden_size, args.num_experts)
+        self.experts = torch.nn.ModuleList()
+        for i in range(args.num_experts):
+            self.experts.append(ParallelMLP(config))
+
+    def forward(self, hidden_states):
+        # hidden_states: [s, b, h]
+        s = hidden_states.size(0)
+        b = hidden_states.size(1)
+        h = hidden_states.size(2)
+        route = self.router(hidden_states)
+        route = torch.nn.functional.softmax(route, dim=2)
+        max_prob, max_ind = torch.max(route, dim=2)
+        max_prob = torch.unsqueeze(max_prob, 2) # [s b 1]
+
+        # TODO (rprenger) TODO this could be made easier to read
+        # Converting [s, b, h] to [s*b, h].
+        # Each vector could be routed differently
+        hidden_states = hidden_states.view(-1, hidden_states.size(2)) # [s*b h]
+        max_prob = max_prob.view(-1, max_prob.size(2)) # [s*b 1]
+        max_ind = max_ind.view(-1) # [s*b]
+
+        output_total = torch.empty_like(hidden_states)
+        output_bias_total = torch.empty_like(hidden_states)
+        #TODO (rprenger) This does each expert in serial, but it could be parallelized
+
+        for expert_num, expert in enumerate(self.experts):
+            local_indices = (max_ind == expert_num).nonzero()
+            hidden = hidden_states[local_indices,:]
+            output, output_bias = expert(hidden)
+            if output_bias is not None:
+                output_bias = output_bias.expand_as(output)
+                output_bias_total[local_indices,:] = output_bias
+            output_total[local_indices,:] = output
+
+        output_total = output_total*max_prob
+        output_total = output_total.view(s, b, h)
+        if output_bias is not None:
+            output_bias_total = output_bias_total*max_prob
+            output_bias_total = output_bias_total.view(s, b, h)
+        else:
+            output_bias_total = None
+
+        return output_total, output_bias_total
+
+
+class CoreAttention(MegatronModule):
+
+    def __init__(self, layer_number, config,
+                 attn_mask_type=AttnMaskType.padding):
+        super(CoreAttention, self).__init__()
+        self.fp16 = config.fp16
+        self.bf16 = config.bf16
+
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+        self.attn_mask_type = attn_mask_type
+        self.sequence_parallel = config.sequence_parallel
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        world_size = mpu.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_partition = core.utils.divide(projection_size,
+                                                           world_size)
+        self.hidden_size_per_attention_head = core.utils.divide(
+            projection_size, config.num_attention_heads)
+        self.num_attention_heads_per_partition = core.utils.divide(
+            config.num_attention_heads, world_size)
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+
+        self.scale_mask_softmax = FusedScaleMaskSoftmax(
+            self.fp16, self.bf16,
+            self.attn_mask_type,
+            config.masked_softmax_fusion,
+            attention_mask_func,
+            self.attention_softmax_in_fp32,
+            coeff)
+
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+
+    def forward(self, query_layer, key_layer,
+                value_layer, attention_mask):
+
+        # ===================================
+        # Raw attention scores. [b, np, s, s]
+        # ===================================
+
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(1),
+                       query_layer.size(2),
+                       query_layer.size(0),
+                       key_layer.size(0))
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.reshape(output_size[2],
+                                          output_size[0] * output_size[1], -1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(output_size[3],
+                                   output_size[0] * output_size[1], -1)
+
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = mpu.get_global_memory_buffer().get_tensor(
+            (output_size[0]*output_size[1], output_size[2], output_size[3]),
+            query_layer.dtype, "mpu")
+
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer.transpose(0, 1),   # [b * np, sq, hn]
+            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0, alpha=(1.0/self.norm_factor))
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs = self.scale_mask_softmax(attention_scores,
+                                                  attention_mask)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        if not self.sequence_parallel:
+            with tensor_parallel.get_cuda_rng_tracker().fork():
+                attention_probs = self.attention_dropout(attention_probs)
+        else:
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value_layer -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(1),
+                       value_layer.size(2),
+                       query_layer.size(0),
+                       value_layer.size(3))
+
+        # change view [sk, b * np, hn]
+        value_layer = value_layer.view(value_layer.size(0),
+                                       output_size[0] * output_size[1], -1)
+
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1],
+                                               output_size[2], -1)
+
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+class FlashSelfAttention(torch.nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0,
+                 device=None, dtype=None):
+        super().__init__()
+        assert flash_attn_unpadded_func is not None, ('Please install FlashAttention first, '
+                                                      'e.g., with pip install flash-attn')
+        assert rearrange is not None, 'Please install einops first, e.g., with pip install einops'
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+
+    def forward(self, q, k, v):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
+        """
+
+        assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q,k,v)))
+        assert all((i.is_cuda for i in (q,k,v)))
+
+        batch_size, seqlen_q = q.shape[0], q.shape[1]
+        seqlen_k = k.shape[1]
+
+        q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]]
+        cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
+                                    device=q.device)
+
+        if self.training:
+            # during training q,k,v always have same seqlen
+            assert seqlen_k == seqlen_q
+
+            is_causal = self.causal
+            cu_seqlens_k = cu_seqlens_q
+            dropout_p = self.dropout_p
+        else:
+            # turn off FA causal mask after first inference autoregressive iteration
+            # only on first autoregressive step q,k,v have same seqlen
+            is_causal = seqlen_q == seqlen_k
+            cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32,
+                        device=q.device)
+            dropout_p = 0
+
+        output = flash_attn_unpadded_func(
+            q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k,
+            dropout_p,
+            softmax_scale=self.softmax_scale, causal=is_causal
+        )
+
+        output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+        return output
+
+
+class ParallelAttention(MegatronModule):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, config, layer_number,
+                 attention_type=AttnType.self_attn,
+                 attn_mask_type=AttnMaskType.padding):
+        super(ParallelAttention, self).__init__()
+        args = get_args()
+        self.layer_number = max(1, layer_number)
+        self.attention_type = attention_type
+        self.attn_mask_type = attn_mask_type
+        self.params_dtype = config.params_dtype
+        self.sequence_parallel = config.sequence_parallel
+
+        self.group_query_attention = args.group_query_attention
+        self.num_query_groups = args.num_query_groups
+
+        query_projection_size = config.kv_channels * config.num_attention_heads
+        if self.group_query_attention:
+            kv_projection_size = args.kv_channels * args.num_query_groups
+        else:
+            kv_projection_size = args.kv_channels * args.num_attention_heads
+
+        self.use_flash_attn = args.use_flash_attn \
+            and attention_type == AttnType.self_attn \
+            and self.attn_mask_type == AttnMaskType.causal
+        if self.use_flash_attn:
+            if flash_attn_unpadded_func is None:
+                raise ImportError('FlashAttention is not installed, please install with '
+                                  'pip install flash-attn')
+            assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
+                                                          'self-attention for now')
+            assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only '
+                                                                'supports causal mask for now')
+            if rearrange is None:
+                raise ImportError('einops is not installed, please install with pip install einops')
+
+        # Per attention head and per partition values.
+        world_size = mpu.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = core.utils.divide(
+            query_projection_size, config.num_attention_heads)
+        self.num_attention_heads_per_partition = core.utils.divide(
+            config.num_attention_heads, world_size)
+
+        if self.group_query_attention:
+            if args.num_query_groups % world_size != 0:
+                raise NotImplementedError('Currently the num_query_groups should be '
+                                          'a multiple of the tensor parallel size')
+            self.num_query_groups_per_partition = core.utils.divide(
+                        args.num_query_groups, world_size)
+        else:
+            self.num_query_groups_per_partition = self.num_attention_heads_per_partition
+
+        # Strided linear layer.
+        if attention_type == AttnType.self_attn:
+            self.query_key_value = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                query_projection_size + 2 * kv_projection_size,
+                config=config,
+                init_method=config.init_method,
+                bias=args.add_bias_linear,
+                gather_output=False)
+        else:
+            assert attention_type == AttnType.cross_attn
+
+            if self.group_query_attention:
+                raise NotImplementedError("Grouped query attention not implemented for cross-attention.")
+            assert query_projection_size == kv_projection_size
+
+            self.query = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                query_projection_size,
+                config=config,
+                init_method=config.init_method,
+                bias=config.add_bias_linear,
+                gather_output=False)
+
+            self.key_value = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                2 * kv_projection_size,
+                config=config,
+                init_method=config.init_method,
+                bias=config.add_bias_linear,
+                gather_output=False)
+
+        self.core_attention = CoreAttention(self.layer_number, config,
+                                            self.attn_mask_type)
+        self.checkpoint_core_attention = config.recompute_granularity == 'selective'
+
+        if self.use_flash_attn:
+            self.core_attention_flash = FlashSelfAttention(
+                causal=True, attention_dropout=config.attention_dropout
+            )
+
+        # Output.
+        self.dense = tensor_parallel.RowParallelLinear(
+            query_projection_size,
+            config.hidden_size,
+            config=config,
+            init_method=config.output_layer_init_method,
+            bias=args.add_bias_linear,
+            input_is_parallel=True,
+            skip_bias_add=True)
+
+    def _checkpointed_attention_forward(self, query_layer, key_layer,
+                                        value_layer, attention_mask,
+                                        rotary_pos_emb=None):
+        """Forward method with activation checkpointing."""
+        def custom_forward(*inputs):
+            query_layer = inputs[0]
+            key_layer = inputs[1]
+            value_layer = inputs[2]
+            attention_mask = inputs[3]
+            output_ = self.core_attention(query_layer, key_layer,
+                                          value_layer, attention_mask)
+            return output_
+
+        q_pos_emb, k_pos_emb = (None, None) if rotary_pos_emb is None \
+            else rotary_pos_emb
+
+        hidden_states = tensor_parallel.checkpoint(
+            custom_forward,
+            False, query_layer, key_layer, value_layer, attention_mask,
+            q_pos_emb, k_pos_emb)
+
+        return hidden_states
+
+    def _allocate_memory(self, inference_max_sequence_len, batch_size, num_attention_heads):
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            num_attention_heads,
+            self.hidden_size_per_attention_head,
+            dtype=self.params_dtype,
+            device=torch.cuda.current_device())
+
+    def forward(self, hidden_states, attention_mask,
+                encoder_output=None, inference_params=None,
+                rotary_pos_emb=None):
+        # hidden_states: [sq, b, h]
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        is_first_step = False
+        if inference_params:
+            if self.layer_number not in inference_params.key_value_memory_dict:
+                inf_max_seq_len = inference_params.max_sequence_length
+                inf_max_batch_size = inference_params.max_batch_size
+                inference_key_memory = self._allocate_memory(
+                    inf_max_seq_len, inf_max_batch_size,
+                    self.num_query_groups_per_partition)
+                inference_value_memory = self._allocate_memory(
+                    inf_max_seq_len, inf_max_batch_size,
+                    self.num_query_groups_per_partition)
+
+                inference_params.key_value_memory_dict[self.layer_number] = (
+                    inference_key_memory, inference_value_memory)
+                is_first_step = True
+            else:
+                inference_key_memory, inference_value_memory = \
+                    inference_params.key_value_memory_dict[self.layer_number]
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        if self.attention_type == AttnType.self_attn:
+            # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
+            mixed_x_layer, _ = self.query_key_value(hidden_states)
+
+            # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
+            new_tensor_shape = mixed_x_layer.size()[:-1] + (
+                self.num_query_groups_per_partition,
+                (
+                    (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
+                    * self.hidden_size_per_attention_head
+                ),
+            )
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query_layer,
+            key_layer,
+            value_layer) = torch.split(
+                mixed_x_layer,
+                [
+                    (
+                        self.num_attention_heads_per_partition // self.num_query_groups_per_partition
+                        * self.hidden_size_per_attention_head
+                    ),
+                    self.hidden_size_per_attention_head,
+                    self.hidden_size_per_attention_head
+                ],
+                dim=3)
+            # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] -
+            query_layer = query_layer.view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
+        else:
+            # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
+            mixed_kv_layer, _ = self.key_value(encoder_output)
+
+            # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
+            new_tensor_shape = mixed_kv_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                2 * self.hidden_size_per_attention_head)
+            mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape)
+
+            # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
+            (key_layer,
+            value_layer) = tensor_parallel.split_tensor_along_last_dim(mixed_kv_layer, 2)
+
+            # Attention head [sq, b, h] --> [sq, b, hp]
+            query_layer, _ = self.query(hidden_states)
+            # [sq, b, hp] --> [sq, b, np, hn]
+            new_tensor_shape = query_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                self.hidden_size_per_attention_head)
+            query_layer = query_layer.view(*new_tensor_shape)
+
+        # ==================================
+        # Adjust key and value for inference
+        # ==================================
+
+        # duplicate the pos_emb for self attention
+        if rotary_pos_emb is not None:
+            if isinstance(rotary_pos_emb, tuple):
+                rotary_pos_emb = rotary_pos_emb
+            else:
+                rotary_pos_emb = ((rotary_pos_emb,) * 2)
+
+        if inference_params:
+            batch_start = inference_params.batch_size_offset
+            batch_end = batch_start + key_layer.size(1)
+            assert batch_end <= inference_key_memory.size(1)
+            sequence_start = inference_params.sequence_len_offset
+            sequence_end = sequence_start + key_layer.size(0)
+            assert sequence_end <= inference_key_memory.size(0)
+            # Copy key and values.
+            inference_key_memory[sequence_start:sequence_end,
+                                 batch_start:batch_end, ...] = key_layer
+            inference_value_memory[sequence_start:sequence_end,
+                                   batch_start:batch_end, ...] = value_layer
+            key_layer = inference_key_memory[
+                :sequence_end, batch_start:batch_end, ...]
+            value_layer = inference_value_memory[
+                :sequence_end, batch_start:batch_end, ...]
+
+
+            # adjust the key rotary positional embedding
+            if rotary_pos_emb is not None:
+                q_pos_emb, k_pos_emb = rotary_pos_emb
+                # need to cross check this condition during inference
+                # if not set_inference_key_value_memory:
+                if not is_first_step:
+                    # In inference, we compute one token at a time.
+                    # Select the correct positional embedding
+                    # (only the last token in the sequence)
+                    q_pos_emb = q_pos_emb[sequence_end - 1 : sequence_end]
+                else:
+                    # In the first forward pass of inference,
+                    # we use the entire provided prefix.
+                    # q_pos_emb here has the rope embeddings of the entire
+                    # prefix + to-be-generated output so
+                    # we slice to just the prefix.
+                    q_pos_emb = q_pos_emb[:sequence_end, :, :, :]
+                k_pos_emb = k_pos_emb[:sequence_end, :, :, :]
+                rotary_pos_emb = (q_pos_emb, k_pos_emb)
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        # expand the key_layer and value_layer [sk, b, ng, hn] -> [sk, b, np, hn]
+        key_layer = key_layer.repeat_interleave(
+            self.num_attention_heads_per_partition // self.num_query_groups_per_partition,
+            dim = 2
+        )
+        value_layer = value_layer.repeat_interleave(
+            self.num_attention_heads_per_partition // self.num_query_groups_per_partition,
+            dim = 2
+        )
+
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+            query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb)
+            # TODO, can apply positional embedding to value_layer so it has
+            # absolute positional embedding.
+            # otherwise, only relative positional embedding takes effect
+            # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
+
+        if not self.use_flash_attn:
+            if self.checkpoint_core_attention:
+                context_layer = self._checkpointed_attention_forward(
+                    query_layer, key_layer, value_layer, attention_mask)
+            else:
+                context_layer = self.core_attention(
+                    query_layer, key_layer, value_layer, attention_mask)
+        else:
+            q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
+                       for x in (query_layer, key_layer, value_layer)]
+            if not self.sequence_parallel:
+                with tensor_parallel.get_cuda_rng_tracker().fork():
+                    context_layer = self.core_attention_flash(q, k, v)
+            else:
+                context_layer = self.core_attention_flash(q, k, v)
+            context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output, bias = self.dense(context_layer)
+
+        return output, bias
+
+
+def bias_dropout_add(x, bias, residual, prob, training):
+    # type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor
+    if bias is not None:
+        x = x + bias
+    out = torch.nn.functional.dropout(x, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+def get_bias_dropout_add(training):
+    def _bias_dropout_add(x, bias, residual, prob):
+        return bias_dropout_add(x, bias, residual, prob, training)
+    return _bias_dropout_add
+
+
+@torch.jit.script
+def bias_dropout_add_fused_train(x: torch.Tensor,
+                                 bias: Optional[torch.Tensor],
+                                 residual: torch.Tensor,
+                                 prob: float) -> torch.Tensor:
+    return bias_dropout_add(x, bias, residual, prob, True)
+
+
+@torch.jit.script
+def bias_dropout_add_fused_inference(x: torch.Tensor,
+                                     bias: Optional[torch.Tensor],
+                                     residual: torch.Tensor,
+                                     prob: float) -> torch.Tensor:
+    return bias_dropout_add(x, bias, residual, prob, False)
+
+
+class ParallelTransformerLayer(MegatronModule):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, config,
+                 layer_number, layer_type=LayerType.encoder,
+                 self_attn_mask_type=AttnMaskType.padding,
+                 drop_path_rate=0.):
+                 # retriever=None):
+        args = get_args()
+
+        super(ParallelTransformerLayer, self).__init__()
+        self.layer_number = layer_number
+        self.layer_type = layer_type
+
+        self.apply_residual_connection_post_layernorm \
+            = config.apply_residual_connection_post_layernorm
+
+        self.bf16 = config.bf16
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(
+            config.hidden_size,
+            eps=config.layernorm_epsilon,
+            no_persist_layer_norm=args.no_persist_layer_norm,
+            sequence_parallel=config.sequence_parallel,
+            apply_layernorm_1p=args.apply_layernorm_1p)
+
+        # Self attention.
+        self.self_attention = ParallelAttention(
+            config,
+            layer_number,
+            attention_type=AttnType.self_attn,
+            attn_mask_type=self_attn_mask_type)
+        self.hidden_dropout = config.hidden_dropout
+        self.bias_dropout_fusion = config.bias_dropout_fusion
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNorm(
+            config.hidden_size,
+            eps=config.layernorm_epsilon,
+            no_persist_layer_norm=not config.persist_layer_norm,
+            sequence_parallel=config.sequence_parallel,
+            apply_layernorm_1p=args.apply_layernorm_1p)
+
+        # Cross attention.
+        if self.layer_type in (LayerType.decoder,
+                               LayerType.retro_decoder,
+                               LayerType.retro_decoder_with_retriever,
+                               LayerType.retro_encoder):
+            self.inter_attention = ParallelAttention(
+                config,
+                layer_number,
+                attention_type=AttnType.cross_attn)
+            # Layernorm on the attention output.
+            self.post_inter_attention_layernorm = LayerNorm(
+                config.hidden_size,
+                eps=config.layernorm_epsilon,
+                no_persist_layer_norm=not config.persist_layer_norm,
+                sequence_parallel=config.sequence_parallel,
+                apply_layernorm_1p=args.apply_layernorm_1p)
+
+        # MLP
+        if args.num_experts is not None:
+            self.mlp = SwitchMLP(config)
+        else:
+            self.mlp = ParallelMLP(config)
+
+        # Set bias+dropout+add fusion grad_enable execution handler.
+        TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        TORCH_MINOR = int(torch.__version__.split('.')[1])
+        use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
+        self.bias_dropout_add_exec_handler = \
+                nullcontext if use_nvfuser else torch.enable_grad
+
+        if args.retro_add_retriever:
+            retro_args = get_retro_args()
+            self.retro_num_neighbors = args.retro_num_neighbors
+            self.retro_chunk_length = retro_args.retro_gpt_chunk_length
+            self.retro_retrieved_length = retro_args.retro_gpt_retrieved_length
+
+        # Retriever (bi-directional transformer with cross attention)
+        if layer_type == LayerType.retro_decoder_with_retriever:
+            self.retriever = ParallelTransformer(
+                config=config,
+                model_type=ModelType.retro_encoder,
+                self_attn_mask_type=AttnMaskType.padding,
+                pre_process=True,
+                post_process=False,
+            )
+            self._retriever_key = 'retriever'
+        else:
+            self.retriever = None
+
+    def default_decoder_cross_attention(self,
+                                        encoder_output,
+                                        enc_dec_attn_mask,
+                                        layernorm_input,
+                                        layernorm_output,
+                                        bias_dropout_add_func):
+        '''Cross attention for a standard encoder-decoder model.'''
+
+        # Attention.
+        attention_output, attention_bias = \
+            self.inter_attention(layernorm_output,
+                                 enc_dec_attn_mask,
+                                 encoder_output=encoder_output)
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        if attention_bias is not None:
+            attention_bias = attention_bias.expand_as(residual)
+
+        # Bias-dropout-add.
+        with self.bias_dropout_add_exec_handler():
+            layernorm_input = bias_dropout_add_func(
+                attention_output,
+                attention_bias,
+                residual,
+                self.hidden_dropout)
+
+        # Layer norm.
+        layernorm_output = self.post_inter_attention_layernorm(layernorm_input)
+
+        return layernorm_input, layernorm_output
+
+    def retro_encoder_cross_attention(self,
+                                      retriever_output,
+                                      layernorm_input,
+                                      layernorm_output,
+                                      bias_dropout_add_func):
+        """Cross attention for Retro encoder.
+
+        Notation:
+            ns : Sequence length.
+            bs : Batch size.
+            d  : Hidden size.
+            l  : Number of chunks per sample (i.e., seq_length/chunk_length).
+            k  : Number of neighbors.
+            r  : Number of retrieved tokens (neighbors + continuation).
+        """
+
+        ns, bs, d = layernorm_output.shape # [r, bs * l * k, d]
+
+        # Divide sequence dimension into chunks.
+        chunked_outputs = layernorm_output.reshape(self.retro_retrieved_length,
+                                                   -1,
+                                                   self.retro_num_neighbors,
+                                                   d)
+        chunked_outputs_before_layer_norm = \
+            layernorm_input.reshape(self.retro_retrieved_length, -1,
+                                    self.retro_num_neighbors, d) # [r, bs*l, k, d]
+
+        # Per-chunk attention.
+        layernorm_inputs = []
+        layernorm_outputs = []
+        for k in range(self.retro_num_neighbors):
+
+            # Attention.
+            chunked_output = chunked_outputs[:,:,k].contiguous()
+            attention_output, attention_bias = \
+                self.inter_attention(
+                    chunked_output, # Q (neighbor embedding)
+                    None,
+                    encoder_output=retriever_output) # K, V (hidden act)
+
+            # Residual connection.
+            if self.apply_residual_connection_post_layernorm:
+                residual = chunked_output
+            else:
+                residual = chunked_outputs_before_layer_norm[:,:,k]
+
+            # Re-enable torch grad to enable fused optimization.
+            with torch.enable_grad():
+                layernorm_input = bias_dropout_add_func(
+                    attention_output,
+                    None if attention_bias is None else attention_bias.expand_as(residual),
+                    residual,
+                    self.hidden_dropout)
+                layernorm_inputs.append(layernorm_input)
+
+            # Layer norm.
+            layernorm_output = \
+                self.post_inter_attention_layernorm(layernorm_input)
+            layernorm_outputs.append(layernorm_output)
+
+        # Concatenate layer norms.
+        # layernorm_input : [r, k * bs * l, d]
+        # layernorm_output : [r, k * bs * l, d]
+        layernorm_input = \
+            torch.stack(layernorm_inputs, dim=1).reshape(ns, bs, d)
+        layernorm_output = \
+            torch.stack(layernorm_outputs, dim=1).reshape(ns, bs, d)
+
+        return layernorm_input, layernorm_output
+
+    def retro_decoder_cross_attention(self,
+                                      retriever_input,
+                                      retriever_output,
+                                      retriever_attn_mask,
+                                      layernorm_input,
+                                      layernorm_output,
+                                      inference_params,
+                                      bias_dropout_add_func):
+        """Cross attention for Retro decoder.
+
+        Notation:
+            ns : Sequence length.
+            bs : Batch size.
+            d  : Hidden size.
+            l  : Number of chunks per sample (i.e., seq_length/chunk_length).
+            m  : Number of tokens per chunk.
+            k  : Number of neighbors.
+            r  : Number of retrieved tokens (neighbors + continuation).
+        """
+
+        ns, bs, d = layernorm_output.shape
+        l = int(np.ceil(ns / self.retro_chunk_length))
+
+        # Retrieve neighbors.
+        if self.layer_type == LayerType.retro_decoder_with_retriever:
+            first_ns = ns % self.retro_chunk_length
+            if first_ns > 0:
+                raise Exception("test this case.")
+                first_chunk, rest_chunk = \
+                    layernorm_output[:first_ns], layernorm_output[first_ns:]
+                first_chunk = torch.nn.functional.pad(
+                    first_chunk,
+                    (0, 0, 0, 0, 0, self.retro_chunk_length - first_ns),
+                    'constant',
+                    0)
+                chunked_output = \
+                    torch.cat((first_chunk, rest_chunk), dim=0) # [l * m, bs, d]
+            else:
+                chunked_output = layernorm_output # [l * m, bs, d]
+            chunked_output = chunked_output \
+                .reshape(l, self.retro_chunk_length, bs, d) \
+                .permute(1, 2, 0, 3) \
+                .reshape(self.retro_chunk_length, bs * l, d) \
+                .contiguous()
+
+            # Get Encoder Output
+            retriever_output = self.retriever(
+                hidden_states=retriever_input,
+                attention_mask=retriever_attn_mask,
+                retriever_output=chunked_output,
+                retriever_attn_mask=retriever_attn_mask,
+                inference_params=inference_params) # [r, k * bs * l , d]
+            retriever_output = retriever_output.reshape(
+                self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d) # [r * k, bs * l, d]
+
+        # Chunks.
+        pad = (ns - 1) % self.retro_chunk_length
+        attending_chunks = layernorm_output[pad:]
+        padded_chunks = torch.nn.functional.pad(
+            attending_chunks,
+            (0, 0, 0, 0, 0, self.retro_chunk_length - 1),
+            'constant', 0)
+        padded_chunked_output = padded_chunks \
+            .reshape(l, self.retro_chunk_length, bs, d) \
+            .permute(1, 2, 0, 3)
+        padded_chunked_output = padded_chunked_output.reshape(
+            self.retro_chunk_length, bs * l, d).contiguous()
+
+        # Encoder output.
+        attention_output, attention_bias = \
+            self.inter_attention(padded_chunked_output,
+                                 None,
+                                 encoder_output=retriever_output)
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        # Re-enable torch grad to enable fused optimization.
+        with torch.enable_grad():
+            layernorm_input = bias_dropout_add_func(
+                attention_output,
+                None if attention_bias is None else attention_bias.expand_as(attention_output),
+                torch.zeros_like(attention_output),
+                self.hidden_dropout)
+            layernorm_input = layernorm_input \
+                .reshape(self.retro_chunk_length, bs, l, d) \
+                .permute(2, 0, 1, 3) # [l, m, bs, d]
+            layernorm_input = layernorm_input.reshape(self.retro_chunk_length * l, bs, d)
+            layernorm_input = torch.nn.functional.pad(
+                layernorm_input,
+                (0, 0, 0, 0, pad, 0),
+                'constant', 0)[:ns] # [ns, b, d]
+            layernorm_input = layernorm_input + residual
+
+        # Layer norm post the decoder attention
+        layernorm_output = self.post_inter_attention_layernorm(layernorm_input)
+
+        return retriever_output, layernorm_input, layernorm_output
+
+    def forward(self, hidden_states, attention_mask,
+                encoder_output=None, enc_dec_attn_mask=None,
+                retriever_input=None,
+                retriever_output=None,
+                retriever_attn_mask=None,
+                inference_params=None,
+                rotary_pos_emb=None):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attention_output, attention_bias = \
+            self.self_attention(
+                layernorm_output,
+                attention_mask,
+                inference_params=inference_params,
+                rotary_pos_emb=rotary_pos_emb)
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        if self.drop_path is None:
+            # jit scripting for a nn.module (with dropout) is not
+            # trigerring the fusion kernel. For now, we use two
+            # different nn.functional routines to account for varying
+            # dropout semantics during training and inference phases.
+            if self.bias_dropout_fusion:
+                if self.training:
+                    bias_dropout_add_func = bias_dropout_add_fused_train
+                else:
+                    bias_dropout_add_func = bias_dropout_add_fused_inference
+            else:
+                bias_dropout_add_func = get_bias_dropout_add(self.training)
+
+            if attention_bias is not None:
+                attention_bias = attention_bias.expand_as(residual)
+            with self.bias_dropout_add_exec_handler():
+                layernorm_input = bias_dropout_add_func(
+                    attention_output,
+                    attention_bias,
+                    residual,
+                    self.hidden_dropout)
+        else:
+            out = torch.nn.functional.dropout(attention_output + attention_bias,
+                                              p=self.hidden_dropout,
+                                              training=self.training)
+            layernorm_input = residual + self.drop_path(out)
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # Cross attention.
+        if self.layer_type == LayerType.encoder:
+            pass
+        elif self.layer_type == LayerType.decoder:
+            layernorm_input, layernorm_output = \
+                self.default_decoder_cross_attention(
+                    encoder_output,
+                    enc_dec_attn_mask,
+                    layernorm_input,
+                    layernorm_output,
+                    bias_dropout_add_func)
+        elif self.layer_type == LayerType.retro_encoder:
+            layernorm_input, layernorm_output = \
+                self.retro_encoder_cross_attention(
+                    retriever_output,
+                    layernorm_input,
+                    layernorm_output,
+                    bias_dropout_add_func)
+        elif self.layer_type in (LayerType.retro_decoder,
+                                 LayerType.retro_decoder_with_retriever):
+            retriever_output, layernorm_input, layernorm_output = \
+                self.retro_decoder_cross_attention(
+                    retriever_input,
+                    retriever_output,
+                    retriever_attn_mask,
+                    layernorm_input,
+                    layernorm_output,
+                    inference_params,
+                    bias_dropout_add_func)
+        else:
+            raise Exception("Unsupported layer type, '%s'." %
+                            self.layer_type.name)
+
+        # MLP.
+        mlp_output, mlp_bias = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        if self.drop_path is None:
+            if mlp_bias is not None:
+                mlp_bias = mlp_bias.expand_as(residual)
+            with self.bias_dropout_add_exec_handler():
+                output = bias_dropout_add_func(
+                    mlp_output,
+                    mlp_bias,
+                    residual,
+                    self.hidden_dropout)
+
+            # Jit compiled function creates 'view' tensor. This tensor
+            # potentially gets saved in the MPU checkpoint function context,
+            # which rejects view tensors. While making a viewless tensor here
+            # won't result in memory savings (like the data loader, or
+            # p2p_communication), it serves to document the origin of this
+            # 'view' tensor.
+            output = core.utils.make_viewless_tensor(inp = output,
+                                                     requires_grad = output.requires_grad,
+                                                     keep_graph = True)
+
+        else:
+            if mlp_bias is not None:
+                mlp_output = mlp_output + mlp_bias
+            out = torch.nn.functional.dropout(mlp_output,
+                                              p=self.hidden_dropout,
+                                              training=self.training)
+            output = residual + self.drop_path(out)
+
+        if self.layer_type == LayerType.retro_decoder_with_retriever:
+            return output, retriever_output
+        else:
+            return output
+
+
+class NoopTransformerLayer(MegatronModule):
+    """A single 'no-op' transformer layer.
+
+    The sole purpose of this layer is for when a standalone embedding layer
+    is used (i.e., args.standalone_embedding_stage == True). In this case,
+    zero transformer layers are assigned when pipeline rank == 0. Additionally,
+    when virtual pipeline rank >= 1, zero total model parameters are created
+    (virtual rank 0 contains the input embedding). This results in the model's
+    input and output tensors being the same, which causes an error when
+    performing certain memory optimiations on the output tensor (e.g.,
+    deallocating it). Thus, this layer disconnects the input from the output
+    via a clone. Since ranks containing a no-op layer are generally under-
+    utilized (both compute and memory), there's no worry of any performance
+    degredation.
+    """
+
+    def __init__(self, layer_number):
+        super().__init__()
+        self.layer_number = layer_number
+
+    def forward(self, hidden_states, attention_mask,
+                encoder_output=None, enc_dec_attn_mask=None,
+                inference_params=None):
+        return hidden_states.clone()
+
+
+def _get_num_layers(args, model_type, is_decoder=False):
+    """Compute the number of transformer layers resident on the current rank."""
+    is_encoder_and_decoder_model = (model_type == ModelType.encoder_and_decoder)
+    if model_type == ModelType.retro_encoder:
+        num_layers = args.retro_encoder_layers
+    elif mpu.get_pipeline_model_parallel_world_size() > 1:
+        if is_encoder_and_decoder_model:
+            assert args.pipeline_model_parallel_split_rank is not None
+
+            # When a standalone embedding stage is used, a rank is taken from
+            # the encoder's ranks, to be used for the encoder's embedding
+            # layer. This way, the rank referenced by the 'split rank' remains
+            # the same whether or not a standalone embedding stage is used.
+            num_ranks_in_encoder = (
+                args.pipeline_model_parallel_split_rank - 1
+                if args.standalone_embedding_stage else
+                args.pipeline_model_parallel_split_rank
+            )
+            num_ranks_in_decoder = args.transformer_pipeline_model_parallel_size - num_ranks_in_encoder
+            assert args.encoder_num_layers % num_ranks_in_encoder == 0, \
+                    'encoder_num_layers (%d) must be divisible by number of ranks given to encoder (%d)' % (args.encoder_num_layers, num_ranks_in_encoder)
+            assert args.decoder_num_layers % num_ranks_in_decoder == 0, \
+                    'decoder_num_layers (%d) must be divisible by number of ranks given to decoder (%d)' % (args.decoder_num_layers, num_ranks_in_decoder)
+            if mpu.is_pipeline_stage_before_split():
+                num_layers = (
+                    0
+                    if args.standalone_embedding_stage
+                    and mpu.get_pipeline_model_parallel_rank() == 0 else
+                    args.encoder_num_layers // num_ranks_in_encoder
+                )
+            else:
+                num_layers = args.decoder_num_layers // num_ranks_in_decoder
+        else:
+            assert args.num_layers == args.encoder_num_layers
+            assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \
+                'num_layers must be divisible by transformer_pipeline_model_parallel_size'
+
+            # When a standalone embedding stage is used, all transformer layers
+            # are divided among pipeline rank >= 1, while on pipeline rank 0,
+            # ranks either contain the input embedding layer (virtual pp rank 0),
+            # or no layers at all (virtual pp rank >= 1).
+            num_layers = (
+                0
+                if args.standalone_embedding_stage
+                and mpu.get_pipeline_model_parallel_rank() == 0 else
+                args.num_layers // args.transformer_pipeline_model_parallel_size
+            )
+    else:
+        if not is_decoder:
+            num_layers = args.encoder_num_layers
+        else:
+            num_layers = args.decoder_num_layers
+    return num_layers
+
+
+def _get_layer_type(model_type, default_layer_type, retro_layer_numbers,
+                    layer_number):
+    args = get_args()
+    if args.retro_add_retriever and layer_number in retro_layer_numbers:
+        if model_type == ModelType.retro_decoder:
+            return LayerType.retro_decoder_with_retriever \
+                if layer_number == retro_layer_numbers[0] \
+                   else LayerType.retro_decoder
+        elif model_type == ModelType.retro_encoder:
+            return LayerType.retro_encoder
+        else:
+            raise Exception("Unsupported model type, '%s'." % model_type)
+    else:
+        return default_layer_type
+
+
+class ParallelTransformer(MegatronModule):
+    """Transformer class."""
+
+    def __init__(self, config,
+                 model_type, layer_type=LayerType.encoder,
+                 self_attn_mask_type=AttnMaskType.padding,
+                 post_layer_norm=True,
+                 pre_process=True,
+                 post_process=True,
+                 drop_path_rate=0.0):
+        super(ParallelTransformer, self).__init__()
+        args = get_args()
+
+        self.layer_type = layer_type
+        self.model_type = model_type
+        self.bf16 = config.bf16
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = post_layer_norm
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.input_tensor = None
+        self.drop_path_rate = drop_path_rate
+        self.transformer_impl = args.transformer_impl
+        self.retro_add_retriever = args.retro_add_retriever
+
+        # Store activation checkpoiting flag.
+        self.recompute_granularity = config.recompute_granularity
+        self.recompute_method = config.recompute_method
+        self.recompute_num_layers = config.recompute_num_layers
+        self.distribute_saved_activations = \
+            config.distribute_saved_activations and not config.sequence_parallel
+
+        self.sequence_parallel = config.sequence_parallel
+
+        # Transformer Engine Init.
+        self.transformer_engine_v_0_10 = False
+        self.transformer_engine_v_0_11 = False
+        self.transformer_engine_v_0_8 = False
+        if self.transformer_impl == 'transformer_engine':
+            global transformer_engine
+            import transformer_engine
+            from importlib.metadata import version
+            from pkg_resources import packaging
+
+            te_version = packaging.version.Version(version("transformer-engine"))
+            if te_version >= packaging.version.Version("0.8.0"):
+                self.transformer_engine_v_0_8 = True
+            if te_version >= packaging.version.Version("0.10.0"):
+                self.transformer_engine_v_0_10 = True
+            if te_version >= packaging.version.Version("0.11.0"):
+                self.transformer_engine_v_0_11 = True
+
+            del version, packaging
+
+            assert not args.squared_relu, "TransformerEngine does not support squared relu activation."
+
+        self.use_fp8 = args.fp8 is not None
+        self.fp8_recipe = None
+        self.fp8_group = None
+        if self.use_fp8:
+            assert args.transformer_impl == 'transformer_engine', \
+                'transformer-engine required for fp8 training and inference'
+            self.fp8_group = mpu.get_amax_reduction_group()
+            if args.fp8 == "e4m3":
+                fp8_format = transformer_engine.common.recipe.Format.E4M3
+            elif args.fp8 == "hybrid":
+                fp8_format = transformer_engine.common.recipe.Format.HYBRID
+            else:
+                raise ValueError("The DelayedScaling recipe only supports E4M3 and HYBRID formats.")
+            self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
+                margin=args.fp8_margin,
+                interval=args.fp8_interval,
+                fp8_format=fp8_format,
+                amax_history_len=args.fp8_amax_history_len,
+                amax_compute_algo=args.fp8_amax_compute_algo,
+                override_linear_precision=(False, False, not args.fp8_wgrad),
+            )
+
+        self.num_microbatches_in_previous_step = -1
+        self.microbatch_count = 0
+        self.checkpoint_core_attention = config.recompute_granularity == 'selective'
+
+        # Number of layers.
+        self.num_layers = _get_num_layers(args, model_type,
+                                          layer_type==LayerType.decoder)
+
+        self.drop_path_rates = [
+            rate.item() for rate in
+            torch.linspace(0, self.drop_path_rate, config.num_layers)]
+
+        self.retro_layer_numbers = None
+        if model_type == ModelType.retro_decoder:
+            retro_layer_start = 6 if config.num_layers <= 15 else 9
+            self.retro_layer_numbers = \
+                np.arange(retro_layer_start, args.num_layers + 1, 3).tolist()
+        if model_type == ModelType.retro_encoder:
+            self.retro_layer_numbers = [1]
+
+        # Transformer layers.
+        if args.retro_add_retriever:
+            assert self.recompute_granularity != 'full', \
+                "Full recompute not supported for Retro."
+            assert args.transformer_impl == 'local', \
+                "Transformer engine does not support Retro layers."
+        def build_layer(layer_number):
+            if args.transformer_impl == 'local':
+                current_layer_type = _get_layer_type(
+                    model_type, layer_type, self.retro_layer_numbers,
+                    layer_number)
+                return ParallelTransformerLayer(
+                    config,
+                    layer_number,
+                    layer_type=current_layer_type,
+                    self_attn_mask_type=self_attn_mask_type,
+                    drop_path_rate=self.drop_path_rates[layer_number - 1])
+            else:
+                # This argument is only available from TE v0.10 onwards.
+                extra_transformer_engine_kwargs = {}
+                if self.transformer_engine_v_0_8:
+                    extra_transformer_engine_kwargs["bias"] = args.add_bias_linear
+                if self.transformer_engine_v_0_10:
+                    extra_transformer_engine_kwargs["activation"] = "swiglu" if args.swiglu else "gelu"
+                if self.transformer_engine_v_0_11:
+                    extra_transformer_engine_kwargs["normalization"] = args.normalization
+                return transformer_engine.pytorch.TransformerLayer(
+                    config.hidden_size,
+                    config.ffn_hidden_size,
+                    config.num_attention_heads,
+                    layernorm_epsilon=config.layernorm_epsilon,
+                    hidden_dropout=config.hidden_dropout,
+                    attention_dropout=config.attention_dropout,
+                    init_method=config.init_method,
+                    output_layer_init_method=config.output_layer_init_method,
+                    layer_number=layer_number,
+                    kv_channels=config.kv_channels,
+                    self_attn_mask_type=self_attn_mask_type.name,
+                    tp_group=mpu.get_tensor_model_parallel_group(),
+                    get_rng_state_tracker=tensor_parallel.get_cuda_rng_tracker,
+                    fuse_wgrad_accumulation=config.gradient_accumulation_fusion,
+                    apply_query_key_layer_scaling=config.apply_query_key_layer_scaling,
+                    attention_softmax_in_fp32=config.attention_softmax_in_fp32,
+                    seq_length=args.seq_length,
+                    micro_batch_size=args.micro_batch_size,
+                    sequence_parallel=config.sequence_parallel,
+                    params_dtype=config.params_dtype,
+                    apply_residual_connection_post_layernorm=config.apply_residual_connection_post_layernorm,
+                    output_layernorm=False,
+                    layer_type="encoder",
+                    drop_path_rate=self.drop_path_rates[layer_number - 1],
+                    set_parallel_mode=True,
+                    fuse_qkv_params=True,
+                    **extra_transformer_engine_kwargs)
+
+        if config.virtual_pipeline_model_parallel_size is not None:
+            assert config.num_layers % config.virtual_pipeline_model_parallel_size == 0, \
+                'num_layers_per_stage must be divisible by ' \
+                'virtual_pipeline_model_parallel_size'
+            assert args.model_type != ModelType.encoder_and_decoder
+            # Number of layers in each model chunk is the number of layers in the stage,
+            # divided by the number of model chunks in a stage.
+            self.num_layers = self.num_layers // config.virtual_pipeline_model_parallel_size
+            # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0]  [2]  [4]  [6]
+            # Stage 1: [1]  [3]  [5]  [7]
+            # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0, 1]  [4, 5]
+            # Stage 1: [2, 3]  [6, 7]
+            offset = mpu.get_virtual_pipeline_model_parallel_rank() * (
+                config.num_layers // config.virtual_pipeline_model_parallel_size) + \
+                (mpu.get_pipeline_model_parallel_rank() * self.num_layers)
+        else:
+            # Each stage gets a contiguous set of layers.
+            if args.model_type == ModelType.encoder_and_decoder and \
+                    mpu.get_pipeline_model_parallel_world_size() > 1:
+                pipeline_rank = mpu.get_pipeline_model_parallel_rank()
+                if layer_type == LayerType.encoder:
+                    offset = pipeline_rank * self.num_layers
+                else:
+                    num_ranks_in_enc = args.pipeline_model_parallel_split_rank
+                    offset = (pipeline_rank - num_ranks_in_enc) * self.num_layers
+            else:
+                offset = mpu.get_pipeline_model_parallel_rank() * self.num_layers
+
+        if self.num_layers == 0:
+            # When a standalone embedding stage is used (e.g.,
+            # args.standalone_embedding_stage == True), virtual pipeline ranks
+            # on pipeline rank 0 will have zero transformer layers assigned to
+            # them. This results in the model's input and output tensors to be
+            # the same, which will cause failure for certain output tensor
+            # optimizations (e.g., pipeline output deallocation). To remedy
+            # this, we assign a 'no-op' layer on these ranks, which will
+            # disconnect the input tensor from the output tensor.
+            self.num_layers = 1
+            self.layers = torch.nn.ModuleList([ NoopTransformerLayer(1) ])
+        else:
+            self.layers = torch.nn.ModuleList(
+                [build_layer(i + 1 + offset) for i in range(self.num_layers)])
+
+            # Update dropout rate for Retro encoder.
+            if model_type == ModelType.retro_encoder:
+                for layer in self.layers:
+                    if layer.self_attention.use_flash_attn:
+                        layer.self_attention.core_attention_flash.dropout_p = \
+                            torch.nn.Dropout(args.retro_encoder_attention_dropout)
+                    else:
+                        layer.self_attention.core_attention.attention_dropout.p =\
+                            args.retro_encoder_attention_dropout
+                    layer.hidden_dropout = args.retro_encoder_hidden_dropout
+
+        if self.post_process and self.post_layer_norm:
+            # Final layer norm before output.
+            self.final_layernorm = LayerNorm(
+                config.hidden_size,
+                eps=config.layernorm_epsilon,
+                no_persist_layer_norm=args.no_persist_layer_norm,
+                sequence_parallel=config.sequence_parallel,
+                apply_layernorm_1p=args.apply_layernorm_1p)
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def _checkpointed_forward(self, hidden_states, attention_mask,
+                              encoder_output, enc_dec_attn_mask,
+                              rotary_pos_emb, is_first_microbatch):
+        """Forward method with activation checkpointing."""
+        def custom(start, end):
+            def custom_forward(*args, **kwargs):
+                x_, *args = args
+                for index in range(start, end):
+                    layer = self._get_layer(index)
+                    x_ = layer(x_, *args, **kwargs)
+                return x_
+            return custom_forward
+
+        te_forward_kwargs = {}
+        if self.transformer_impl == 'transformer_engine':
+            te_forward_kwargs['is_first_microbatch'] = is_first_microbatch
+            if self.transformer_engine_v_0_10:
+                te_forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
+
+        if self.recompute_method == 'uniform':
+            # Uniformly divide the total number of Transformer layers and
+            # checkpoint the input activation of each divided chunk.
+            # A method to further reduce memory usage reducing checkpoints.
+            l = 0
+            while l < self.num_layers:
+                if self.transformer_impl == 'transformer_engine':
+                    hidden_states = transformer_engine.pytorch.checkpoint(
+                        custom(l, l + self.recompute_num_layers),
+                        self.distribute_saved_activations,
+                        tensor_parallel.get_cuda_rng_tracker,
+                        mpu.get_tensor_model_parallel_group(),
+                        hidden_states, attention_mask, encoder_output,
+                        enc_dec_attn_mask, **te_forward_kwargs)
+                else:
+                    hidden_states = tensor_parallel.checkpoint(
+                        custom(l, l + self.recompute_num_layers),
+                        self.distribute_saved_activations,
+                        hidden_states, attention_mask,
+                        encoder_output, enc_dec_attn_mask,
+                        None, None, None, None, rotary_pos_emb)
+
+                l += self.recompute_num_layers
+
+        elif self.recompute_method == 'block':
+            # Checkpoint the input activation of only a set number of individual
+            # Transformer layers and skip the rest.
+            # A method fully use the device memory removing redundant re-computation.
+            for l in range(self.num_layers):
+                if l < self.recompute_num_layers:
+                    if self.transformer_impl == 'transformer_engine':
+                        hidden_states = transformer_engine.pytorch.checkpoint(
+                            custom(l, l + 1),
+                            self.distribute_saved_activations,
+                            tensor_parallel.get_cuda_rng_tracker,
+                            mpu.get_tensor_model_parallel_group(),
+                            hidden_states, attention_mask, encoder_output,
+                            enc_dec_attn_mask, **te_forward_kwargs)
+                    else:
+                        hidden_states = tensor_parallel.checkpoint(
+                            custom(l, l + 1),
+                            self.distribute_saved_activations,
+                            hidden_states, attention_mask,
+                            encoder_output, enc_dec_attn_mask,
+                            None, None, None, None, rotary_pos_emb)
+                else:
+                    if self.transformer_impl == 'transformer_engine':
+                        hidden_states = custom(l, l + 1)(
+                            hidden_states, attention_mask, encoder_output,
+                            enc_dec_attn_mask, **te_forward_kwargs)
+                    else:
+                        hidden_states = custom(l, l + 1)(
+                            hidden_states, attention_mask,
+                            encoder_output, enc_dec_attn_mask,
+                            None, None, None, None, rotary_pos_emb)
+        else:
+            raise ValueError("Invalid activation recompute method.")
+
+        return hidden_states
+
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+
+    def forward(self, hidden_states, attention_mask,
+                encoder_output=None, enc_dec_attn_mask=None,
+                retriever_input=None,
+                retriever_output=None,
+                retriever_attn_mask=None,
+                inference_params=None,
+                rotary_pos_emb=None):
+        # hidden_states: [s, b, h]
+
+        # Checks.
+        if inference_params:
+            assert self.recompute_granularity is None, \
+                'inference does not work with activation checkpointing'
+
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        # Viewless tensor.
+        # - We only need to create a viewless tensor in the case of micro batch
+        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+        #   above creates a view tensor, and '.contiguous()' is a pass-through.
+        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+        #   the need to make it viewless.
+        #
+        #   However, we don't explicitly check mbs == 1 here because
+        #   make_viewless_tensor() has negligible overhead when its input
+        #   is already viewless.
+        #
+        # - For the 'else' case above, calling make_viewless_tensor() here is
+        #   likely redundant, since p2p_communication.py (likely originator)
+        #   already creates viewless tensors. That said, make_viewless_tensor()
+        #   is called here to be future-proof and corner-case-proof.
+        hidden_states = core.utils.make_viewless_tensor(
+            hidden_states,
+            requires_grad=True,
+            keep_graph=True,
+        )
+
+        # RNG context.
+        if self.sequence_parallel:
+            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+
+        # Forward layers.
+        with rng_context:
+            # The fp8_autocast context manager is a no-op when enabled=True
+            # The if...else serves to short circuit name resolution for fp8_autocast
+            with transformer_engine.pytorch.fp8_autocast(
+                enabled=self.use_fp8,
+                fp8_recipe=self.fp8_recipe,
+                fp8_group=self.fp8_group
+            ) if self.use_fp8 else nullcontext():
+                # Determine if the current iteration is first microbatch
+                if self.num_microbatches_in_previous_step != get_num_microbatches():
+                    self.microbatch_count = 0 # Reset count on new batch size rampup interval
+                self.num_microbatches_in_previous_step = get_num_microbatches()
+                is_first_microbatch = self.microbatch_count % get_num_microbatches() == 0
+
+                # Forward pass.
+                if self.recompute_granularity == 'full':
+                    hidden_states = self._checkpointed_forward(hidden_states,
+                                                               attention_mask,
+                                                               encoder_output,
+                                                               enc_dec_attn_mask,
+                                                               rotary_pos_emb,
+                                                               is_first_microbatch)
+                else:
+                    forward_kwargs = {
+                        'encoder_output': encoder_output,
+                        'enc_dec_attn_mask': enc_dec_attn_mask,
+                        'inference_params': inference_params,
+                    }
+
+                    if self.transformer_impl == 'transformer_engine':
+                        forward_kwargs['is_first_microbatch'] = is_first_microbatch
+                        forward_kwargs['checkpoint_core_attention'] = self.checkpoint_core_attention
+                        if self.transformer_engine_v_0_10:
+                            forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
+                    else:
+                        forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
+                        forward_kwargs['retriever_input'] = retriever_input
+                        forward_kwargs['retriever_output'] = retriever_output
+                        forward_kwargs['retriever_attn_mask'] = retriever_attn_mask
+
+                    for index in range(self.num_layers):
+                        layer = self._get_layer(index)
+
+                        hidden_states = layer(
+                            hidden_states,
+                            attention_mask,
+                            **forward_kwargs)
+
+                        # First Retro decoder layer returns both hidden_states
+                        # and retriever_output. Make retriever_output available
+                        # to subsequence Retro layers.
+                        if isinstance(hidden_states, tuple):
+                            assert len(hidden_states) == 2
+                            hidden_states, retriever_output = hidden_states
+                            forward_kwargs["retriever_output"] = retriever_output
+
+                # Skip counter update for eval and activation checkpointing
+                if torch.is_grad_enabled() and self.training:
+                    self.microbatch_count += 1
+
+        # Final layer norm.
+        if self.post_process and self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
diff --git a/training/DeepSpeed-Domino/megatron/model/utils.py b/training/DeepSpeed-Domino/megatron/model/utils.py
new file mode 100644
index 000000000..cf3727c02
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/utils.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Utilities for models."""
+
+import math
+
+import torch
+
+from megatron import get_args
+
+def init_method_normal(sigma):
+    """Init method based on N(0, sigma)."""
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method_normal(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+def attention_mask_func(attention_scores, attention_mask):
+    attention_scores.masked_fill_(attention_mask, -10000.0)
+    return attention_scores
+
+
+def get_linear_layer(rows, columns, init_method):
+    """Simple linear layer with weight initialization."""
+    layer = torch.nn.Linear(rows, columns)
+    if get_args().perform_initialization:
+        init_method(layer.weight)
+    with torch.no_grad():
+        layer.bias.zero_()
+    return layer
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
+                                       (1.0 + 0.044715 * x * x)))
+def openai_gelu(x):
+    return gelu_impl(x)
+
+#This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
+@torch.jit.script
+def erf_gelu(x):
+    return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
diff --git a/training/DeepSpeed-Domino/megatron/model/vision/classification.py b/training/DeepSpeed-Domino/megatron/model/vision/classification.py
new file mode 100644
index 000000000..4d1a4e902
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/vision/classification.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Vision Transformer(VIT) model."""
+
+import torch
+from torch.nn.init import trunc_normal_
+from megatron import get_args
+from megatron.model.utils import get_linear_layer
+from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead
+from megatron.model.vision.mit_backbone import mit_b3_avg
+from megatron.model.module import MegatronModule
+
+class VitClassificationModel(MegatronModule):
+    """Vision Transformer Model."""
+
+    def __init__(self, config, num_classes, finetune=False,
+                 pre_process=True, post_process=True):
+        super(VitClassificationModel, self).__init__()
+        args = get_args()
+
+        self.hidden_size = args.hidden_size
+        self.num_classes = num_classes
+        self.finetune = finetune
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.backbone = VitBackbone(
+            config=config,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+            single_token_output=True
+        )
+        
+        if self.post_process:
+            if not self.finetune:
+                self.head = VitMlpHead(self.hidden_size, self.num_classes)
+            else:
+                self.head = get_linear_layer(
+                    self.hidden_size,
+                    self.num_classes,
+                    torch.nn.init.zeros_
+                )
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.backbone.set_input_tensor(input_tensor)
+
+    def forward(self, input):
+        hidden_states = self.backbone(input)
+
+        if self.post_process:
+            hidden_states = self.head(hidden_states)
+
+        return hidden_states
+
+
+class MitClassificationModel(MegatronModule):
+    """Mix vision Transformer Model."""
+
+    def __init__(self, num_classes,
+                 pre_process=True, post_process=True):
+        super(MitClassificationModel, self).__init__()
+        args = get_args()
+
+        self.hidden_size = args.hidden_size
+        self.num_classes = num_classes
+
+        self.backbone = mit_b3_avg()
+        self.head = torch.nn.Linear(512, num_classes)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, torch.nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, torch.nn.Linear) and m.bias is not None:
+                torch.nn.init.constant_(m.bias, 0)
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        pass
+
+    def forward(self, input):
+        hidden_states = self.backbone(input)
+        hidden_states = self.head(hidden_states)
+
+        return hidden_states
diff --git a/training/DeepSpeed-Domino/megatron/model/vision/dino.py b/training/DeepSpeed-Domino/megatron/model/vision/dino.py
new file mode 100644
index 000000000..1c577d2e1
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/vision/dino.py
@@ -0,0 +1,290 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the Apache license found in the
+# LICENSE file in the root directory of this source tree.
+
+# copied from https://github.com/facebookresearch/dino/blob/main/main_dino.py
+# reworked/refactored some parts to make it run in Megatron.
+import math
+import apex
+import einops
+import torch
+import numpy as np
+import torch.nn.functional as F
+from torch.nn.init import trunc_normal_
+from megatron import get_args, print_rank_0
+from megatron.model.utils import get_linear_layer
+from megatron.model.vision.vit_backbone import VitBackbone
+from megatron.model.module import MegatronModule
+from megatron.model.vision.mit_backbone import mit_b5_avg
+from megatron.model.vision.esvit_swin_backbone import get_swin
+
+
+class DINOLoss(torch.nn.Module):
+    def __init__(self, out_dim, ncrops, warmup_teacher_temp, teacher_temp,
+                 warmup_teacher_temp_epochs, nepochs, student_temp=0.1,
+                 center_momentum=0.9):
+        super().__init__()
+        self.student_temp = student_temp
+        self.center_momentum = center_momentum
+        self.ncrops = ncrops
+        self.register_buffer("center", torch.zeros(1, out_dim))
+        # we apply a warm up for the teacher temperature because
+        # a too high temperature makes the training instable at the beginning
+        self.teacher_temp_schedule = np.concatenate((
+            np.linspace(warmup_teacher_temp,
+                        teacher_temp, warmup_teacher_temp_epochs),
+            np.ones(nepochs - warmup_teacher_temp_epochs) * teacher_temp
+        ))
+        self.teacher_temp = teacher_temp
+
+    def forward(self, student_output, teacher_output, iteration):
+        """
+        Cross-entropy between softmax outputs of the teacher
+        and student network.
+        """
+        args = get_args()
+        student_out = student_output / self.student_temp
+        student_out = student_out.chunk(self.ncrops)
+
+        epoch = iteration // args.iter_per_epoch
+
+        # teacher centering and sharpening
+        temp = self.teacher_temp_schedule[epoch]
+        teacher_out = F.softmax((teacher_output - self.center) / temp, dim=-1)
+
+        teacher_out = teacher_out.detach().chunk(2)
+
+        total_loss = 0
+        n_loss_terms = 0
+        for iq, q in enumerate(teacher_out):
+            for v in range(len(student_out)):
+                if v == iq:
+                    # we skip cases where student and teacher operate on the same view
+                    continue
+                loss = torch.sum(-q * F.log_softmax(student_out[v], dim=-1), dim=-1)
+                total_loss += loss.mean()
+                n_loss_terms += 1
+        total_loss /= n_loss_terms
+        self.update_center(teacher_output)
+        return total_loss
+
+    @torch.no_grad()
+    def update_center(self, teacher_output):
+        """
+        Update center used for teacher output.
+        """
+        batch_center = torch.sum(teacher_output, dim=0, keepdim=True)
+        torch.distributed.all_reduce(batch_center)
+        batch_center = batch_center / (len(teacher_output) * torch.distributed.get_world_size())
+        self.center = self.center * self.center_momentum + batch_center * (1 - self.center_momentum)
+
+class DINOHead(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, norm_last_layer=True, nlayers=3):
+        super().__init__()
+        args = get_args()
+        hidden_dim = args.dino_head_hidden_size
+        bottleneck_dim = args.dino_bottleneck_size
+        nlayers = max(nlayers, 1)
+        if nlayers == 1:
+            self.mlp = torch.nn.Linear(in_dim, bottleneck_dim)
+        else:
+            layers = [torch.nn.Linear(in_dim, hidden_dim)]
+            layers.append(torch.nn.GELU())
+            for _ in range(nlayers - 2):
+                layers.append(torch.nn.Linear(hidden_dim, hidden_dim))
+                layers.append(torch.nn.GELU())
+            layers.append(torch.nn.Linear(hidden_dim, bottleneck_dim))
+            self.mlp = torch.nn.Sequential(*layers)
+        self.apply(self._init_weights)
+        self.last_layer = torch.nn.utils.weight_norm(torch.nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+        if norm_last_layer:
+            self.last_layer.weight_g.requires_grad = False
+
+    def _init_weights(self, m):
+        if isinstance(m, torch.nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, torch.nn.Linear) and m.bias is not None:
+                torch.nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        x = self.mlp(x)
+        x = torch.nn.functional.normalize(x, dim=-1, p=2)
+        x = self.last_layer(x)
+        return x
+
+
+class MultiCropWrapper(MegatronModule):
+
+    """
+    Perform forward pass separately on each resolution input.
+    The inputs corresponding to a single resolution are clubbed and single
+    forward is run on the same resolution inputs. Hence we do several
+    forward passes = number of different resolutions used. We then
+    concatenate all the output features and run the head forward on these
+    concatenated features.
+    """
+    def __init__(self, backbone, head):
+        super(MultiCropWrapper, self).__init__()
+        # disable layers dedicated to ImageNet labels classification
+        #backbone.fc, backbone.head = torch.nn.Identity(), torch.nn.Identity()
+        self.backbone = backbone
+        self.head = head
+
+    def forward(self, x):
+        # convert to list
+        if not isinstance(x, list):
+            x = [x]
+        idx_crops = torch.cumsum(torch.unique_consecutive(
+            torch.tensor([inp.shape[-1] for inp in x]),
+            return_counts=True,
+        )[1], 0)
+
+        start_idx = 0
+        for end_idx in idx_crops:
+            _out = self.backbone(torch.cat(x[start_idx: end_idx]))
+            if start_idx == 0:
+                output = _out
+            else:
+                output = torch.cat((output, _out))
+            start_idx = end_idx
+        # Run the head forward on the concatenated features.
+        if self.training:
+            return self.head(output)
+        else:
+            return output
+
+
+def cosine_scheduler(base_value, final_value, epochs, niter_per_ep,
+                     warmup_epochs=0, start_warmup_value=0):
+    warmup_schedule = np.array([])
+    warmup_iters = warmup_epochs * niter_per_ep
+    if warmup_epochs > 0:
+        warmup_schedule = \
+                np.linspace(start_warmup_value, base_value, warmup_iters)
+
+    iters = np.arange(epochs * niter_per_ep - warmup_iters)
+    schedule = final_value + 0.5 * (base_value - final_value) \
+        * (1 + np.cos(np.pi * iters / len(iters)))
+
+    schedule = np.concatenate((warmup_schedule, schedule))
+    assert len(schedule) == epochs * niter_per_ep
+    return schedule
+
+
+def get_student_backbone_and_num_features(config, pre_process=True, post_process=True):
+    args = get_args()
+
+    if args.vision_backbone_type == 'vit':
+        student = VitBackbone(config,
+                              pre_process=pre_process,
+                              post_process=post_process,
+                              drop_path_rate=0.1,
+                              single_token_output=True)
+        num_features = args.hidden_size
+    elif args.vision_backbone_type == 'mit':
+        student = mit_b5_avg(drop_path_rate=0.1)
+        num_features = 512
+    elif args.vision_backbone_type == 'swin':
+        student = get_swin()
+        num_features = student.num_features
+    else:
+        raise Exception('{} vision backbone is not supported.'.format(
+                              args.vision_backbone_type))
+ 
+    return student, num_features
+
+def get_teacher_backbone_and_num_features(config, pre_process=True, post_process=True):
+    args = get_args()
+
+    if args.vision_backbone_type == 'vit':
+        teacher = VitBackbone(config,
+                              pre_process=pre_process,
+                              post_process=post_process,
+                              single_token_output=True)
+        num_features = args.hidden_size
+    elif args.vision_backbone_type == 'mit':
+        teacher = mit_b5_avg(drop_path_rate=0.0)
+        num_features = 512
+    elif args.vision_backbone_type == 'swin':
+        teacher = get_swin(is_teacher=True)
+        num_features = teacher.num_features
+    else:
+        raise Exception('{} vision backbone is not supported.'.format(
+                              args.vision_backbone_type))
+    return teacher, num_features
+
+
+class DINOPretrainModel(MegatronModule):
+    def __init__(self, config, pre_process=True, post_process=True):
+        super(DINOPretrainModel, self).__init__()
+        args = get_args()
+        self.out_dim = 65536
+
+        self.dino_loss = DINOLoss(
+            self.out_dim,
+            args.dino_local_crops_number + 2,
+            args.dino_warmup_teacher_temp,
+            args.dino_teacher_temp,
+            args.dino_warmup_teacher_temp_epochs,
+            300,
+        )
+
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.momentum_teacher = 0.996
+
+        student_backbone, num_features = \
+            get_student_backbone_and_num_features(config, pre_process, post_process)
+
+        self.student = MultiCropWrapper(
+            student_backbone,
+            DINOHead(num_features, self.out_dim,
+                     norm_last_layer=args.dino_norm_last_layer)
+        )
+
+        self.momentum_schedule = cosine_scheduler(
+            self.momentum_teacher, 1,
+            args.train_iters // args.iter_per_epoch,
+            args.iter_per_epoch
+        )
+
+        teacher_backbone, num_features = \
+            get_teacher_backbone_and_num_features(config, pre_process, post_process)
+        self.teacher = MultiCropWrapper(
+            teacher_backbone,
+            DINOHead(num_features, self.out_dim)
+        )
+        self.teacher.load_state_dict(self.student.state_dict())
+
+        for p in self.teacher.parameters():
+            if hasattr(p, "requires_grad") and p.requires_grad is not None:
+                p.requires_grad = False
+
+    def set_input_tensor(self, tensor):
+        pass
+
+    def forward(self, input):
+        student_output = None
+        if self.training:
+            student_output = self.student(input)
+            teacher_output = self.teacher(input[:2])
+        else:
+            teacher_output = self.teacher(input)
+        return student_output, teacher_output
+
+    def cancel_gradients_last_layer(self, iteration):
+        args = get_args()
+        epoch = iteration // args.iter_per_epoch
+        if epoch < args.dino_freeze_last_layer:
+            for n, p in self.student.named_parameters():
+                if "last_layer" in n:
+                    p.grad = None
+
+    def update_momentum(self, iteration):
+        with torch.no_grad():
+            m = self.momentum_schedule[iteration]
+            for param_q, param_k in zip(self.student.parameters(), self.teacher.parameters()):
+                param_k.data.mul_(m).add_((1 - m) * param_q.detach().data)
+
diff --git a/training/DeepSpeed-Domino/megatron/model/vision/esvit_swin_backbone.py b/training/DeepSpeed-Domino/megatron/model/vision/esvit_swin_backbone.py
new file mode 100644
index 000000000..70aee3db4
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/vision/esvit_swin_backbone.py
@@ -0,0 +1,849 @@
+# Copyright (c) 2021 Microsoft
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# Modified by Chunyuan Li (chunyl@microsoft.com)
+# Swin Transformer
+# --------------------------------------------------------
+
+import os
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+import torch.distributed as dist
+from torch.nn.init import trunc_normal_
+from megatron.model.transformer import DropPath
+from megatron import get_args
+from megatron.model import LayerNorm
+import numpy as np
+from math import sqrt
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None,
+                 out_features=None, act_layer=nn.GELU, drop=0.):
+        super(Mlp, self).__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    r"""Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+
+        super(WindowAttention, self).__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2 Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0).type(attn.type())
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn_out = attn
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, attn_out
+
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+
+    @staticmethod
+    def compute_macs(module, input, output):
+        B, N, C = input[0].shape
+
+        module.__flops__ += module.flops(N) * B
+
+
+class SwinTransformerBlock(nn.Module):
+    r"""Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=(self.window_size, self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        self.H = input_resolution[0]
+        self.W = input_resolution[1]
+
+        self.attn_mask_dict = {}
+
+
+    def create_attn_mask(self, H, W):
+        # calculate attention mask for SW-MSA
+
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1))  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+
+        return attn_mask
+
+
+    def forward(self, x):
+        B, L, C = x.shape
+        H = int(sqrt(L))
+        W = H
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+
+            if H in self.attn_mask_dict.keys():
+                attn_mask = self.attn_mask_dict[H]
+            else:
+                self.attn_mask_dict[H] = self.create_attn_mask(self.H, self.W).to(x.device)
+                attn_mask = self.attn_mask_dict[H]
+
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows, attn = self.attn(x_windows, attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x, attn
+
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size} mlp_ratio={self.mlp_ratio}"
+
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+
+
+class PatchMerging(nn.Module):
+    r"""Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        H = int(sqrt(L))
+        W = H
+
+        x = x.view(B, H, W, C)
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+    def extra_repr(self) -> str:
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+
+    def flops(self):
+        H, W = self.input_resolution
+        flops = H * W * self.dim
+        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        return flops
+
+
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
+                                 num_heads=num_heads, window_size=window_size,
+                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                 drop=drop, attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                 norm_layer=norm_layer)
+            for i in range(depth)])
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        for blk in self.blocks:
+            x, _ = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+
+    def forward_with_features(self, x):
+        fea = []
+        for blk in self.blocks:
+            x, _ = blk(x)
+            fea.append(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x, fea
+
+    def forward_with_attention(self, x):
+        attns = []
+        for blk in self.blocks:
+            x, attn = blk(x)
+            attns.append(attn)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x, attns
+
+
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None):
+        super().__init__()
+        img_size = (img_size, img_size)
+        patch_size = (patch_size, patch_size)
+        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+
+        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+
+    def flops(self):
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+
+class SwinTransformer(nn.Module):
+    r""" Swin Transformer
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        img_size (int | tuple(int)): Input image size.
+        patch_size (int | tuple(int)): Patch size.
+        in_chans (int): Number of input channels.
+        num_classes (int): Number of classes for classification head.
+        embed_dim (int): Embedding dimension.
+        depths (tuple(int)): Depth of Swin Transformer layers.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate.
+        drop_path_rate (float): Stochastic depth rate.
+        norm_layer (nn.Module): normalization layer.
+        ape (bool): If True, add absolute position embedding to the patch embedding.
+        patch_norm (bool): If True, add normalization after patch embedding.
+    """
+
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
+                 embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
+                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True, **kwargs):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
+                               input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                                                 patches_resolution[1] // (2 ** i_layer)),
+                               depth=depths[i_layer],
+                               num_heads=num_heads[i_layer],
+                               window_size=window_size,
+                               mlp_ratio=self.mlp_ratio,
+                               qkv_bias=qkv_bias, qk_scale=qk_scale,
+                               drop=drop_rate, attn_drop=attn_drop_rate,
+                               drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                               norm_layer=norm_layer,
+                               downsample=PatchMerging if (i_layer < self.num_layers - 1) else None)
+            self.layers.append(layer)
+
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        # todo: to be implemented
+        return {'relative_position_bias_table'}
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x)
+
+        x_region = self.norm(x)  # B L C
+        x = self.avgpool(x_region.transpose(1, 2))  # B C 1
+        x = torch.flatten(x, 1)
+
+        return x
+
+
+    def forward_feature_maps(self, x):
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x)
+
+        x_grid = self.norm(x)  # B L C
+        x = self.avgpool(x_grid.transpose(1, 2))  # B C 1
+        x = torch.flatten(x, 1)
+
+        return x, x_grid
+
+
+    def forward_selfattention(self, x, n=1):
+        # n=1 return the last layer attn map; otherwise return attn maps in all layers
+
+        
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        if n==1:
+            return self.forward_last_selfattention(x)
+        else:
+            return self.forward_all_selfattention(x)
+
+    def forward_last_selfattention(self, x):
+
+        for i, layer in enumerate(self.layers):
+            if i < len(self.layers) - 1:
+                x = layer(x)
+            else:
+                x, attns = layer.forward_with_attention(x)
+                return attns[-1]
+
+    def forward_all_selfattention(self, x):
+        attn_out = []
+
+        for layer in self.layers:
+            x, attns = layer.forward_with_attention(x)
+            attn_out += attns
+
+        return attn_out
+
+
+    def forward_return_n_last_blocks(self, x, n=1, return_patch_avgpool=False, depth=[]):
+
+        num_blks = sum(depth)
+        start_idx = num_blks - n
+
+        sum_cur = 0
+        for i, d in enumerate(depth):
+            sum_cur_new = sum_cur + d
+            if start_idx >= sum_cur and start_idx < sum_cur_new:
+                start_stage = i
+                start_blk = start_idx - sum_cur
+            sum_cur = sum_cur_new
+
+
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        # we will return the averaged token features from the `n` last blocks
+        # note: there is no [CLS] token in Swin Transformer
+        output = []
+        s = 0
+        for i, layer in enumerate(self.layers):
+            x, fea = layer.forward_with_features(x)
+
+            if i >= start_stage:
+                for x_ in fea[start_blk:]:
+
+                    if i == len(self.layers)-1: # use the norm in the last stage
+                        x_ = self.norm(x_)
+
+                    x_avg = torch.flatten(self.avgpool(x_.transpose(1, 2)), 1)  # B C     
+                    # print(f'Stage {i},  x_avg {x_avg.shape}')          
+                    output.append(x_avg)
+
+                start_blk = 0
+
+        return torch.cat(output, dim=-1)
+
+
+
+    def flops(self):
+        flops = 0
+        flops += self.patch_embed.flops()
+        for i, layer in enumerate(self.layers):
+            flops += layer.flops()
+            if dist.get_rank() == 0:
+                print(f"GFLOPs layer_{i}: {layer.flops() / 1e9}")
+        flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
+        flops += self.num_features * self.num_classes
+        return flops
+
+    def init_weights(self, pretrained='', pretrained_layers=[], verbose=True):
+        if os.path.isfile(pretrained):
+            pretrained_dict = torch.load(pretrained, map_location='cpu')
+            logging.info(f'=> loading pretrained model {pretrained}')
+            model_dict = self.state_dict()
+            pretrained_dict = {
+                k: v for k, v in pretrained_dict.items()
+                if k in model_dict.keys()
+            }
+            need_init_state_dict = {}
+            for k, v in pretrained_dict.items():
+                need_init = (
+                        k.split('.')[0] in pretrained_layers
+                        or pretrained_layers[0] is '*'
+                        or 'relative_position_index' not in k
+                        or 'attn_mask' not in k
+                )
+
+                if need_init:
+                    if verbose:
+                        logging.info(f'=> init {k} from {pretrained}')
+
+                    if 'relative_position_bias_table' in k and v.size() != model_dict[k].size():
+                        relative_position_bias_table_pretrained = v
+                        relative_position_bias_table_current = model_dict[k]
+                        L1, nH1 = relative_position_bias_table_pretrained.size()
+                        L2, nH2 = relative_position_bias_table_current.size()
+                        if nH1 != nH2:
+                            logging.info(f"Error in loading {k}, passing")
+                        else:
+                            if L1 != L2:
+                                logging.info(
+                                    '=> load_pretrained: resized variant: {} to {}'
+                                        .format((L1, nH1), (L2, nH2))
+                                )
+                                S1 = int(L1 ** 0.5)
+                                S2 = int(L2 ** 0.5)
+                                relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate(
+                                    relative_position_bias_table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
+                                    size=(S2, S2),
+                                    mode='bicubic')
+                                v = relative_position_bias_table_pretrained_resized.view(nH2, L2).permute(1, 0)
+
+                    if 'absolute_pos_embed' in k and v.size() != model_dict[k].size():
+                        absolute_pos_embed_pretrained = v
+                        absolute_pos_embed_current = model_dict[k]
+                        _, L1, C1 = absolute_pos_embed_pretrained.size()
+                        _, L2, C2 = absolute_pos_embed_current.size()
+                        if C1 != C1:
+                            logging.info(f"Error in loading {k}, passing")
+                        else:
+                            if L1 != L2:
+                                logging.info(
+                                    '=> load_pretrained: resized variant: {} to {}'
+                                        .format((1, L1, C1), (1, L2, C2))
+                                )
+                                S1 = int(L1 ** 0.5)
+                                S2 = int(L2 ** 0.5)
+                                absolute_pos_embed_pretrained = absolute_pos_embed_pretrained.reshape(-1, S1, S1, C1)
+                                absolute_pos_embed_pretrained = absolute_pos_embed_pretrained.permute(0, 3, 1, 2)
+                                absolute_pos_embed_pretrained_resized = torch.nn.functional.interpolate(
+                                    absolute_pos_embed_pretrained, size=(S2, S2), mode='bicubic')
+                                v = absolute_pos_embed_pretrained_resized.permute(0, 2, 3, 1).flatten(1, 2)
+
+                    need_init_state_dict[k] = v
+            self.load_state_dict(need_init_state_dict, strict=False)
+
+    def freeze_pretrained_layers(self, frozen_layers=[]):
+        for name, module in self.named_modules():
+            if (
+                    name.split('.')[0] in frozen_layers
+                    or '.'.join(name.split('.')[0:2]) in frozen_layers
+                    or (len(frozen_layers) > 0 and frozen_layers[0] is '*')
+            ):
+                for _name, param in module.named_parameters():
+                    param.requires_grad = False
+                logging.info(
+                    '=> set param {} requires grad to False'
+                        .format(name)
+                )
+        for name, param in self.named_parameters():
+            if (
+                    name.split('.')[0] in frozen_layers
+                    or (len(frozen_layers) > 0 and frozen_layers[0] is '*')
+                    and param.requires_grad is True
+            ):
+                param.requires_grad = False
+                logging.info(
+                    '=> set param {} requires grad to False'
+                        .format(name)
+                )
+        return self
+
+
+def get_swin(is_teacher=False):
+    args = get_args()
+
+    if args.swin_backbone_type == "tiny":
+        embed_dim = 96
+        depths = [2, 2, 6, 2]
+        num_heads = [3, 6, 12, 24]
+        drop_path_rate = 0.1
+    elif args.swin_backbone_type == 'h3':
+        embed_dim = 384
+        depths = [2, 2, 18, 2]
+        num_heads = [6, 12, 24, 48]
+        drop_path_rate = 0.2
+    else:
+        embed_dim = 128
+        depths = [2, 2, 18, 2]
+        num_heads = [4, 8, 16, 32]
+        drop_path_rate = 0.2
+
+    swin = SwinTransformer(
+        img_size=224,
+        in_chans=3,
+        num_classes=1000,
+        patch_size=4,
+        embed_dim=embed_dim,
+        depths=depths,
+        num_heads=num_heads,
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_rate=0,
+        attn_drop_rate=0,
+        drop_path_rate=(0.0 if is_teacher else drop_path_rate),
+        norm_layer=partial(LayerNorm, eps=1e-6),
+        ape=False,
+        patch_norm=True,
+    )
+
+    return swin
+
diff --git a/training/DeepSpeed-Domino/megatron/model/vision/inpainting.py b/training/DeepSpeed-Domino/megatron/model/vision/inpainting.py
new file mode 100644
index 000000000..cda03315b
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/vision/inpainting.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+i
+import math
+import apex
+import einops
+import torch
+import torch.nn.functional as F
+from megatron import get_args, print_rank_0
+from megatron.model.utils import get_linear_layer
+from megatron.model.vision.vit_backbone import VitBackbone
+from megatron.model.module import MegatronModule
+from megatron.model.vision.mit_backbone import mit_b3
+from megatron.model.vision.utils import resize_
+
+
+class VitInpaintingModel(MegatronModule):
+
+    def __init__(self, config, pre_process=True, post_process=True):
+        super(VitInpaintingModel, self).__init__()
+        args = get_args()
+
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.hidden_size = config.hidden_size
+        self.backbone = VitBackbone(
+            config=config,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+            class_token=False,
+        )
+        self.patch_dim = args.patch_dim
+        self.img_h = args.img_h
+        self.img_w = args.img_w
+        self.seq_length = args.seq_length
+        # full mask
+
+        if self.post_process:
+            self.linear_decoder = get_linear_layer(
+                self.hidden_size,
+                self.backbone.flatten_dim,
+                torch.nn.init.zeros_
+            )
+
+    def set_input_tensor(self, input_tensor):
+        self.backbone.set_input_tensor(input_tensor)
+
+    def forward(self, input):
+
+        hidden_states = self.backbone(input)
+
+        if not self.post_process:
+            return hidden_states
+        decoded_output = self.linear_decoder(hidden_states)
+        output = einops.rearrange(
+                decoded_output,
+                "b (h w) (p1 p2 c) -> b c (h p1) (w p2)",
+                p1=self.patch_dim,
+                p2=self.patch_dim,
+                h=self.img_h//self.patch_dim,
+                w=self.img_w//self.patch_dim,
+            )
+
+        return output
+
+
+class MLP(torch.nn.Module):
+    """
+    Linear Embedding
+    """
+    def __init__(self, input_dim=2048, embed_dim=768):
+        super().__init__()
+        self.proj = torch.nn.Linear(input_dim, embed_dim)
+
+    def forward(self, x):
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+
+class MitInpaintingModel(MegatronModule):
+    """Mix vision Transformer Model."""
+
+    def __init__(self, pre_process=True, post_process=True):
+        super(MitInpaintingModel, self).__init__()
+        self.pre_process = pre_process
+        self.post_process = post_process
+
+        args = get_args()
+        self.patch_dim = args.patch_dim
+        self.img_h = args.img_h
+        self.img_w = args.img_w
+        self.flatten_dim = self.patch_dim * self.patch_dim * 3
+        self.backbone = mit_b3()
+
+        self.in_channels = [64, 128, 320, 512]
+        self.embedding_dim = 768
+
+        c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = self.in_channels
+
+        self.linear_c4 = MLP(input_dim=c4_in_channels, embed_dim=self.embedding_dim)
+        self.linear_c3 = MLP(input_dim=c3_in_channels, embed_dim=self.embedding_dim)
+        self.linear_c2 = MLP(input_dim=c2_in_channels, embed_dim=self.embedding_dim)
+        self.linear_c1 = MLP(input_dim=c1_in_channels, embed_dim=self.embedding_dim)
+
+        self.conv_fuse = torch.nn.Conv2d(self.embedding_dim*4, self.embedding_dim, 1, 1, bias=False)
+        self.norm = apex.parallel.SyncBatchNorm(self.embedding_dim)
+        self.dropout = torch.nn.Dropout2d(0.1)
+        
+        self.linear_pred = torch.nn.Conv2d(self.embedding_dim, self.flatten_dim, kernel_size=1)
+    
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        pass
+
+    def forward(self, input):
+        c1, c2, c3, c4 = self.backbone(input)
+
+        n, _, h, w = c4.shape
+        _c4 = self.linear_c4(c4).permute(0, 2, 1).reshape(n, -1, c4.shape[2], c4.shape[3])
+        _c4 = resize(_c4, size=c1.size()[2:], mode='bilinear', align_corners=False)
+    
+        _c3 = self.linear_c3(c3).permute(0, 2, 1).reshape(n, -1, c3.shape[2], c3.shape[3])
+        _c3 = resize(_c3, size=c1.size()[2:], mode='bilinear', align_corners=False)
+
+        _c2 = self.linear_c2(c2).permute(0, 2, 1).reshape(n, -1, c2.shape[2], c2.shape[3])
+        _c2 = resize(_c2, size=c1.size()[2:], mode='bilinear', align_corners=False)
+
+        _c1 = self.linear_c1(c1).permute(0, 2, 1).reshape(n, -1, c1.shape[2], c1.shape[3])
+
+        _c = torch.cat([_c4, _c3, _c2, _c1], dim=1)
+        _c = self.conv_fuse(_c)
+ 
+        x = self.norm(_c)
+        x = F.relu(x, inplace=True)
+        x = self.dropout(x)
+
+        x = self.linear_pred(x)
+
+        output = einops.rearrange(
+            x,
+            "b (c p1 p2) h w -> b c (h p1) (w p2)",
+            p1=self.patch_dim,
+            p2=self.patch_dim,
+            h=self.img_h//self.patch_dim,
+            w=self.img_w//self.patch_dim,
+        )
+
+        return output
diff --git a/training/DeepSpeed-Domino/megatron/model/vision/knn_monitor.py b/training/DeepSpeed-Domino/megatron/model/vision/knn_monitor.py
new file mode 100644
index 000000000..a7d79854e
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/vision/knn_monitor.py
@@ -0,0 +1,129 @@
+import torch.nn.functional as F
+import torch
+from megatron import print_rank_0, get_args
+from megatron.core import mpu
+from megatron.data.vit_dataset import ClassificationTransform
+from megatron.data.image_folder import ImageFolder
+
+_FEATURE_BANK = None
+
+
+def build_data_loader(dataset, drop_last=True, shuffle=False):
+    """Data loader. Note that batch-size is the local (per GPU) batch-size."""
+    # Sampler.
+    args = get_args()
+    micro_batch_size = 16
+    num_workers = args.num_workers
+    world_size = mpu.get_data_parallel_world_size()
+    rank = mpu.get_data_parallel_rank()
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset, num_replicas=world_size, rank=rank,
+        drop_last=drop_last, shuffle=shuffle
+    )
+
+    # Data loader. Note that batch size is the per GPU batch size.
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=micro_batch_size,
+        sampler=sampler,
+        shuffle=False,
+        num_workers=num_workers,
+        drop_last=not drop_last,
+        pin_memory=True,
+    )
+    return data_loader
+
+
+def compute_feature_bank(model):
+    args = get_args()
+    global _FEATURE_BANK
+    feature_bank = []
+    feature_label = []
+
+    train_ds = ImageFolder(
+        root=args.data_path[0],
+        transform=ClassificationTransform((args.img_h, args.img_w), train=False),
+        data_per_class_fraction=1.0
+    )
+    classes = len(train_ds.classes)
+    dataloader = build_data_loader(train_ds)
+     
+    for m in model:
+        m.eval()
+
+    with torch.no_grad():
+        for i, batch in enumerate(dataloader):
+            images = batch[0].cuda().contiguous()
+            labels = batch[1].cuda().contiguous()
+            student_feature, teacher_feature = model[0](images)
+            feature = F.normalize(teacher_feature.float(), dim=1)
+            feature_bank.append(feature)
+            feature_label.append(labels)
+    
+    for m in model:
+        m.train()
+
+    # [N', D]
+    feature_bank = torch.cat(feature_bank, dim=0).contiguous()
+    feature_label = torch.cat(feature_label, dim=0).contiguous()
+
+    feature_banks = [torch.zeros_like(feature_bank)
+                     for i in range(mpu.get_data_parallel_world_size())]
+    torch.distributed.all_gather(feature_banks,
+                                 feature_bank,
+                                 group=mpu.get_data_parallel_group())
+
+    assert torch.all(torch.eq(feature_banks[mpu.get_data_parallel_rank()],
+                              feature_bank))
+
+    feature_labels = [torch.zeros_like(feature_label)
+                      for i in range(mpu.get_data_parallel_world_size())]
+    torch.distributed.all_gather(feature_labels,
+                                 feature_label,
+                                 group=mpu.get_data_parallel_group())
+
+    # [D, N]
+    feature_banks = torch.cat(feature_banks, dim=0).t().contiguous()
+    # [N]
+    feature_labels = torch.cat(feature_labels, dim=0).contiguous()
+    print_rank_0("feature_banks size is {}".format(feature_banks.size()))
+    print_rank_0("feature labels size is {}".format(feature_labels.size()))
+
+    _FEATURE_BANK = (feature_banks, feature_labels, classes)
+
+
+def get_feature_bank():
+    global _FEATURE_BANK
+    assert _FEATURE_BANK is not None
+    return _FEATURE_BANK
+
+
+# knn monitor as in InstDisc https://arxiv.org/abs/1805.01978
+# implementation follows http://github.com/zhirongw/lemniscate.pytorch and
+# https://github.com/leftthomas/SimCLR
+def knn_predict(feature, feature_bank, feature_labels, classes, knn_k, knn_t):
+    # compute cos similarity between each feature vector and feature bank ---> [B, N]
+    sim_matrix = torch.mm(feature, feature_bank)
+    # [B, K]
+    sim_weight, sim_indices = sim_matrix.topk(k=knn_k, dim=-1)
+    # [B, K]
+    sim_labels = torch.gather(feature_labels.expand(feature.size(0), -1),
+                              dim=-1,
+                              index=sim_indices)
+    sim_weight = (sim_weight / knn_t).exp()
+
+    # counts for each class
+    one_hot_label = torch.zeros(feature.size(0) * knn_k,
+                                classes,
+                                device=sim_labels.device)
+    # [B*K, C]
+    one_hot_label = one_hot_label.scatter(dim=-1,
+                                          index=sim_labels.view(-1, 1),
+                                          value=1.0)
+    # weighted score ---> [B, C]
+    pred_scores = torch.sum(
+            one_hot_label.view(feature.size(0), -1, classes) * sim_weight.unsqueeze(dim=-1),
+            dim=1)
+
+    pred_labels = pred_scores.argsort(dim=-1, descending=True)
+    return pred_labels
diff --git a/training/DeepSpeed-Domino/megatron/model/vision/mit_backbone.py b/training/DeepSpeed-Domino/megatron/model/vision/mit_backbone.py
new file mode 100644
index 000000000..6640b105d
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/vision/mit_backbone.py
@@ -0,0 +1,415 @@
+# Copyright (c) 2023, NVIDIA Corporation. All rights reserved.
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+from torch.nn.init import trunc_normal_
+from megatron.model.transformer import DropPath
+from megatron.model import LayerNorm
+
+
+class Mlp(nn.Module):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 sr_ratio=1):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = LayerNorm(dim)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        if self.sr_ratio > 1:
+            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+            x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
+            x_ = self.norm(x_)
+            kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        else:
+            kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=LayerNorm, sr_ratio=1):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+
+        return x
+
+
+class OverlapPatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = (img_size, img_size)
+        patch_size = (patch_size, patch_size)
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+                              padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = LayerNorm(embed_dim)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+
+        return x, H, W
+
+
+class MixVisionTransformer(nn.Module):
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
+                 attn_drop_rate=0., drop_path_rate=0., norm_layer=LayerNorm,
+                 depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], output_avg=False):
+        super().__init__()
+        self.num_classes = num_classes
+        self.depths = depths
+        self.output_avg = output_avg
+
+        # patch_embed
+        self.patch_embed1 = OverlapPatchEmbed(img_size=img_size, patch_size=7, stride=4, in_chans=in_chans,
+                                              embed_dim=embed_dims[0])
+        self.patch_embed2 = OverlapPatchEmbed(img_size=img_size // 4, patch_size=3, stride=2, in_chans=embed_dims[0],
+                                              embed_dim=embed_dims[1])
+        self.patch_embed3 = OverlapPatchEmbed(img_size=img_size // 8, patch_size=3, stride=2, in_chans=embed_dims[1],
+                                              embed_dim=embed_dims[2])
+        self.patch_embed4 = OverlapPatchEmbed(img_size=img_size // 16, patch_size=3, stride=2, in_chans=embed_dims[2],
+                                              embed_dim=embed_dims[3])
+
+        # transformer encoder
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        cur = 0
+        self.block1 = nn.ModuleList([Block(
+            dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[0])
+            for i in range(depths[0])])
+        self.norm1 = norm_layer(embed_dims[0])
+
+        cur += depths[0]
+        self.block2 = nn.ModuleList([Block(
+            dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[1])
+            for i in range(depths[1])])
+        self.norm2 = norm_layer(embed_dims[1])
+
+        cur += depths[1]
+        self.block3 = nn.ModuleList([Block(
+            dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[2])
+            for i in range(depths[2])])
+        self.norm3 = norm_layer(embed_dims[2])
+
+        cur += depths[2]
+        self.block4 = nn.ModuleList([Block(
+            dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[3])
+            for i in range(depths[3])])
+        self.norm4 = norm_layer(embed_dims[3])
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def reset_drop_path(self, drop_path_rate):
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
+        cur = 0
+        for i in range(self.depths[0]):
+            self.block1[i].drop_path.drop_prob = dpr[cur + i]
+
+        cur += self.depths[0]
+        for i in range(self.depths[1]):
+            self.block2[i].drop_path.drop_prob = dpr[cur + i]
+
+        cur += self.depths[1]
+        for i in range(self.depths[2]):
+            self.block3[i].drop_path.drop_prob = dpr[cur + i]
+
+        cur += self.depths[2]
+        for i in range(self.depths[3]):
+            self.block4[i].drop_path.drop_prob = dpr[cur + i]
+
+    def freeze_patch_emb(self):
+        self.patch_embed1.requires_grad = False
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        outs = []
+
+        # stage 1
+        x, H, W = self.patch_embed1(x)
+        for i, blk in enumerate(self.block1):
+            x = blk(x, H, W)
+        x = self.norm1(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs.append(x)
+
+        # stage 2
+        x, H, W = self.patch_embed2(x)
+        for i, blk in enumerate(self.block2):
+            x = blk(x, H, W)
+        x = self.norm2(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs.append(x)
+
+        # stage 3
+        x, H, W = self.patch_embed3(x)
+        for i, blk in enumerate(self.block3):
+            x = blk(x, H, W)
+        x = self.norm3(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs.append(x)
+
+        # stage 4
+        x, H, W = self.patch_embed4(x)
+        for i, blk in enumerate(self.block4):
+            x = blk(x, H, W)
+        x = self.norm4(x)
+        if not self.output_avg:
+            x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs.append(x)
+
+        return outs
+
+    def forward(self, x):
+        x = self.forward_features(x)
+    
+        if self.output_avg:
+            x = x[3].mean(dim=1)
+
+        return x
+
+
+class DWConv(nn.Module):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+
+        return x
+
+class mit_b0(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b0, self).__init__(
+            patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+
+
+class mit_b1(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b1, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+
+
+class mit_b2(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b2, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+
+ 
+class mit_b3(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b3, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+
+class mit_b3_avg(MixVisionTransformer):
+    def __init__(self, drop_path_rate=0.1, **kwargs):
+        super(mit_b3_avg, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=drop_path_rate, output_avg=True)
+
+class mit_b4(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b4, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+
+class mit_b5(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b5, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+
+class mit_b5_avg(MixVisionTransformer):
+    def __init__(self, drop_path_rate=0.1, **kwargs):
+        super(mit_b5_avg, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=drop_path_rate, output_avg=True)
+
diff --git a/training/DeepSpeed-Domino/megatron/model/vision/swin_backbone.py b/training/DeepSpeed-Domino/megatron/model/vision/swin_backbone.py
new file mode 100644
index 000000000..9a622c707
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/vision/swin_backbone.py
@@ -0,0 +1,625 @@
+# Copyright (c) 2021 Microsoft
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# Swin Transformer
+# --------------------------------------------------------
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from math import sqrt
+
+from megatron import get_args
+from functools import partial
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None,
+                 out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+
+
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        self.H = input_resolution[0]
+        self.W = input_resolution[1]
+
+        self.attn_mask_dict = {} 
+
+    def create_attn_mask(self, H, W):
+        # calculate attention mask for SW-MSA
+
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1))  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+
+        return attn_mask
+
+
+    def forward(self, x):
+        B, L, C = x.shape
+        H = int(sqrt(L))
+        W = H
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+
+
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+
+        x = x.view(B, H, W, C)
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+    def extra_repr(self) -> str:
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+
+    def flops(self):
+        H, W = self.input_resolution
+        flops = H * W * self.dim
+        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        return flops
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
+                                 num_heads=num_heads, window_size=window_size,
+                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                 drop=drop, attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                 norm_layer=norm_layer)
+            for i in range(depth)])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        x_b4_ds = x
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x_b4_ds, x
+
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+
+
+class PatchEmbed(nn.Module):
+    r""" Image to Patch Embedding
+
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+    def flops(self):
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+
+
+class SwinTransformer(nn.Module):
+    r""" Swin Transformer
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+    """
+
+    def __init__(self, img_size=224, patch_size=4, in_chans=3,
+                 embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
+                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.3,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6), ape=False, patch_norm=True,
+                 use_checkpoint=False, output_avg=False, **kwargs):
+        super().__init__()
+
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+        self.img_size = to_2tuple(img_size)
+        self.patch_size = to_2tuple(patch_size)
+        self.output_avg = output_avg
+        
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
+                               input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                                                 patches_resolution[1] // (2 ** i_layer)),
+                               depth=depths[i_layer],
+                               num_heads=num_heads[i_layer],
+                               window_size=window_size,
+                               mlp_ratio=self.mlp_ratio,
+                               qkv_bias=qkv_bias, qk_scale=qk_scale,
+                               drop=drop_rate, attn_drop=attn_drop_rate,
+                               drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                               norm_layer=norm_layer,
+                               downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                               use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        h = self.img_size[0] // self.patch_size[0]
+        w = self.img_size[1] // self.patch_size[1]
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            px, x = layer(x)
+            b, n, c = px.shape
+
+            if i != len(self.layers) - 1 or not self.output_avg:
+                px = px.permute(0, 2, 1).contiguous()
+                px = px.reshape(b, c, h, w)
+            # is this a fair assumption ?? i think it's baked into the architecture
+            h, w = h//2, w//2
+            outs.append(px)
+
+        if self.output_avg:
+            return outs[-1].mean(dim=1)
+
+        return outs
+
+    def flops(self):
+        flops = 0
+        flops += self.patch_embed.flops()
+        for i, layer in enumerate(self.layers):
+            flops += layer.flops()
+        flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
+        flops += self.num_features * self.num_classes
+        return flops
+
+
+def get_swin(drop_path_rate=0.3, output_avg=False):
+    args = get_args()
+
+    window_size = 7
+    embed_dim = 128
+    depths = [2, 2, 18, 2]
+    num_heads = [4, 8, 16, 32]
+    swin = SwinTransformer(
+        img_size=(args.img_h, args.img_w,),
+        in_chans=3,
+        patch_size=args.patch_dim,
+        embed_dim=embed_dim,
+        depths=depths,
+        num_heads=num_heads,
+        window_size=window_size,
+        drop_path_rate=drop_path_rate,
+        output_avg=output_avg,
+    )
+
+    return swin
+
diff --git a/training/DeepSpeed-Domino/megatron/model/vision/utils.py b/training/DeepSpeed-Domino/megatron/model/vision/utils.py
new file mode 100644
index 000000000..b4068912c
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/vision/utils.py
@@ -0,0 +1,27 @@
+import warnings
+import torch
+import torch.nn.functional as F
+
+
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    if isinstance(size, torch.Size):
+        size = tuple(int(x) for x in size)
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
diff --git a/training/DeepSpeed-Domino/megatron/model/vision/vit_backbone.py b/training/DeepSpeed-Domino/megatron/model/vision/vit_backbone.py
new file mode 100644
index 000000000..1efef9c17
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/model/vision/vit_backbone.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Vision Transformer(VIT) model."""
+
+import math
+import einops
+import torch
+import apex
+import torch.nn.functional as F
+from megatron import get_args
+from megatron.model.transformer import ParallelTransformer
+from megatron.model.utils import (
+    get_linear_layer,
+    init_method_normal,
+    scaled_init_method_normal,
+)
+from megatron.model.module import MegatronModule
+
+CLASS_TOKEN_LENGTH = 8
+
+class VitMlpHead(MegatronModule):
+    """Pooler layer.
+
+    Pool hidden states of a specific token (for example start of the
+    sequence) and add a linear transformation followed by a tanh.
+
+    Arguments:
+        hidden_size: hidden size
+        init_method: weight initialization method for the linear layer.
+            bias is set to zero.
+    """
+
+    def __init__(self, hidden_size, num_classes):
+        super(VitMlpHead, self).__init__()
+        self.dense_in = torch.nn.Linear(hidden_size, hidden_size)
+        self.relu = torch.nn.ReLU()
+        self.dense_out = torch.nn.Linear(hidden_size, num_classes)
+        torch.nn.init.constant_(self.dense_out.bias, -10)
+
+    def forward(self, hidden_states):
+        # hidden_states: [b, 1, h]
+        # sequence_index: index of the token to pool.
+        dense_in_result = self.dense_in(hidden_states)
+        tanh_result = torch.tanh(dense_in_result)
+        dense_out_result = self.dense_out(tanh_result)
+        return dense_out_result
+
+
+def isPerfectSquare(x):
+    if(x >= 0):
+        sr = math.sqrt(x)
+        return (int(sr) * int(sr) == x)
+    return False
+
+
+def twod_interpolate_position_embeddings_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+
+    args = get_args()
+    num_patches_per_dim_h = args.img_h // args.patch_dim
+    num_patches_per_dim_w = args.img_w // args.patch_dim
+    num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+    hidden_size = args.hidden_size
+
+    key = prefix + "weight"
+
+    assert key in state_dict
+    if key in state_dict:
+        input_param = state_dict[key]
+
+        input_seq_len = input_param.shape[0]
+        assert(isPerfectSquare(input_seq_len) or isPerfectSquare(input_seq_len - CLASS_TOKEN_LENGTH))
+        input_has_class_token = not isPerfectSquare(input_seq_len)
+        num_tok_input = input_seq_len - CLASS_TOKEN_LENGTH if input_has_class_token else input_seq_len
+        num_tok_output = num_patches
+        output_has_class_token = args.class_token_present
+
+        # update input_param and load it to state_dict[key]
+        if input_has_class_token:
+            input_param_tok = input_param[:CLASS_TOKEN_LENGTH, :]
+            input_param_grid = input_param[CLASS_TOKEN_LENGTH:, :]
+        else:
+            input_param_tok = torch.zeros(CLASS_TOKEN_LENGTH, hidden_size)
+            input_param_grid = input_param
+
+        assert input_param.shape[1] == hidden_size
+
+        if num_tok_input != num_tok_output:
+
+            gs_input = int(math.sqrt(num_tok_input))
+            gs_new = (num_patches_per_dim_h, num_patches_per_dim_w)
+
+            input_param_grid = input_param_grid.transpose(0, 1).contiguous()
+            input_param_grid = input_param_grid.reshape(
+                (1, -1, gs_input, gs_input)
+            )
+            input_param_grid = input_param_grid.float()
+            scale_factor = (gs_new[0] / gs_input, gs_new[1] / gs_input)
+
+            input_param_grid = F.interpolate(
+                input_param_grid, scale_factor=scale_factor, mode="bilinear"
+            )
+
+            input_param_grid = input_param_grid.half()
+            input_param_grid = input_param_grid.reshape((-1, num_tok_output))
+            input_param_grid = input_param_grid.transpose(0, 1).contiguous()
+
+            assert input_param_grid.shape[1] == hidden_size
+
+        input_param = input_param_grid
+        assert (
+            input_param.shape[0] == num_tok_output
+            and input_param.shape[1] == hidden_size
+        )
+
+        if output_has_class_token:
+            input_param = torch.cat((input_param_tok, input_param), dim=0)
+
+        state_dict[key] = input_param
+
+
+class VitBackbone(MegatronModule):
+    """Vision Transformer Model."""
+
+    def __init__(self,
+                 config,
+                 pre_process=True,
+                 post_process=True,
+                 class_token=True,
+                 single_token_output=False,
+                 post_layer_norm=True,
+                 drop_path_rate=0.0):
+        super(VitBackbone, self).__init__(share_embeddings_and_output_weights=False)
+        args = get_args()
+
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.class_token = class_token
+        self.post_layer_norm = post_layer_norm
+        self.hidden_size = args.hidden_size
+        self.patch_dim = args.patch_dim
+        self.img_h = args.img_h
+        self.img_w = args.img_w
+        self.micro_batch_size = args.micro_batch_size
+        self.single_token_output = single_token_output
+        self.drop_path_rate = drop_path_rate
+
+        assert self.img_h % self.patch_dim == 0
+        assert self.img_w % self.patch_dim == 0
+        self.num_patches_per_dim_h = self.img_h // self.patch_dim
+        self.num_patches_per_dim_w = self.img_w // self.patch_dim
+        self.num_patches = self.num_patches_per_dim_h * self.num_patches_per_dim_w
+        self.seq_length = self.num_patches + (CLASS_TOKEN_LENGTH if self.class_token else 0)
+        self.flatten_dim = self.patch_dim * self.patch_dim * args.num_channels
+        self.input_tensor = None
+        self.position_ids = None
+
+        if self.pre_process:
+            # cls_token
+            if self.class_token:
+                self.cls_token = torch.nn.Parameter(
+                    torch.randn(1, CLASS_TOKEN_LENGTH, self.hidden_size)
+                )
+                torch.nn.init.zeros_(self.cls_token)
+            self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()
+            
+            # Linear encoder
+            self.linear_encoder = torch.nn.Linear(
+                self.flatten_dim, self.hidden_size
+            )
+
+            # embedding
+            self.position_embeddings = torch.nn.Embedding(
+                self.seq_length, self.hidden_size
+            )
+            init_method_normal(args.init_method_std)(
+                self.position_embeddings.weight
+            )
+
+            args.class_token_present = self.class_token
+            self.position_embeddings._register_load_state_dict_pre_hook(
+                twod_interpolate_position_embeddings_hook
+            )
+
+            self.embedding_dropout = torch.nn.Dropout(args.hidden_dropout)
+
+        # Transformer
+        self.transformer = ParallelTransformer(
+            config,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+            post_layer_norm=self.post_layer_norm,
+            drop_path_rate=self.drop_path_rate
+        )
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.transformer.set_input_tensor(input_tensor)
+
+    def forward(self, input):
+
+        if self.pre_process:
+            rearranged_input = einops.rearrange(
+                input,
+                "b c (h p1) (w p2) -> b (h w) (p1 p2 c)",
+                p1=self.patch_dim,
+                p2=self.patch_dim,
+            )
+
+            assert rearranged_input.dtype == torch.half
+            encoder_output = self.linear_encoder(rearranged_input)
+
+            concatenated_tokens = encoder_output
+            if self.class_token:
+                cls_tokens = self.cls_token.expand(encoder_output.shape[0], -1, -1)
+                concatenated_tokens = torch.cat((cls_tokens, encoder_output), dim=1)
+
+            token_embeddings = concatenated_tokens + \
+                    self.position_embeddings(self.position_ids[:, :concatenated_tokens.shape[1]])
+            # [b, s, h] => [s, b, h]
+            token_embeddings = token_embeddings.transpose(0, 1).contiguous()
+            hidden_states = self.embedding_dropout(token_embeddings)
+        else:
+            hidden_states = input
+
+        hidden_states = self.transformer(hidden_states, None)
+
+        if self.post_process:
+            # [s b h] => [b s h]
+            if self.single_token_output:
+                hidden_states = hidden_states[0]
+            else:
+                hidden_states = hidden_states.transpose(0, 1).contiguous()
+
+        return hidden_states
+
diff --git a/training/DeepSpeed-Domino/megatron/text_generation_server.py b/training/DeepSpeed-Domino/megatron/text_generation_server.py
new file mode 100644
index 000000000..8bd6c26fc
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/text_generation_server.py
@@ -0,0 +1,241 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+import datetime
+import torch
+import json
+import threading
+from flask import Flask, request, jsonify, current_app
+from flask_restful import Resource, Api
+from megatron import get_args
+from megatron.text_generation import generate_and_post_process
+from megatron.text_generation import beam_search_and_post_process
+
+
+GENERATE_NUM = 0
+BEAM_NUM = 1
+lock = threading.Lock()
+
+class MegatronGenerate(Resource):
+    def __init__(self, model):
+        self.model = model
+
+    @staticmethod
+    def send_do_generate():
+        choice = torch.cuda.LongTensor([GENERATE_NUM])
+        torch.distributed.broadcast(choice, 0)
+     
+    @staticmethod
+    def send_do_beam_search():
+        choice = torch.cuda.LongTensor([BEAM_NUM])
+        torch.distributed.broadcast(choice, 0)
+    
+    def put(self):
+        args = get_args()
+       
+        if not "prompts" in request.get_json():
+            return "prompts argument required", 400
+        
+        if "max_len" in request.get_json():
+            return "max_len is no longer used.  Replace with tokens_to_generate", 400
+        
+        if "sentences" in request.get_json():
+            return "sentences is no longer used.  Replace with prompts", 400
+
+        prompts = request.get_json()["prompts"]
+        if not isinstance(prompts, list):
+            return "prompts is not a list of strings", 400
+
+        if len(prompts) == 0:
+            return "prompts is empty", 400
+        
+        if len(prompts) > 128:
+            return "Maximum number of prompts is 128", 400
+        
+        tokens_to_generate = 64  # Choosing hopefully sane default.  Full sequence is slow
+        if "tokens_to_generate" in request.get_json():
+            tokens_to_generate = request.get_json()["tokens_to_generate"]
+            if not isinstance(tokens_to_generate, int):
+                return "tokens_to_generate must be an integer greater than 0"
+            if tokens_to_generate < 0:
+                return "tokens_to_generate must be an integer greater than or equal to 0"
+
+        logprobs = False
+        if "logprobs" in request.get_json():
+            logprobs = request.get_json()["logprobs"]
+            if not isinstance(logprobs, bool):
+                return "logprobs must be a boolean value"
+        
+        if tokens_to_generate == 0 and not logprobs:
+            return "tokens_to_generate=0 implies logprobs should be True"
+        
+        temperature = 1.0
+        if "temperature" in request.get_json():
+            temperature = request.get_json()["temperature"]
+            if not (type(temperature) == int or type(temperature) == float):
+                return "temperature must be a positive number less than or equal to 100.0"
+            if not (0.0 < temperature <= 100.0):
+                return "temperature must be a positive number less than or equal to 100.0"
+        
+        top_k = 0.0
+        if "top_k" in request.get_json():
+            top_k = request.get_json()["top_k"]
+            if not (type(top_k) == int):
+                return "top_k must be an integer equal to or greater than 0 and less than or equal to 1000"
+            if not (0 <= top_k <= 1000):
+                return "top_k must be equal to or greater than 0 and less than or equal to 1000"
+        
+        top_p = 0.0
+        if "top_p" in request.get_json():
+            top_p = request.get_json()["top_p"]
+            if not (type(top_p) == float):
+                return "top_p must be a positive float less than or equal to 1.0"
+            if top_p > 0.0 and top_k > 0.0:
+                return "cannot set both top-k and top-p samplings."
+            if not (0 <= top_p <= 1.0):
+                return "top_p must be less than or equal to 1.0"
+        
+        top_p_decay = 0.0
+        if "top_p_decay" in request.get_json():
+            top_p_decay = request.get_json()["top_p_decay"]
+            if not (type(top_p_decay) == float):
+                return "top_p_decay must be a positive float less than or equal to 1.0"
+            if top_p == 0.0:
+                return "top_p_decay cannot be set without top_p"
+            if not (0 <= top_p_decay <= 1.0):
+                return "top_p_decay must be less than or equal to 1.0"
+        
+        top_p_bound = 0.0
+        if "top_p_bound" in request.get_json():
+            top_p_bound = request.get_json()["top_p_bound"]
+            if not (type(top_p_bound) == float):
+                return "top_p_bound must be a positive float less than or equal to top_p"
+            if top_p == 0.0:
+                return "top_p_bound cannot be set without top_p"
+            if not (0.0 < top_p_bound <= top_p):
+                return "top_p_bound must be greater than 0 and less than top_p"
+        
+        add_BOS = False
+        if "add_BOS" in request.get_json():
+            add_BOS = request.get_json()["add_BOS"]
+            if not isinstance(add_BOS, bool):
+                return "add_BOS must be a boolean value"
+        
+        if any([len(prompt) == 0 for prompt in prompts]) and not add_BOS:
+            return "Empty prompts require add_BOS=true"
+
+        stop_on_double_eol = False
+        if "stop_on_double_eol" in request.get_json():
+            stop_on_double_eol = request.get_json()["stop_on_double_eol"]
+            if not isinstance(stop_on_double_eol, bool):
+                return "stop_on_double_eol must be a boolean value"
+        
+        stop_on_eol = False
+        if "stop_on_eol" in request.get_json():
+            stop_on_eol = request.get_json()["stop_on_eol"]
+            if not isinstance(stop_on_eol, bool):
+                return "stop_on_eol must be a boolean value"
+
+        prevent_newline_after_colon = False
+        if "prevent_newline_after_colon" in request.get_json():
+            prevent_newline_after_colon = request.get_json()["prevent_newline_after_colon"]
+            if not isinstance(prevent_newline_after_colon, bool):
+                return "prevent_newline_after_colon must be a boolean value"
+
+        random_seed = -1
+        if "random_seed" in request.get_json():
+            random_seed = request.get_json()["random_seed"]
+            if not isinstance(random_seed, int):
+                return "random_seed must be integer"
+            if random_seed < 0: 
+                return "random_seed must be a positive integer"
+
+        no_log = False
+        if "no_log" in request.get_json():
+            no_log = request.get_json()["no_log"]
+            if not isinstance(no_log, bool):
+                return "no_log must be a boolean value"
+        
+        beam_width = None
+        if "beam_width" in request.get_json():
+            beam_width = request.get_json()["beam_width"]
+            if not isinstance(beam_width, int):
+                return "beam_width must be integer"
+            if beam_width < 1:
+                return "beam_width must be an integer > 1"
+            if len(prompts) > 1:
+                return "When doing beam_search, batch size must be 1"
+
+        stop_token=50256
+        if "stop_token" in request.get_json():
+            stop_token = request.get_json()["stop_token"]
+            if not isinstance(stop_token, int):
+                return "stop_token must be an integer"
+        
+        length_penalty = 1 
+        if "length_penalty" in request.get_json():
+            length_penalty = request.get_json()["length_penalty"]
+            if not isinstance(length_penalty, float):
+                return "length_penalty must be a float"
+        
+        with lock:  # Need to get lock to keep multiple threads from hitting code
+            
+            if not no_log:
+                print("request IP: " + str(request.remote_addr))
+                print(json.dumps(request.get_json()),flush=True)
+                print("start time: ", datetime.datetime.now())
+            
+            try:
+                if beam_width is not None:
+                    MegatronGenerate.send_do_beam_search()  # Tell other ranks we're doing beam_search
+                    response, response_seg, response_scores = \
+                        beam_search_and_post_process(
+                        self.model,
+                        prompts=prompts,
+                        tokens_to_generate=tokens_to_generate,
+                        beam_size = beam_width,
+                        add_BOS=add_BOS,
+                        stop_token=stop_token,
+                        num_return_gen=beam_width,  # Returning whole beam
+                        length_penalty=length_penalty,
+                        prevent_newline_after_colon=prevent_newline_after_colon
+                        )
+                    
+                    return jsonify({"text": response,
+                        "segments": response_seg,
+                        "scores": response_scores})
+                else:
+                    MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
+                    response, response_seg, response_logprobs, _ = \
+                        generate_and_post_process(
+                        self.model,
+                        prompts=prompts,
+                        tokens_to_generate=tokens_to_generate,
+                        return_output_log_probs=logprobs,
+                        top_k_sampling=top_k,
+                        top_p_sampling=top_p,
+                        top_p_decay=top_p_decay,
+                        top_p_bound=top_p_bound,
+                        temperature=temperature,
+                        add_BOS=add_BOS,
+                        use_eod_token_for_early_termination=True,
+                        stop_on_double_eol=stop_on_double_eol,
+                        stop_on_eol=stop_on_eol,
+                        prevent_newline_after_colon=prevent_newline_after_colon,
+                        random_seed=random_seed)
+
+                    return jsonify({"text": response,
+                        "segments": response_seg,
+                        "logprobs": response_logprobs})
+
+            except ValueError as ve:
+                return ve.args[0]
+            print("end time: ", datetime.datetime.now())
+        
+
+class MegatronServer(object):
+    def __init__(self, model):
+        self.app = Flask(__name__, static_url_path='')
+        api = Api(self.app)
+        api.add_resource(MegatronGenerate, '/api', resource_class_args=[model])
+        
+    def run(self, url, port): 
+        self.app.run(url, threaded=True, debug=False, port=port)
diff --git a/training/DeepSpeed-Domino/megatron/timers.py b/training/DeepSpeed-Domino/megatron/timers.py
new file mode 100644
index 000000000..a9478fa01
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/timers.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+
+"""Megatron timers."""
+
+from abc import ABC
+from abc import abstractmethod
+import time
+
+import torch
+
+
+
+class TimerBase(ABC):
+
+    def __init__(self, name):
+        self.name = name
+
+    @abstractmethod
+    def start(self, barrier=False):
+        pass
+
+    @abstractmethod
+    def stop(self, barrier=False):
+        pass
+
+    @abstractmethod
+    def reset(self):
+        pass
+
+    @abstractmethod
+    def elapsed(self, reset=True, barrier=False):
+        pass
+
+
+
+class DummyTimer(TimerBase):
+
+    def __init__(self):
+        super().__init__('dummy timer')
+
+    def start(self, barrier=False):
+        return
+
+    def stop(self, barrier=False):
+        return
+
+    def reset(self):
+        return
+
+    def elapsed(self, reset=True, barrier=False):
+        raise Exception('dummy timer should not be used to '
+                        'calculate elapsed time')
+
+
+
+class Timer(TimerBase):
+    """
+    Comment on using `barrier`: If this flag is passed, then all
+    the caller processes will wait till all reach the timing routine.
+    It is up to the user to make sure all the ranks in `barrier_group`
+    call it otherwise, it will result in a hang.
+    Comment on `barrier_group`: By default it is set to None which
+    in torch distributed land, it will result in the global communicator.
+    """
+
+    def __init__(self, name):
+        super().__init__(name)
+        self._elapsed = 0.0
+        self._started = False
+        # Note that None will default to the global process group
+        self._barrier_group = None
+        self._start_time = time.time()
+
+
+    def set_barrier_group(self, barrier_group):
+        self._barrier_group = barrier_group
+
+
+    def start(self, barrier=False):
+        """Start the timer."""
+        assert not self._started, 'timer has already been started'
+        if barrier:
+            torch.distributed.barrier(group=self._barrier_group)
+        torch.cuda.synchronize()
+        self._start_time = time.time()
+        self._started = True
+
+
+    def stop(self, barrier=False):
+        """Stop the timer."""
+        assert self._started, 'timer is not started'
+        if barrier:
+            torch.distributed.barrier(group=self._barrier_group)
+        torch.cuda.synchronize()
+        self._elapsed += (time.time() - self._start_time)
+        self._started = False
+
+
+    def reset(self):
+        """Reset timer."""
+        self._elapsed = 0.0
+        self._started = False
+
+
+    def elapsed(self, reset=True, barrier=False):
+        """Calculate the elapsed time."""
+        _started = self._started
+        # If the timing in progress, end it first.
+        if self._started:
+            self.stop(barrier=barrier)
+        # Get the elapsed time.
+        _elapsed = self._elapsed
+        # Reset the elapsed time
+        if reset:
+            self.reset()
+        # If timing was in progress, set it back.
+        if _started:
+            self.start(barrier=barrier)
+        return _elapsed
+
+
+
+class Timers:
+    """Group of timers."""
+
+    def __init__(self, log_level, log_option):
+        self._log_level = log_level
+        self._log_option = log_option
+        self._timers = {}
+        self._log_levels = {}
+        self._dummy_timer = DummyTimer()
+        self._max_log_level = 2
+
+
+    def __call__(self, name, log_level=None):
+        # If the timer has already been set, then check if the log-level
+        # is provided, it matches the one that the timer was created with.
+        if name in self._timers:
+            if log_level is not None:
+                assert log_level == self._log_levels[name], \
+                    'input log level {} does not match already existing '\
+                    'log level {} for {} timer'.format(
+                        log_level, self._log_levels[name], name)
+            return self._timers[name]
+        # If timer does not exist and no log level is provided,
+        # set it to the max log level which is 2.
+        if log_level is None:
+            log_level = self._max_log_level
+        assert log_level <= self._max_log_level, \
+            'log level {} is larger than max supported log level {}'.format(
+                log_level, self._max_log_level)
+        # Now if the input log level is larger than the one set for
+        # the timers class, just ignore it and return a dummy timer.
+        if log_level > self._log_level:
+            return self._dummy_timer
+        # Otherwise, initalize the timer and set the level.
+        self._timers[name] = Timer(name)
+        self._log_levels[name] = log_level
+        return self._timers[name]
+
+
+    def _get_elapsed_time_all_ranks(self, names, reset, barrier):
+        """
+        Assumptions:
+            - All the ranks call this function.
+            - `names` are identical on all ranks.
+        If the above assumptions are not met, calling this function will
+        result in hang.
+        Arguments:
+            - names: list of timer names
+            - reset: reset the timer after recording the elapsed time
+            - barrier: if set, do a global barrier before time measurments
+        """
+
+        # First make sure all the callers are in sync.
+        if barrier:
+            torch.distributed.barrier()
+
+        world_size = torch.distributed.get_world_size()
+        rank = torch.distributed.get_rank()
+
+        # Here we can use gather on the rank we want to print the
+        # timing, however, there is no gather_base support in
+        # pytorch yet. It is simpler to deal with a single tensor
+        # and since we are only gathering a small amount of data,
+        # it should be ok to use all-gather instead of gather.
+        rank_name_to_time = torch.zeros((world_size, len(names)),
+                                        dtype=torch.float,
+                                        device=torch.cuda.current_device())
+        for i, name in enumerate(names):
+            if name in self._timers:
+                # Here we don't need to pass the barrier flag as all
+                # the processes are already in sync. This avoids the
+                # issue of different timers having different barrier
+                # groups inside their class.
+                rank_name_to_time[rank, i] = self._timers[name].elapsed(
+                    reset=reset)
+
+        # See the note above for why we are not using gather.
+        torch.distributed._all_gather_base(rank_name_to_time.view(-1),
+                                           rank_name_to_time[rank, :].view(-1))
+
+        return rank_name_to_time
+
+
+    def _get_global_min_max_time(self, names, reset, barrier, normalizer):
+        """Report only min and max times across all ranks."""
+
+        rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset,
+                                                             barrier)
+        name_to_min_max_time = {}
+        for i, name in enumerate(names):
+            rank_to_time = rank_name_to_time[:, i]
+            # filter out the ones we did not have any timings for
+            rank_to_time = rank_to_time[rank_to_time > 0.0]
+            # If the timer exists:
+            if rank_to_time.numel() > 0:
+                name_to_min_max_time[name] = (
+                    rank_to_time.min().item() / normalizer,
+                    rank_to_time.max().item() / normalizer)
+        return name_to_min_max_time
+
+
+    def _get_global_min_max_time_string(self, names, reset, barrier,
+                                        normalizer, max_only):
+        name_to_min_max_time = self._get_global_min_max_time(
+            names, reset, barrier, normalizer)
+        if not name_to_min_max_time:
+            return None
+        output_string = '(min, max) time across ranks (ms):'
+        for name in name_to_min_max_time:
+            min_time, max_time = name_to_min_max_time[name]
+            if max_only:
+                output_string += '\n    {}: {:.2f}'.format(
+                    (name+' ').ljust(48, '.'), max_time)
+            else:
+                output_string += '\n    {}: ({:.2f}, {:.2f})'.format(
+                    (name+' ').ljust(48, '.'), min_time, max_time)
+        return output_string
+
+
+    def _get_all_ranks_time_string(self, names, reset, barrier, normalizer):
+        """Report times across all ranks."""
+        rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset,
+                                                             barrier)
+
+        output_string = 'times across ranks (ms):'
+        no_reported_timing = True
+        for i, name in enumerate(names):
+            not_yet_found = True
+            for rank in range(torch.distributed.get_world_size()):
+                if rank_name_to_time[rank, i] > 0:
+                    no_reported_timing = False
+                    if not_yet_found:
+                        not_yet_found = False
+                        output_string += '\n  {}:'.format(name)
+                    output_string += '\n     rank {:2d}: {:.2f}'.format(
+                        rank, rank_name_to_time[rank, i] / normalizer)
+        if no_reported_timing:
+            return None
+        return output_string
+
+
+    def log(self, names, rank=None, normalizer=1.0, reset=True, barrier=False):
+        """Log a group of timers."""
+
+        # Print.
+        assert normalizer > 0.0
+        if self._log_option in ['max', 'minmax']:
+            max_only = False
+            if self._log_option == 'max':
+                max_only = True
+            output_string = self._get_global_min_max_time_string(
+                names, reset, barrier, normalizer/1000.0, max_only)
+        elif self._log_option == 'all':
+            output_string = self._get_all_ranks_time_string(names,
+                                                            reset, barrier,
+                                                            normalizer/1000.0)
+        else:
+            raise Exception('unknown timing log option {}'.format(
+                self._log_option))
+
+        # If no input rank is provided, log on last rank.
+        if rank is None:
+            rank = torch.distributed.get_world_size() - 1
+        if rank == torch.distributed.get_rank() and output_string is not None:
+            print(output_string, flush=True)
+
+
+    def write(self, names, writer, iteration, normalizer=1.0,
+              reset=False, barrier=False):
+        """Write timers to a tensorboard writer
+        Note that we only report maximum time across ranks to tensorboard.
+        """
+        # currently when using add_scalars,
+        # torch.utils.add_scalars makes each timer its own run, which
+        # polutes the runs list, so we just add each as a scalar
+        assert normalizer > 0.0
+        name_to_min_max_time = self._get_global_min_max_time(
+            names, reset, barrier, normalizer)
+        if writer is not None:
+            for name in name_to_min_max_time:
+                _, max_time = name_to_min_max_time[name]
+                writer.add_scalar(name + '-time', max_time, iteration)
diff --git a/training/DeepSpeed-Domino/megatron/tokenizer/__init__.py b/training/DeepSpeed-Domino/megatron/tokenizer/__init__.py
new file mode 100644
index 000000000..59ceb3386
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/tokenizer/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+
+from .tokenizer import build_tokenizer
diff --git a/training/DeepSpeed-Domino/megatron/tokenizer/bert_tokenization.py b/training/DeepSpeed-Domino/megatron/tokenizer/bert_tokenization.py
new file mode 100644
index 000000000..642041e77
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/tokenizer/bert_tokenization.py
@@ -0,0 +1,431 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+import unicodedata
+import six
+
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+    """Checks whether the casing config is consistent with the checkpoint name."""
+
+    # The casing has to be passed in by the user and there is no explicit check
+    # as to whether it matches the checkpoint. The casing information probably
+    # should have been stored in the bert_config.json file, but it's not, so
+    # we have to heuristically detect it to validate.
+
+    if not init_checkpoint:
+        return
+
+    m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+    if m is None:
+        return
+
+    model_name = m.group(1)
+
+    lower_models = [
+        "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+        "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+    ]
+
+    cased_models = [
+        "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+        "multi_cased_L-12_H-768_A-12"
+    ]
+
+    is_bad_config = False
+    if model_name in lower_models and not do_lower_case:
+        is_bad_config = True
+        actual_flag = "False"
+        case_name = "lowercased"
+        opposite_flag = "True"
+
+    if model_name in cased_models and do_lower_case:
+        is_bad_config = True
+        actual_flag = "True"
+        case_name = "cased"
+        opposite_flag = "False"
+
+    if is_bad_config:
+        raise ValueError(
+            "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+            "However, `%s` seems to be a %s model, so you "
+            "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+            "how the model was pre-training. If this error is wrong, please "
+            "just comment out this check." % (actual_flag, init_checkpoint,
+                                              model_name, case_name, opposite_flag))
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r", encoding = "utf-8") as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+    @staticmethod
+    def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True):
+        """ Converts a sequence of tokens (string) in a single string. """
+
+        def clean_up_tokenization(out_string):
+            """ Clean up a list of simple English tokenization artifacts
+            like spaces before punctuations and abreviated forms.
+            """
+            out_string = (
+                out_string.replace(" .", ".")
+                    .replace(" ?", "?")
+                    .replace(" !", "!")
+                    .replace(" ,", ",")
+                    .replace(" ' ", "'")
+                    .replace(" n't", "n't")
+                    .replace(" 'm", "'m")
+                    .replace(" 's", "'s")
+                    .replace(" 've", "'ve")
+                    .replace(" 're", "'re")
+            )
+            return out_string
+
+        text = ' '.join(tokens).replace(' ##', '').strip()
+        if clean_up_tokenization_spaces:
+            clean_text = clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+
+    def vocab_size(self):
+        return len(self.vocab)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat in ("Cc", "Cf"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/training/DeepSpeed-Domino/megatron/tokenizer/gpt2_tokenization.py b/training/DeepSpeed-Domino/megatron/tokenizer/gpt2_tokenization.py
new file mode 100644
index 000000000..3f37e4490
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/tokenizer/gpt2_tokenization.py
@@ -0,0 +1,321 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for OpenAI GPT."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+import json
+import logging
+import os
+import regex as re
+from io import open
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE
+    # tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'gpt2': 1024,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
+    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \
+        list(range(ord("®"), ord("ÿ") + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class GPT2Tokenizer(object):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info("loading special tokens file {}".format(special_tokens_file))
+        # redirect to the cache, if necessary
+        try:
+            from .file_utils import cached_path
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    vocab_file, merges_file))
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+            logger.info("loading merges file {} from cache at {}".format(
+                merges_file, resolved_merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(
+            resolved_vocab_file,
+            resolved_merges_file,
+            special_tokens=special_tokens,
+            *inputs,
+            **kwargs)
+        return tokenizer
+
+    def __init__(self, vocab_file, merges_file, errors='replace',
+                 special_tokens=None, max_len=None):
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for
+        # capitalized versions of contractions
+        self.pat = re.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i)
+                                   for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
+        logger.info("Special tokens {}".format(self.special_tokens))
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except BaseException:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this OpenAI GPT model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(
+                    len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        index = len(self.encoder)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+
+        return vocab_file, merge_file, special_tokens_file
diff --git a/training/DeepSpeed-Domino/megatron/tokenizer/tokenizer.py b/training/DeepSpeed-Domino/megatron/tokenizer/tokenizer.py
new file mode 100644
index 000000000..79dab75a0
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/tokenizer/tokenizer.py
@@ -0,0 +1,536 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron tokenizers."""
+
+from abc import ABC
+from abc import abstractmethod
+
+from .bert_tokenization import FullTokenizer as FullBertTokenizer
+from .gpt2_tokenization import GPT2Tokenizer
+
+
+def build_tokenizer(args):
+    """Initialize tokenizer."""
+    if args.rank == 0:
+        print('> building {} tokenizer ...'.format(args.tokenizer_type),
+              flush=True)
+
+    # Select and instantiate the tokenizer.
+    if args.tokenizer_type == 'BertWordPieceLowerCase':
+        assert args.vocab_file is not None
+        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
+                                            lower_case=True,
+                                            vocab_extra_ids=args.vocab_extra_ids)
+    elif args.tokenizer_type == 'BertWordPieceCase':
+        assert args.vocab_file is not None
+        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
+                                            lower_case=False,
+                                            vocab_extra_ids=args.vocab_extra_ids)
+    elif args.tokenizer_type == 'GPT2BPETokenizer':
+        assert args.vocab_file is not None
+        assert args.merge_file is not None
+        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
+    elif args.tokenizer_type == 'SentencePieceTokenizer':
+        assert args.tokenizer_model is not None
+        tokenizer = _SentencePieceTokenizer(args.tokenizer_model, vocab_extra_ids=args.vocab_extra_ids)
+    elif args.tokenizer_type == 'GPTSentencePieceTokenizer':
+        assert args.tokenizer_model is not None
+        tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model)
+    elif args.tokenizer_type == 'NullTokenizer':
+        assert args.vocab_size is not None
+        tokenizer = _NullTokenizer(args.vocab_size)
+    else:
+        raise NotImplementedError('{} tokenizer is not '
+                                  'implemented.'.format(args.tokenizer_type))
+    
+    # Add vocab size.
+    args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size,
+                                                      args)
+
+    return tokenizer
+
+
+def _vocab_size_with_padding(orig_vocab_size, args):
+    """Pad vocab size so it is divisible by model parallel size and
+    still having GPU friendly size."""
+
+    after = orig_vocab_size
+    multiple = args.make_vocab_size_divisible_by * \
+        args.tensor_model_parallel_size
+    while (after % multiple) != 0:
+        after += 1
+    if args.rank == 0:
+        print(' > padded vocab (size: {}) with {} dummy tokens '
+              '(new size: {})'.format(
+                  orig_vocab_size, after - orig_vocab_size, after), flush=True)
+    return after
+
+
+class AbstractTokenizer(ABC):
+    """Abstract class for tokenizer."""
+
+    def __init__(self, name):
+        self.name = name
+        super().__init__()
+
+    @property
+    @abstractmethod
+    def vocab_size(self):
+        pass
+
+    @property
+    @abstractmethod
+    def vocab(self):
+        """Dictionary from vocab text token to id token."""
+        pass
+
+    @property
+    @abstractmethod
+    def inv_vocab(self):
+        """Dictionary from vocab id token to text token."""
+        pass
+
+    @abstractmethod
+    def tokenize(self, text):
+        pass
+
+    def detokenize(self, token_ids):
+        raise NotImplementedError('detokenizer is not implemented for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def cls(self):
+        raise NotImplementedError('CLS is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def sep(self):
+        raise NotImplementedError('SEP is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def pad(self):
+        raise NotImplementedError('PAD is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def eod(self):
+        raise NotImplementedError('EOD is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def mask(self):
+        raise NotImplementedError('MASK is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+
+class _BertWordPieceTokenizer(AbstractTokenizer):
+    """Original BERT wordpiece tokenizer."""
+
+    def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0):
+        if lower_case:
+            name = 'BERT Lower Case'
+        else:
+            name = 'BERT Upper Case'
+        super().__init__(name)
+        self.tokenizer = FullBertTokenizer(vocab_file, do_lower_case=lower_case)
+        self.cls_id = self.tokenizer.vocab['[CLS]']
+        self.sep_id = self.tokenizer.vocab['[SEP]']
+        self.pad_id = self.tokenizer.vocab['[PAD]']
+        self.mask_id = self.tokenizer.vocab['[MASK]']
+        self._additional_special_tokens = []
+
+        # (dsachan) Add BOS and EOS tokens
+        SPECIAL_TOKENS = {'eos_token': '[EOS]',
+                          'bos_token': '[BOS]'}
+        self._bos_token = '[BOS]'
+        self.add_token(self._bos_token)
+        self._bos_token_id = self.vocab.get(self._bos_token)
+
+        self._eos_token = '[EOS]'
+        self.add_token(self._eos_token)
+        self._eos_token_id = self.vocab.get(self._eos_token)
+
+        # (dsachan) Add additional special tokens
+        # These can be used as sentinel tokens in T5 model inputs
+        additional_special_tokens = []
+        additional_special_tokens.extend(
+            ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
+        self.add_additional_special_tokens(additional_special_tokens)
+
+    def add_token(self, token):
+        if token not in self.vocab:
+            self.inv_vocab[self.vocab_size] = token
+            # self.vocab_size comes from len(vocab)
+            # and it will increase as we add elements
+            self.vocab[token] = self.vocab_size
+
+    def add_additional_special_tokens(self, tokens_list):
+        setattr(self, "additional_special_tokens", tokens_list)
+        for value in tokens_list:
+            self.add_token(value)
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.vocab_size()
+
+    @property
+    def vocab(self):
+        return self.tokenizer.vocab
+
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.inv_vocab
+
+    def tokenize(self, text):
+        text_tokens = self.tokenizer.tokenize(text)
+        return self.tokenizer.convert_tokens_to_ids(text_tokens)
+
+    def decode(self, ids):
+        tokens = self.tokenizer.convert_ids_to_tokens(ids)
+        return self.tokenizer.convert_tokens_to_string(tokens)
+
+    def decode_token_ids(self, token_ids):
+        tokens = self.tokenizer.convert_ids_to_tokens(token_ids)
+        exclude_list = ['[PAD]', '[CLS]']
+        non_pads = [t for t in tokens if t not in exclude_list]
+
+        result = ""
+        for s in non_pads:
+            if s.startswith("##"):
+                result += s[2:]
+            else:
+                result += " " + s
+
+        return result
+
+    @property
+    def cls(self):
+        return self.cls_id
+
+    @property
+    def sep(self):
+        return self.sep_id
+
+    @property
+    def pad(self):
+        return self.pad_id
+
+    @property
+    def mask(self):
+        return self.mask_id
+
+    @property
+    def bos_token(self):
+        """ Beginning of sentence token id """
+        return self._bos_token
+
+    @property
+    def eos_token(self):
+        """ End of sentence token id """
+        return self._eos_token
+
+    @property
+    def additional_special_tokens(self):
+        """ All the additional special tokens you may want to use (list of strings)."""
+        return self._additional_special_tokens
+
+    @property
+    def bos_token_id(self):
+        """ Id of the beginning of sentence token in the vocabulary."""
+        return self._bos_token_id
+
+    @property
+    def eos_token_id(self):
+        """ Id of the end of sentence token in the vocabulary."""
+        return self._eos_token_id
+
+    @property
+    def additional_special_tokens_ids(self):
+        """ Ids of all the additional special tokens in the vocabulary (list of integers)."""
+        return [self.vocab.get(token) for token in self._additional_special_tokens]
+
+    @additional_special_tokens.setter
+    def additional_special_tokens(self, value):
+        self._additional_special_tokens = value
+
+
+class _GPT2BPETokenizer(AbstractTokenizer):
+    """Original GPT2 BPE tokenizer."""
+
+    def __init__(self, vocab_file, merge_file):
+        name = 'GPT2 BPE'
+        super().__init__(name)
+
+        self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
+                                       special_tokens=[], max_len=None)
+        self.eod_id = self.tokenizer.encoder['<|endoftext|>']
+
+    @property
+    def vocab_size(self):
+        return len(self.tokenizer.encoder)
+
+    @property
+    def vocab(self):
+        return self.tokenizer.encoder
+
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.decoder
+
+    def tokenize(self, text):
+        return self.tokenizer.encode(text)
+
+    def detokenize(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+
+    @property
+    def eod(self):
+        return self.eod_id
+
+
+class _SentencePieceTokenizer(AbstractTokenizer):
+    """SentencePieceTokenizer-Megatron wrapper"""
+
+    def __init__(self, model_file, vocab_extra_ids=0):
+        name = 'SentencePieceTokenizer'
+        super().__init__(name)
+
+        import sentencepiece
+        self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_file)
+        self._initalize(vocab_extra_ids)
+
+    def _populate_vocab(self):
+        self._vocab = {}
+        self._inv_vocab = {}
+
+        for i in range(len(self.tokenizer)):
+            t = self.tokenizer.id_to_piece(i)
+            self._inv_vocab[i] = t
+            self._vocab[t] = i
+
+    def _initalize(self, vocab_extra_ids):
+        self._populate_vocab()
+        self._special_tokens = {}
+        self._inv_special_tokens = {}
+
+        self._t5_tokens = []
+
+        def _add_special_token(t):
+            if t not in self._vocab:
+                next_id = len(self._vocab)
+                self._vocab[t] = next_id
+                self._inv_vocab[next_id] = t
+            self._special_tokens[t] = self._vocab[t]
+            self._inv_special_tokens[self._vocab[t]] = t
+
+        _add_special_token('<CLS>')
+        self._cls_id = self._vocab['<CLS>']
+        _add_special_token('<SEP>')
+        self._sep_id = self._vocab['<SEP>']
+        _add_special_token('<EOD>')
+        self._eod_id = self._vocab['<EOD>']
+        _add_special_token('<MASK>')
+        self._mask_id = self._vocab['<MASK>']
+
+        pad_id = self.tokenizer.pad_id()
+        try:
+            pad_token = self.tokenizer.id_to_piece(pad_id)
+        except IndexError:
+            pad_token = '<PAD>'
+        _add_special_token(pad_token)
+        self._pad_id = self._vocab[pad_token]
+
+        bos_id = self.tokenizer.bos_id()
+        try:
+            bos_token = self.tokenizer.id_to_piece(bos_id)
+        except IndexError:
+            bos_token = '<BOS>'
+        _add_special_token(bos_token)
+        self._bos_id = self._vocab[bos_token]
+
+        eos_id = self.tokenizer.eos_id()
+        try:
+            eos_token = self.tokenizer.id_to_piece(eos_id)
+        except IndexError:
+            eos_token = '<EOS>'
+        _add_special_token(eos_token)
+        self._eos_id = self._vocab[eos_token]
+
+        for i in range(vocab_extra_ids):
+            t = "<extra_id_{}>".format(i)
+            _add_special_token(t)
+            self._t5_tokens += [t]
+
+    @property
+    def vocab_size(self):
+        return len(self._vocab)
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    @property
+    def inv_vocab(self):
+        return self._inv_vocab
+
+    @property
+    def decoder(self):
+        return self._inv_vocab
+
+    @property
+    def encoder(self):
+        return self._vocab
+
+    # From:
+    # https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L89
+    def tokenize(self, text):
+        ids = []
+        idx = 0
+
+        while 1:
+            indices = {}
+            for token in self._special_tokens:
+                try:
+                    indices[token] = text[idx:].index(token)
+                except ValueError:
+                    continue
+            if len(indices) == 0:
+                break
+
+            next_token = min(indices, key=indices.get)
+            next_idx = idx + indices[next_token]
+
+            ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx]))
+            ids.append(self._special_tokens[next_token])
+            idx = next_idx + len(next_token)
+
+        ids.extend(self.tokenizer.encode_as_ids(text[idx:]))
+        return ids
+
+    # From:
+    # https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L125
+    def detokenize(self, ids):
+        text = ""
+        last_i = 0
+
+        for i, id in enumerate(ids):
+            if id in self._inv_special_tokens:
+                text += self.tokenizer.decode_ids(ids[last_i:i]) + " "
+                text += self._inv_special_tokens[id] + " "
+                last_i = i + 1
+
+        text += self.tokenizer.decode_ids(ids[last_i:])
+        return text
+
+    @property
+    def cls(self):
+        return self._cls_id
+
+    @property
+    def sep(self):
+        return self._sep_id
+
+    @property
+    def pad(self):
+        return self._pad_id
+
+    @property
+    def bos_token_id(self):
+        return self._bos_id
+
+    @property
+    def bos(self):
+        return self._bos_id
+
+    @property
+    def eod(self):
+        return self._eod_id
+
+    @property
+    def eos_token_id(self):
+        return self._eos_id
+
+    @property
+    def eos(self):
+        return self._eos_id
+
+    @property
+    def mask(self):
+        return self._mask_id
+
+    @property
+    def additional_special_tokens_ids(self):
+        return [self.vocab[k] for k in self._t5_tokens]
+
+class _GPTSentencePieceTokenizer(_SentencePieceTokenizer):
+    """SentencePieceTokenizer-Megatron wrapper"""
+
+    def __init__(self, model_file,):
+        super().__init__(model_file, vocab_extra_ids=0)
+
+    def _initalize(self, vocab_extra_ids):
+        self._populate_vocab()
+
+        self._pad_id = self.tokenizer.pad_id()
+        self._bos_id = self.tokenizer.bos_id()
+        self._eos_id = self.tokenizer.eos_id()
+
+    def tokenize(self, text):
+        return self.tokenizer.encode_as_ids(text)
+
+    def detokenize(self, ids):
+        return self.tokenizer.decode_ids(ids)
+
+    @property
+    def cls(self):
+        return -1
+
+    @property
+    def sep(self):
+        return -1
+
+    @property
+    def mask(self):
+        return -1
+
+    @property
+    def eod(self):
+        return self._eos_id
+
+    @property
+    def additional_special_tokens_ids(self):
+        return None
+
+class _NullTokenizer:
+    def __init__(self, vocab_size):
+        vocab_size = int(vocab_size)
+        self._eos_id = vocab_size
+        self.vocab_size = vocab_size+1
+
+    def tokenize(self, text):
+        return [int(x) for x in text.split(' ')]
+
+    def detokenize(self, ids):
+        text = [str(x) for x in ids]
+        return ' '.join(text)
+
+    @property
+    def cls(self):
+        return -1
+
+    @property
+    def sep(self):
+        return -1
+
+    @property
+    def mask(self):
+        return -1
+
+    @property
+    def eod(self):
+        return self._eos_id
+
+    @property
+    def additional_special_tokens_ids(self):
+        return None
diff --git a/training/DeepSpeed-Domino/megatron/utils.py b/training/DeepSpeed-Domino/megatron/utils.py
new file mode 100644
index 000000000..008f89fa8
--- /dev/null
+++ b/training/DeepSpeed-Domino/megatron/utils.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""General utilities."""
+
+import sys
+
+import torch
+from torch.nn.parallel import DistributedDataParallel as torchDDP
+
+from apex.multi_tensor_apply import multi_tensor_applier
+import amp_C
+
+from megatron import (
+    get_args,
+    get_adlr_autoresume,
+)
+from megatron.core import mpu
+from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate
+from megatron.model.module import param_is_not_shared
+
+
+def unwrap_model(model, module_instances=(torchDDP)):
+    return_list = True
+    if not isinstance(model, list):
+        model = [model]
+        return_list = False
+    unwrapped_model = []
+    for model_module in model:
+        while isinstance(model_module, module_instances):
+            model_module = model_module.module
+        unwrapped_model.append(model_module)
+    if not return_list:
+        return unwrapped_model[0]
+    return unwrapped_model
+
+
+def calc_params_l2_norm(model):
+    """Calculate l2 norm of parameters """
+    args = get_args()
+    if not isinstance(model, list):
+        model = [model]
+    # Remove duplicate params.
+    params_data = []
+    for model_ in model:
+        for param in model_.parameters():
+            is_not_shared = param_is_not_shared(param)
+            is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+            if is_not_shared and is_not_tp_duplicate:
+                if args.bf16:
+                    params_data.append(param.data.float())
+                else:
+                    params_data.append(param.data)
+    # Calculate norm
+    dummy_overflow_buf = torch.cuda.IntTensor([0])
+    norm, _ = multi_tensor_applier(
+        amp_C.multi_tensor_l2norm,
+        dummy_overflow_buf,
+        [params_data],
+        False # no per-parameter norm
+    )
+    norm_2 = norm * norm
+    # Sum across all model-parallel GPUs.
+    torch.distributed.all_reduce(norm_2,
+                                 op=torch.distributed.ReduceOp.SUM,
+                                 group=mpu.get_model_parallel_group())
+    return norm_2.item() ** 0.5
+
+
+def average_losses_across_data_parallel_group(losses):
+    """Reduce a tensor of losses across all GPUs."""
+    averaged_losses = torch.cat(
+        [loss.clone().detach().view(1) for loss in losses])
+    torch.distributed.all_reduce(averaged_losses,
+                                 group=mpu.get_data_parallel_group())
+    averaged_losses = averaged_losses / \
+        torch.distributed.get_world_size(group=mpu.get_data_parallel_group())
+
+    return averaged_losses
+
+
+def report_memory(name):
+    """Simple GPU memory report."""
+    mega_bytes = 1024.0 * 1024.0
+    string = name + ' memory (MB)'
+    string += ' | allocated: {}'.format(
+        torch.cuda.memory_allocated() / mega_bytes)
+    string += ' | max allocated: {}'.format(
+        torch.cuda.max_memory_allocated() / mega_bytes)
+    string += ' | reserved: {}'.format(
+        torch.cuda.memory_reserved() / mega_bytes)
+    string += ' | max reserved: {}'.format(
+        torch.cuda.max_memory_reserved() / mega_bytes)
+    if mpu.get_data_parallel_rank() == 0:
+        print("[Rank {}] {}".format(torch.distributed.get_rank(), string),
+              flush=True)
+
+
+def print_params_min_max_norm(optimizer, iteration):
+    """Print min, max, and norm of all parameters."""
+    index = 0
+    rank = torch.distributed.get_rank()
+    string = 'iteration, rank, index, tensor-model-parallel, min, max, norm\n'
+    optimizer_ = optimizer.optimizer
+    for param_group in optimizer_.param_groups:
+        for param in param_group['params']:
+            index += 1
+            min_ = param.data.min()
+            max_ = param.data.max()
+            norm = torch.linalg.norm(param.data)
+            string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format(
+                iteration, rank, index, int(param.tensor_model_parallel))
+            string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm)
+    print(string, flush=True)
+
+
+def check_adlr_autoresume_termination(iteration, model,
+                                      optimizer, opt_param_scheduler):
+    """Check for autoresume signal and exit if it is received."""
+    from megatron.checkpointing import save_checkpoint
+
+    args = get_args()
+    autoresume = get_adlr_autoresume()
+    # Add barrier to ensure consistnecy.
+    torch.distributed.barrier()
+    if autoresume.termination_requested():
+        if args.save:
+            save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
+        print_rank_0(">>> autoresume termination request found!")
+        if torch.distributed.get_rank() == 0:
+            autoresume.request_resume()
+        print_rank_0(">>> training terminated. Returning")
+        sys.exit(0)
+
+
+def get_ltor_masks_and_position_ids(data,
+                                    eod_token,
+                                    reset_position_ids,
+                                    reset_attention_mask,
+                                    eod_mask_loss):
+    """Build masks and position id for left to right model."""
+
+    # Extract batch size and sequence length.
+    micro_batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    if reset_attention_mask:
+        att_mask_batch = micro_batch_size
+    else:
+        att_mask_batch = 1
+    attention_mask = torch.tril(torch.ones(
+        (att_mask_batch, seq_length, seq_length), device=data.device)).view(
+            att_mask_batch, 1, seq_length, seq_length)
+
+    # Loss mask.
+    loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
+    if eod_mask_loss:
+        loss_mask[data == eod_token] = 0.0
+
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(micro_batch_size):
+
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1):] -= (i + 1 - prev_index)
+                    prev_index = i + 1
+
+    # Convert attention mask to binary:
+    attention_mask = (attention_mask < 0.5)
+
+    return attention_mask, loss_mask, position_ids
+
+
+def print_rank_0(message):
+    """If distributed is initialized, print only on rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
+
+def is_last_rank():
+    return torch.distributed.get_rank() == (
+        torch.distributed.get_world_size() - 1)
+
+def print_rank_last(message):
+    """If distributed is initialized, print only on last rank."""
+    if torch.distributed.is_initialized():
+        if is_last_rank():
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
diff --git a/training/DeepSpeed-Domino/pretrain_base.py b/training/DeepSpeed-Domino/pretrain_base.py
new file mode 100644
index 000000000..800305555
--- /dev/null
+++ b/training/DeepSpeed-Domino/pretrain_base.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from pretrain_base.py in Megatron-LM
+
+from functools import partial
+import torch
+from domino.arguments import get_args
+from domino.arguments import get_tokenizer
+from domino.utils import get_ltor_masks_and_position_ids
+from domino.utils import average_losses_across_data_parallel_group
+from domino.tensor_parallel.data import broadcast_data
+
+def forward_step(data_iterator, model):
+    """Forward step."""
+    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+        data_iterator)
+    output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
+    return output_tensor, partial(loss_func, loss_mask)
+
+
+def get_batch(data_iterator):
+    """Generate a batch."""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Items and their type.
+    keys = ['text']
+    datatype = torch.int64
+
+    # Broadcast data.
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+    data_b = broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    tokens_ = data_b['text'].long()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and postition ids.
+    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss)
+
+    return tokens, labels, loss_mask, attention_mask, position_ids
+
+
+def loss_func(loss_mask, output_tensor):
+    """Loss function."""
+    raw_loss = output_tensor.view(-1).float()
+    loss_mask = loss_mask.view(-1).float()
+    loss = torch.sum(raw_loss * loss_mask) / loss_mask.sum()
+    # Reduce loss for logging.
+    averaged_loss = average_losses_across_data_parallel_group([loss])
+    return loss, {'lm loss': averaged_loss[0]}
\ No newline at end of file
diff --git a/training/DeepSpeed-Domino/pretrain_gpt.py b/training/DeepSpeed-Domino/pretrain_gpt.py
new file mode 100644
index 000000000..7fc4650e1
--- /dev/null
+++ b/training/DeepSpeed-Domino/pretrain_gpt.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from pretrain_gpt.py in Megatron-LM
+
+import time
+import torch
+from domino.utils import print_rank_0
+from domino.initialize import initialize_domino, set_jit_fusion_options
+from domino.arguments import get_args, core_transformer_config_from_args
+from domino.data.gpt_dataset import build_train_valid_test_datasets
+from domino.training import pretrain
+from domino.modules.module import DominoModule
+from domino.modules.enums import AttnMaskType
+from domino.language_model import parallel_lm_logits
+from domino.language_model import get_language_model
+from domino.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
+
+
+_TRAIN_START_TIME = time.time()
+
+def post_language_model_processing(lm_output, labels, logit_weights, parallel_output):
+    output = parallel_lm_logits(lm_output, logit_weights, parallel_output)
+    labels = labels.transpose(0, 1).contiguous()
+    loss = vocab_parallel_cross_entropy(output.float(), labels)
+    loss = loss.transpose(0, 1).contiguous()
+    return loss
+
+
+class GPTModel(DominoModule):
+    def __init__(
+        self,
+        config,
+        num_tokentypes=0,
+        parallel_output=True,
+        pre_process=True,
+        post_process=True,
+    ):
+        super().__init__(config=config)
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.language_model = get_language_model(
+            config=config,
+            num_tokentypes=num_tokentypes,
+            encoder_attn_mask_type=AttnMaskType.causal,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+        self.initialize_word_embeddings()
+
+    def set_input_tensor(self, input_tensor):
+        self.language_model.set_input_tensor(input_tensor)
+
+    def forward(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        labels=None,
+        inference_params=None,
+    ):
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            inference_params=inference_params,
+        )
+
+        if self.post_process:
+            return post_language_model_processing(
+                lm_output,
+                labels,
+                self.shared_embedding_or_output_weight(),
+                self.parallel_output,
+            )
+        else:
+            return lm_output
+
+def main():
+    initialize_domino()
+
+    # Set pytorch JIT layer fusion options and warmup JIT functions.
+    set_jit_fusion_options()
+
+    # Adjust the startup time so it reflects the largest value.
+    # This will be closer to what scheduler will see (outside of
+    # image ... launches.
+    global _TRAIN_START_TIME
+    start_time_tensor = torch.cuda.DoubleTensor([_TRAIN_START_TIME])
+    torch.distributed.all_reduce(start_time_tensor,
+                                 op=torch.distributed.ReduceOp.MIN)
+    _TRAIN_START_TIME = start_time_tensor.item()
+    print_rank_0('time to initialize megatron (seconds): {:.3f}'.format(
+        time.time() - _TRAIN_START_TIME))
+
+    print_rank_0('Building GPT model ...')
+    config = core_transformer_config_from_args(get_args())
+    model = GPTModel(
+        config,
+        num_tokentypes=0,
+        parallel_output=True,
+        pre_process=True,
+        post_process=True
+    )
+
+    args = get_args()
+    print_rank_0('Load GPT dataset ...')
+    # Number of train/valid/test samples.
+    if args.train_samples:
+        train_samples = args.train_samples
+    else:
+        train_samples = args.train_iters * args.global_batch_size
+    eval_iters = (args.train_iters // args.eval_interval + 1) * \
+        args.eval_iters
+    test_iters = args.eval_iters
+    train_val_test_num_samples = [train_samples,
+                                  eval_iters * args.global_batch_size,
+                                  test_iters * args.global_batch_size]
+    print_rank_0(' > datasets target sizes (minimum size):')
+    print_rank_0('    train:      {}'.format(train_val_test_num_samples[0]))
+    print_rank_0('    validation: {}'.format(train_val_test_num_samples[1]))
+    print_rank_0('    test:       {}'.format(train_val_test_num_samples[2]))
+    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+        data_prefix=args.data_path,
+        data_impl=args.data_impl,
+        splits_string=args.split,
+        train_valid_test_num_samples=train_val_test_num_samples,
+        seq_length=args.seq_length,
+        seed=args.seed,
+        skip_warmup=(not args.mmap_warmup),
+        train_data_prefix=args.train_data_path,
+        valid_data_prefix=args.valid_data_path,
+        test_data_prefix=args.test_data_path,
+        data_cache_path=args.data_cache_path)
+    
+    pretrain(model, train_ds, valid_ds, test_ds)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/DeepSpeed-Domino/pretrain_gpt3_2.7b.sh b/training/DeepSpeed-Domino/pretrain_gpt3_2.7b.sh
new file mode 100644
index 000000000..c22b6866e
--- /dev/null
+++ b/training/DeepSpeed-Domino/pretrain_gpt3_2.7b.sh
@@ -0,0 +1,72 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from pretrain_gpt.sh in Megatron-LM
+
+#!/bin/bash --login
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=2
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+ 
+CHECKPOINT_PATH=checkpoint
+rm -rf $CHECKPOINT_PATH/*
+VOCAB_FILE="dataset/gpt2-vocab.json"
+MERGE_FILE="dataset/gpt2-merges.txt"
+DATA_PATH="dataset/my-gpt2_text_document"
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+export PYTHONPATH=$SCRIPT_DIR:$PYTHONPATH
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --num-layers 12 \
+    --hidden-size 2560 \
+    --num-attention-heads 32 \
+    --seq-length 512 \
+    --max-position-embeddings 512 \
+    --micro-batch-size 64 \
+    --global-batch-size 64 \
+    --lr 0.00015 \
+    --train-iters 10 \
+    --lr-decay-iters 320000 \
+    --lr-decay-style cosine \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16 \
+    --tensor-model-parallel-size $WORLD_SIZE \
+    --seed 3407
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --split 949,50,1
+"
+ 
+OUTPUT_ARGS="
+    --log-interval 1 \
+"
+
+cmd="deepspeed --num_gpus $WORLD_SIZE \
+    pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS 
+    " 
+
+echo $cmd
+eval $cmd 
diff --git a/training/DeepSpeed-Domino/pretrain_gpt3_6.7b.sh b/training/DeepSpeed-Domino/pretrain_gpt3_6.7b.sh
new file mode 100644
index 000000000..131411d2b
--- /dev/null
+++ b/training/DeepSpeed-Domino/pretrain_gpt3_6.7b.sh
@@ -0,0 +1,71 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from pretrain_gpt.sh in Megatron-LM
+
+#!/bin/bash --login
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=2
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+ 
+CHECKPOINT_PATH=checkpoint
+rm -rf $CHECKPOINT_PATH/*
+VOCAB_FILE="dataset/gpt2-vocab.json"
+MERGE_FILE="dataset/gpt2-merges.txt"
+DATA_PATH="dataset/my-gpt2_text_document"
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+export PYTHONPATH=$SCRIPT_DIR:$PYTHONPATH
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --num-attention-heads 32 \
+    --seq-length 2048 \
+    --max-position-embeddings 2048 \
+    --micro-batch-size 8 \
+    --global-batch-size 8 \
+    --lr 0.00015 \
+    --train-iters 80 \
+    --lr-decay-iters 320000 \
+    --lr-decay-style cosine \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --no-gradient-accumulation-fusion \
+    --fp16 \
+    --tensor-model-parallel-size $WORLD_SIZE
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --split 949,50,1
+"
+ 
+OUTPUT_ARGS="
+    --log-interval 1 \
+"
+
+cmd="deepspeed --num_gpus $WORLD_SIZE \
+    pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS 
+    " 
+echo $cmd
+eval $cmd 
diff --git a/training/DeepSpeed-Domino/pretrain_llama.py b/training/DeepSpeed-Domino/pretrain_llama.py
new file mode 100644
index 000000000..25e8b01a0
--- /dev/null
+++ b/training/DeepSpeed-Domino/pretrain_llama.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from pretrain_llama.py in Megatron-LM
+
+import time
+import torch
+from domino.utils import print_rank_0
+from domino.initialize import initialize_domino, set_jit_fusion_options
+from domino.arguments import get_args, core_transformer_config_from_args
+from domino.data.gpt_dataset import build_train_valid_test_datasets
+from domino.training import pretrain
+from domino.modules.module import DominoModule
+from domino.modules.enums import AttnMaskType
+from domino.language_model import parallel_lm_logits
+from domino.language_model import get_language_model
+from domino.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
+
+
+# The earliest we can measure the start time.
+_TRAIN_START_TIME = time.time()
+
+def post_language_model_processing(lm_output, labels, logit_weights, parallel_output):
+    output = parallel_lm_logits(lm_output, logit_weights, parallel_output)
+    labels = labels.transpose(0, 1).contiguous()
+    loss = vocab_parallel_cross_entropy(output.float(), labels)
+    loss = loss.transpose(0, 1).contiguous()
+    return loss
+
+
+class LLaMAModel(DominoModule):
+    """LLaMA Language model."""
+
+    def __init__(
+        self,
+        config,
+        num_tokentypes=0,
+        parallel_output=True,
+        pre_process=True,
+        post_process=True,
+    ):
+        args = get_args()
+        super(LLaMAModel, self).__init__(
+            config=config,
+            share_embeddings_and_output_weights=True)
+
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.padded_vocab_size = args.padded_vocab_size
+        self.language_model = get_language_model(
+            config=config,
+            num_tokentypes=num_tokentypes,
+            encoder_attn_mask_type=AttnMaskType.causal,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+        self.initialize_word_embeddings()
+        self.lm_head = torch.nn.Linear(
+            args.hidden_size, args.padded_vocab_size, bias=False
+        )
+
+    def set_input_tensor(self, input_tensor):
+        self.language_model.set_input_tensor(input_tensor)
+
+    def _causal_lm_process(self, lm_output, labels):
+        lm_output = lm_output.transpose(0, 1)
+        logits = self.lm_head(lm_output)
+        loss = None
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., :-1].contiguous()
+        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=0)
+        shift_logits = shift_logits.view(-1, self.padded_vocab_size)
+        shift_labels = shift_labels.view(-1)
+        shift_labels = shift_labels.to(shift_logits.device)
+        loss = loss_fct(shift_logits, shift_labels)
+
+        return loss
+
+    def forward(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        labels=None,
+        inference_params=None,
+    ):
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            inference_params=inference_params,
+        )
+
+        if self.post_process:
+            return self._causal_lm_process(lm_output=lm_output, labels=labels)
+        else:
+            return lm_output
+
+
+def main():
+    initialize_domino()
+
+    # Set pytorch JIT layer fusion options and warmup JIT functions.
+    set_jit_fusion_options()
+
+    # Adjust the startup time so it reflects the largest value.
+    # This will be closer to what scheduler will see (outside of
+    # image ... launches.
+    global _TRAIN_START_TIME
+    start_time_tensor = torch.cuda.DoubleTensor([_TRAIN_START_TIME])
+    torch.distributed.all_reduce(start_time_tensor,
+                                 op=torch.distributed.ReduceOp.MIN)
+    _TRAIN_START_TIME = start_time_tensor.item()
+    print_rank_0('time to initialize megatron (seconds): {:.3f}'.format(
+        time.time() - _TRAIN_START_TIME))
+
+    print_rank_0('Building LLaMA model ...')
+    config = core_transformer_config_from_args(get_args())
+    model = LLaMAModel(
+        config,
+        num_tokentypes=0,
+        parallel_output=True,
+        pre_process=True,
+        post_process=True
+    )
+
+    args = get_args()
+    print_rank_0('Load LLaMA dataset ...')
+    # Number of train/valid/test samples.
+    if args.train_samples:
+        train_samples = args.train_samples
+    else:
+        train_samples = args.train_iters * args.global_batch_size
+    eval_iters = (args.train_iters // args.eval_interval + 1) * \
+        args.eval_iters
+    test_iters = args.eval_iters
+    train_val_test_num_samples = [train_samples,
+                                  eval_iters * args.global_batch_size,
+                                  test_iters * args.global_batch_size]
+    print_rank_0(' > datasets target sizes (minimum size):')
+    print_rank_0('    train:      {}'.format(train_val_test_num_samples[0]))
+    print_rank_0('    validation: {}'.format(train_val_test_num_samples[1]))
+    print_rank_0('    test:       {}'.format(train_val_test_num_samples[2]))
+    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+        data_prefix=args.data_path,
+        data_impl=args.data_impl,
+        splits_string=args.split,
+        train_valid_test_num_samples=train_val_test_num_samples,
+        seq_length=args.seq_length,
+        seed=args.seed,
+        skip_warmup=(not args.mmap_warmup),
+        train_data_prefix=args.train_data_path,
+        valid_data_prefix=args.valid_data_path,
+        test_data_prefix=args.test_data_path,
+        data_cache_path=args.data_cache_path)
+    
+    pretrain(model, train_ds, valid_ds, test_ds)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/DeepSpeed-Domino/pretrain_llama_13b.sh b/training/DeepSpeed-Domino/pretrain_llama_13b.sh
new file mode 100644
index 000000000..1a438513a
--- /dev/null
+++ b/training/DeepSpeed-Domino/pretrain_llama_13b.sh
@@ -0,0 +1,84 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from pretrain_llama.sh in Megatron-LM
+
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+ 
+GPUS_PER_NODE=2
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+ 
+CHECKPOINT_PATH=checkpoint
+rm -rf $CHECKPOINT_PATH/*
+VOCAB_FILE="dataset/gpt2-vocab.json"
+MERGE_FILE="dataset/gpt2-merges.txt"
+DATA_PATH="dataset/my-gpt2_text_document"
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+export PYTHONPATH=$SCRIPT_DIR:$PYTHONPATH
+ 
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+LLAMA_ARGS="
+    --llama-model \
+    --num-layers 2 \
+    --hidden-size 4096 \
+    --num-attention-heads 32 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --position-embedding-type rope \
+    --swiglu \
+    --ffn-hidden-size 11008 \
+    --disable-bias-linear \
+    --normalization RMSNorm \
+    --layernorm-epsilon 1e-6 \
+    --micro-batch-size 8 \
+    --global-batch-size 8 \
+    --lr 0.00015 \
+    --train-iters 100 \
+    --lr-decay-iters 320000 \
+    --lr-decay-style cosine \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --no-gradient-accumulation-fusion \
+    --fp16 \
+    --tensor-model-parallel-size $WORLD_SIZE \
+    --seed 3407 \
+    --causal-lm
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --split 949,50,1
+"
+ 
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 1
+"
+
+cmd="deepspeed --num_gpus $WORLD_SIZE \
+    pretrain_llama.py \
+    $LLAMA_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS 
+    "
+echo $cmd
+eval $cmd 
diff --git a/training/DeepSpeed-Domino/pretrain_llama_7b.sh b/training/DeepSpeed-Domino/pretrain_llama_7b.sh
new file mode 100644
index 000000000..ddef81382
--- /dev/null
+++ b/training/DeepSpeed-Domino/pretrain_llama_7b.sh
@@ -0,0 +1,80 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# This file is adapted from pretrain_llama.sh in Megatron-LM
+
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+ 
+GPUS_PER_NODE=2
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+ 
+CHECKPOINT_PATH=checkpoint
+rm -rf $CHECKPOINT_PATH/*
+VOCAB_FILE="dataset/gpt2-vocab.json"
+MERGE_FILE="dataset/gpt2-merges.txt"
+DATA_PATH="dataset/my-gpt2_text_document"
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+export PYTHONPATH=$SCRIPT_DIR:$PYTHONPATH
+ 
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+ 
+LLAMA_ARGS="
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --num-attention-heads 32 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --position-embedding-type rope \
+    --swiglu \
+    --ffn-hidden-size 11008\
+    --normalization RMSNorm \
+    --layernorm-epsilon 1e-6 \
+    --micro-batch-size 16 \
+    --global-batch-size 16 \
+    --lr 0.00015 \
+    --train-iters 80 \
+    --lr-decay-iters 320000 \
+    --lr-decay-style cosine \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16 \
+    --tensor-model-parallel-size $WORLD_SIZE \
+    --seed 3407 
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --split 949,50,1
+"
+ 
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 1
+"
+ 
+cmd="deepspeed --num_gpus $WORLD_SIZE \
+    pretrain_llama.py \
+    $LLAMA_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS 
+    "
+echo $cmd
+eval $cmd 
diff --git a/training/DeepSpeed-Domino/requirements.txt b/training/DeepSpeed-Domino/requirements.txt
new file mode 100644
index 000000000..a39083e71
--- /dev/null
+++ b/training/DeepSpeed-Domino/requirements.txt
@@ -0,0 +1,6 @@
+apex
+deepspeed
+nltk
+pybind11
+transformers
+

From 600db32e8b86b0be7b92c7d1db51f7d734e565af Mon Sep 17 00:00:00 2001
From: Zheyu Shen <zyshen@umd.edu>
Date: Wed, 27 Nov 2024 17:16:09 -0500
Subject: [PATCH 04/13] Update DeepSpeed version requirement to >=0.16.0 in
 requirements.txt for Domino (#939)

Signed-off-by: zhangsmallshark <zhangsmallshark@gmail.com>
---
 training/DeepSpeed-Domino/domino/data/.DS_Store | Bin 6148 -> 0 bytes
 training/DeepSpeed-Domino/requirements.txt      |   2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 delete mode 100644 training/DeepSpeed-Domino/domino/data/.DS_Store

diff --git a/training/DeepSpeed-Domino/domino/data/.DS_Store b/training/DeepSpeed-Domino/domino/data/.DS_Store
deleted file mode 100644
index d02a1acaaed83168d65321c3f703c83ad2b2c9a0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKJxc>Y5S>X<1OgJQEVs50EbNyM5o@v5#0w#CCSoFJcYn$s<F7g2+u4YBUSTDo
zGhz45?#`RreUSYS5uH30ry^q!8KMxCPKU9(bEubt2S80ZR<e^D{7&SdBhhc{;@n4R
z&MR5V=lQ=}mYchIxtSw^@?4{w)yd$ouaFtmI|F-*`W|a8Wg(Afas4fR?+u(|<JUF(
z16d-9dhPRU!QORo`xh<lA!fP;Zz0bp#kX?2wsN59<_fq1u7E4>FBQO<O&g93y><m$
z0axHd0qze0g%~qdhH2|Sm7V}VUZa!2=Vh6kkTGVg3^4*zqXLbp`H7)MM}Nq1F=J(D
zbkY3y(ER27<3-cgkw2tx(P-$kE8q$=73f=Y%=7;of0@olev{%YSHKncV+!bCdNrM3
zQTc5BvOJ!(0p%8j#<+?O41VJhfCl%Gqcmvafj;75#>$Xc*q`V?zX&8jymAG8K!JCW
Ca#9Td

diff --git a/training/DeepSpeed-Domino/requirements.txt b/training/DeepSpeed-Domino/requirements.txt
index a39083e71..53f617b9a 100644
--- a/training/DeepSpeed-Domino/requirements.txt
+++ b/training/DeepSpeed-Domino/requirements.txt
@@ -1,5 +1,5 @@
 apex
-deepspeed
+deepspeed>=0.16.0
 nltk
 pybind11
 transformers

From 2b906c93b89a84d3c4bf6d96b892bdfdd4d5880a Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Fri, 13 Dec 2024 09:57:51 -0800
Subject: [PATCH 05/13] Example and benchmark of APIs to offload states (#942)

* add benchmarking for offloading states

* fix api names

Signed-off-by: zhangsmallshark <zhangsmallshark@gmail.com>
---
 training/offload_states/README.md         |  25 ++++
 training/offload_states/offload_states.py | 152 ++++++++++++++++++++++
 training/offload_states/output_table.py   |  28 ++++
 training/offload_states/run_benchmark.sh  |  28 ++++
 4 files changed, 233 insertions(+)
 create mode 100644 training/offload_states/README.md
 create mode 100644 training/offload_states/offload_states.py
 create mode 100644 training/offload_states/output_table.py
 create mode 100644 training/offload_states/run_benchmark.sh

diff --git a/training/offload_states/README.md b/training/offload_states/README.md
new file mode 100644
index 000000000..4add7404b
--- /dev/null
+++ b/training/offload_states/README.md
@@ -0,0 +1,25 @@
+# Offloading States Example
+
+The script `offload_states.py` demonstrates how to offload the state of a model. Here is the example usage.
+
+```bash
+$ deepspeed --num_gpus=4 offload_states.py --hidden_dim 32768 --nlayers 4 --pin_memory --non_blocking
+...
+Memory usage (0): include=None, pin_memory=True, non_blocking=True alloc_before_offload=18198419456 alloc_after_offload=17763840
+Memory usage (1): include=None, pin_memory=True, non_blocking=True alloc_before_offload=18198760960 alloc_after_offload=17763840
+...
+Summary: pin_memory=True non_blocking=True offload=5.643414640426636 load=2.4087101459503173
+```
+
+`run_benchmark.sh` shows how to run the script with different configurations. The script outputs the time for offloading and loading the states.
+
+```bash
+$ ./run_benchmark.sh
+...
+|   |pin_memory=0_non_blocking=0|pin_memory=0_non_blocking=1|pin_memory=1_non_blocking=0|pin_memory=1_non_blocking=1|
+|--:|---------------------------|---------------------------|---------------------------|---------------------------|
+|  1|4.34 / 3.42                |4.99 / 2.37                |6.5 / 2.42                 |6.0 / 2.39                 |
+|  2|9.9 / 3.28                 |5.1 / 2.34                 |6.21 / 2.42                |6.25 / 2.45                |
+|  3|9.92 / 3.19                |6.71 / 2.35                |6.33 / 2.38                |5.93 / 2.42                |
+|  4|9.55 / 2.82                |7.11 / 2.39                |6.9 / 2.38                 |6.5 / 2.43                 |...
+```
diff --git a/training/offload_states/offload_states.py b/training/offload_states/offload_states.py
new file mode 100644
index 000000000..f80b06e05
--- /dev/null
+++ b/training/offload_states/offload_states.py
@@ -0,0 +1,152 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import time
+import argparse
+
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+import torch
+
+import deepspeed
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum, OffloadStateTypeEnum
+
+
+class SimpleModel(torch.nn.Module):
+
+    def __init__(self, hidden_dim, empty_grad=False, nlayers=1):
+        super(SimpleModel, self).__init__()
+        self.linears = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim) for _ in range(nlayers)])
+        if empty_grad:
+            self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
+        self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        for l in self.linears:
+            x = l(x)
+        return self.cross_entropy_loss(x, y)
+
+
+def random_dataset(total_samples, hidden_dim, device, dtype):
+    train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=dtype)
+    train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
+    train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
+    return train_dataset
+
+
+def random_dataloader(model, total_samples, hidden_dim, device, dtype):
+    batch_size = model.train_micro_batch_size_per_gpu()
+    train_dataset = random_dataset(total_samples, hidden_dim, device, dtype=dtype)
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
+    return train_loader
+
+
+def run_model(model, config_dict, hidden_dim, dtype, include, pin_memory, non_blocking, iteration, warmup):
+    model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+    data_loader = random_dataloader(model=model,
+                                    total_samples=iteration,
+                                    hidden_dim=hidden_dim,
+                                    device=model.device,
+                                    dtype=dtype)
+
+    time_offload_list = []
+    time_load_list = []
+
+    dist.barrier()
+    for i, batch in enumerate(data_loader):
+        loss = model(batch[0], batch[1])
+        model.backward(loss)
+        model.step()
+
+        # Start offloading
+        alloc_before_offload = get_accelerator().memory_allocated()
+        dist.barrier()
+
+        time_start = time.time()
+        model.offload_states(include=include,
+                             device=OffloadDeviceEnum.cpu,
+                             pin_memory=pin_memory,
+                             non_blocking=non_blocking)
+        dist.barrier()
+        time_after_offload = time.time()
+        alloc_after_offload = get_accelerator().memory_allocated()
+        assert alloc_after_offload < alloc_before_offload, f"Allocated memory should decrease after offload"
+
+        # Load offloaded states back
+        model.reload_states()
+        dist.barrier()
+        time_after_load = time.time()
+
+        time_offload_list.append(time_after_offload - time_start)
+        time_load_list.append(time_after_load - time_after_offload)
+
+        assert alloc_after_offload < get_accelerator().memory_allocated(
+        ), f"Allocated memory should increase after offload back"
+
+        if dist.get_rank() == 0:
+            print(
+                f"Memory usage ({i}): include={include}, pin_memory={pin_memory}, non_blocking={non_blocking} alloc_before_offload={alloc_before_offload} alloc_after_offload={alloc_after_offload}"
+            )
+
+    # remove warmup
+    time_offload_list = time_offload_list[warmup:]
+    time_load_list = time_load_list[warmup:]
+
+    if dist.get_rank() == 0:
+        with open("offload_states.log", "a") as f:
+            offload_time = sum(time_offload_list) / len(time_offload_list)
+            load_time = sum(time_load_list) / len(time_load_list)
+            msg = f"{1 if pin_memory else 0},{1 if non_blocking else 0},{offload_time},{load_time}"
+            f.write(f"{msg}\n")
+        print(f"Summary: pin_memory={pin_memory} non_blocking={non_blocking} offload={offload_time} load={load_time}")
+
+    # Needed in ZeRO 3. Not doing so can give memory leak
+    model.destroy()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Test Offload States")
+    parser.add_argument("--included_state", type=str, choices=[e.name for e in OffloadStateTypeEnum] + [None], default=None, help="State to include")
+    parser.add_argument("--pin_memory", action='store_true', help="Pin memory")
+    parser.add_argument("--non_blocking", action='store_true', help="Non blocking")
+    parser.add_argument("--nlayers", type=int, default=1, help="Number of layers")
+    parser.add_argument("--hidden_dim", type=int, default=1024, help="Hidden dimension")
+    parser.add_argument('--dtype', choices=['torch.bfloat16', 'torch.float16', 'torch.float32'], default='torch.bfloat16', help='Data type')
+    parser.add_argument("--local_rank", type=int, default=-1, help="Local rank")
+    parser.add_argument("--iteration", type=int, default=10, help="Warmup")
+    parser.add_argument("--warmup", type=int, default=5, help="Warmup")
+
+    args = parser.parse_args()
+
+    dtype = eval(args.dtype)
+    hidden_dim = args.hidden_dim
+
+    config_dict = {
+        "train_micro_batch_size_per_gpu": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 1e-6
+            }
+        },
+        "zero_optimization": {
+            "stage": 3,
+        },
+    }
+
+    if dtype == torch.float16:
+        config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+    elif dtype == torch.bfloat16:
+        config_dict["bf16"] = {"enabled": True}
+
+    with deepspeed.zero.Init(config_dict_or_path=config_dict):
+        model = SimpleModel(hidden_dim, nlayers=args.nlayers)
+
+    included_state = None if args.included_state is None else [OffloadStateTypeEnum[args.included_state]]
+    run_model(model, config_dict, hidden_dim, dtype, included_state, args.pin_memory, args.non_blocking, args.iteration, args.warmup)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/offload_states/output_table.py b/training/offload_states/output_table.py
new file mode 100644
index 000000000..fc1a5b840
--- /dev/null
+++ b/training/offload_states/output_table.py
@@ -0,0 +1,28 @@
+import pandas as pd
+from pytablewriter import MarkdownTableWriter
+
+
+def read_csv(file_path):
+    return pd.read_csv(file_path)
+
+df = read_csv('offload_states.log')
+df.columns = ['pin_memory', 'non_blocking', 'offload_time', 'load_time']
+
+df['ratio_string'] = df['offload_time'].round(2).astype(str) + " / " + df['load_time'].round(2).astype(str)
+
+result_df = pd.DataFrame({
+    'pin_memory=0_non_blocking=0': df[(df['pin_memory'] == 0) & (df['non_blocking'] == 0)]['ratio_string'].reset_index(drop=True),
+    'pin_memory=0_non_blocking=1': df[(df['pin_memory'] == 0) & (df['non_blocking'] == 1)]['ratio_string'].reset_index(drop=True),
+    'pin_memory=1_non_blocking=0': df[(df['pin_memory'] == 1) & (df['non_blocking'] == 0)]['ratio_string'].reset_index(drop=True),
+    'pin_memory=1_non_blocking=1': df[(df['pin_memory'] == 1) & (df['non_blocking'] == 1)]['ratio_string'].reset_index(drop=True)
+})
+result_df = result_df.dropna()
+result_df.index = range(1, len(result_df) + 1)
+result_df.index.name = 'trial'
+# print(result_df)
+
+writer = MarkdownTableWriter()
+writer.from_dataframe(result_df,
+    add_index_column=True,
+)
+writer.write_table()
\ No newline at end of file
diff --git a/training/offload_states/run_benchmark.sh b/training/offload_states/run_benchmark.sh
new file mode 100644
index 000000000..ba18da03e
--- /dev/null
+++ b/training/offload_states/run_benchmark.sh
@@ -0,0 +1,28 @@
+NGPUS=4
+HIDDEN_SIZE=32768
+NUM_LAYERS=4
+
+TRIALS=10
+
+PIN_MEMORY_OPTS=(0 1)
+NON_BLOCKING_OPTS=(0 1)
+
+for i in $(seq 1 $TRIALS); do
+    for PIN_MEMORY in "${PIN_MEMORY_OPTS[@]}"; do
+        PIN_MEMORY_ARG=""
+        if [ $PIN_MEMORY -eq 1 ]; then
+            PIN_MEMORY_ARG="--pin_memory"
+        fi
+
+        for NON_BLOCKING in "${NON_BLOCKING_OPTS[@]}"; do
+            NON_BLOCKING_ARG=""
+            if [ $NON_BLOCKING -eq 1 ]; then
+                NON_BLOCKING_ARG="--non_blocking"
+            fi
+
+            echo "Running iteration $i"
+            deepspeed --num_gpus=$NGPUS offload_states.py --hidden_dim $HIDDEN_SIZE --nlayers $NUM_LAYERS $PIN_MEMORY_ARG $NON_BLOCKING_ARG
+        done
+    done
+done
+python output_table.py

From 937f452c7621c8fdd859c2a79ae73244ec8da2b5 Mon Sep 17 00:00:00 2001
From: zhangsmallshark <zhangsmallshark@gmail.com>
Date: Mon, 16 Dec 2024 13:54:58 -0600
Subject: [PATCH 06/13] add checkpoint

Signed-off-by: zhangsmallshark <zhangsmallshark@gmail.com>
---
 training/DeepSpeed-Domino/domino/arguments.py |  13 +-
 .../DeepSpeed-Domino/domino/initialize.py     |   3 +
 .../DeepSpeed-Domino/domino/language_model.py | 153 +++++++++++++++++-
 .../DeepSpeed-Domino/domino/modules/module.py |  16 +-
 training/DeepSpeed-Domino/domino/training.py  |  11 ++
 .../megatron/checkpointing.py                 |  29 ++--
 .../DeepSpeed-Domino/megatron/initialize.py   |   2 +-
 7 files changed, 213 insertions(+), 14 deletions(-)

diff --git a/training/DeepSpeed-Domino/domino/arguments.py b/training/DeepSpeed-Domino/domino/arguments.py
index 8bc59223a..d4d0a7123 100644
--- a/training/DeepSpeed-Domino/domino/arguments.py
+++ b/training/DeepSpeed-Domino/domino/arguments.py
@@ -204,9 +204,20 @@ def parse_args():
                        'validation set.')
     parser.add_argument('--log-interval', type=int, default=100,
                        help='Report loss and timing interval.')
+    parser.add_argument('--save', type=str, default=None,
+                       help='Output directory to save checkpoints to.')
+    parser.add_argument('--no-save-optim', action='store_true', default=None,
+                       help='Do not save current optimizer.')
+    parser.add_argument('--no-save-rng', action='store_true', default=None,
+                       help='Do not save current rng state.')
     parser.add_argument('--save-interval', type=int, default=None,
                        help='Number of iterations between checkpoint saves.')
-    
+    parser.add_argument('--load', type=str, default=None,
+                       help='Directory containing a model checkpoint.')
+    parser.add_argument('--no-load-optim', action='store_true', default=None,
+                       help='Do not load optimizer when loading checkpoint.')
+    parser.add_argument('--no-load-rng', action='store_true', default=None,
+                       help='Do not load rng state when loading checkpoint.')
     args = parser.parse_args()
 
     args.rank = int(os.getenv('RANK', '0'))
diff --git a/training/DeepSpeed-Domino/domino/initialize.py b/training/DeepSpeed-Domino/domino/initialize.py
index 36e0fa1bc..a144bbdf2 100644
--- a/training/DeepSpeed-Domino/domino/initialize.py
+++ b/training/DeepSpeed-Domino/domino/initialize.py
@@ -14,6 +14,7 @@
 from domino.modules.fused_bias_gelu import bias_gelu
 
 from megatron import fused_kernels
+import deepspeed
 
 
 def initialize_domino():
@@ -37,6 +38,8 @@ def initialize_domino():
         world_size=args.world_size,
         rank=args.rank
     )
+    deepspeed.init_distributed()
+
     mpu.initialize_model_parallel(args.tensor_model_parallel_size)
     seed_ = args.seed
     data_parallel_random_init = False
diff --git a/training/DeepSpeed-Domino/domino/language_model.py b/training/DeepSpeed-Domino/domino/language_model.py
index 2cfb2f9fd..4916db61f 100644
--- a/training/DeepSpeed-Domino/domino/language_model.py
+++ b/training/DeepSpeed-Domino/domino/language_model.py
@@ -84,6 +84,70 @@ def forward(self, input_ids, position_ids):
 
         return combined_embeds
 
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(prefix=prefix,
+                                              keep_vars=keep_vars)
+        if self.add_position_embedding:
+            state_dict_[self._position_embeddings_key] \
+                = self.position_embeddings.state_dict(prefix=prefix,
+                                                  keep_vars=keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(prefix=prefix,
+                                                       keep_vars=keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self.add_position_embedding:
+            if self._position_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._position_embeddings_key]
+            else:
+                # for backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if 'position_embeddings' in key:
+                        state_dict_[key.split('position_embeddings.')[1]] \
+                            = state_dict[key]
+            self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_,
+                                                          strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it', flush=True)
+
 
 class RotaryEmbedding(nn.Module):
     def __init__(self, dim, seq_len_interpolation_factor=None):
@@ -190,4 +254,91 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
         encoder_output = encoder_output_t
 
         return encoder_output
-        
\ No newline at end of file
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        if self.pre_process:
+            state_dict_[self._embedding_key] \
+                = self.embedding.state_dict_for_save_checkpoint(prefix=prefix,
+                                                                keep_vars=keep_vars)
+        if self.add_encoder:
+            state_dict_[self._encoder_key] \
+                = self.encoder.state_dict_for_save_checkpoint(prefix=prefix,
+                                                              keep_vars=keep_vars)
+        if self.post_process:
+            if self.add_pooler:
+                state_dict_[self._pooler_key] \
+                    = self.pooler.state_dict_for_save_checkpoint(prefix=prefix,
+                                                                 keep_vars=keep_vars)
+            if self.untie_embeddings_and_output_weights:
+                state_dict_[self._output_layer_key] \
+                    = self.output_layer.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+        if self.add_decoder:
+            state_dict_[self._decoder_key] \
+                = self.decoder.state_dict_for_save_checkpoint(prefix=prefix,
+                                                              keep_vars=keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Embedding.
+        if self.pre_process:
+            if self._embedding_key in state_dict:
+                state_dict_ = state_dict[self._embedding_key]
+            else:
+                # for backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if '_embeddings' in key:
+                        state_dict_[key] = state_dict[key]
+            self.embedding.load_state_dict(state_dict_, strict=strict)
+
+        # Encoder.
+        if self.add_encoder:
+            if self._encoder_key in state_dict:
+                state_dict_ = state_dict[self._encoder_key]
+            # For backward compatibility.
+            elif 'transformer' in state_dict:
+                state_dict_ = state_dict['transformer']
+            else:
+                # For backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if 'transformer.' in key:
+                        state_dict_[key.split('transformer.')[1]] = state_dict[key]
+
+            # For backward compatibility.
+            state_dict_self_attention = {}
+            for key in state_dict_.keys():
+                if '.attention.' in key:
+                    state_dict_self_attention[key.replace(".attention.",
+                        ".self_attention.")] = state_dict_[key]
+                else:
+                    state_dict_self_attention[key] = state_dict_[key]
+            state_dict_ = state_dict_self_attention
+
+            self.encoder.load_state_dict(state_dict_, strict=strict)
+
+        # Pooler.
+        if self.post_process:
+            if self.add_pooler:
+                assert 'pooler' in state_dict, \
+                    'could not find data for pooler in the checkpoint'
+                self.pooler.load_state_dict(state_dict[self._pooler_key],
+                                            strict=strict)
+            if self.untie_embeddings_and_output_weights:
+                assert 'output_layer' in state_dict, \
+                    'could not find data for output_layer in the checkpoint'
+                self.output_layer.load_state_dict(state_dict[self._output_layer_key],
+                                                  strict=strict)
+        # Decoder.
+        if self.add_decoder:
+            assert 'decoder' in state_dict, \
+                'could not find data for pooler in the checkpoint'
+            self.decoder.load_state_dict(state_dict[self._decoder_key],
+                                         strict=strict)
diff --git a/training/DeepSpeed-Domino/domino/modules/module.py b/training/DeepSpeed-Domino/domino/modules/module.py
index b89bbc21f..2cb02002b 100644
--- a/training/DeepSpeed-Domino/domino/modules/module.py
+++ b/training/DeepSpeed-Domino/domino/modules/module.py
@@ -25,6 +25,13 @@ def __init__(self, config=None, share_embeddings_and_output_weights=True):
         self.config = config
         self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
 
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """Use this function to override the state dict for
+           saving checkpoints.
+        """
+
+        return self.state_dict(prefix=prefix, keep_vars=keep_vars)
+
     def initialize_word_embeddings(self):
         self.share_embeddings_and_output_weights = True
         return
@@ -74,7 +81,8 @@ def float_conversion(val):
     return conversion_helper(val, float_conversion)
 
 
-class Float16Module(torch.nn.Module):
+# class Float16Module(torch.nn.Module):
+class Float16Module(DominoModule):
 
     def __init__(self, module, args):
         super(Float16Module, self).__init__()
@@ -91,3 +99,9 @@ def forward(self, *inputs, **kwargs):
             outputs = float16_to_fp32(outputs)
         return outputs
 
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """ Retrieve state_dict from the module being wrapped."""
+        return self.module.state_dict_for_save_checkpoint(prefix=prefix, keep_vars=keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
\ No newline at end of file
diff --git a/training/DeepSpeed-Domino/domino/training.py b/training/DeepSpeed-Domino/domino/training.py
index 59e253fcf..d699b4ed5 100644
--- a/training/DeepSpeed-Domino/domino/training.py
+++ b/training/DeepSpeed-Domino/domino/training.py
@@ -16,6 +16,8 @@
 from domino.initialize import set_jit_fusion_options
 from domino.tensor_parallel.data import broadcast_data
 
+from megatron.checkpointing import load_checkpoint
+from megatron.checkpointing import save_checkpoint
 
 def is_rank_0():
     # if torch.cuda.current_device() == 0:
@@ -109,6 +111,11 @@ def setup_model_and_optimizer(base_model,
     optimizer = get_megatron_optimizer(models, no_wd_decay_cond, scale_lr_cond)
     opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
 
+    if args.load is not None:
+        args.iteration = load_checkpoint(model, optimizer, opt_param_scheduler)
+    else:
+        args.iteration = 0
+
     args.iteration = 0
 
     return model, optimizer, opt_param_scheduler
@@ -297,6 +304,10 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
                         config)
 
         iteration += 1
+        if args.save and args.save_interval and \
+           iteration % args.save_interval == 0:
+            save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
+
         args.consumed_train_samples += mpu.get_data_parallel_world_size() * \
             args.micro_batch_size * get_num_microbatches()
         
diff --git a/training/DeepSpeed-Domino/megatron/checkpointing.py b/training/DeepSpeed-Domino/megatron/checkpointing.py
index e88b58513..19aef958f 100644
--- a/training/DeepSpeed-Domino/megatron/checkpointing.py
+++ b/training/DeepSpeed-Domino/megatron/checkpointing.py
@@ -9,12 +9,16 @@
 
 import torch
 
-from megatron import update_num_microbatches
-from megatron.core import mpu, tensor_parallel
-from .global_vars import get_args
-from .utils import (unwrap_model,
-                    print_rank_0)
+# from megatron.core import mpu, tensor_parallel
+# from .global_vars import get_args
+# from .utils import (unwrap_model,
+#                     print_rank_0)
 
+# from megatron import update_num_microbatches               
+import domino.parallel_state as mpu
+from domino.tensor_parallel.random import get_cuda_rng_tracker
+from domino.arguments import get_args
+from domino.utils import unwrap_model, print_rank_0
 
 _CHECKPOINT_VERSION = None
 
@@ -194,7 +198,7 @@ def get_rng_state():
         'np_rng_state': np.random.get_state(),
         'torch_rng_state': torch.get_rng_state(),
         'cuda_rng_state': torch.cuda.get_rng_state(),
-        'rng_tracker_states': tensor_parallel.get_cuda_rng_tracker().get_states()}
+        'rng_tracker_states': get_cuda_rng_tracker().get_states()}
 
     rng_state_list = None
     if torch.distributed.is_initialized() and \
@@ -218,6 +222,8 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler):
 
     # Only rank zero of the data parallel writes to the disk.
     model = unwrap_model(model)
+    model_module = model.module
+    model = [model_module]
 
     print_rank_0('saving checkpoint at iteration {:7d} to {}'.format(
         iteration, args.save))
@@ -241,7 +247,7 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler):
 
         # Arguments, iteration, and model.
         state_dict = {}
-        state_dict['args'] = args
+        # state_dict['args'] = args
         state_dict['checkpoint_version'] = 3.0
         state_dict['iteration'] = iteration
         if len(model) == 1:
@@ -503,6 +509,8 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
     load_dir = getattr(args, load_arg)
 
     model = unwrap_model(model)
+    model_module = model.module
+    model = [model_module]
 
     state_dict, checkpoint_name, release = _load_base_checkpoint(load_dir, rank0=False)
 
@@ -522,6 +530,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
     set_checkpoint_version(state_dict.get('checkpoint_version', 0))
 
     # Set iteration.
+    args.finetune = False
     if args.finetune or release:
         iteration = 0
     else:
@@ -544,7 +553,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
         check_checkpoint_args(checkpoint_args)
         args.consumed_train_samples = getattr(checkpoint_args,
                                               'consumed_train_samples', 0)
-        update_num_microbatches(consumed_samples=args.consumed_train_samples)
+        # update_num_microbatches(consumed_samples=args.consumed_train_samples)
         args.consumed_valid_samples = getattr(checkpoint_args,
                                               'consumed_valid_samples', 0)
     else:
@@ -614,7 +623,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
                 # Check for empty states array
                 if not rng_state['rng_tracker_states']:
                     raise KeyError
-                tensor_parallel.get_cuda_rng_tracker().set_states(
+                get_cuda_rng_tracker().set_states(
                     rng_state['rng_tracker_states'])
             else:  # backward compatability
                 random.setstate(state_dict['random_rng_state'])
@@ -624,7 +633,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
                 # Check for empty states array
                 if not state_dict['rng_tracker_states']:
                     raise KeyError
-                tensor_parallel.get_cuda_rng_tracker().set_states(
+                get_cuda_rng_tracker().set_states(
                     state_dict['rng_tracker_states'])
         except KeyError:
             print_rank_0('Unable to load rng state from checkpoint {}. '
diff --git a/training/DeepSpeed-Domino/megatron/initialize.py b/training/DeepSpeed-Domino/megatron/initialize.py
index 367ba85cb..b8aa6d3e8 100644
--- a/training/DeepSpeed-Domino/megatron/initialize.py
+++ b/training/DeepSpeed-Domino/megatron/initialize.py
@@ -16,7 +16,7 @@
 from megatron import get_tensorboard_writer
 from megatron.core import mpu, tensor_parallel
 from megatron.arguments import parse_args, validate_args
-from megatron.checkpointing import load_args_from_checkpoint
+# from megatron.checkpointing import load_args_from_checkpoint
 from megatron.global_vars import set_global_variables
 from megatron.model.transformer import bias_dropout_add_fused_train
 from megatron.model.fused_bias_gelu import bias_gelu

From 6b8109e93068674e357c524d9fbc4f05286e03f2 Mon Sep 17 00:00:00 2001
From: zhangsmallshark <zhangsmallshark@gmail.com>
Date: Wed, 22 Jan 2025 13:47:24 -0600
Subject: [PATCH 07/13] fix args

Signed-off-by: zhangsmallshark <zhangsmallshark@gmail.com>
---
 training/DeepSpeed-Domino/domino/arguments.py | 5 +++++
 training/DeepSpeed-Domino/domino/training.py  | 2 --
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/training/DeepSpeed-Domino/domino/arguments.py b/training/DeepSpeed-Domino/domino/arguments.py
index d4d0a7123..0faf58d93 100644
--- a/training/DeepSpeed-Domino/domino/arguments.py
+++ b/training/DeepSpeed-Domino/domino/arguments.py
@@ -218,6 +218,11 @@ def parse_args():
                        help='Do not load optimizer when loading checkpoint.')
     parser.add_argument('--no-load-rng', action='store_true', default=None,
                        help='Do not load rng state when loading checkpoint.')
+    group.add_argument('--exit-on-missing-checkpoint', action='store_true',
+                       help="If '--load' is set, but checkpoint is not found "
+                       "(e.g., path typo), then exit instead of random "
+                       "initialization.")
+
     args = parser.parse_args()
 
     args.rank = int(os.getenv('RANK', '0'))
diff --git a/training/DeepSpeed-Domino/domino/training.py b/training/DeepSpeed-Domino/domino/training.py
index d699b4ed5..88fbfaa8d 100644
--- a/training/DeepSpeed-Domino/domino/training.py
+++ b/training/DeepSpeed-Domino/domino/training.py
@@ -116,8 +116,6 @@ def setup_model_and_optimizer(base_model,
     else:
         args.iteration = 0
 
-    args.iteration = 0
-
     return model, optimizer, opt_param_scheduler
 
 

From 0be49e3763c743d89b026cec6d1e86aacebc5a02 Mon Sep 17 00:00:00 2001
From: Shijie <821898965@qq.com>
Date: Tue, 24 Dec 2024 22:55:45 +0800
Subject: [PATCH 08/13] remove-redundant-code (#947)

Signed-off-by: zhangsmallshark <zhangsmallshark@gmail.com>
---
 applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py
index 7e3e6776b..e53042075 100644
--- a/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py
+++ b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py
@@ -211,12 +211,7 @@ def create_dataset_split(current_dataset, raw_dataset, train_phase, tokenizer,
                                          padding="max_length",
                                          truncation=True,
                                          return_tensors="pt")
-                chosen_token["input_ids"] = chosen_token["input_ids"]
-                chosen_token["attention_mask"] = chosen_token["attention_mask"]
                 chosen_dataset.append(chosen_token)
-
-                reject_token["input_ids"] = reject_token["input_ids"]
-                reject_token["attention_mask"] = reject_token["attention_mask"]
                 reject_dataset.append(reject_token)
         print(
             f'Creating dataset {raw_dataset.dataset_name_clean} for {train_phase=} size={len(chosen_dataset)}'

From f7628f44567776e70b39549685b4da685e2d445e Mon Sep 17 00:00:00 2001
From: stceum <50257864+stceum@users.noreply.github.com>
Date: Tue, 7 Jan 2025 03:11:44 +0800
Subject: [PATCH 09/13] Add DPO support for DeepSpeed-Chat (#828)

* Add label_smoothing while calculating step2 DPO loss in DeepSpeed-Chat.

* Add training scripts for step2 DPO in DeepSpeed-Chat.

* Remove unused packages and format the code of step2 DPO in DeepSpeed-Chat.

* Update training scripts of step2 DPO in DeepSpeed-Chat.

* Follow upstream fixes.

* Update README.md for Step2 DPO finetuning.

* Add opt 350M training log demo for step 2 dpo finetuning in DeepSpeed-Chat.

* Address the formatting issue in step2 dpo finetuning in DeepSpeed-Chat.

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Signed-off-by: zhangsmallshark <zhangsmallshark@gmail.com>
---
 .../training/step2_dpo_finetuning/README.md   |   26 +
 .../training/step2_dpo_finetuning/main.py     |  528 ++
 .../opt-350M_globalBatchSize-32.log           | 6409 +++++++++++++++++
 .../training_scripts/README.md                |    6 +
 .../training_scripts/llama2/run_llama2_7b.sh  |   35 +
 .../llama2/run_llama2_7b_lora.sh              |   37 +
 .../opt/multi_node/run_350m.sh                |   34 +
 .../opt/single_gpu/run_350m.sh                |   20 +
 .../opt/single_node/run_350m.sh               |   34 +
 .../opt/single_node/sweep/README.md           |   20 +
 .../opt/single_node/sweep/run_single.sh       |   46 +
 .../opt/single_node/sweep/run_step2_sweep.sh  |   21 +
 12 files changed, 7216 insertions(+)
 create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/README.md
 create mode 100755 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/main.py
 create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_log_output/opt-350M_globalBatchSize-32.log
 create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md
 create mode 100755 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh
 create mode 100755 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
 create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh
 create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh
 create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh
 create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md
 create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
 create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh

diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/README.md b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/README.md
new file mode 100644
index 000000000..9a086003c
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/README.md
@@ -0,0 +1,26 @@
+# 🐕 Direct Preference Optimization (DPO) finetuning
+[Direct Preference Optimization (DPO)](https://arxiv.org/abs/2305.18290) is a novel approach to preference learning, which directly optimizes the policy without explicit reward modeling or reinforcement learning. It leverages a specific parameterization of the reward model that enables the extraction of the corresponding optimal policy in closed form. By using a simple classification loss, DPO aligns language models with human preferences, avoiding the complexity and instability often associated with RLHF.
+
+As the paper says, "Your Language Model is Secretly a Reward Model." Therefore, the training arguments and the training process of DPO are mostly the same as the reward model, as shown in [step2 "Reward Model (RM) finetuning"](../step2_reward_model_finetuning/README.md). After the training of DPO, you will get a model that has been aligned with human preferences.
+
+## 🏃 How to train the model
+
+We provide the script for OPT-350m, which you can test by launching the command
+
+```bash
+training_scripts/opt/single_node/run_350m.sh
+```
+
+We also provide the script for llama2, which you can test by launching the command
+
+```bash
+training_scripts/llama2/run_llama2_7b.sh
+```
+
+## 🏃 How to evaluate the DPO checkpoint?
+
+The checkpoint of DPO is exactly the language model that can be evaluated as [step1 "Supervised Finetuning"](../step1_supervised_finetuning/README.md).
+
+## 💁 Datasets
+
+Because DPO treats the language model as a reward model, the dataset for DPO is in the same format as that used for reward model fine-tuning. Each item in the dataset includes one "chosen" and one "rejected" output for the same input.
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/main.py b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/main.py
new file mode 100755
index 000000000..f36e7a69b
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/main.py
@@ -0,0 +1,528 @@
+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import argparse
+import math
+
+import torch
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+
+from transformers import (
+    AutoModelForCausalLM,
+    SchedulerType,
+    get_scheduler,
+)
+
+import deepspeed
+from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
+from deepspeed import get_accelerator
+
+from dschat.utils.data.data_utils import create_prompt_dataset, DataCollatorReward
+from dschat.utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, get_optimizer_grouped_parameters, save_zero_three_model, load_hf_tokenizer
+from dschat.utils.ds_utils import get_train_ds_config, get_eval_ds_config
+from dschat.utils.module.lora import convert_linear_layer_to_lora, convert_lora_to_linear_layer, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible
+from dschat.utils.model.model_utils import create_hf_model, causal_lm_model_to_fp32_loss
+from dschat.utils.perf import print_throughput
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=
+        "Finetune a transformers model on a causal language modeling task")
+    parser.add_argument('--data_path',
+                        nargs='*',
+                        default=['Dahoas/rm-static'],
+                        help='Path to the training dataset. Accepted format:'
+                        '1) a single data path, 2) multiple datasets in the'
+                        'form: dataset1-path dataset2-path ...')
+    parser.add_argument('--data_split',
+                        type=str,
+                        default='2,4,4',
+                        help='Comma-separated list of proportions for training'
+                        'phase 1, 2, and 3 data. For example the split `6,2,2`'
+                        'will use 60%% of data for phase 1, 20%% for phase 2'
+                        'and 20%% for phase 3.')
+    parser.add_argument(
+        '--data_output_path',
+        type=str,
+        default='/tmp/data_files/',
+        help=
+        'Where to store the data-related files such as shuffle index. This needs to be on a local storage of a node (not on a shared storage)'
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help=
+        "Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=16,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=16,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--max_seq_len",
+        type=int,
+        default=512,
+        help="The maximum sequence length.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-3,
+        help=
+        "Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay",
+                        type=float,
+                        default=0.,
+                        help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs",
+                        type=int,
+                        default=1,
+                        help="Total number of training epochs to perform.")
+    # Reference: https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py
+    parser.add_argument(
+        "--beta",
+        type=float,
+        default=1e-1,
+        help=
+        "Temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5. We ignore the reference model as beta -> 0."
+    )
+    parser.add_argument(
+        "--label_smoothing",
+        type=float,
+        default=0.0,
+        help=
+        "conservativeness for DPO loss, which assumes that preferences are noisy (flipped with probability label_smoothing)"
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help=
+        "Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="cosine",
+        help="The scheduler type to use.",
+        choices=[
+            "linear", "cosine", "cosine_with_restarts", "polynomial",
+            "constant", "constant_with_warmup"
+        ],
+    )
+    parser.add_argument(
+        "--num_warmup_steps",
+        type=int,
+        default=0,
+        help="Number of steps for the warmup in the lr scheduler.")
+    parser.add_argument("--output_dir",
+                        type=str,
+                        default=None,
+                        help="Where to store the model.")
+    parser.add_argument("--seed",
+                        type=int,
+                        default=1234,
+                        help="A seed for reproducible training.")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--gradient_checkpointing',
+                        action='store_true',
+                        help='Enable HF gradient checkpointing for model.')
+    parser.add_argument(
+        "--dropout",
+        type=float,
+        default=None,
+        help="If dropout configured, use it. "
+        "Otherwise, keep the default dropout configuration of the model.")
+    # deepspeed features
+    parser.add_argument('--offload',
+                        action='store_true',
+                        help='Enable ZeRO Offload techniques.')
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='fp16',
+                        choices=['fp16', 'bf16'],
+                        help='Training data type')
+    parser.add_argument(
+        '--offload_reference_model',
+        action='store_true',
+        help='Enable ZeRO Offload techniques for reference model.')
+    parser.add_argument(
+        '--zero_stage',
+        type=int,
+        default=0,
+        help='ZeRO optimization stage for Actor model (and clones).')
+    ## LoRA for efficient training setting
+    parser.add_argument("--lora_dim",
+                        type=int,
+                        default=0,
+                        help="If > 0, use LoRA for efficient training.")
+    parser.add_argument("--lora_module_name",
+                        type=str,
+                        default="decoder.layers.",
+                        help="The scope of LoRA.")
+    parser.add_argument('--only_optimize_lora',
+                        action='store_true',
+                        help='Only optimize the LoRA parameters.')
+    parser.add_argument(
+        "--lora_learning_rate",
+        type=float,
+        default=5e-4,
+        help=
+        "Initial LoRA learning rate (after the potential warmup period) to use."
+    )
+    ## low precision
+    parser.add_argument(
+        '--compute_fp32_loss',
+        action='store_true',
+        help='Relevant for low precision dtypes (fp16, bf16, etc.). '
+        'If specified, loss is calculated in fp32.')
+    ## Tensorboard logging
+    parser.add_argument('--enable_tensorboard',
+                        action='store_true',
+                        help='Enable tensorboard logging')
+    parser.add_argument('--tensorboard_path',
+                        type=str,
+                        default="step2_tensorboard")
+    ## Tokenizer
+    parser.add_argument(
+        "--add_eot_token",
+        action='store_true',
+        help="Add <|endoftext|> as additional special token to tokenizer")
+    ## Print loss
+    parser.add_argument('--print_loss',
+                        action='store_true',
+                        help='Prints loss at each step.')
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+
+    return args
+
+
+# Reference: https://github.com/huggingface/trl/blob/main/trl/trainer/dpo_trainer.py
+def get_batch_logps(logits, input_ids, label_mask):
+    labels = input_ids.clone() * label_mask
+    assert logits.shape[:-1] == labels.shape, \
+        "Logits (batch and sequence length dim) and labels must have the same shape."
+    labels = labels[:, 1:]
+    label_mask = label_mask[:, 1:]
+    logits = logits[:, :-1, :]
+    per_token_logps = torch.gather(logits.log_softmax(-1),
+                                   dim=2,
+                                   index=labels.unsqueeze(2)).squeeze(2)
+    return (per_token_logps * label_mask).sum(-1)
+
+
+def main():
+    args = parse_args()
+
+    if args.local_rank == -1:
+        device = torch.device(get_accelerator().device_name())
+    else:
+        get_accelerator().set_device(args.local_rank)
+        device = torch.device(get_accelerator().device_name(), args.local_rank)
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        # torch.distributed.init_process_group(backend='nccl')
+        deepspeed.init_distributed()
+
+    args.global_rank = torch.distributed.get_rank()
+
+    ds_config = get_train_ds_config(offload=args.offload,
+                                    dtype=args.dtype,
+                                    stage=args.zero_stage,
+                                    enable_tensorboard=args.enable_tensorboard,
+                                    tb_path=args.tensorboard_path,
+                                    tb_name="step2_model")
+    ds_config[
+        'train_micro_batch_size_per_gpu'] = args.per_device_train_batch_size
+    ds_config[
+        'train_batch_size'] = args.per_device_train_batch_size * torch.distributed.get_world_size(
+        ) * args.gradient_accumulation_steps
+
+    # If passed along, set the training seed now.
+    set_random_seed(args.seed)
+
+    torch.distributed.barrier()
+
+    # load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family
+    args.end_of_conversation_token = "<|endoftext|>"
+    additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
+    tokenizer = load_hf_tokenizer(args.model_name_or_path,
+                                  fast_tokenizer=True,
+                                  add_special_tokens=additional_special_tokens)
+
+    model = create_hf_model(AutoModelForCausalLM,
+                            args.model_name_or_path,
+                            tokenizer,
+                            ds_config,
+                            dropout=args.dropout)
+
+    # DS Config for ref model
+    ref_zero_stage = args.zero_stage
+    if ref_zero_stage != 3:
+        # If it is ZeRO-3 then we use it for everything, otherwise assume we have enough memory for ref model
+        ref_zero_stage = 0
+    ref_ds_config = get_eval_ds_config(args.offload_reference_model,
+                                       args.dtype, ref_zero_stage)
+    ref_ds_config[
+        'train_micro_batch_size_per_gpu'] = args.per_device_train_batch_size
+    ref_ds_config[
+        'train_batch_size'] = args.per_device_train_batch_size * torch.distributed.get_world_size(
+        ) * args.gradient_accumulation_steps
+    ref_ds_eval_config = get_eval_ds_config(offload=False,
+                                            dtype=args.dtype,
+                                            stage=ref_zero_stage)
+    ref_ds_eval_config[
+        'train_micro_batch_size_per_gpu'] = args.per_device_train_batch_size
+    ref_ds_eval_config[
+        'train_batch_size'] = args.per_device_train_batch_size * torch.distributed.get_world_size(
+        ) * args.gradient_accumulation_steps
+    ref_model = create_hf_model(AutoModelForCausalLM,
+                                args.model_name_or_path,
+                                tokenizer,
+                                ref_ds_eval_config,
+                                dropout=args.dropout)
+    # End of DS config for ref model
+
+    if args.compute_fp32_loss:
+        print_rank_0(
+            f"Using model {model.__class__.__name__} with loss in fp32",
+            args.global_rank)
+        causal_lm_model_to_fp32_loss(model)
+
+    # Copied from ../step2_reward_model_finetuning/main.py.
+    # Model bigscience/bloom-560m has large variance at ln_f.weight parameter
+    # This makes bf16 finetuning hard.
+    # In general, since we are replacing the model head, it makes sense to reset
+    # the LN that precedes it.
+    force_optimize_params = []
+    if "bigscience/bloom-" in args.model_name_or_path:
+        zero_init_enabled = (args.zero_stage == 3)
+        params = [
+            model.rwtranrsformer.ln_f.weight, model.rwtranrsformer.ln_f.bias
+        ]
+        with deepspeed.zero.GatheredParameters(params,
+                                               modifier_rank=0,
+                                               enabled=zero_init_enabled):
+            if deepspeed.comm.get_rank() == 0 or not zero_init_enabled:
+                torch.nn.init.ones_(model.rwtransformer.ln_f.weight)
+                torch.nn.init.zeros_(model.rwtransformer.ln_f.bias)
+        force_optimize_params.extend(
+            ['rwtransformer.ln_f.weight', 'rwtransformer.ln_f.bias'])
+
+    if args.lora_dim > 0:
+        model = convert_linear_layer_to_lora(model, args.lora_module_name,
+                                             args.lora_dim)
+        if args.only_optimize_lora:
+            model = only_optimize_lora_parameters(model)
+            model = make_model_gradient_checkpointing_compatible(model)
+
+    # Prepare the data
+    train_phase = 2
+    train_dataset, eval_dataset = create_prompt_dataset(
+        args.local_rank, args.data_path, args.data_split,
+        args.data_output_path, train_phase, args.seed, tokenizer,
+        args.max_seq_len)
+
+    # DataLoaders creation:
+    data_collator = DataCollatorReward()
+    if args.local_rank == -1:
+        train_sampler = RandomSampler(train_dataset)
+        eval_sampler = SequentialSampler(eval_dataset)
+    else:
+        train_sampler = DistributedSampler(train_dataset)
+        eval_sampler = DistributedSampler(eval_dataset)
+    train_dataloader = DataLoader(train_dataset,
+                                  collate_fn=data_collator,
+                                  sampler=train_sampler,
+                                  batch_size=args.per_device_train_batch_size)
+    eval_dataloader = DataLoader(eval_dataset,
+                                 collate_fn=data_collator,
+                                 sampler=eval_sampler,
+                                 batch_size=args.per_device_eval_batch_size)
+
+    def evaluation(model, ref_model, tokenizer, eval_dataloader):
+        model.eval()
+        losses = 0
+        for step, batch in enumerate(eval_dataloader):
+            batch = to_device(batch, device)
+            batch_size = batch['input_ids'].shape[0] // 2
+            chosen_input_ids = batch['input_ids'][:batch_size]
+            rejected_input_ids = batch['input_ids'][batch_size:]
+            label_mask = (batch['input_ids'] != tokenizer.pad_token_id).int()
+            for i in range(batch_size):
+                divergence_ind = (chosen_input_ids[i] !=
+                                  rejected_input_ids[i]).nonzero().squeeze(-1)
+                if len(divergence_ind) > 0:
+                    divergence_ind = divergence_ind[0]
+                else:
+                    divergence_ind = 0
+                label_mask[i][:divergence_ind] = 0
+                label_mask[i + batch_size][:divergence_ind] = 0
+            with torch.no_grad():
+                outputs = model(**batch)
+                ref_outputs = ref_model(**batch)
+
+            logps = get_batch_logps(outputs.logits, batch['input_ids'],
+                                    label_mask)
+            ref_logps = get_batch_logps(ref_outputs.logits, batch['input_ids'],
+                                        label_mask)
+
+            chosen_logps = logps[:batch_size]
+            rejected_logps = logps[batch_size:]
+            ref_chosen_logps = ref_logps[:batch_size]
+            ref_rejected_logps = ref_logps[batch_size:]
+
+            logits = args.beta * ((chosen_logps - ref_chosen_logps) -
+                                  (rejected_logps - ref_rejected_logps))
+            loss = (- torch.nn.functional.logsigmoid(logits) * (1 - args.label_smoothing) - \
+                    torch.nn.functional.logsigmoid(-logits) * args.label_smoothing).mean(0)
+            losses += loss.float()
+        losses = losses / (step + 1)
+        try:
+            losses = get_all_reduce_mean(losses)
+        except:
+            pass
+        chosen_rewards = args.beta * (chosen_logps - ref_chosen_logps).detach()
+        rejected_rewards = args.beta * (rejected_logps -
+                                        ref_rejected_logps).detach()
+        return chosen_rewards.mean().item(), rejected_rewards.mean().item(
+        ), losses.item()
+
+    # Split weights in two groups, one with weight decay and the other not.
+    optimizer_grouped_parameters = get_optimizer_grouped_parameters(
+        model, args.weight_decay, args.lora_learning_rate)
+
+    AdamOptimizer = DeepSpeedCPUAdam if args.offload else FusedAdam
+    optimizer = AdamOptimizer(optimizer_grouped_parameters,
+                              lr=args.learning_rate,
+                              betas=(0.9, 0.95))
+
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps)
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.num_train_epochs * num_update_steps_per_epoch,
+    )
+
+    model, optimizer, _, lr_scheduler = deepspeed.initialize(
+        model=model,
+        optimizer=optimizer,
+        args=args,
+        config=ds_config,
+        lr_scheduler=lr_scheduler,
+        dist_init_required=True)
+    ref_model, *_ = deepspeed.initialize(model=ref_model, config=ref_ds_config)
+    ref_model.eval()
+
+    if args.gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+
+    # Train!
+    print_rank_0("***** Running training *****", args.global_rank)
+    print_rank_0(
+        f"***** Evaluating rewards, Epoch {1}/{args.num_train_epochs} *****",
+        args.global_rank)
+    chosen_rewards, rejected_rewards, eval_loss = evaluation(
+        model, ref_model, tokenizer, eval_dataloader)
+    print_rank_0(
+        f"chosen: {chosen_rewards}, rejected: {rejected_rewards}, loss: {eval_loss}",
+        args.global_rank)
+
+    for epoch in range(args.num_train_epochs):
+        print_rank_0(
+            f"Beginning of Epoch {epoch+1}/{args.num_train_epochs}, Total Micro Batches {len(train_dataloader)}",
+            args.global_rank)
+        model.train()
+        import time
+        for step, batch in enumerate(train_dataloader):
+            start = time.time()
+            batch = to_device(batch, device)
+            batch_size = batch['input_ids'].shape[0] // 2
+            chosen_input_ids = batch['input_ids'][:batch_size]
+            rejected_input_ids = batch['input_ids'][batch_size:]
+            label_mask = (batch['input_ids'] != tokenizer.pad_token_id).int()
+            for i in range(batch_size):
+                divergence_ind = (chosen_input_ids[i] !=
+                                  rejected_input_ids[i]).nonzero().squeeze(-1)
+                if len(divergence_ind) > 0:
+                    divergence_ind = divergence_ind[0]
+                else:
+                    divergence_ind = 0
+                label_mask[i][:divergence_ind] = 0
+                label_mask[i + batch_size][:divergence_ind] = 0
+            outputs = model(**batch, use_cache=False)
+            with torch.no_grad():
+                ref_outputs = ref_model(**batch)
+
+            logps = get_batch_logps(outputs.logits, batch['input_ids'],
+                                    label_mask)
+            ref_logps = get_batch_logps(ref_outputs.logits, batch['input_ids'],
+                                        label_mask)
+
+            chosen_logps = logps[:batch_size]
+            rejected_logps = logps[batch_size:]
+            ref_chosen_logps = ref_logps[:batch_size]
+            ref_rejected_logps = ref_logps[batch_size:]
+
+            logits = args.beta * ((chosen_logps - ref_chosen_logps) -
+                                  (rejected_logps - ref_rejected_logps))
+            loss = (- torch.nn.functional.logsigmoid(logits) * (1 - args.label_smoothing) - \
+                    torch.nn.functional.logsigmoid(-logits) * args.label_smoothing).mean(0)
+            if args.print_loss:
+                print(
+                    f"Epoch: {epoch}, Step: {step}, Rank: {torch.distributed.get_rank()}, loss = {loss}"
+                )
+            model.backward(loss)
+            model.step()
+            end = time.time()
+            if torch.distributed.get_rank() == 0:
+                print_throughput(model.model, args, end - start,
+                                 args.global_rank)
+
+        # Evaluate rewards on the validation set.
+        print_rank_0(
+            f"***** Evaluating rewards, Epoch {epoch+1}/{args.num_train_epochs} *****",
+            args.global_rank)
+        chosen_rewards, rejected_rewards, eval_loss = evaluation(
+            model, ref_model, tokenizer, eval_dataloader)
+        print_rank_0(
+            f"chosen: {chosen_rewards}, rejected: {rejected_rewards}, loss: {eval_loss}",
+            args.global_rank)
+        model.tput_timer.update_epoch_count()
+
+    if args.output_dir is not None:
+        print_rank_0('saving the final model ...', args.global_rank)
+        model = convert_lora_to_linear_layer(model)
+
+        if args.global_rank == 0:
+            save_hf_format(model, tokenizer, args)
+
+        if args.zero_stage == 3:
+            # For zero stage 3, each gpu only has a part of the model, so we need a special save function
+            save_zero_three_model(model,
+                                  args.global_rank,
+                                  args.output_dir,
+                                  zero_stage=args.zero_stage)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_log_output/opt-350M_globalBatchSize-32.log b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_log_output/opt-350M_globalBatchSize-32.log
new file mode 100644
index 000000000..2f75b0a5d
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_log_output/opt-350M_globalBatchSize-32.log
@@ -0,0 +1,6409 @@
+[2025-01-02 15:45:19,529] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-01-02 15:45:22,160] [WARNING] [runner.py:215:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
+[2025-01-02 15:45:22,160] [INFO] [runner.py:607:main] cmd = .venv/dsexamples/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None main.py --data_path Dahoas/rm-static Dahoas/full-hh-rlhf --data_split 2,4,4 --model_name_or_path facebook/opt-350m --per_device_train_batch_size 2 --per_device_eval_batch_size 4 --max_seq_len 512 --learning_rate 5e-5 --weight_decay 0.1 --num_train_epochs 1 --dropout 0.0 --gradient_accumulation_steps 2 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --zero_stage 0 --deepspeed --output_dir ./output
+[2025-01-02 15:45:24,254] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-01-02 15:45:26,831] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}
+[2025-01-02 15:45:26,831] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=8, node_rank=0
+[2025-01-02 15:45:26,831] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]})
+[2025-01-02 15:45:26,832] [INFO] [launch.py:164:main] dist_world_size=8
+[2025-01-02 15:45:26,832] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+[2025-01-02 15:45:26,832] [INFO] [launch.py:256:main] process 106031 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=0', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output']
+[2025-01-02 15:45:26,833] [INFO] [launch.py:256:main] process 106032 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=1', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output']
+[2025-01-02 15:45:26,833] [INFO] [launch.py:256:main] process 106033 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=2', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output']
+[2025-01-02 15:45:26,834] [INFO] [launch.py:256:main] process 106034 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=3', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output']
+[2025-01-02 15:45:26,834] [INFO] [launch.py:256:main] process 106035 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=4', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output']
+[2025-01-02 15:45:26,835] [INFO] [launch.py:256:main] process 106036 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=5', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output']
+[2025-01-02 15:45:26,835] [INFO] [launch.py:256:main] process 106037 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=6', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output']
+[2025-01-02 15:45:26,836] [INFO] [launch.py:256:main] process 106038 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=7', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output']
+[2025-01-02 15:45:30,732] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-01-02 15:45:30,910] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-01-02 15:45:31,042] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-01-02 15:45:31,049] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-01-02 15:45:31,103] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-01-02 15:45:31,135] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-01-02 15:45:31,144] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-01-02 15:45:31,147] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-01-02 15:45:33,071] [INFO] [comm.py:652:init_distributed] cdb=None
+[2025-01-02 15:45:33,584] [INFO] [comm.py:652:init_distributed] cdb=None
+[2025-01-02 15:45:33,584] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[rank3]:[W102 15:45:34.866823429 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[2025-01-02 15:45:34,383] [INFO] [comm.py:652:init_distributed] cdb=None
+[rank4]:[W102 15:45:34.247342944 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[2025-01-02 15:45:34,450] [INFO] [comm.py:652:init_distributed] cdb=None
+[2025-01-02 15:45:34,472] [INFO] [comm.py:652:init_distributed] cdb=None
+[2025-01-02 15:45:34,472] [INFO] [comm.py:652:init_distributed] cdb=None
+[rank2]:[W102 15:45:34.334422354 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[rank1]:[W102 15:45:34.339768589 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[rank6]:[W102 15:45:34.340404849 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[2025-01-02 15:45:34,557] [INFO] [comm.py:652:init_distributed] cdb=None
+[rank7]:[W102 15:45:34.425895009 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[2025-01-02 15:45:34,604] [INFO] [comm.py:652:init_distributed] cdb=None
+[rank5]:[W102 15:45:34.470924200 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[rank0]:[W102 15:45:34.726812378 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+Setting model_config.dropout to 0.0
+Setting model_config.attention_dropout to 0.0
+Setting model_config.activation_dropout to 0.0
+DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(train_fname), torch.load(eval_fname)
+DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(train_fname), torch.load(eval_fname)
+DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(train_fname), torch.load(eval_fname)
+DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(train_fname), torch.load(eval_fname)
+DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(train_fname), torch.load(eval_fname)
+DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(train_fname), torch.load(eval_fname)
+DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(train_fname), torch.load(eval_fname)
+DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(train_fname), torch.load(eval_fname)
+Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root...
+Detected CUDA files, patching ldflags
+Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja...
+.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
+  warnings.warn(
+Building extension module fused_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.05063033103942871 seconds
+[2025-01-02 15:46:44,140] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root...
+Detected CUDA files, patching ldflags
+Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja...
+.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
+  warnings.warn(
+Building extension module fused_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.04431414604187012 seconds
+[2025-01-02 15:46:44,304] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root...
+Detected CUDA files, patching ldflags
+Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja...
+.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
+  warnings.warn(
+Building extension module fused_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.11332392692565918 seconds
+[2025-01-02 15:46:44,637] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.16.2, git-hash=unknown, git-branch=unknown
+[2025-01-02 15:46:44,637] [INFO] [comm.py:677:init_distributed] Distributed backend already initialized
+[2025-01-02 15:46:44,637] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root...
+Detected CUDA files, patching ldflags
+Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja...
+.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
+  warnings.warn(
+Building extension module fused_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.10501360893249512 seconds
+[2025-01-02 15:46:45,431] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root...
+Detected CUDA files, patching ldflags
+Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja...
+.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
+  warnings.warn(
+Building extension module fused_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.033315420150756836 seconds
+[2025-01-02 15:46:48,423] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root...
+Detected CUDA files, patching ldflags
+Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja...
+.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
+  warnings.warn(
+Building extension module fused_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.1015481948852539 seconds
+[2025-01-02 15:46:49,014] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root...
+Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root...
+Detected CUDA files, patching ldflags
+Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja...
+.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
+  warnings.warn(
+Building extension module fused_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.13011908531188965 seconds
+[2025-01-02 15:46:50,464] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.20447063446044922 seconds
+[2025-01-02 15:46:50,545] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+[2025-01-02 15:46:55,765] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2025-01-02 15:46:55,766] [INFO] [logging.py:128:log_dist] [Rank 0] Using client Optimizer as basic optimizer
+[2025-01-02 15:46:55,767] [INFO] [logging.py:128:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
+[2025-01-02 15:46:55,785] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam
+[2025-01-02 15:46:55,785] [INFO] [logging.py:128:log_dist] [Rank 0] Creating fp16 optimizer with dynamic loss scale
+[2025-01-02 15:46:55,967] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Final Optimizer = FP16_Optimizer
+[2025-01-02 15:46:55,967] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed using client LR scheduler
+[2025-01-02 15:46:55,967] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed LR Scheduler = <torch.optim.lr_scheduler.LambdaLR object at 0x7fad8c82c730>
+[2025-01-02 15:46:55,967] [INFO] [logging.py:128:log_dist] [Rank 0] step=0, skipped=0, lr=[5e-05, 5e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:46:55,968] [INFO] [config.py:999:print] DeepSpeedEngine configuration:
+[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print]   activation_checkpointing_config  {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false,
+    "cpu_checkpointing": false,
+    "number_checkpoints": null,
+    "synchronize_checkpoint_boundary": false,
+    "profile": false
+}
+[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False}
+[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print]   amp_enabled .................. False
+[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print]   amp_params ................... False
+[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print]   autotuning_config ............ {
+    "enabled": false,
+    "start_step": null,
+    "end_step": null,
+    "metric_path": null,
+    "arg_mappings": null,
+    "metric": "throughput",
+    "model_info": null,
+    "results_dir": "autotuning_results",
+    "exps_dir": "autotuning_exps",
+    "overwrite": true,
+    "fast": true,
+    "start_profile_step": 3,
+    "end_profile_step": 5,
+    "tuner_type": "gridsearch",
+    "tuner_early_stopping": 5,
+    "tuner_num_trials": 50,
+    "model_info_path": null,
+    "mp_size": 1,
+    "max_train_batch_size": null,
+    "min_train_batch_size": 1,
+    "max_train_micro_batch_size_per_gpu": 1.024000e+03,
+    "min_train_micro_batch_size_per_gpu": 1,
+    "num_tuning_micro_batch_sizes": 3
+}
+[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print]   bfloat16_enabled ............. False
+[2025-01-02 15:46:55,969] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print]   checkpoint_parallel_write_pipeline  False
+[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print]   checkpoint_tag_validation_enabled  True
+[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print]   checkpoint_tag_validation_fail  False
+[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7fad8c82e5c0>
+[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print]   communication_data_type ...... None
+[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   curriculum_enabled_legacy .... False
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   curriculum_params_legacy ..... False
+[2025-01-02 15:46:55,969] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   data_efficiency_enabled ...... False
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   dataloader_drop_last ......... False
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   disable_allgather ............ False
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   dump_state ................... False
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 100, 'delayed_shift': 2, 'consecutive_hysteresis': False, 'min_scale': 1}
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   eigenvalue_enabled ........... False
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   eigenvalue_gas_boundary_resolution  1
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   eigenvalue_layer_num ......... 0
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   eigenvalue_max_iter .......... 100
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   eigenvalue_stability ......... 1e-06
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   eigenvalue_tol ............... 0.01
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   eigenvalue_verbose ........... False
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   elasticity_enabled ........... False
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   flops_profiler_config ........ {
+    "enabled": false,
+    "recompute_fwd_factor": 0.0,
+    "profile_step": 1,
+    "module_depth": -1,
+    "top_modules": 1,
+    "detailed": true,
+    "output_file": null
+}
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   fp16_auto_cast ............... False
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   fp16_enabled ................. True
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   fp16_master_weights_and_gradients  False
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   global_rank .................. 0
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   grad_accum_dtype ............. None
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   gradient_accumulation_steps .. 2
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   gradient_clipping ............ 1.0
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   gradient_predivide_factor .... 1.0
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   graph_harvesting ............. False
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
+[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print]   initial_dynamic_scale ........ 65536
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   load_universal_checkpoint .... False
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   loss_scale ................... 0
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   memory_breakdown ............. False
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   mics_hierarchial_params_gather  False
+[2025-01-02 15:46:55,971] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='step2_tensorboard/ds_tensorboard_logs/', job_name='step2_model_tensorboard') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName')
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   nebula_config ................ {
+    "enabled": false,
+    "persistent_storage_path": null,
+    "persistent_time_interval": 100,
+    "num_of_version_in_retention": 2,
+    "enable_nebula_load": true,
+    "load_path": null
+}
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   optimizer_legacy_fusion ...... False
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   optimizer_name ............... None
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   optimizer_params ............. None
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   pld_enabled .................. False
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   pld_params ................... False
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   prescale_gradients ........... False
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   scheduler_name ............... None
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   scheduler_params ............. None
+[2025-01-02 15:46:55,971] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   sparse_attention ............. None
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   sparse_gradients_enabled ..... False
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   steps_per_print .............. 10
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   timers_config ................ enabled=True synchronized=True
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   train_batch_size ............. 32
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   train_micro_batch_size_per_gpu  2
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   use_data_before_expert_parallel_  False
+[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print]   use_node_local_storage ....... False
+[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print]   wall_clock_breakdown ......... False
+[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print]   weight_quantization_config ... None
+[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print]   world_size ................... 8
+[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print]   zero_allow_untested_optimizer  False
+[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print]   zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=9223372036854775807 max_live_parameters=30000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True
+[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print]   zero_enabled ................. False
+[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print]   zero_force_ds_cpu_optimizer .. True
+[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print]   zero_optimization_stage ...... 0
+[2025-01-02 15:46:55,972] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+[2025-01-02 15:46:55,972] [INFO] [config.py:989:print_user_config]   json = {
+    "train_batch_size": 32,
+    "train_micro_batch_size_per_gpu": 2,
+    "steps_per_print": 10,
+    "zero_optimization": {
+        "stage": 0,
+        "overlap_comm": true,
+        "offload_param": {
+            "device": "none"
+        },
+        "offload_optimizer": {
+            "device": "none"
+        },
+        "stage3_param_persistence_threshold": 1.000000e+04,
+        "stage3_max_live_parameters": 3.000000e+07,
+        "stage3_prefetch_bucket_size": 3.000000e+07,
+        "memory_efficient_linear": false
+    },
+    "fp16": {
+        "enabled": true,
+        "loss_scale_window": 100
+    },
+    "gradient_clipping": 1.0,
+    "prescale_gradients": false,
+    "wall_clock_breakdown": false,
+    "hybrid_engine": {
+        "enabled": false,
+        "max_out_tokens": 512,
+        "inference_tp_size": 1,
+        "release_inference_cache": false,
+        "pin_parameters": true,
+        "tp_gather_partition_size": 8
+    },
+    "tensorboard": {
+        "enabled": false,
+        "output_path": "step2_tensorboard/ds_tensorboard_logs/",
+        "job_name": "step2_model_tensorboard"
+    }
+}
+[2025-01-02 15:46:55,972] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.16.2, git-hash=unknown, git-branch=unknown
+[2025-01-02 15:46:55,972] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+[2025-01-02 15:46:55,978] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+[2025-01-02 15:46:56,023] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8
+[2025-01-02 15:47:02,563] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2025-01-02 15:47:02,565] [INFO] [config.py:999:print] DeepSpeedEngine configuration:
+[2025-01-02 15:47:02,565] [INFO] [config.py:1003:print]   activation_checkpointing_config  {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false,
+    "cpu_checkpointing": false,
+    "number_checkpoints": null,
+    "synchronize_checkpoint_boundary": false,
+    "profile": false
+}
+[2025-01-02 15:47:02,565] [INFO] [config.py:1003:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False}
+[2025-01-02 15:47:02,565] [INFO] [config.py:1003:print]   amp_enabled .................. False
+[2025-01-02 15:47:02,565] [INFO] [config.py:1003:print]   amp_params ................... False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   autotuning_config ............ {
+    "enabled": false,
+    "start_step": null,
+    "end_step": null,
+    "metric_path": null,
+    "arg_mappings": null,
+    "metric": "throughput",
+    "model_info": null,
+    "results_dir": "autotuning_results",
+    "exps_dir": "autotuning_exps",
+    "overwrite": true,
+    "fast": true,
+    "start_profile_step": 3,
+    "end_profile_step": 5,
+    "tuner_type": "gridsearch",
+    "tuner_early_stopping": 5,
+    "tuner_num_trials": 50,
+    "model_info_path": null,
+    "mp_size": 1,
+    "max_train_batch_size": null,
+    "min_train_batch_size": 1,
+    "max_train_micro_batch_size_per_gpu": 1.024000e+03,
+    "min_train_micro_batch_size_per_gpu": 1,
+    "num_tuning_micro_batch_sizes": 3
+}
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   bfloat16_enabled ............. False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   bfloat16_immediate_grad_update  False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   checkpoint_parallel_write_pipeline  False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   checkpoint_tag_validation_enabled  True
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   checkpoint_tag_validation_fail  False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7fad8c853670>
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   communication_data_type ...... None
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   curriculum_enabled_legacy .... False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   curriculum_params_legacy ..... False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   data_efficiency_enabled ...... False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   dataloader_drop_last ......... False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   disable_allgather ............ False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   dump_state ................... False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   dynamic_loss_scale_args ...... None
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   eigenvalue_enabled ........... False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   eigenvalue_gas_boundary_resolution  1
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   eigenvalue_layer_num ......... 0
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   eigenvalue_max_iter .......... 100
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   eigenvalue_stability ......... 1e-06
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   eigenvalue_tol ............... 0.01
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   eigenvalue_verbose ........... False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   elasticity_enabled ........... False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   flops_profiler_config ........ {
+    "enabled": false,
+    "recompute_fwd_factor": 0.0,
+    "profile_step": 1,
+    "module_depth": -1,
+    "top_modules": 1,
+    "detailed": true,
+    "output_file": null
+}
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   fp16_auto_cast ............... False
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   fp16_enabled ................. True
+[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print]   fp16_master_weights_and_gradients  False
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   global_rank .................. 0
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   grad_accum_dtype ............. None
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   gradient_accumulation_steps .. 2
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   gradient_clipping ............ 1.0
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   gradient_predivide_factor .... 1.0
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   graph_harvesting ............. False
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   initial_dynamic_scale ........ 65536
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   load_universal_checkpoint .... False
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   loss_scale ................... 0
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   memory_breakdown ............. False
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   mics_hierarchial_params_gather  False
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   mics_shard_size .............. -1
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName')
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   nebula_config ................ {
+    "enabled": false,
+    "persistent_storage_path": null,
+    "persistent_time_interval": 100,
+    "num_of_version_in_retention": 2,
+    "enable_nebula_load": true,
+    "load_path": null
+}
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   optimizer_legacy_fusion ...... False
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   optimizer_name ............... None
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   optimizer_params ............. None
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   pld_enabled .................. False
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   pld_params ................... False
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   prescale_gradients ........... False
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   scheduler_name ............... None
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   scheduler_params ............. None
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   seq_parallel_communication_data_type  torch.float32
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   sparse_attention ............. None
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   sparse_gradients_enabled ..... False
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   steps_per_print .............. 10
+[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print]   timers_config ................ enabled=True synchronized=True
+[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print]   train_batch_size ............. 32
+[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print]   train_micro_batch_size_per_gpu  2
+[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print]   use_data_before_expert_parallel_  False
+[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print]   use_node_local_storage ....... False
+[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print]   wall_clock_breakdown ......... False
+[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print]   weight_quantization_config ... None
+[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print]   world_size ................... 8
+[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print]   zero_allow_untested_optimizer  False
+[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print]   zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=False) offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=10000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True
+[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print]   zero_enabled ................. False
+[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print]   zero_force_ds_cpu_optimizer .. True
+[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print]   zero_optimization_stage ...... 0
+[2025-01-02 15:47:02,568] [INFO] [config.py:989:print_user_config]   json = {
+    "train_batch_size": 32,
+    "train_micro_batch_size_per_gpu": 2,
+    "steps_per_print": 10,
+    "zero_optimization": {
+        "stage": 0,
+        "stage3_param_persistence_threshold": 1.000000e+04,
+        "offload_param": {
+            "device": "none"
+        },
+        "memory_efficient_linear": false
+    },
+    "fp16": {
+        "enabled": true
+    },
+    "gradient_clipping": 1.0,
+    "prescale_gradients": false,
+    "wall_clock_breakdown": false
+}
+***** Running training *****
+***** Evaluating rewards, Epoch 1/1 *****
+chosen: 0.0, rejected: 0.0, loss: 0.693359375
+Beginning of Epoch 1/1, Total Micro Batches 4708
+Model Parameters: 0.331 B, Latency: 0.27s, TFLOPs: 1.02, Samples/sec: 7.28, Time/seq 0.14s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:47:32,711] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 0
+[2025-01-02 15:47:32,711] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 0
+[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0
+[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale fr[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0
+[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 0
+[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0
+[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 0
+[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 0
+[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 0
+[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0
+[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0
+[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0
+[2025-01-02 15:47:32,713] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 65536, reducing to 32768.0
+[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 0
+[2025-01-02 15:47:32,713] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.06, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.10s, TFLOPs: 2.70, Samples/sec: 19.26, Time/seq 0.05s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1
+[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0
+[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1
+[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0
+[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0
+[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1
+[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1
+[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0
+[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1
+[2025-01-02 15:47:33,343] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 32768.0, reducing to 16384.0
+[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1
+[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale fr[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0
+Model Parameters: 0.331 B, Latency: 0.52s, TFLOPs: 0.54, Samples/sec: 3.84, Time/seq 0.26s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2
+[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2
+[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0
+[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0
+[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2
+[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0
+[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2
+[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0
+[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0
+[2025-01-02 15:47:33,975] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 16384.0, reducing to 8192.0
+[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2
+[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2
+[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0
+[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.11, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.62s, TFLOPs: 0.46, Samples/sec: 3.25, Time/seq 0.31s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 9
+[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 9
+[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 9
+[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 9
+[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 9
+[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 9
+[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 15:47:38,840] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0
+[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 9
+[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 15:47:38,841] [INFO] [logging.py:128:log_dist] [Rank 0] step=10, skipped=4, lr=[4.999919851200522e-05, 4.999919851200522e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:47:38,842] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=46.98077573584925, CurrSamplesPerSec=50.86885115471782, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.09, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:47:45,728] [INFO] [logging.py:128:log_dist] [Rank 0] step=20, skipped=4, lr=[4.999430071591966e-05, 4.999430071591966e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:47:45,746] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=46.82057321463456, CurrSamplesPerSec=46.3240790591193, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:47:52,687] [INFO] [logging.py:128:log_dist] [Rank 0] step=30, skipped=4, lr=[4.998495126612987e-05, 4.998495126612987e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:47:52,708] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=46.657677501465244, CurrSamplesPerSec=46.740073109032714, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:47:59,588] [INFO] [logging.py:128:log_dist] [Rank 0] step=40, skipped=4, lr=[4.9971151827835975e-05, 4.9971151827835975e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:47:59,608] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=46.67284201254282, CurrSamplesPerSec=46.70549424381561, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:48:06,507] [INFO] [logging.py:128:log_dist] [Rank 0] step=50, skipped=4, lr=[4.995290485881111e-05, 4.995290485881111e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:48:06,528] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=46.658721893844756, CurrSamplesPerSec=46.780295098825675, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:48:13,403] [INFO] [logging.py:128:log_dist] [Rank 0] step=60, skipped=4, lr=[4.993021360896366e-05, 4.993021360896366e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:48:13,424] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=46.67496477850472, CurrSamplesPerSec=46.74603117536787, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:48:20,321] [INFO] [logging.py:128:log_dist] [Rank 0] step=70, skipped=4, lr=[4.99030821197584e-05, 4.99030821197584e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:48:20,340] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=46.66552508644222, CurrSamplesPerSec=46.58816877398781, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 76
+[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 76
+[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 76
+[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 76
+[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 76
+[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 76
+[2025-01-02 15:48:25,110] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0
+[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 76
+[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.08, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:48:27,157] [INFO] [logging.py:128:log_dist] [Rank 0] step=80, skipped=5, lr=[4.987487135239265e-05, 4.987487135239265e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:48:27,178] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=46.72639900113086, CurrSamplesPerSec=46.69866909880398, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:48:34,057] [INFO] [logging.py:128:log_dist] [Rank 0] step=90, skipped=5, lr=[4.983931737433311e-05, 4.983931737433311e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:48:34,078] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=46.72344397631967, CurrSamplesPerSec=46.670138855517386, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:48:40,995] [INFO] [logging.py:128:log_dist] [Rank 0] step=100, skipped=5, lr=[4.979933934614882e-05, 4.979933934614882e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:48:41,016] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=46.69784876342507, CurrSamplesPerSec=46.6644759276489, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:48:47,897] [INFO] [logging.py:128:log_dist] [Rank 0] step=110, skipped=5, lr=[4.9754944388196535e-05, 4.9754944388196535e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:48:47,918] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=46.697522740072756, CurrSamplesPerSec=46.6885325659028, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:48:54,803] [INFO] [logging.py:128:log_dist] [Rank 0] step=120, skipped=5, lr=[4.970614040751798e-05, 4.970614040751798e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:48:54,823] [INFO] [timer.py:264:stop] epoch=0/micro_step=240/global_step=120, RunningAvgSamplesPerSec=46.69541297477131, CurrSamplesPerSec=46.75664876248707, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:49:01,714] [INFO] [logging.py:128:log_dist] [Rank 0] step=130, skipped=5, lr=[4.96529360964316e-05, 4.96529360964316e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:49:01,734] [INFO] [timer.py:264:stop] epoch=0/micro_step=260/global_step=130, RunningAvgSamplesPerSec=46.69230036582147, CurrSamplesPerSec=46.71986606957944, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:49:05,845] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 135
+[2025-01-02 15:49:05,845] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:49:05,845] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 135
+[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 135
+[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:49:05,846] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0
+[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 135
+[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 135
+[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 135
+[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 135
+[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.09, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:49:08,600] [INFO] [logging.py:128:log_dist] [Rank 0] step=140, skipped=6, lr=[4.9601297749741036e-05, 4.9601297749741036e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:49:08,620] [INFO] [timer.py:264:stop] epoch=0/micro_step=280/global_step=140, RunningAvgSamplesPerSec=46.701067188567556, CurrSamplesPerSec=46.34668162363741, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:49:15,555] [INFO] [logging.py:128:log_dist] [Rank 0] step=150, skipped=6, lr=[4.9539759563783176e-05, 4.9539759563783176e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:49:15,575] [INFO] [timer.py:264:stop] epoch=0/micro_step=300/global_step=150, RunningAvgSamplesPerSec=46.676625606909724, CurrSamplesPerSec=46.37509004775261, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:49:22,519] [INFO] [logging.py:128:log_dist] [Rank 0] step=160, skipped=6, lr=[4.947385068096907e-05, 4.947385068096907e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:49:22,537] [INFO] [timer.py:264:stop] epoch=0/micro_step=320/global_step=160, RunningAvgSamplesPerSec=46.652045629756806, CurrSamplesPerSec=45.754688762495974, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:49:29,477] [INFO] [logging.py:128:log_dist] [Rank 0] step=170, skipped=6, lr=[4.940358284011574e-05, 4.940358284011574e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:49:29,498] [INFO] [timer.py:264:stop] epoch=0/micro_step=340/global_step=170, RunningAvgSamplesPerSec=46.63235423832134, CurrSamplesPerSec=46.33578549044944, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:49:36,430] [INFO] [logging.py:128:log_dist] [Rank 0] step=180, skipped=6, lr=[4.9328968556400026e-05, 4.9328968556400026e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:49:36,451] [INFO] [timer.py:264:stop] epoch=0/micro_step=360/global_step=180, RunningAvgSamplesPerSec=46.617390821416194, CurrSamplesPerSec=46.339240977849904, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.93, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:49:43,383] [INFO] [logging.py:128:log_dist] [Rank 0] step=190, skipped=6, lr=[4.9250021119129636e-05, 4.9250021119129636e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:49:43,403] [INFO] [timer.py:264:stop] epoch=0/micro_step=380/global_step=190, RunningAvgSamplesPerSec=46.60359803968563, CurrSamplesPerSec=46.34333703776871, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:49:50,336] [INFO] [logging.py:128:log_dist] [Rank 0] step=200, skipped=6, lr=[4.916675458937614e-05, 4.916675458937614e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:49:50,356] [INFO] [timer.py:264:stop] epoch=0/micro_step=400/global_step=200, RunningAvgSamplesPerSec=46.59140057519339, CurrSamplesPerSec=46.578597409153254, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 208
+[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 208
+[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0
+[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0
+[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 208
+[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0
+[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 208
+[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0
+[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 208
+[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 208
+[2025-01-02 15:49:56,558] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0
+[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 208
+[2025-01-02 15:49:56,558] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0
+[2025-01-02 15:49:56,558] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 1024.0, reducing to 512.0
+[2025-01-02 15:49:56,558] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 208
+[2025-01-02 15:49:56,558] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:49:57,226] [INFO] [logging.py:128:log_dist] [Rank 0] step=210, skipped=7, lr=[4.908813412994094e-05, 4.908813412994094e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:49:57,247] [INFO] [timer.py:264:stop] epoch=0/micro_step=420/global_step=210, RunningAvgSamplesPerSec=46.60008012821061, CurrSamplesPerSec=46.86441592162194, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.49, Samples/sec: 3.51, Time/seq 0.29s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:50:04,214] [INFO] [logging.py:128:log_dist] [Rank 0] step=220, skipped=7, lr=[4.899670281569845e-05, 4.899670281569845e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:50:04,232] [INFO] [timer.py:264:stop] epoch=0/micro_step=440/global_step=220, RunningAvgSamplesPerSec=46.580701736622736, CurrSamplesPerSec=45.654661323911306, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:50:11,163] [INFO] [logging.py:128:log_dist] [Rank 0] step=230, skipped=7, lr=[4.890099752667294e-05, 4.890099752667294e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:50:11,183] [INFO] [timer.py:264:stop] epoch=0/micro_step=460/global_step=230, RunningAvgSamplesPerSec=46.57159456525261, CurrSamplesPerSec=46.32703709314177, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.99, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:50:18,110] [INFO] [logging.py:128:log_dist] [Rank 0] step=240, skipped=7, lr=[4.880103530862256e-05, 4.880103530862256e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:50:18,131] [INFO] [timer.py:264:stop] epoch=0/micro_step=480/global_step=240, RunningAvgSamplesPerSec=46.563629676576625, CurrSamplesPerSec=46.309614118584825, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:50:25,065] [INFO] [logging.py:128:log_dist] [Rank 0] step=250, skipped=7, lr=[4.86968339654932e-05, 4.86968339654932e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:50:25,086] [INFO] [timer.py:264:stop] epoch=0/micro_step=500/global_step=250, RunningAvgSamplesPerSec=46.554810311450424, CurrSamplesPerSec=46.38267044164213, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:50:32,025] [INFO] [logging.py:128:log_dist] [Rank 0] step=260, skipped=7, lr=[4.858841205624759e-05, 4.858841205624759e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:50:32,046] [INFO] [timer.py:264:stop] epoch=0/micro_step=520/global_step=260, RunningAvgSamplesPerSec=46.54628059347242, CurrSamplesPerSec=46.90926209286344, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:50:38,997] [INFO] [logging.py:128:log_dist] [Rank 0] step=270, skipped=7, lr=[4.8475788891559783e-05, 4.8475788891559783e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:50:39,018] [INFO] [timer.py:264:stop] epoch=0/micro_step=540/global_step=270, RunningAvgSamplesPerSec=46.535565503469485, CurrSamplesPerSec=46.32256021546509, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:50:45,948] [INFO] [logging.py:128:log_dist] [Rank 0] step=280, skipped=7, lr=[4.835898453037574e-05, 4.835898453037574e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:50:45,969] [INFO] [timer.py:264:stop] epoch=0/micro_step=560/global_step=280, RunningAvgSamplesPerSec=46.52910793247543, CurrSamplesPerSec=46.34376908630242, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:50:52,901] [INFO] [logging.py:128:log_dist] [Rank 0] step=290, skipped=7, lr=[4.823801977634082e-05, 4.823801977634082e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:50:52,922] [INFO] [timer.py:264:stop] epoch=0/micro_step=580/global_step=290, RunningAvgSamplesPerSec=46.52320489698547, CurrSamplesPerSec=46.32582185506885, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:50:59,855] [INFO] [logging.py:128:log_dist] [Rank 0] step=300, skipped=7, lr=[4.811291617409437e-05, 4.811291617409437e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:50:59,875] [INFO] [timer.py:264:stop] epoch=0/micro_step=600/global_step=300, RunningAvgSamplesPerSec=46.51706229660043, CurrSamplesPerSec=46.36179428641033, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:51:06,787] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0
+[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0
+[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0
+[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0
+[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0
+[2025-01-02 15:51:06,789] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:51:06,789] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0
+[2025-01-02 15:51:06,794] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:51:06,794] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0
+[2025-01-02 15:51:06,807] [INFO] [logging.py:128:log_dist] [Rank 0] step=310, skipped=7, lr=[4.7983696005432587e-05, 4.7983696005432587e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:51:06,827] [INFO] [timer.py:264:stop] epoch=0/micro_step=620/global_step=310, RunningAvgSamplesPerSec=46.511774321108796, CurrSamplesPerSec=46.36275517013998, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:51:13,760] [INFO] [logging.py:128:log_dist] [Rank 0] step=320, skipped=7, lr=[4.7850382285339924e-05, 4.7850382285339924e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:51:13,780] [INFO] [timer.py:264:stop] epoch=0/micro_step=640/global_step=320, RunningAvgSamplesPerSec=46.50668977864453, CurrSamplesPerSec=46.34026492494737, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:51:20,713] [INFO] [logging.py:128:log_dist] [Rank 0] step=330, skipped=7, lr=[4.771299875788999e-05, 4.771299875788999e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:51:20,734] [INFO] [timer.py:264:stop] epoch=0/micro_step=660/global_step=330, RunningAvgSamplesPerSec=46.502571796260625, CurrSamplesPerSec=46.319203124992384, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:51:27,664] [INFO] [logging.py:128:log_dist] [Rank 0] step=340, skipped=7, lr=[4.7571569892016555e-05, 4.7571569892016555e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:51:27,685] [INFO] [timer.py:264:stop] epoch=0/micro_step=680/global_step=340, RunningAvgSamplesPerSec=46.49864772289161, CurrSamplesPerSec=46.32430289713105, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:51:34,611] [INFO] [logging.py:128:log_dist] [Rank 0] step=350, skipped=7, lr=[4.742612087715547e-05, 4.742612087715547e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:51:34,631] [INFO] [timer.py:264:stop] epoch=0/micro_step=700/global_step=350, RunningAvgSamplesPerSec=46.495692632814894, CurrSamplesPerSec=46.390253314093556, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:51:41,565] [INFO] [logging.py:128:log_dist] [Rank 0] step=360, skipped=7, lr=[4.727667761875828e-05, 4.727667761875828e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:51:41,586] [INFO] [timer.py:264:stop] epoch=0/micro_step=720/global_step=360, RunningAvgSamplesPerSec=46.491304442529376, CurrSamplesPerSec=46.325166292582765, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:51:48,518] [INFO] [logging.py:128:log_dist] [Rank 0] step=370, skipped=7, lr=[4.712326673367824e-05, 4.712326673367824e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:51:48,538] [INFO] [timer.py:264:stop] epoch=0/micro_step=740/global_step=370, RunningAvgSamplesPerSec=46.487412684576384, CurrSamplesPerSec=46.33804109742901, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:51:55,472] [INFO] [logging.py:128:log_dist] [Rank 0] step=380, skipped=7, lr=[4.696591554542973e-05, 4.696591554542973e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:51:55,493] [INFO] [timer.py:264:stop] epoch=0/micro_step=760/global_step=380, RunningAvgSamplesPerSec=46.48344060347751, CurrSamplesPerSec=46.31380082968855, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:52:02,420] [INFO] [logging.py:128:log_dist] [Rank 0] step=390, skipped=7, lr=[4.6804652079321726e-05, 4.6804652079321726e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:52:02,441] [INFO] [timer.py:264:stop] epoch=0/micro_step=780/global_step=390, RunningAvgSamplesPerSec=46.48118928657953, CurrSamplesPerSec=46.933735963905434, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:52:09,377] [INFO] [logging.py:128:log_dist] [Rank 0] step=400, skipped=7, lr=[4.663950505746629e-05, 4.663950505746629e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:52:09,398] [INFO] [timer.py:264:stop] epoch=0/micro_step=800/global_step=400, RunningAvgSamplesPerSec=46.47709234422624, CurrSamplesPerSec=46.35471700350897, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:52:16,305] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:52:16,307] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:52:16,309] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:52:16,309] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:52:16,312] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:52:16,312] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:52:16,325] [INFO] [logging.py:128:log_dist] [Rank 0] step=410, skipped=7, lr=[4.6470503893662995e-05, 4.6470503893662995e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:52:16,345] [INFO] [timer.py:264:stop] epoch=0/micro_step=820/global_step=410, RunningAvgSamplesPerSec=46.47536079054126, CurrSamplesPerSec=46.36307547356772, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:52:23,279] [INFO] [logging.py:128:log_dist] [Rank 0] step=420, skipped=7, lr=[4.6297678688160096e-05, 4.6297678688160096e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:52:23,299] [INFO] [timer.py:264:stop] epoch=0/micro_step=840/global_step=420, RunningAvgSamplesPerSec=46.47249330297309, CurrSamplesPerSec=46.40106279118714, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:52:30,233] [INFO] [logging.py:128:log_dist] [Rank 0] step=430, skipped=7, lr=[4.612106022229352e-05, 4.612106022229352e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:52:30,254] [INFO] [timer.py:264:stop] epoch=0/micro_step=860/global_step=430, RunningAvgSamplesPerSec=46.469326490503796, CurrSamplesPerSec=46.32311977785527, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:52:37,190] [INFO] [logging.py:128:log_dist] [Rank 0] step=440, skipped=7, lr=[4.594067995300447e-05, 4.594067995300447e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:52:37,210] [INFO] [timer.py:264:stop] epoch=0/micro_step=880/global_step=440, RunningAvgSamplesPerSec=46.466282683995, CurrSamplesPerSec=46.33493769496381, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:52:44,142] [INFO] [logging.py:128:log_dist] [Rank 0] step=450, skipped=7, lr=[4.57565700072367e-05, 4.57565700072367e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:52:44,162] [INFO] [timer.py:264:stop] epoch=0/micro_step=900/global_step=450, RunningAvgSamplesPerSec=46.463773986613354, CurrSamplesPerSec=46.314376162125654, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:52:51,093] [INFO] [logging.py:128:log_dist] [Rank 0] step=460, skipped=7, lr=[4.556876317621458e-05, 4.556876317621458e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:52:51,114] [INFO] [timer.py:264:stop] epoch=0/micro_step=920/global_step=460, RunningAvgSamplesPerSec=46.46132522763121, CurrSamplesPerSec=46.33345012880701, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:52:58,047] [INFO] [logging.py:128:log_dist] [Rank 0] step=470, skipped=7, lr=[4.5377292909602656e-05, 4.5377292909602656e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:52:58,068] [INFO] [timer.py:264:stop] epoch=0/micro_step=940/global_step=470, RunningAvgSamplesPerSec=46.45878779560237, CurrSamplesPerSec=46.43445319841673, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 472
+[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 472
+[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 472
+[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 472
+[2025-01-02 15:53:00,097] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0
+[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 472
+[2025-01-02 15:53:00,098] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:53:00,098] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:53:00,098] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 472
+[2025-01-02 15:53:00,098] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:53:04,937] [INFO] [logging.py:128:log_dist] [Rank 0] step=480, skipped=8, lr=[4.520186560426292e-05, 4.520186560426292e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:53:04,958] [INFO] [timer.py:264:stop] epoch=0/micro_step=960/global_step=480, RunningAvgSamplesPerSec=46.4652574508203, CurrSamplesPerSec=46.38100350402843, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:53:11,893] [INFO] [logging.py:128:log_dist] [Rank 0] step=490, skipped=8, lr=[4.5003529295830075e-05, 4.5003529295830075e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:53:11,913] [INFO] [timer.py:264:stop] epoch=0/micro_step=980/global_step=490, RunningAvgSamplesPerSec=46.46303714998046, CurrSamplesPerSec=46.34392910632221, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:53:18,845] [INFO] [logging.py:128:log_dist] [Rank 0] step=500, skipped=8, lr=[4.4801630223777665e-05, 4.4801630223777665e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:53:18,865] [INFO] [timer.py:264:stop] epoch=0/micro_step=1000/global_step=500, RunningAvgSamplesPerSec=46.46079008466478, CurrSamplesPerSec=46.36059323775675, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:53:25,798] [INFO] [logging.py:128:log_dist] [Rank 0] step=510, skipped=8, lr=[4.459620434769351e-05, 4.459620434769351e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:53:25,819] [INFO] [timer.py:264:stop] epoch=0/micro_step=1020/global_step=510, RunningAvgSamplesPerSec=46.45906494240357, CurrSamplesPerSec=46.75834281138947, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.99, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:53:32,751] [INFO] [logging.py:128:log_dist] [Rank 0] step=520, skipped=8, lr=[4.438728825531305e-05, 4.438728825531305e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:53:32,772] [INFO] [timer.py:264:stop] epoch=0/micro_step=1040/global_step=520, RunningAvgSamplesPerSec=46.45683775721524, CurrSamplesPerSec=46.369770828201965, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:53:39,703] [INFO] [logging.py:128:log_dist] [Rank 0] step=530, skipped=8, lr=[4.417491915600285e-05, 4.417491915600285e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:53:39,724] [INFO] [timer.py:264:stop] epoch=0/micro_step=1060/global_step=530, RunningAvgSamplesPerSec=46.454981721927275, CurrSamplesPerSec=46.67327109354972, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:53:46,661] [INFO] [logging.py:128:log_dist] [Rank 0] step=540, skipped=8, lr=[4.395913487413324e-05, 4.395913487413324e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:53:46,681] [INFO] [timer.py:264:stop] epoch=0/micro_step=1080/global_step=540, RunningAvgSamplesPerSec=46.45267445975501, CurrSamplesPerSec=46.34778592305895, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:53:53,612] [INFO] [logging.py:128:log_dist] [Rank 0] step=550, skipped=8, lr=[4.37399738423417e-05, 4.37399738423417e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:53:53,632] [INFO] [timer.py:264:stop] epoch=0/micro_step=1100/global_step=550, RunningAvgSamplesPerSec=46.45093776947002, CurrSamplesPerSec=46.396587634057866, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:54:00,566] [INFO] [logging.py:128:log_dist] [Rank 0] step=560, skipped=8, lr=[4.351747509468763e-05, 4.351747509468763e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:54:00,586] [INFO] [timer.py:264:stop] epoch=0/micro_step=1120/global_step=560, RunningAvgSamplesPerSec=46.44909549888697, CurrSamplesPerSec=46.45555564637373, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:54:07,514] [INFO] [logging.py:128:log_dist] [Rank 0] step=570, skipped=8, lr=[4.3291678259700163e-05, 4.3291678259700163e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:54:07,534] [INFO] [timer.py:264:stop] epoch=0/micro_step=1140/global_step=570, RunningAvgSamplesPerSec=46.44810234293339, CurrSamplesPerSec=46.34565739295321, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:54:10,277] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:54:10,277] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:54:10,277] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:54:10,279] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:54:10,284] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:54:10,284] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:54:14,467] [INFO] [logging.py:128:log_dist] [Rank 0] step=580, skipped=8, lr=[4.306262355332006e-05, 4.306262355332006e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:54:14,487] [INFO] [timer.py:264:stop] epoch=0/micro_step=1160/global_step=580, RunningAvgSamplesPerSec=46.446247391835065, CurrSamplesPerSec=46.379192448009945, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:54:21,419] [INFO] [logging.py:128:log_dist] [Rank 0] step=590, skipped=8, lr=[4.2830351771736965e-05, 4.2830351771736965e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:54:21,440] [INFO] [timer.py:264:stop] epoch=0/micro_step=1180/global_step=590, RunningAvgSamplesPerSec=46.444511706837304, CurrSamplesPerSec=46.33538558305289, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:54:28,372] [INFO] [logging.py:128:log_dist] [Rank 0] step=600, skipped=8, lr=[4.259490428412335e-05, 4.259490428412335e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:54:28,392] [INFO] [timer.py:264:stop] epoch=0/micro_step=1200/global_step=600, RunningAvgSamplesPerSec=46.44315240764414, CurrSamplesPerSec=46.37826293436061, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:54:35,326] [INFO] [logging.py:128:log_dist] [Rank 0] step=610, skipped=8, lr=[4.235632302526635e-05, 4.235632302526635e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:54:35,347] [INFO] [timer.py:264:stop] epoch=0/micro_step=1220/global_step=610, RunningAvgSamplesPerSec=46.4417772135784, CurrSamplesPerSec=46.33301827265184, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:54:42,279] [INFO] [logging.py:128:log_dist] [Rank 0] step=620, skipped=8, lr=[4.2114650488098936e-05, 4.2114650488098936e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:54:42,299] [INFO] [timer.py:264:stop] epoch=0/micro_step=1240/global_step=620, RunningAvgSamplesPerSec=46.44041818234045, CurrSamplesPerSec=46.349610533055575, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:54:49,231] [INFO] [logging.py:128:log_dist] [Rank 0] step=630, skipped=8, lr=[4.1869929716131605e-05, 4.1869929716131605e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:54:49,251] [INFO] [timer.py:264:stop] epoch=0/micro_step=1260/global_step=630, RunningAvgSamplesPerSec=46.43896915559678, CurrSamplesPerSec=46.32174487732559, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:54:54,756] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 637
+[2025-01-02 15:54:54,756] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 637
+[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:54:54,756] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 637
+[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 637
+[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 637
+[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 637
+[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 637
+[2025-01-02 15:54:54,757] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0
+[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 637
+[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:54:56,121] [INFO] [logging.py:128:log_dist] [Rank 0] step=640, skipped=9, lr=[4.164711079369153e-05, 4.164711079369153e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:54:56,141] [INFO] [timer.py:264:stop] epoch=0/micro_step=1280/global_step=640, RunningAvgSamplesPerSec=46.44426552356086, CurrSamplesPerSec=46.362723140040615, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:55:03,067] [INFO] [logging.py:128:log_dist] [Rank 0] step=650, skipped=9, lr=[4.1396718898658025e-05, 4.1396718898658025e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:55:03,087] [INFO] [timer.py:264:stop] epoch=0/micro_step=1300/global_step=650, RunningAvgSamplesPerSec=46.443367108838785, CurrSamplesPerSec=46.33520962598524, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:55:10,025] [INFO] [logging.py:128:log_dist] [Rank 0] step=660, skipped=9, lr=[4.1143406637287735e-05, 4.1143406637287735e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:55:10,045] [INFO] [timer.py:264:stop] epoch=0/micro_step=1320/global_step=660, RunningAvgSamplesPerSec=46.44134794762593, CurrSamplesPerSec=46.33818507979863, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:55:16,979] [INFO] [logging.py:128:log_dist] [Rank 0] step=670, skipped=9, lr=[4.088721912620461e-05, 4.088721912620461e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:55:17,000] [INFO] [timer.py:264:stop] epoch=0/micro_step=1340/global_step=670, RunningAvgSamplesPerSec=46.43998496782961, CurrSamplesPerSec=47.037639814383006, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.57, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:55:23,929] [INFO] [logging.py:128:log_dist] [Rank 0] step=680, skipped=9, lr=[4.0628201994134016e-05, 4.0628201994134016e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:55:23,949] [INFO] [timer.py:264:stop] epoch=0/micro_step=1360/global_step=680, RunningAvgSamplesPerSec=46.43896855112754, CurrSamplesPerSec=46.311851198280706, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.15s, TFLOPs: 1.93, Samples/sec: 13.76, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:55:30,886] [INFO] [logging.py:128:log_dist] [Rank 0] step=690, skipped=9, lr=[4.036640137377588e-05, 4.036640137377588e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:55:30,907] [INFO] [timer.py:264:stop] epoch=0/micro_step=1380/global_step=690, RunningAvgSamplesPerSec=46.437531826930254, CurrSamplesPerSec=45.93531883513, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:55:37,833] [INFO] [logging.py:128:log_dist] [Rank 0] step=700, skipped=9, lr=[4.010186389358825e-05, 4.010186389358825e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:55:37,854] [INFO] [timer.py:264:stop] epoch=0/micro_step=1400/global_step=700, RunningAvgSamplesPerSec=46.436798376271994, CurrSamplesPerSec=46.37616365270356, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:55:44,791] [INFO] [logging.py:128:log_dist] [Rank 0] step=710, skipped=9, lr=[3.983463666948233e-05, 3.983463666948233e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:55:44,811] [INFO] [timer.py:264:stop] epoch=0/micro_step=1420/global_step=710, RunningAvgSamplesPerSec=46.43521967480529, CurrSamplesPerSec=46.30605121647706, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:55:51,744] [INFO] [logging.py:128:log_dist] [Rank 0] step=720, skipped=9, lr=[3.9564767296430877e-05, 3.9564767296430877e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:55:51,765] [INFO] [timer.py:264:stop] epoch=0/micro_step=1440/global_step=720, RunningAvgSamplesPerSec=46.434081332236666, CurrSamplesPerSec=46.35162737441071, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.57, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:55:58,693] [INFO] [logging.py:128:log_dist] [Rank 0] step=730, skipped=9, lr=[3.929230383999124e-05, 3.929230383999124e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:55:58,714] [INFO] [timer.py:264:stop] epoch=0/micro_step=1460/global_step=730, RunningAvgSamplesPerSec=46.43326436780265, CurrSamplesPerSec=46.40451197798106, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.54, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:56:04,939] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:56:04,939] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:56:04,939] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:56:04,939] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:56:04,941] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:56:04,941] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:56:04,942] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:56:04,942] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 15:56:04,945] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:56:04,946] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:56:05,667] [INFO] [logging.py:128:log_dist] [Rank 0] step=740, skipped=9, lr=[3.901729482774453e-05, 3.901729482774453e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:56:05,684] [INFO] [timer.py:264:stop] epoch=0/micro_step=1480/global_step=740, RunningAvgSamplesPerSec=46.43108873755449, CurrSamplesPerSec=45.89287960413895, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:56:12,611] [INFO] [logging.py:128:log_dist] [Rank 0] step=750, skipped=9, lr=[3.8739789240652524e-05, 3.8739789240652524e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:56:12,632] [INFO] [timer.py:264:stop] epoch=0/micro_step=1500/global_step=750, RunningAvgSamplesPerSec=46.430940342580826, CurrSamplesPerSec=46.33469775847788, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:56:19,572] [INFO] [logging.py:128:log_dist] [Rank 0] step=760, skipped=9, lr=[3.845983650433384e-05, 3.845983650433384e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:56:19,593] [INFO] [timer.py:264:stop] epoch=0/micro_step=1520/global_step=760, RunningAvgSamplesPerSec=46.42927334106034, CurrSamplesPerSec=46.41226248648767, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:56:26,522] [INFO] [logging.py:128:log_dist] [Rank 0] step=770, skipped=9, lr=[3.817748648026087e-05, 3.817748648026087e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:56:26,542] [INFO] [timer.py:264:stop] epoch=0/micro_step=1540/global_step=770, RunningAvgSamplesPerSec=46.42870842332397, CurrSamplesPerSec=46.3644207962945, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:56:33,474] [INFO] [logging.py:128:log_dist] [Rank 0] step=780, skipped=9, lr=[3.78927894568792e-05, 3.78927894568792e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:56:33,494] [INFO] [timer.py:264:stop] epoch=0/micro_step=1560/global_step=780, RunningAvgSamplesPerSec=46.427907498021575, CurrSamplesPerSec=46.35952035358448, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:56:40,426] [INFO] [logging.py:128:log_dist] [Rank 0] step=790, skipped=9, lr=[3.7605796140650764e-05, 3.7605796140650764e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:56:40,447] [INFO] [timer.py:264:stop] epoch=0/micro_step=1580/global_step=790, RunningAvgSamplesPerSec=46.42688505692321, CurrSamplesPerSec=46.38398484232087, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:56:47,379] [INFO] [logging.py:128:log_dist] [Rank 0] step=800, skipped=9, lr=[3.73165576470228e-05, 3.73165576470228e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:56:47,399] [INFO] [timer.py:264:stop] epoch=0/micro_step=1600/global_step=800, RunningAvgSamplesPerSec=46.42593307965039, CurrSamplesPerSec=46.31824404565779, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:56:54,330] [INFO] [logging.py:128:log_dist] [Rank 0] step=810, skipped=9, lr=[3.70251254913238e-05, 3.70251254913238e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:56:54,350] [INFO] [timer.py:264:stop] epoch=0/micro_step=1620/global_step=810, RunningAvgSamplesPerSec=46.42510739233998, CurrSamplesPerSec=46.36661512111462, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:57:01,282] [INFO] [logging.py:128:log_dist] [Rank 0] step=820, skipped=9, lr=[3.673155157958827e-05, 3.673155157958827e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:57:01,303] [INFO] [timer.py:264:stop] epoch=0/micro_step=1640/global_step=820, RunningAvgSamplesPerSec=46.42418767305092, CurrSamplesPerSec=46.3146478518564, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:57:08,234] [INFO] [logging.py:128:log_dist] [Rank 0] step=830, skipped=9, lr=[3.6435888199311916e-05, 3.6435888199311916e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:57:08,254] [INFO] [timer.py:264:stop] epoch=0/micro_step=1660/global_step=830, RunningAvgSamplesPerSec=46.42334132276314, CurrSamplesPerSec=46.34514529458703, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:57:14,472] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 15:57:14,474] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:57:14,475] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 15:57:14,479] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:57:14,479] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:57:15,187] [INFO] [logging.py:128:log_dist] [Rank 0] step=840, skipped=9, lr=[3.6138188010138916e-05, 3.6138188010138916e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:57:15,208] [INFO] [timer.py:264:stop] epoch=0/micro_step=1680/global_step=840, RunningAvgSamplesPerSec=46.42245676253584, CurrSamplesPerSec=46.35514926425546, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:57:22,140] [INFO] [logging.py:128:log_dist] [Rank 0] step=850, skipped=9, lr=[3.583850403448287e-05, 3.583850403448287e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:57:22,161] [INFO] [timer.py:264:stop] epoch=0/micro_step=1700/global_step=850, RunningAvgSamplesPerSec=46.42177760625656, CurrSamplesPerSec=46.53673661663851, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:57:29,092] [INFO] [logging.py:128:log_dist] [Rank 0] step=860, skipped=9, lr=[3.5536889648083114e-05, 3.5536889648083114e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:57:29,113] [INFO] [timer.py:264:stop] epoch=0/micro_step=1720/global_step=860, RunningAvgSamplesPerSec=46.421073151354676, CurrSamplesPerSec=46.339704948271745, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:57:36,046] [INFO] [logging.py:128:log_dist] [Rank 0] step=870, skipped=9, lr=[3.523339857049819e-05, 3.523339857049819e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:57:36,066] [INFO] [timer.py:264:stop] epoch=0/micro_step=1740/global_step=870, RunningAvgSamplesPerSec=46.420195916054574, CurrSamplesPerSec=46.33840905415208, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 871
+[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 871
+[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 871
+[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 871
+[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:57:37,399] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0
+[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 871
+[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 871
+[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:57:42,936] [INFO] [logging.py:128:log_dist] [Rank 0] step=880, skipped=10, lr=[3.495869669843086e-05, 3.495869669843086e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:57:42,956] [INFO] [timer.py:264:stop] epoch=0/micro_step=1760/global_step=880, RunningAvgSamplesPerSec=46.42414880906899, CurrSamplesPerSec=46.31806821875464, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:57:49,887] [INFO] [logging.py:128:log_dist] [Rank 0] step=890, skipped=10, lr=[3.4651789094342044e-05, 3.4651789094342044e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:57:49,907] [INFO] [timer.py:264:stop] epoch=0/micro_step=1780/global_step=890, RunningAvgSamplesPerSec=46.42345409193999, CurrSamplesPerSec=46.35346829552468, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:57:56,836] [INFO] [logging.py:128:log_dist] [Rank 0] step=900, skipped=10, lr=[3.434316244145236e-05, 3.434316244145236e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:57:56,857] [INFO] [timer.py:264:stop] epoch=0/micro_step=1800/global_step=900, RunningAvgSamplesPerSec=46.42303656570756, CurrSamplesPerSec=46.4751966952545, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:58:03,784] [INFO] [logging.py:128:log_dist] [Rank 0] step=910, skipped=10, lr=[3.403287170825234e-05, 3.403287170825234e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:58:03,805] [INFO] [timer.py:264:stop] epoch=0/micro_step=1820/global_step=910, RunningAvgSamplesPerSec=46.42259453897555, CurrSamplesPerSec=46.36161812871359, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:58:10,733] [INFO] [logging.py:128:log_dist] [Rank 0] step=920, skipped=10, lr=[3.3720972159616496e-05, 3.3720972159616496e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:58:10,751] [INFO] [timer.py:264:stop] epoch=0/micro_step=1840/global_step=920, RunningAvgSamplesPerSec=46.42241240368338, CurrSamplesPerSec=46.487559281055454, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:58:17,702] [INFO] [logging.py:128:log_dist] [Rank 0] step=930, skipped=10, lr=[3.340751934696017e-05, 3.340751934696017e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:58:17,722] [INFO] [timer.py:264:stop] epoch=0/micro_step=1860/global_step=930, RunningAvgSamplesPerSec=46.420715482541, CurrSamplesPerSec=46.42123576943527, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:58:24,619] [INFO] [logging.py:128:log_dist] [Rank 0] step=940, skipped=10, lr=[3.309256909834556e-05, 3.309256909834556e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:58:24,639] [INFO] [timer.py:264:stop] epoch=0/micro_step=1880/global_step=940, RunningAvgSamplesPerSec=46.42255350808398, CurrSamplesPerSec=46.7765126862402, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:58:31,518] [INFO] [logging.py:128:log_dist] [Rank 0] step=950, skipped=10, lr=[3.2776177508538304e-05, 3.2776177508538304e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:58:31,538] [INFO] [timer.py:264:stop] epoch=0/micro_step=1900/global_step=950, RunningAvgSamplesPerSec=46.42571670265899, CurrSamplesPerSec=46.77811035486935, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:58:38,418] [INFO] [logging.py:128:log_dist] [Rank 0] step=960, skipped=10, lr=[3.245840092901662e-05, 3.245840092901662e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:58:38,439] [INFO] [timer.py:264:stop] epoch=0/micro_step=1920/global_step=960, RunningAvgSamplesPerSec=46.428712246785175, CurrSamplesPerSec=46.81064173887472, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:58:45,316] [INFO] [logging.py:128:log_dist] [Rank 0] step=970, skipped=10, lr=[3.213929595793479e-05, 3.213929595793479e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:58:45,336] [INFO] [timer.py:264:stop] epoch=0/micro_step=1940/global_step=970, RunningAvgSamplesPerSec=46.431706559142256, CurrSamplesPerSec=46.806527953026816, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:58:47,365] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:58:47,365] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 15:58:47,367] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.59, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:58:52,204] [INFO] [logging.py:128:log_dist] [Rank 0] step=980, skipped=10, lr=[3.1818919430042524e-05, 3.1818919430042524e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:58:52,225] [INFO] [timer.py:264:stop] epoch=0/micro_step=1960/global_step=980, RunningAvgSamplesPerSec=46.43535870004616, CurrSamplesPerSec=46.78776389902927, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:58:59,091] [INFO] [logging.py:128:log_dist] [Rank 0] step=990, skipped=10, lr=[3.1497328406562476e-05, 3.1497328406562476e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:58:59,112] [INFO] [timer.py:264:stop] epoch=0/micro_step=1980/global_step=990, RunningAvgSamplesPerSec=46.43904306339881, CurrSamplesPerSec=46.83297000125497, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.94, Samples/sec: 13.81, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:59:05,986] [INFO] [logging.py:128:log_dist] [Rank 0] step=1000, skipped=10, lr=[3.117458016502711e-05, 3.117458016502711e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:59:06,006] [INFO] [timer.py:264:stop] epoch=0/micro_step=2000/global_step=1000, RunningAvgSamplesPerSec=46.44231158064333, CurrSamplesPerSec=46.81330302519387, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.50, Samples/sec: 3.53, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:59:12,898] [INFO] [logging.py:128:log_dist] [Rank 0] step=1010, skipped=10, lr=[3.0850732189077236e-05, 3.0850732189077236e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:59:12,918] [INFO] [timer.py:264:stop] epoch=0/micro_step=2020/global_step=1010, RunningAvgSamplesPerSec=46.444083453859605, CurrSamplesPerSec=46.8065442761449, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:59:19,784] [INFO] [logging.py:128:log_dist] [Rank 0] step=1020, skipped=10, lr=[3.05258421582238e-05, 3.05258421582238e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:59:19,804] [INFO] [timer.py:264:stop] epoch=0/micro_step=2040/global_step=1020, RunningAvgSamplesPerSec=46.447488410641995, CurrSamplesPerSec=46.75251188976779, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:59:26,682] [INFO] [logging.py:128:log_dist] [Rank 0] step=1030, skipped=10, lr=[3.0199967937574774e-05, 3.0199967937574774e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:59:26,699] [INFO] [timer.py:264:stop] epoch=0/micro_step=2060/global_step=1030, RunningAvgSamplesPerSec=46.45030464457745, CurrSamplesPerSec=46.19032410349784, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:59:27,333] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1030
+[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:59:27,333] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1030
+[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1030
+[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1030
+[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1030
+[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1030
+[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1030
+[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:59:27,335] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1030
+[2025-01-02 15:59:27,335] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 15:59:27,335] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.10, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:59:33,531] [INFO] [logging.py:128:log_dist] [Rank 0] step=1040, skipped=11, lr=[2.9905887623649602e-05, 2.9905887623649602e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:59:33,551] [INFO] [timer.py:264:stop] epoch=0/micro_step=2080/global_step=1040, RunningAvgSamplesPerSec=46.45609527814709, CurrSamplesPerSec=46.61336089809432, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:59:40,420] [INFO] [logging.py:128:log_dist] [Rank 0] step=1050, skipped=11, lr=[2.9578303480235774e-05, 2.9578303480235774e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:59:40,441] [INFO] [timer.py:264:stop] epoch=0/micro_step=2100/global_step=1050, RunningAvgSamplesPerSec=46.45922477354279, CurrSamplesPerSec=46.80225168829745, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:59:47,310] [INFO] [logging.py:128:log_dist] [Rank 0] step=1060, skipped=11, lr=[2.9249903910062116e-05, 2.9249903910062116e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:59:47,330] [INFO] [timer.py:264:stop] epoch=0/micro_step=2120/global_step=1060, RunningAvgSamplesPerSec=46.46228296302194, CurrSamplesPerSec=46.73650876277751, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.54, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:59:54,242] [INFO] [logging.py:128:log_dist] [Rank 0] step=1070, skipped=11, lr=[2.8920747403309247e-05, 2.8920747403309247e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 15:59:54,262] [INFO] [timer.py:264:stop] epoch=0/micro_step=2140/global_step=1070, RunningAvgSamplesPerSec=46.46284313048135, CurrSamplesPerSec=46.67312502129054, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1070
+[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1070
+[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1070
+[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:59:54,893] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0
+[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1070
+[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1070
+[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1070
+[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1070
+[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.11, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:00:01,069] [INFO] [logging.py:128:log_dist] [Rank 0] step=1080, skipped=12, lr=[2.8623907817398308e-05, 2.8623907817398308e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:00:01,090] [INFO] [timer.py:264:stop] epoch=0/micro_step=2160/global_step=1080, RunningAvgSamplesPerSec=46.469588801539444, CurrSamplesPerSec=46.82241573242371, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:00:07,965] [INFO] [logging.py:128:log_dist] [Rank 0] step=1090, skipped=12, lr=[2.8293474746020472e-05, 2.8293474746020472e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:00:07,984] [INFO] [timer.py:264:stop] epoch=0/micro_step=2180/global_step=1090, RunningAvgSamplesPerSec=46.47228214300523, CurrSamplesPerSec=46.75736546050338, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:00:14,870] [INFO] [logging.py:128:log_dist] [Rank 0] step=1100, skipped=12, lr=[2.7962455084554778e-05, 2.7962455084554778e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:00:14,890] [INFO] [timer.py:264:stop] epoch=0/micro_step=2200/global_step=1100, RunningAvgSamplesPerSec=46.47416601952744, CurrSamplesPerSec=46.769764555584786, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:00:21,778] [INFO] [logging.py:128:log_dist] [Rank 0] step=1110, skipped=12, lr=[2.763090778983777e-05, 2.763090778983777e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:00:21,798] [INFO] [timer.py:264:stop] epoch=0/micro_step=2220/global_step=1110, RunningAvgSamplesPerSec=46.47592055429421, CurrSamplesPerSec=46.91998677298789, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:00:28,694] [INFO] [logging.py:128:log_dist] [Rank 0] step=1120, skipped=12, lr=[2.729889191268107e-05, 2.729889191268107e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:00:28,714] [INFO] [timer.py:264:stop] epoch=0/micro_step=2240/global_step=1120, RunningAvgSamplesPerSec=46.477148202453265, CurrSamplesPerSec=46.72287486730245, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:00:35,589] [INFO] [logging.py:128:log_dist] [Rank 0] step=1130, skipped=12, lr=[2.696646658735396e-05, 2.696646658735396e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:00:35,609] [INFO] [timer.py:264:stop] epoch=0/micro_step=2260/global_step=1130, RunningAvgSamplesPerSec=46.479514044879025, CurrSamplesPerSec=46.664411030997236, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:00:42,488] [INFO] [logging.py:128:log_dist] [Rank 0] step=1140, skipped=12, lr=[2.6633691021051226e-05, 2.6633691021051226e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:00:42,509] [INFO] [timer.py:264:stop] epoch=0/micro_step=2280/global_step=1140, RunningAvgSamplesPerSec=46.481591664999485, CurrSamplesPerSec=46.77589321144054, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:00:49,420] [INFO] [logging.py:128:log_dist] [Rank 0] step=1150, skipped=12, lr=[2.6300624483347926e-05, 2.6300624483347926e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:00:49,440] [INFO] [timer.py:264:stop] epoch=0/micro_step=2300/global_step=1150, RunningAvgSamplesPerSec=46.481755312094066, CurrSamplesPerSec=46.77967552383947, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:00:56,307] [INFO] [logging.py:128:log_dist] [Rank 0] step=1160, skipped=12, lr=[2.596732629564309e-05, 2.596732629564309e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:00:56,328] [INFO] [timer.py:264:stop] epoch=0/micro_step=2320/global_step=1160, RunningAvgSamplesPerSec=46.48439949209591, CurrSamplesPerSec=46.79316314116429, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:01:03,200] [INFO] [logging.py:128:log_dist] [Rank 0] step=1170, skipped=12, lr=[2.56338558205942e-05, 2.56338558205942e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:01:03,221] [INFO] [timer.py:264:stop] epoch=0/micro_step=2340/global_step=1170, RunningAvgSamplesPerSec=46.48707192862625, CurrSamplesPerSec=46.97179330281597, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:01:04,561] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:01:04,561] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 16:01:04,563] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:01:04,563] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 16:01:04,564] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:01:04,564] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:01:10,094] [INFO] [logging.py:128:log_dist] [Rank 0] step=1180, skipped=12, lr=[2.5300272451544234e-05, 2.5300272451544234e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:01:10,114] [INFO] [timer.py:264:stop] epoch=0/micro_step=2360/global_step=1180, RunningAvgSamplesPerSec=46.48953479190863, CurrSamplesPerSec=46.803459409078194, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:01:16,986] [INFO] [logging.py:128:log_dist] [Rank 0] step=1190, skipped=12, lr=[2.496663560194338e-05, 2.496663560194338e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:01:17,006] [INFO] [timer.py:264:stop] epoch=0/micro_step=2380/global_step=1190, RunningAvgSamplesPerSec=46.49180947433954, CurrSamplesPerSec=46.80528742936384, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.50, Samples/sec: 3.53, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:01:19,727] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1193
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1193
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1193
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1193
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1193
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1193
+[2025-01-02 16:01:19,728] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1193
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1193
+[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.09, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:01:23,847] [INFO] [logging.py:128:log_dist] [Rank 0] step=1200, skipped=13, lr=[2.4666365824494565e-05, 2.4666365824494565e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:01:23,868] [INFO] [timer.py:264:stop] epoch=0/micro_step=2400/global_step=1200, RunningAvgSamplesPerSec=46.49591524184215, CurrSamplesPerSec=46.686599976696954, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:01:30,757] [INFO] [logging.py:128:log_dist] [Rank 0] step=1210, skipped=13, lr=[2.4332791071488294e-05, 2.4332791071488294e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:01:30,777] [INFO] [timer.py:264:stop] epoch=0/micro_step=2420/global_step=1210, RunningAvgSamplesPerSec=46.49732960112968, CurrSamplesPerSec=46.76973196064543, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:01:37,682] [INFO] [logging.py:128:log_dist] [Rank 0] step=1220, skipped=13, lr=[2.3999335152896784e-05, 2.3999335152896784e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:01:37,703] [INFO] [timer.py:264:stop] epoch=0/micro_step=2440/global_step=1220, RunningAvgSamplesPerSec=46.497692834200365, CurrSamplesPerSec=46.50880673295993, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.18, Samples/sec: 15.49, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.49, Samples/sec: 3.50, Time/seq 0.29s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:01:44,633] [INFO] [logging.py:128:log_dist] [Rank 0] step=1230, skipped=13, lr=[2.3666057459470436e-05, 2.3666057459470436e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:01:44,654] [INFO] [timer.py:264:stop] epoch=0/micro_step=2460/global_step=1230, RunningAvgSamplesPerSec=46.49675848239761, CurrSamplesPerSec=46.493952415474645, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:01:51,580] [INFO] [logging.py:128:log_dist] [Rank 0] step=1240, skipped=13, lr=[2.3333017350216558e-05, 2.3333017350216558e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:01:51,600] [INFO] [timer.py:264:stop] epoch=0/micro_step=2480/global_step=1240, RunningAvgSamplesPerSec=46.49602569155278, CurrSamplesPerSec=46.5161407241932, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:01:58,509] [INFO] [logging.py:128:log_dist] [Rank 0] step=1250, skipped=13, lr=[2.300027414182708e-05, 2.300027414182708e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:01:58,529] [INFO] [timer.py:264:stop] epoch=0/micro_step=2500/global_step=1250, RunningAvgSamplesPerSec=46.49621684629403, CurrSamplesPerSec=46.497528184225175, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:02:05,438] [INFO] [logging.py:128:log_dist] [Rank 0] step=1260, skipped=13, lr=[2.2667887098113915e-05, 2.2667887098113915e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:02:05,458] [INFO] [timer.py:264:stop] epoch=0/micro_step=2520/global_step=1260, RunningAvgSamplesPerSec=46.496336564272966, CurrSamplesPerSec=46.48627120484043, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:02:12,370] [INFO] [logging.py:128:log_dist] [Rank 0] step=1270, skipped=13, lr=[2.233591541945361e-05, 2.233591541945361e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:02:12,390] [INFO] [timer.py:264:stop] epoch=0/micro_step=2540/global_step=1270, RunningAvgSamplesPerSec=46.49631392383694, CurrSamplesPerSec=46.47926853952197, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.56, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:02:19,299] [INFO] [logging.py:128:log_dist] [Rank 0] step=1280, skipped=13, lr=[2.2004418232243425e-05, 2.2004418232243425e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:02:19,320] [INFO] [timer.py:264:stop] epoch=0/micro_step=2560/global_step=1280, RunningAvgSamplesPerSec=46.496336834420134, CurrSamplesPerSec=46.493743040656476, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:02:26,231] [INFO] [logging.py:128:log_dist] [Rank 0] step=1290, skipped=13, lr=[2.1673454578370484e-05, 2.1673454578370484e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:02:26,251] [INFO] [timer.py:264:stop] epoch=0/micro_step=2580/global_step=1290, RunningAvgSamplesPerSec=46.49628521737437, CurrSamplesPerSec=46.5082749070585, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:02:29,678] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:02:29,678] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:02:29,678] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 16:02:29,678] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+[2025-01-02 16:02:29,684] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:02:29,684] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:02:33,187] [INFO] [logging.py:128:log_dist] [Rank 0] step=1300, skipped=13, lr=[2.1343083404695983e-05, 2.1343083404695983e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:02:33,205] [INFO] [timer.py:264:stop] epoch=0/micro_step=2600/global_step=1300, RunningAvgSamplesPerSec=46.49535814691071, CurrSamplesPerSec=46.25004359522548, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:02:40,113] [INFO] [logging.py:128:log_dist] [Rank 0] step=1310, skipped=13, lr=[2.101336355255645e-05, 2.101336355255645e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:02:40,134] [INFO] [timer.py:264:stop] epoch=0/micro_step=2620/global_step=1310, RunningAvgSamplesPerSec=46.495657776731484, CurrSamplesPerSec=46.5038918355933, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:02:47,050] [INFO] [logging.py:128:log_dist] [Rank 0] step=1320, skipped=13, lr=[2.0684353747283626e-05, 2.0684353747283626e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:02:47,070] [INFO] [timer.py:264:stop] epoch=0/micro_step=2640/global_step=1320, RunningAvgSamplesPerSec=46.49561196265634, CurrSamplesPerSec=46.49445170149724, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:02:53,981] [INFO] [logging.py:128:log_dist] [Rank 0] step=1330, skipped=13, lr=[2.035611258774508e-05, 2.035611258774508e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:02:54,002] [INFO] [timer.py:264:stop] epoch=0/micro_step=2660/global_step=1330, RunningAvgSamplesPerSec=46.495620556823845, CurrSamplesPerSec=46.64968416069277, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:03:00,913] [INFO] [logging.py:128:log_dist] [Rank 0] step=1340, skipped=13, lr=[2.0028698535907454e-05, 2.0028698535907454e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:03:00,934] [INFO] [timer.py:264:stop] epoch=0/micro_step=2680/global_step=1340, RunningAvgSamplesPerSec=46.49557111930108, CurrSamplesPerSec=46.49372693497934, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:03:07,851] [INFO] [logging.py:128:log_dist] [Rank 0] step=1350, skipped=13, lr=[1.970216990642385e-05, 1.970216990642385e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:03:07,871] [INFO] [timer.py:264:stop] epoch=0/micro_step=2700/global_step=1350, RunningAvgSamplesPerSec=46.4952625195588, CurrSamplesPerSec=46.51009605835737, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:03:14,772] [INFO] [logging.py:128:log_dist] [Rank 0] step=1360, skipped=13, lr=[1.9376584856247734e-05, 1.9376584856247734e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:03:14,793] [INFO] [timer.py:264:stop] epoch=0/micro_step=2720/global_step=1360, RunningAvgSamplesPerSec=46.49589953343763, CurrSamplesPerSec=46.493340402075496, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:03:21,676] [INFO] [logging.py:128:log_dist] [Rank 0] step=1370, skipped=13, lr=[1.9052001374274694e-05, 1.9052001374274694e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:03:21,696] [INFO] [timer.py:264:stop] epoch=0/micro_step=2740/global_step=1370, RunningAvgSamplesPerSec=46.497266423476795, CurrSamplesPerSec=46.77222560471587, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:03:28,578] [INFO] [logging.py:128:log_dist] [Rank 0] step=1380, skipped=13, lr=[1.8728477271014252e-05, 1.8728477271014252e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:03:28,598] [INFO] [timer.py:264:stop] epoch=0/micro_step=2760/global_step=1380, RunningAvgSamplesPerSec=46.498741093520614, CurrSamplesPerSec=46.613199012037946, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:03:35,473] [INFO] [logging.py:128:log_dist] [Rank 0] step=1390, skipped=13, lr=[1.8406070168293457e-05, 1.8406070168293457e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:03:35,494] [INFO] [timer.py:264:stop] epoch=0/micro_step=2780/global_step=1390, RunningAvgSamplesPerSec=46.500554058102814, CurrSamplesPerSec=46.81554004591409, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:03:38,900] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:03:42,364] [INFO] [logging.py:128:log_dist] [Rank 0] step=1400, skipped=13, lr=[1.8084837488994006e-05, 1.8084837488994006e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:03:42,384] [INFO] [timer.py:264:stop] epoch=0/micro_step=2800/global_step=1400, RunningAvgSamplesPerSec=46.50243920837039, CurrSamplesPerSec=46.78748663020288, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.49, Samples/sec: 3.49, Time/seq 0.29s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.93, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:03:49,284] [INFO] [logging.py:128:log_dist] [Rank 0] step=1410, skipped=13, lr=[1.7764836446824833e-05, 1.7764836446824833e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:03:49,304] [INFO] [timer.py:264:stop] epoch=0/micro_step=2820/global_step=1410, RunningAvgSamplesPerSec=46.50316609477529, CurrSamplesPerSec=46.82542141733676, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:03:56,173] [INFO] [logging.py:128:log_dist] [Rank 0] step=1420, skipped=13, lr=[1.7446124036132035e-05, 1.7446124036132035e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:03:56,194] [INFO] [timer.py:264:stop] epoch=0/micro_step=2840/global_step=1420, RunningAvgSamplesPerSec=46.50514626952885, CurrSamplesPerSec=46.80305138832927, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:04:03,071] [INFO] [logging.py:128:log_dist] [Rank 0] step=1430, skipped=13, lr=[1.71287570217477e-05, 1.71287570217477e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:04:03,091] [INFO] [timer.py:264:stop] epoch=0/micro_step=2860/global_step=1430, RunningAvgSamplesPerSec=46.50676320685188, CurrSamplesPerSec=46.848777623724516, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:04:07,853] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1436
+[2025-01-02 16:04:07,853] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:04:07,853] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1436
+[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1436
+[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1436
+[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1436
+[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1436
+[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1436
+[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.11, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:04:09,908] [INFO] [logging.py:128:log_dist] [Rank 0] step=1440, skipped=14, lr=[1.684432374584351e-05, 1.684432374584351e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:04:09,929] [INFO] [timer.py:264:stop] epoch=0/micro_step=2880/global_step=1440, RunningAvgSamplesPerSec=46.51114454430859, CurrSamplesPerSec=46.99939345444054, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:04:16,798] [INFO] [logging.py:128:log_dist] [Rank 0] step=1450, skipped=14, lr=[1.6529668505230238e-05, 1.6529668505230238e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:04:16,818] [INFO] [timer.py:264:stop] epoch=0/micro_step=2900/global_step=1450, RunningAvgSamplesPerSec=46.51307636486264, CurrSamplesPerSec=46.83207123269369, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:04:23,735] [INFO] [logging.py:128:log_dist] [Rank 0] step=1460, skipped=14, lr=[1.6216521887842863e-05, 1.6216521887842863e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:04:23,756] [INFO] [timer.py:264:stop] epoch=0/micro_step=2920/global_step=1460, RunningAvgSamplesPerSec=46.51272734068831, CurrSamplesPerSec=46.46213298539395, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:04:30,666] [INFO] [logging.py:128:log_dist] [Rank 0] step=1470, skipped=14, lr=[1.59049396672081e-05, 1.59049396672081e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:04:30,687] [INFO] [timer.py:264:stop] epoch=0/micro_step=2940/global_step=1470, RunningAvgSamplesPerSec=46.51279988022624, CurrSamplesPerSec=46.513899984808106, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:04:37,603] [INFO] [logging.py:128:log_dist] [Rank 0] step=1480, skipped=14, lr=[1.5594977338223077e-05, 1.5594977338223077e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:04:37,622] [INFO] [timer.py:264:stop] epoch=0/micro_step=2960/global_step=1480, RunningAvgSamplesPerSec=46.512692069553914, CurrSamplesPerSec=46.78629604259554, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:04:44,541] [INFO] [logging.py:128:log_dist] [Rank 0] step=1490, skipped=14, lr=[1.528669010727125e-05, 1.528669010727125e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:04:44,562] [INFO] [timer.py:264:stop] epoch=0/micro_step=2980/global_step=1490, RunningAvgSamplesPerSec=46.51222927903836, CurrSamplesPerSec=46.50253840695117, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:04:51,473] [INFO] [logging.py:128:log_dist] [Rank 0] step=1500, skipped=14, lr=[1.4980132882389835e-05, 1.4980132882389835e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:04:51,494] [INFO] [timer.py:264:stop] epoch=0/micro_step=3000/global_step=1500, RunningAvgSamplesPerSec=46.51210125057641, CurrSamplesPerSec=46.50220006209979, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:04:58,403] [INFO] [logging.py:128:log_dist] [Rank 0] step=1510, skipped=14, lr=[1.4675360263490295e-05, 1.4675360263490295e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:04:58,424] [INFO] [timer.py:264:stop] epoch=0/micro_step=3020/global_step=1510, RunningAvgSamplesPerSec=46.51201324992706, CurrSamplesPerSec=46.546904191841676, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:05:05,351] [INFO] [logging.py:128:log_dist] [Rank 0] step=1520, skipped=14, lr=[1.4372426532633664e-05, 1.4372426532633664e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:05:05,372] [INFO] [timer.py:264:stop] epoch=0/micro_step=3040/global_step=1520, RunningAvgSamplesPerSec=46.51129958829241, CurrSamplesPerSec=46.45904510287717, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:05:12,297] [INFO] [logging.py:128:log_dist] [Rank 0] step=1530, skipped=14, lr=[1.4071385644362672e-05, 1.4071385644362672e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:05:12,318] [INFO] [timer.py:264:stop] epoch=0/micro_step=3060/global_step=1530, RunningAvgSamplesPerSec=46.51070949284205, CurrSamplesPerSec=46.50381127215948, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:05:17,818] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:05:17,820] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:05:17,820] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:05:17,820] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:05:17,824] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:05:17,824] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:05:19,220] [INFO] [logging.py:128:log_dist] [Rank 0] step=1540, skipped=14, lr=[1.3772291216091954e-05, 1.3772291216091954e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:05:19,241] [INFO] [timer.py:264:stop] epoch=0/micro_step=3080/global_step=1540, RunningAvgSamplesPerSec=46.5110599747054, CurrSamplesPerSec=46.807442065172594, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:05:26,152] [INFO] [logging.py:128:log_dist] [Rank 0] step=1550, skipped=14, lr=[1.347519651855848e-05, 1.347519651855848e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:05:26,173] [INFO] [timer.py:264:stop] epoch=0/micro_step=3100/global_step=1550, RunningAvgSamplesPerSec=46.51086523978462, CurrSamplesPerSec=46.47960655075998, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:05:33,087] [INFO] [logging.py:128:log_dist] [Rank 0] step=1560, skipped=14, lr=[1.3180154466333705e-05, 1.3180154466333705e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:05:33,108] [INFO] [timer.py:264:stop] epoch=0/micro_step=3120/global_step=1560, RunningAvgSamplesPerSec=46.51062351613674, CurrSamplesPerSec=46.46309803286058, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:05:40,050] [INFO] [logging.py:128:log_dist] [Rank 0] step=1570, skipped=14, lr=[1.2887217608399083e-05, 1.2887217608399083e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:05:40,071] [INFO] [timer.py:264:stop] epoch=0/micro_step=3140/global_step=1570, RunningAvgSamplesPerSec=46.50958519849321, CurrSamplesPerSec=46.503472908785604, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:05:46,982] [INFO] [logging.py:128:log_dist] [Rank 0] step=1580, skipped=14, lr=[1.2596438118786732e-05, 1.2596438118786732e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:05:47,003] [INFO] [timer.py:264:stop] epoch=0/micro_step=3160/global_step=1580, RunningAvgSamplesPerSec=46.50945444674376, CurrSamplesPerSec=46.49416179217857, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.50, Samples/sec: 3.52, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.95, Samples/sec: 13.88, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:05:53,931] [INFO] [logging.py:128:log_dist] [Rank 0] step=1590, skipped=14, lr=[1.2307867787286942e-05, 1.2307867787286942e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:05:53,951] [INFO] [timer.py:264:stop] epoch=0/micro_step=3180/global_step=1590, RunningAvgSamplesPerSec=46.50879849785462, CurrSamplesPerSec=46.54303030602095, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:06:00,889] [INFO] [logging.py:128:log_dist] [Rank 0] step=1600, skipped=14, lr=[1.2021558010224001e-05, 1.2021558010224001e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:06:00,908] [INFO] [timer.py:264:stop] epoch=0/micro_step=3200/global_step=1600, RunningAvgSamplesPerSec=46.50777483497731, CurrSamplesPerSec=46.43972299579574, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.18, Samples/sec: 15.53, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:06:07,814] [INFO] [logging.py:128:log_dist] [Rank 0] step=1610, skipped=14, lr=[1.1737559781302185e-05, 1.1737559781302185e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:06:07,835] [INFO] [timer.py:264:stop] epoch=0/micro_step=3220/global_step=1610, RunningAvgSamplesPerSec=46.50800788060955, CurrSamplesPerSec=46.49611069658042, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:06:14,759] [INFO] [logging.py:128:log_dist] [Rank 0] step=1620, skipped=14, lr=[1.1455923682523475e-05, 1.1455923682523475e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:06:14,778] [INFO] [timer.py:264:stop] epoch=0/micro_step=3240/global_step=1620, RunningAvgSamplesPerSec=46.507528936833005, CurrSamplesPerSec=45.83951018076546, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:06:21,714] [INFO] [logging.py:128:log_dist] [Rank 0] step=1630, skipped=14, lr=[1.1176699875178485e-05, 1.1176699875178485e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:06:21,735] [INFO] [timer.py:264:stop] epoch=0/micro_step=3260/global_step=1630, RunningAvgSamplesPerSec=46.5067150409607, CurrSamplesPerSec=46.487349963813216, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:06:27,250] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:06:27,251] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:06:28,650] [INFO] [logging.py:128:log_dist] [Rank 0] step=1640, skipped=14, lr=[1.0899938090912464e-05, 1.0899938090912464e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:06:28,671] [INFO] [timer.py:264:stop] epoch=0/micro_step=3280/global_step=1640, RunningAvgSamplesPerSec=46.50646437840029, CurrSamplesPerSec=46.486061899197516, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:06:35,580] [INFO] [logging.py:128:log_dist] [Rank 0] step=1650, skipped=14, lr=[1.0625687622867731e-05, 1.0625687622867731e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:06:35,601] [INFO] [timer.py:264:stop] epoch=0/micro_step=3300/global_step=1650, RunningAvgSamplesPerSec=46.506480546092504, CurrSamplesPerSec=46.49532145114602, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:06:40,393] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1656
+[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1656
+[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1656
+[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:06:40,394] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0
+[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1656
+[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1656
+[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1656
+[2025-01-02 16:06:40,395] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.08, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:06:42,454] [INFO] [logging.py:128:log_dist] [Rank 0] step=1660, skipped=15, lr=[1.038104975748232e-05, 1.038104975748232e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:06:42,473] [INFO] [timer.py:264:stop] epoch=0/micro_step=3320/global_step=1660, RunningAvgSamplesPerSec=46.5088144037295, CurrSamplesPerSec=46.2695112142726, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:06:49,394] [INFO] [logging.py:128:log_dist] [Rank 0] step=1670, skipped=15, lr=[1.011170498391135e-05, 1.011170498391135e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:06:49,413] [INFO] [timer.py:264:stop] epoch=0/micro_step=3340/global_step=1670, RunningAvgSamplesPerSec=46.508717776579374, CurrSamplesPerSec=46.62398308028986, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:06:56,325] [INFO] [logging.py:128:log_dist] [Rank 0] step=1680, skipped=15, lr=[9.845011916199696e-06, 9.845011916199696e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:06:56,346] [INFO] [timer.py:264:stop] epoch=0/micro_step=3360/global_step=1680, RunningAvgSamplesPerSec=46.50875869239154, CurrSamplesPerSec=46.45526622191193, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:07:03,255] [INFO] [logging.py:128:log_dist] [Rank 0] step=1690, skipped=15, lr=[9.581018054183269e-06, 9.581018054183269e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:07:03,276] [INFO] [timer.py:264:stop] epoch=0/micro_step=3380/global_step=1690, RunningAvgSamplesPerSec=46.50870012606984, CurrSamplesPerSec=46.556898566121795, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:07:10,185] [INFO] [logging.py:128:log_dist] [Rank 0] step=1700, skipped=15, lr=[9.31977041695123e-06, 9.31977041695123e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:07:10,206] [INFO] [timer.py:264:stop] epoch=0/micro_step=3400/global_step=1700, RunningAvgSamplesPerSec=46.508622175904215, CurrSamplesPerSec=46.45679378061621, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:07:17,116] [INFO] [logging.py:128:log_dist] [Rank 0] step=1710, skipped=15, lr=[9.061315534471568e-06, 9.061315534471568e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:07:17,137] [INFO] [timer.py:264:stop] epoch=0/micro_step=3420/global_step=1710, RunningAvgSamplesPerSec=46.50870541176424, CurrSamplesPerSec=46.7551991450285, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.15s, TFLOPs: 1.86, Samples/sec: 13.23, Time/seq 0.08s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:07:24,056] [INFO] [logging.py:128:log_dist] [Rank 0] step=1720, skipped=15, lr=[8.805699439303772e-06, 8.805699439303772e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:07:24,077] [INFO] [timer.py:264:stop] epoch=0/micro_step=3440/global_step=1720, RunningAvgSamplesPerSec=46.50831570128103, CurrSamplesPerSec=46.47883396086827, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:07:30,991] [INFO] [logging.py:128:log_dist] [Rank 0] step=1730, skipped=15, lr=[8.552967658400174e-06, 8.552967658400174e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:07:31,012] [INFO] [timer.py:264:stop] epoch=0/micro_step=3460/global_step=1730, RunningAvgSamplesPerSec=46.50817874207723, CurrSamplesPerSec=46.92904261360235, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:07:37,918] [INFO] [logging.py:128:log_dist] [Rank 0] step=1740, skipped=15, lr=[8.303165204997231e-06, 8.303165204997231e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:07:37,939] [INFO] [timer.py:264:stop] epoch=0/micro_step=3480/global_step=1740, RunningAvgSamplesPerSec=46.5082478561663, CurrSamplesPerSec=46.53083177923073, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.18, Samples/sec: 15.55, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:07:44,850] [INFO] [logging.py:128:log_dist] [Rank 0] step=1750, skipped=15, lr=[8.056336570598434e-06, 8.056336570598434e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:07:44,871] [INFO] [timer.py:264:stop] epoch=0/micro_step=3500/global_step=1750, RunningAvgSamplesPerSec=46.50811673391772, CurrSamplesPerSec=46.45566820130503, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:07:50,380] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:07:50,380] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:07:51,779] [INFO] [logging.py:128:log_dist] [Rank 0] step=1760, skipped=15, lr=[7.812525717049999e-06, 7.812525717049999e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:07:51,799] [INFO] [timer.py:264:stop] epoch=0/micro_step=3520/global_step=1760, RunningAvgSamplesPerSec=46.50828314448669, CurrSamplesPerSec=46.52642832694709, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:07:58,711] [INFO] [logging.py:128:log_dist] [Rank 0] step=1770, skipped=15, lr=[7.571776068710998e-06, 7.571776068710998e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:07:58,731] [INFO] [timer.py:264:stop] epoch=0/micro_step=3540/global_step=1770, RunningAvgSamplesPerSec=46.50829869356961, CurrSamplesPerSec=46.50495529908099, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:08:05,651] [INFO] [logging.py:128:log_dist] [Rank 0] step=1780, skipped=15, lr=[7.334130504719211e-06, 7.334130504719211e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:08:05,671] [INFO] [timer.py:264:stop] epoch=0/micro_step=3560/global_step=1780, RunningAvgSamplesPerSec=46.507906608593274, CurrSamplesPerSec=46.49524091740414, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:08:12,577] [INFO] [logging.py:128:log_dist] [Rank 0] step=1790, skipped=15, lr=[7.099631351354036e-06, 7.099631351354036e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:08:12,597] [INFO] [timer.py:264:stop] epoch=0/micro_step=3580/global_step=1790, RunningAvgSamplesPerSec=46.50816179436258, CurrSamplesPerSec=46.53826953800675, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:08:19,505] [INFO] [logging.py:128:log_dist] [Rank 0] step=1800, skipped=15, lr=[6.868320374497869e-06, 6.868320374497869e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:08:19,526] [INFO] [timer.py:264:stop] epoch=0/micro_step=3600/global_step=1800, RunningAvgSamplesPerSec=46.50819381179044, CurrSamplesPerSec=46.46337147026462, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:08:26,432] [INFO] [logging.py:128:log_dist] [Rank 0] step=1810, skipped=15, lr=[6.64023877219738e-06, 6.64023877219738e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:08:26,452] [INFO] [timer.py:264:stop] epoch=0/micro_step=3620/global_step=1810, RunningAvgSamplesPerSec=46.5083076047805, CurrSamplesPerSec=46.491520562684975, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.95, Samples/sec: 13.91, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.92, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:08:33,369] [INFO] [logging.py:128:log_dist] [Rank 0] step=1820, skipped=15, lr=[6.415427167325794e-06, 6.415427167325794e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:08:33,389] [INFO] [timer.py:264:stop] epoch=0/micro_step=3640/global_step=1820, RunningAvgSamplesPerSec=46.50816781684397, CurrSamplesPerSec=46.81537675249691, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:08:40,298] [INFO] [logging.py:128:log_dist] [Rank 0] step=1830, skipped=15, lr=[6.19392560034775e-06, 6.19392560034775e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:08:40,318] [INFO] [timer.py:264:stop] epoch=0/micro_step=3660/global_step=1830, RunningAvgSamplesPerSec=46.50820709570212, CurrSamplesPerSec=46.5024095130939, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:08:47,228] [INFO] [logging.py:128:log_dist] [Rank 0] step=1840, skipped=15, lr=[5.975773522187763e-06, 5.975773522187763e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:08:47,248] [INFO] [timer.py:264:stop] epoch=0/micro_step=3680/global_step=1840, RunningAvgSamplesPerSec=46.508121168997306, CurrSamplesPerSec=46.481586429598636, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1841
+[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1841
+[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1841
+[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1841
+[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1841
+[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:08:48,576] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0
+[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1841
+[2025-01-02 16:08:48,577] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:08:48,577] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1841
+[2025-01-02 16:08:48,577] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:08:48,577] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.07, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1843
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1843
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1843
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1843
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1843
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:08:49,900] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1843
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1843
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1843
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.07, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:08:54,034] [INFO] [logging.py:128:log_dist] [Rank 0] step=1850, skipped=17, lr=[5.803689621959219e-06, 5.803689621959219e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:08:54,054] [INFO] [timer.py:264:stop] epoch=0/micro_step=3700/global_step=1850, RunningAvgSamplesPerSec=46.51260312244167, CurrSamplesPerSec=46.48744657153694, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:09:00,972] [INFO] [logging.py:128:log_dist] [Rank 0] step=1860, skipped=17, lr=[5.5916641313404325e-06, 5.5916641313404325e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:09:00,991] [INFO] [timer.py:264:stop] epoch=0/micro_step=3720/global_step=1860, RunningAvgSamplesPerSec=46.51238471774673, CurrSamplesPerSec=46.237791067849386, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:09:07,897] [INFO] [logging.py:128:log_dist] [Rank 0] step=1870, skipped=17, lr=[5.383095396373447e-06, 5.383095396373447e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:09:07,918] [INFO] [timer.py:264:stop] epoch=0/micro_step=3740/global_step=1870, RunningAvgSamplesPerSec=46.51254589556029, CurrSamplesPerSec=46.49611069658042, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:09:14,834] [INFO] [logging.py:128:log_dist] [Rank 0] step=1880, skipped=17, lr=[5.178020564558106e-06, 5.178020564558106e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:09:14,855] [INFO] [timer.py:264:stop] epoch=0/micro_step=3760/global_step=1880, RunningAvgSamplesPerSec=46.51221216844289, CurrSamplesPerSec=46.48469340875124, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:09:21,777] [INFO] [logging.py:128:log_dist] [Rank 0] step=1890, skipped=17, lr=[4.976476161106478e-06, 4.976476161106478e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:09:21,798] [INFO] [timer.py:264:stop] epoch=0/micro_step=3780/global_step=1890, RunningAvgSamplesPerSec=46.512024339691905, CurrSamplesPerSec=46.72973962932564, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:09:28,720] [INFO] [logging.py:128:log_dist] [Rank 0] step=1900, skipped=17, lr=[4.778498082437544e-06, 4.778498082437544e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:09:28,738] [INFO] [timer.py:264:stop] epoch=0/micro_step=3800/global_step=1900, RunningAvgSamplesPerSec=46.511614340757966, CurrSamplesPerSec=45.800076335051905, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:09:35,653] [INFO] [logging.py:128:log_dist] [Rank 0] step=1910, skipped=17, lr=[4.584121589783738e-06, 4.584121589783738e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:09:35,674] [INFO] [timer.py:264:stop] epoch=0/micro_step=3820/global_step=1910, RunningAvgSamplesPerSec=46.51156047823584, CurrSamplesPerSec=46.491182378142085, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:09:42,583] [INFO] [logging.py:128:log_dist] [Rank 0] step=1920, skipped=17, lr=[4.39338130291071e-06, 4.39338130291071e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:09:42,604] [INFO] [timer.py:264:stop] epoch=0/micro_step=3840/global_step=1920, RunningAvgSamplesPerSec=46.51154169085933, CurrSamplesPerSec=46.508935662282894, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:09:49,550] [INFO] [logging.py:128:log_dist] [Rank 0] step=1930, skipped=17, lr=[4.206311193951332e-06, 4.206311193951332e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:09:49,571] [INFO] [timer.py:264:stop] epoch=0/micro_step=3860/global_step=1930, RunningAvgSamplesPerSec=46.51063862305707, CurrSamplesPerSec=46.89421645226965, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:09:56,483] [INFO] [logging.py:128:log_dist] [Rank 0] step=1940, skipped=17, lr=[4.022944581354981e-06, 4.022944581354981e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:09:56,503] [INFO] [timer.py:264:stop] epoch=0/micro_step=3880/global_step=1940, RunningAvgSamplesPerSec=46.51063920775755, CurrSamplesPerSec=46.44960709050509, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:09:59,929] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:09:59,929] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:09:59,929] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:09:59,929] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:09:59,935] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:09:59,935] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:10:03,413] [INFO] [logging.py:128:log_dist] [Rank 0] step=1950, skipped=17, lr=[3.843314123953354e-06, 3.843314123953354e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:10:03,434] [INFO] [timer.py:264:stop] epoch=0/micro_step=3900/global_step=1950, RunningAvgSamplesPerSec=46.51056111915757, CurrSamplesPerSec=46.50838771759699, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:10:10,344] [INFO] [logging.py:128:log_dist] [Rank 0] step=1960, skipped=17, lr=[3.6674518151436744e-06, 3.6674518151436744e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:10:10,364] [INFO] [timer.py:264:stop] epoch=0/micro_step=3920/global_step=1960, RunningAvgSamplesPerSec=46.51048027244313, CurrSamplesPerSec=46.48772029560139, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:10:17,273] [INFO] [logging.py:128:log_dist] [Rank 0] step=1970, skipped=17, lr=[3.4953889771904475e-06, 3.4953889771904475e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:10:17,293] [INFO] [timer.py:264:stop] epoch=0/micro_step=3940/global_step=1970, RunningAvgSamplesPerSec=46.51043553683364, CurrSamplesPerSec=46.44023718641015, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1973
+[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1973
+[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1973
+[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1973
+[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1973
+[2025-01-02 16:10:20,011] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0
+[2025-01-02 16:10:20,012] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1973
+[2025-01-02 16:10:20,012] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:10:20,012] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 1973
+[2025-01-02 16:10:20,012] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:10:24,143] [INFO] [logging.py:128:log_dist] [Rank 0] step=1980, skipped=18, lr=[3.343806313235337e-06, 3.343806313235337e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:10:24,163] [INFO] [timer.py:264:stop] epoch=0/micro_step=3960/global_step=1980, RunningAvgSamplesPerSec=46.51242203854685, CurrSamplesPerSec=46.519187831305196, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:10:31,073] [INFO] [logging.py:128:log_dist] [Rank 0] step=1990, skipped=18, lr=[3.1790463347058543e-06, 3.1790463347058543e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:10:31,094] [INFO] [timer.py:264:stop] epoch=0/micro_step=3980/global_step=1990, RunningAvgSamplesPerSec=46.512401921343596, CurrSamplesPerSec=46.516979041899226, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:10:38,016] [INFO] [logging.py:128:log_dist] [Rank 0] step=2000, skipped=18, lr=[3.0181728153463233e-06, 3.0181728153463233e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:10:38,035] [INFO] [timer.py:264:stop] epoch=0/micro_step=4000/global_step=2000, RunningAvgSamplesPerSec=46.51207795653655, CurrSamplesPerSec=45.77321075988233, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:10:44,970] [INFO] [logging.py:128:log_dist] [Rank 0] step=2010, skipped=18, lr=[2.8612144078166593e-06, 2.8612144078166593e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:10:44,990] [INFO] [timer.py:264:stop] epoch=0/micro_step=4020/global_step=2010, RunningAvgSamplesPerSec=46.511671133770086, CurrSamplesPerSec=46.50012176602313, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:10:51,899] [INFO] [logging.py:128:log_dist] [Rank 0] step=2020, skipped=18, lr=[2.708199067468939e-06, 2.708199067468939e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:10:51,919] [INFO] [timer.py:264:stop] epoch=0/micro_step=4040/global_step=2020, RunningAvgSamplesPerSec=46.5116397897557, CurrSamplesPerSec=46.49754429253596, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:10:58,848] [INFO] [logging.py:128:log_dist] [Rank 0] step=2030, skipped=18, lr=[2.5591540473683453e-06, 2.5591540473683453e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:10:58,868] [INFO] [timer.py:264:stop] epoch=0/micro_step=4060/global_step=2030, RunningAvgSamplesPerSec=46.511126243282945, CurrSamplesPerSec=46.50880673295993, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:11:05,780] [INFO] [logging.py:128:log_dist] [Rank 0] step=2040, skipped=18, lr=[2.414105893439225e-06, 2.414105893439225e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:11:05,801] [INFO] [timer.py:264:stop] epoch=0/micro_step=4080/global_step=2040, RunningAvgSamplesPerSec=46.5110025907912, CurrSamplesPerSec=46.43461384529517, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:11:12,744] [INFO] [logging.py:128:log_dist] [Rank 0] step=2050, skipped=18, lr=[2.2730804397370688e-06, 2.2730804397370688e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:11:12,764] [INFO] [timer.py:264:stop] epoch=0/micro_step=4100/global_step=2050, RunningAvgSamplesPerSec=46.51005633195471, CurrSamplesPerSec=46.5211065850435, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:11:19,691] [INFO] [logging.py:128:log_dist] [Rank 0] step=2060, skipped=18, lr=[2.1361028038473034e-06, 2.1361028038473034e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:11:19,707] [INFO] [timer.py:264:stop] epoch=0/micro_step=4120/global_step=2060, RunningAvgSamplesPerSec=46.509589050477125, CurrSamplesPerSec=45.671672575913625, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:11:26,611] [INFO] [logging.py:128:log_dist] [Rank 0] step=2070, skipped=18, lr=[2.003197382411673e-06, 2.003197382411673e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:11:26,631] [INFO] [timer.py:264:stop] epoch=0/micro_step=4140/global_step=2070, RunningAvgSamplesPerSec=46.509808656833094, CurrSamplesPerSec=46.522928741415484, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:11:30,062] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:11:30,063] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:11:33,543] [INFO] [logging.py:128:log_dist] [Rank 0] step=2080, skipped=18, lr=[1.8743878467830294e-06, 1.8743878467830294e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:11:33,564] [INFO] [timer.py:264:stop] epoch=0/micro_step=4160/global_step=2080, RunningAvgSamplesPerSec=46.50968619314036, CurrSamplesPerSec=46.45285449163494, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2083
+[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2083
+[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2083
+[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2083
+[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2083
+[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:11:36,282] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0
+[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2083
+[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from [2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2083
+[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.07, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:11:40,414] [INFO] [logging.py:128:log_dist] [Rank 0] step=2090, skipped=19, lr=[1.7619802236591875e-06, 1.7619802236591875e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:11:40,434] [INFO] [timer.py:264:stop] epoch=0/micro_step=4180/global_step=2090, RunningAvgSamplesPerSec=46.51158895789503, CurrSamplesPerSec=46.4917138132042, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:11:47,345] [INFO] [logging.py:128:log_dist] [Rank 0] step=2100, skipped=19, lr=[1.6410154696242603e-06, 1.6410154696242603e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:11:47,365] [INFO] [timer.py:264:stop] epoch=0/micro_step=4200/global_step=2100, RunningAvgSamplesPerSec=46.51148902071231, CurrSamplesPerSec=46.4236281604466, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:11:54,277] [INFO] [logging.py:128:log_dist] [Rank 0] step=2110, skipped=19, lr=[1.5242111084402238e-06, 1.5242111084402238e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:11:54,297] [INFO] [timer.py:264:stop] epoch=0/micro_step=4220/global_step=2110, RunningAvgSamplesPerSec=46.511339445793595, CurrSamplesPerSec=46.41922892018923, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:12:01,207] [INFO] [logging.py:128:log_dist] [Rank 0] step=2120, skipped=19, lr=[1.4115879437524043e-06, 1.4115879437524043e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:12:01,228] [INFO] [timer.py:264:stop] epoch=0/micro_step=4240/global_step=2120, RunningAvgSamplesPerSec=46.51130295982599, CurrSamplesPerSec=46.511304865842796, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.57, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:12:08,136] [INFO] [logging.py:128:log_dist] [Rank 0] step=2130, skipped=19, lr=[1.3031660345068002e-06, 1.3031660345068002e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:12:08,156] [INFO] [timer.py:264:stop] epoch=0/micro_step=4260/global_step=2130, RunningAvgSamplesPerSec=46.51131404937642, CurrSamplesPerSec=46.51030558048654, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:12:15,059] [INFO] [logging.py:128:log_dist] [Rank 0] step=2140, skipped=19, lr=[1.1989646913774466e-06, 1.1989646913774466e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:12:15,080] [INFO] [timer.py:264:stop] epoch=0/micro_step=4280/global_step=2140, RunningAvgSamplesPerSec=46.5114519203876, CurrSamplesPerSec=46.79242903203494, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:12:22,020] [INFO] [logging.py:128:log_dist] [Rank 0] step=2150, skipped=19, lr=[1.0990024733270572e-06, 1.0990024733270572e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:12:22,040] [INFO] [timer.py:264:stop] epoch=0/micro_step=4300/global_step=2150, RunningAvgSamplesPerSec=46.51052237053515, CurrSamplesPerSec=46.532267517101225, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:12:28,947] [INFO] [logging.py:128:log_dist] [Rank 0] step=2160, skipped=19, lr=[1.0032971843015576e-06, 1.0032971843015576e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:12:28,967] [INFO] [timer.py:264:stop] epoch=0/micro_step=4320/global_step=2160, RunningAvgSamplesPerSec=46.5105816299913, CurrSamplesPerSec=46.438469703874325, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:12:35,881] [INFO] [logging.py:128:log_dist] [Rank 0] step=2170, skipped=19, lr=[9.118658700590616e-07, 9.118658700590616e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:12:35,902] [INFO] [timer.py:264:stop] epoch=0/micro_step=4340/global_step=2170, RunningAvgSamplesPerSec=46.510394643211036, CurrSamplesPerSec=46.47535762417345, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:12:42,807] [INFO] [logging.py:128:log_dist] [Rank 0] step=2180, skipped=19, lr=[8.247248151339343e-07, 8.247248151339343e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:12:42,828] [INFO] [timer.py:264:stop] epoch=0/micro_step=4360/global_step=2180, RunningAvgSamplesPerSec=46.510493742010965, CurrSamplesPerSec=46.50587378396359, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:12:46,253] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2025-01-02 16:12:46,259] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:12:46,259] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:12:49,737] [INFO] [logging.py:128:log_dist] [Rank 0] step=2190, skipped=19, lr=[7.418895399363746e-07, 7.418895399363746e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:12:49,758] [INFO] [timer.py:264:stop] epoch=0/micro_step=4380/global_step=2190, RunningAvgSamplesPerSec=46.510429443817735, CurrSamplesPerSec=46.49342092923379, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:12:56,670] [INFO] [logging.py:128:log_dist] [Rank 0] step=2200, skipped=19, lr=[6.633747979881533e-07, 6.633747979881533e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:12:56,690] [INFO] [timer.py:264:stop] epoch=0/micro_step=4400/global_step=2200, RunningAvgSamplesPerSec=46.51038262533071, CurrSamplesPerSec=46.55770605348971, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:13:03,604] [INFO] [logging.py:128:log_dist] [Rank 0] step=2210, skipped=19, lr=[5.891945732949017e-07, 5.891945732949017e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:13:03,623] [INFO] [timer.py:264:stop] epoch=0/micro_step=4420/global_step=2210, RunningAvgSamplesPerSec=46.51031258811781, CurrSamplesPerSec=46.33170675886358, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:13:10,546] [INFO] [logging.py:128:log_dist] [Rank 0] step=2220, skipped=19, lr=[5.193620778554536e-07, 5.193620778554536e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:13:10,567] [INFO] [timer.py:264:stop] epoch=0/micro_step=4440/global_step=2220, RunningAvgSamplesPerSec=46.509899108741436, CurrSamplesPerSec=46.37364797007833, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:13:17,446] [INFO] [logging.py:128:log_dist] [Rank 0] step=2230, skipped=19, lr=[4.538897493087113e-07, 4.538897493087113e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:13:17,467] [INFO] [timer.py:264:stop] epoch=0/micro_step=4460/global_step=2230, RunningAvgSamplesPerSec=46.51082145039772, CurrSamplesPerSec=46.73201748416546, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:13:24,357] [INFO] [logging.py:128:log_dist] [Rank 0] step=2240, skipped=19, lr=[3.927892487184254e-07, 3.927892487184254e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:13:24,377] [INFO] [timer.py:264:stop] epoch=0/micro_step=4480/global_step=2240, RunningAvgSamplesPerSec=46.51141811493153, CurrSamplesPerSec=46.78347475499735, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:13:31,255] [INFO] [logging.py:128:log_dist] [Rank 0] step=2250, skipped=19, lr=[3.360714584962621e-07, 3.360714584962621e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:13:31,275] [INFO] [timer.py:264:stop] epoch=0/micro_step=4500/global_step=2250, RunningAvgSamplesPerSec=46.512457104244405, CurrSamplesPerSec=46.71571944298993, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:13:38,143] [INFO] [logging.py:128:log_dist] [Rank 0] step=2260, skipped=19, lr=[2.83746480463587e-07, 2.83746480463587e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:13:38,164] [INFO] [timer.py:264:stop] epoch=0/micro_step=4520/global_step=2260, RunningAvgSamplesPerSec=46.51361995388357, CurrSamplesPerSec=46.76160094278518, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:13:45,038] [INFO] [logging.py:128:log_dist] [Rank 0] step=2270, skipped=19, lr=[2.3582363405225405e-07, 2.3582363405225405e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:13:45,059] [INFO] [timer.py:264:stop] epoch=0/micro_step=4540/global_step=2270, RunningAvgSamplesPerSec=46.514651946554, CurrSamplesPerSec=46.784420582288035, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:13:51,929] [INFO] [logging.py:128:log_dist] [Rank 0] step=2280, skipped=19, lr=[1.9231145464475297e-07, 1.9231145464475297e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:13:51,949] [INFO] [timer.py:264:stop] epoch=0/micro_step=4560/global_step=2280, RunningAvgSamplesPerSec=46.51576697162558, CurrSamplesPerSec=46.77840381607327, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:13:55,353] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:13:55,353] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:13:55,355] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:13:55,355] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:13:55,355] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2025-01-02 16:13:55,357] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations
+[2025-01-02 16:13:55,357] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2285
+[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2285
+[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2285
+[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2285
+[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2285
+[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2285
+[2025-01-02 16:13:56,028] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0
+[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2285
+[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2285
+[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.07, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:13:58,763] [INFO] [logging.py:128:log_dist] [Rank 0] step=2290, skipped=20, lr=[1.5692803494115337e-07, 1.5692803494115337e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:13:58,783] [INFO] [timer.py:264:stop] epoch=0/micro_step=4580/global_step=2290, RunningAvgSamplesPerSec=46.51851714180861, CurrSamplesPerSec=46.79658928835533, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:14:05,670] [INFO] [logging.py:128:log_dist] [Rank 0] step=2300, skipped=20, lr=[1.2181682318773424e-07, 1.2181682318773424e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:14:05,691] [INFO] [timer.py:264:stop] epoch=0/micro_step=4600/global_step=2300, RunningAvgSamplesPerSec=46.519098342938314, CurrSamplesPerSec=46.81861017418855, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:14:12,559] [INFO] [logging.py:128:log_dist] [Rank 0] step=2310, skipped=20, lr=[9.11365838208722e-08, 9.11365838208722e-08], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:14:12,579] [INFO] [timer.py:264:stop] epoch=0/micro_step=4620/global_step=2310, RunningAvgSamplesPerSec=46.520263336208856, CurrSamplesPerSec=46.77643117546071, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:14:18,029] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2317
+[2025-01-02 16:14:18,029] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:14:18,029] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2317
+[2025-01-02 16:14:18,029] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2317
+[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2317
+[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:14:18,030] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0
+[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2317
+[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2317
+[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:392:_update_scale]
+Grad overflow on iteration 2317
+[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.11, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:14:19,386] [INFO] [logging.py:128:log_dist] [Rank 0] step=2320, skipped=21, lr=[6.731738239555418e-08, 6.731738239555418e-08], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:14:19,407] [INFO] [timer.py:264:stop] epoch=0/micro_step=4640/global_step=2320, RunningAvgSamplesPerSec=46.52321387815545, CurrSamplesPerSec=46.80081556093043, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.57, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:14:26,272] [INFO] [logging.py:128:log_dist] [Rank 0] step=2330, skipped=21, lr=[4.507039181994299e-08, 4.507039181994299e-08], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:14:26,293] [INFO] [timer.py:264:stop] epoch=0/micro_step=4660/global_step=2330, RunningAvgSamplesPerSec=46.52443592782957, CurrSamplesPerSec=46.81611158184509, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:14:33,160] [INFO] [logging.py:128:log_dist] [Rank 0] step=2340, skipped=21, lr=[2.726804268846084e-08, 2.726804268846084e-08], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:14:33,181] [INFO] [timer.py:264:stop] epoch=0/micro_step=4680/global_step=2340, RunningAvgSamplesPerSec=46.52560854377931, CurrSamplesPerSec=46.75790299843455, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+[2025-01-02 16:14:40,046] [INFO] [logging.py:128:log_dist] [Rank 0] step=2350, skipped=21, lr=[1.3913505719678755e-08, 1.3913505719678755e-08], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2025-01-02 16:14:40,067] [INFO] [timer.py:264:stop] epoch=0/micro_step=4700/global_step=2350, RunningAvgSamplesPerSec=46.526791452423346, CurrSamplesPerSec=46.81642185005165, MemAllocated=5.53GB, MaxMemAllocated=8.61GB
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512
+Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512
+***** Evaluating rewards, Epoch 1/1 *****
+chosen: -31.9375, rejected: -33.96875, loss: 0.6650807857513428
+saving the final model ...
+[rank0]:[W102 16:15:18.696314488 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())
+[2025-01-02 16:15:21,891] [INFO] [launch.py:351:main] Process 106033 exits successfully.
+[2025-01-02 16:15:22,894] [INFO] [launch.py:351:main] Process 106032 exits successfully.
+[2025-01-02 16:15:22,895] [INFO] [launch.py:351:main] Process 106038 exits successfully.
+[2025-01-02 16:15:22,896] [INFO] [launch.py:351:main] Process 106031 exits successfully.
+[2025-01-02 16:15:22,896] [INFO] [launch.py:351:main] Process 106037 exits successfully.
+[2025-01-02 16:15:23,898] [INFO] [launch.py:351:main] Process 106034 exits successfully.
+[2025-01-02 16:15:23,898] [INFO] [launch.py:351:main] Process 106036 exits successfully.
+[2025-01-02 16:15:23,898] [INFO] [launch.py:351:main] Process 106035 exits successfully.
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md
new file mode 100644
index 000000000..ca2d5eb70
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md
@@ -0,0 +1,6 @@
+### 💁For each folder, the bash scripts are examples of "facebook/opt" family.
+
+If you want to change your model such as EleutherAI/gpt-j-6b, you may simply replace
+`` --model_name_or_path facebook/opt-350m`` to ``--model_name_or_path EleutherAI/gpt-neo-125m ``.
+
+For the models we support, please see [our landing page](./../../../README.md#-supported-models-)
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh
new file mode 100755
index 000000000..aaa059bbe
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+OUTPUT=$1
+ZERO_STAGE=$2
+if [ "$OUTPUT" == "" ]; then
+    OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6
+fi
+if [ "$ZERO_STAGE" == "" ]; then
+    ZERO_STAGE=3
+fi
+mkdir -p $OUTPUT
+
+deepspeed main.py \
+   --data_path Dahoas/rm-static \
+   --data_split 2,4,4 \
+   --model_name_or_path meta-llama/Llama-2-7b-hf \
+   --per_device_train_batch_size 8 \
+   --per_device_eval_batch_size 8 \
+   --max_seq_len 512 \
+   --learning_rate 9.65e-6 \
+   --weight_decay 0.1 \
+   --num_train_epochs 1  \
+   --gradient_accumulation_steps 1 \
+   --lr_scheduler_type cosine \
+   --num_warmup_steps 0 \
+   --seed 1234 \
+   --gradient_checkpointing \
+   --zero_stage $ZERO_STAGE \
+   --deepspeed \
+   --offload \
+   --output_dir $OUTPUT \
+   &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
new file mode 100755
index 000000000..ec48de78a
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+OUTPUT=$1
+ZERO_STAGE=$2
+if [ "$OUTPUT" == "" ]; then
+    OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6
+fi
+if [ "$ZERO_STAGE" == "" ]; then
+    ZERO_STAGE=3
+fi
+mkdir -p $OUTPUT
+
+deepspeed main.py \
+   --data_path Dahoas/rm-static \
+   --data_split 2,4,4 \
+   --model_name_or_path meta-llama/Llama-2-7b-hf \
+   --per_device_train_batch_size 8 \
+   --per_device_eval_batch_size 8 \
+   --max_seq_len 512 \
+   --learning_rate 9.65e-6 \
+   --weight_decay 0.1 \
+   --num_train_epochs 1  \
+   --gradient_accumulation_steps 1 \
+   --lr_scheduler_type cosine \
+   --num_warmup_steps 0 \
+   --seed 1234 \
+   --gradient_checkpointing \
+   --zero_stage $ZERO_STAGE \
+   --deepspeed \
+   --offload \
+   --lora_dim 128 \
+   --lora_module_name "layers." \
+   --output_dir $OUTPUT \
+   &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh
new file mode 100644
index 000000000..b55beef98
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+OUTPUT=$1
+ZERO_STAGE=$2
+if [ "$OUTPUT" == "" ]; then
+    OUTPUT=./output
+fi
+if [ "$ZERO_STAGE" == "" ]; then
+    ZERO_STAGE=0
+fi
+mkdir -p $OUTPUT
+
+deepspeed main.py \
+   --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
+   --data_split 2,4,4 \
+   --model_name_or_path facebook/opt-350m \
+   --per_device_train_batch_size 2 \
+   --per_device_eval_batch_size 2 \
+   --max_seq_len 512 \
+   --learning_rate 5e-5 \
+   --weight_decay 0.1 \
+   --dropout 0.0 \
+   --num_train_epochs 1 \
+   --gradient_accumulation_steps 1 \
+   --lr_scheduler_type cosine \
+   --num_warmup_steps 0 \
+   --seed 1234 \
+   --zero_stage $ZERO_STAGE \
+   --deepspeed \
+   --output_dir $OUTPUT \
+   &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh
new file mode 100644
index 000000000..8157865a5
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+OUTPUT=$1
+ZERO_STAGE=$2
+if [ "$OUTPUT" == "" ]; then
+    OUTPUT=./output
+fi
+if [ "$ZERO_STAGE" == "" ]; then
+    ZERO_STAGE=0
+fi
+mkdir -p $OUTPUT
+
+deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-350m \
+   --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \
+   --enable_tensorboard \
+   --tensorboard_path $OUTPUT \
+   --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh
new file mode 100644
index 000000000..16aed6a42
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+OUTPUT=$1
+ZERO_STAGE=$2
+if [ "$OUTPUT" == "" ]; then
+    OUTPUT=./output
+fi
+if [ "$ZERO_STAGE" == "" ]; then
+    ZERO_STAGE=0
+fi
+mkdir -p $OUTPUT
+
+deepspeed main.py \
+   --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
+   --data_split 2,4,4 \
+   --model_name_or_path facebook/opt-350m \
+   --per_device_train_batch_size 4 \
+   --per_device_eval_batch_size 4 \
+   --max_seq_len 512 \
+   --learning_rate 5e-5 \
+   --weight_decay 0.1 \
+   --num_train_epochs 1 \
+   --dropout 0.0 \
+   --gradient_accumulation_steps 1 \
+   --lr_scheduler_type cosine \
+   --num_warmup_steps 0 \
+   --seed 1234 \
+   --zero_stage $ZERO_STAGE \
+   --deepspeed \
+   --output_dir $OUTPUT \
+   &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md
new file mode 100644
index 000000000..1f90b9f65
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md
@@ -0,0 +1,20 @@
+# DeepSpeed Characterization Script
+
+# Contents
+   * [Introduction](#introduction)
+   * [Usage](#usage)
+
+# Introduction
+The step 2 characterization script sweeps across various training parameters. Currently, the following parameters are swept:
+<pre>
+Zero Stage: 2, 3
+Offload: True, False
+</pre>
+
+The `run_step2_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc).
+
+# Usage
+The sweep script can be run as follows:
+<pre>
+DeepSpeedExamples/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning$ bash training_scripts/opt/single_node/sweep/run_step2_sweep.sh
+</pre>
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_single.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
new file mode 100644
index 000000000..6f5453af1
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+ZERO_STAGE=$1
+OFFLOAD=$2
+OUTPUT=$3
+if [ "$ZERO_STAGE" == "" ]; then
+    ZERO_STAGE=0
+fi
+if [ "$OFFLOAD" == true ]; then
+    OFFLOAD="--offload"
+else
+    OFFLOAD=""
+fi
+if [ "$OUTPUT" == "" ]; then
+    OUTPUT=./output
+fi
+mkdir -p $OUTPUT
+
+cmd="deepspeed main.py \
+   --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
+   --data_split 2,4,4 \
+   --model_name_or_path facebook/opt-350m \
+   --num_padding_at_beginning 1 \
+   --per_device_train_batch_size 4 \
+   --per_device_eval_batch_size 4 \
+   --max_seq_len 512 \
+   --learning_rate 5e-5 \
+   --weight_decay 0.1 \
+   --num_train_epochs 1 \
+   --dropout 0.0 \
+   --gradient_accumulation_steps 1 \
+   --lr_scheduler_type cosine \
+   --num_warmup_steps 0 \
+   --seed 1234 \
+   --zero_stage $ZERO_STAGE \
+   --deepspeed \
+   --output_dir $OUTPUT \
+   $OFFLOAD"
+
+echo "----------------------------- DS COMMAND -----------------------------"
+echo $cmd
+
+$cmd &> $OUTPUT/${OUTPUT}.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh
new file mode 100644
index 000000000..ad9849e38
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+for z in {2..3}
+do
+    for offload in true false
+    do
+        cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
+            ${z} \
+            ${offload} \
+            z${z}_offload_${offload}"
+        echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
+        echo $cmd
+        $cmd
+        pkill -9 python
+        sleep 60
+        echo ""
+    done
+done

From b436bd488e04934371bf65b5bcb9fafb3587a115 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Mon, 20 Jan 2025 16:24:13 -0800
Subject: [PATCH 10/13] Update references to torchvision (#949)

Signed-off-by: zhangsmallshark <zhangsmallshark@gmail.com>
---
 training/cifar/cifar10_deepspeed.py           |  2 +-
 training/cifar/cifar10_tutorial.py            |  2 +-
 .../vit_finetuning/main_imagenet.py           | 46 +++++++++----------
 .../vit_finetuning/utils/get_data.py          | 22 ++++-----
 training/gan/gan_baseline_train.py            |  2 +-
 training/gan/gan_deepspeed_train.py           |  2 +-
 training/imagenet/main.py                     | 16 +++----
 training/pipeline_parallelism/train.py        |  2 +-
 8 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/training/cifar/cifar10_deepspeed.py b/training/cifar/cifar10_deepspeed.py
index 9888544d5..e92c43937 100755
--- a/training/cifar/cifar10_deepspeed.py
+++ b/training/cifar/cifar10_deepspeed.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torchvision
-import torchvision.transforms as transforms
+from torchvision import transforms
 from deepspeed.accelerator import get_accelerator
 from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
 
diff --git a/training/cifar/cifar10_tutorial.py b/training/cifar/cifar10_tutorial.py
index 114e8c5fa..b7c7e01bd 100644
--- a/training/cifar/cifar10_tutorial.py
+++ b/training/cifar/cifar10_tutorial.py
@@ -57,7 +57,7 @@
 """
 import torch
 import torchvision
-import torchvision.transforms as transforms
+from torchvision import transforms
 
 ########################################################################
 # The output of torchvision datasets are PILImage images of range [0, 1].
diff --git a/training/data_efficiency/vit_finetuning/main_imagenet.py b/training/data_efficiency/vit_finetuning/main_imagenet.py
index 4d39ac9af..0042b49b4 100644
--- a/training/data_efficiency/vit_finetuning/main_imagenet.py
+++ b/training/data_efficiency/vit_finetuning/main_imagenet.py
@@ -19,8 +19,8 @@
 import torch.multiprocessing as mp
 import torch.utils.data
 import torch.utils.data.distributed
-import torchvision.transforms as transforms
-import torchvision.datasets as datasets
+from torchvision import transforms
+from torchvision import datasets
 import torchvision.models as models
 from torch.utils.data import Subset
 import models
@@ -105,7 +105,7 @@ def _get_model(args):
     nchannels = 3
     model = models.__dict__[args.arch](num_classes=nclasses, nchannels=nchannels)
     return model
-    
+
 def _get_dist_model(gpu, args):
     ngpus_per_node = torch.cuida.device_count()
     if args.distributed:
@@ -149,9 +149,9 @@ def _get_dist_model(gpu, args):
         else:
             model = torch.nn.DataParallel(model).cuda()
     return model
-    
+
 def main():
-   
+
     args = parser.parse_args()
 
     if args.seed is not None:
@@ -190,7 +190,7 @@ def main():
 def main_worker(gpu, ngpus_per_node, args):
     global best_acc1
     global history
-    
+
     if args.deepspeed:
         gpu = args.local_rank
     args.gpu = gpu
@@ -205,7 +205,7 @@ def main_worker(gpu, ngpus_per_node, args):
         deepspeed.init_distributed()
     print(f'created model on gpu {gpu}')
     # exit ()
-    
+
     # define loss function (criterion), optimizer, and learning rate scheduler
     criterion = nn.CrossEntropyLoss().cuda(args.gpu)
 
@@ -284,14 +284,14 @@ def main_worker(gpu, ngpus_per_node, args):
         validate(val_loader, model, criterion, args)
         # return
     args.completed_step = 0
-    
+
     optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                 momentum=args.momentum,
                                 weight_decay=args.weight_decay)
-    
+
     """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
     scheduler = StepLR(optimizer, step_size=int(len(train_loader)*args.epochs//3), gamma=0.1)# None #
-    
+
 
     model, optimizer, _, scheduler  = deepspeed.initialize(
         model=model,
@@ -311,17 +311,17 @@ def main_worker(gpu, ngpus_per_node, args):
         time_epoch = time.time() - start_time
         # evaluate on validation set
         top5_val, top1_val, losses_val = validate(val_loader, model, criterion, args)
-        if args.gpu==0: 
+        if args.gpu==0:
             history["epoch"].append(epoch)
             history["val_loss"].append(losses_val)
-            history["val_acc1"].append(top1_val) 
-            history["val_acc5"].append(top5_val) 
+            history["val_acc1"].append(top1_val)
+            history["val_acc5"].append(top5_val)
             history["train_loss"].append(losses_train)
-            history["train_acc1"].append(top1_train) 
+            history["train_acc1"].append(top1_train)
             history["train_acc5"].append(top5_train)
-            torch.save(history,f"{args.out_dir}/stat.pt") 
+            torch.save(history,f"{args.out_dir}/stat.pt")
             try:
-                print (f'{epoch} epoch at time {time_epoch}s and learning rate {scheduler.get_last_lr()}') 
+                print (f'{epoch} epoch at time {time_epoch}s and learning rate {scheduler.get_last_lr()}')
             except:
                 print (f'{epoch} epoch at time {time_epoch}s and learning rate {args.lr}')
             print (f"finish epoch {epoch} or iteration {args.completed_step}, train_accuracy is {top1_train}, val_accuracy {top1_val}")
@@ -393,14 +393,14 @@ def train(scheduler, train_loader, model, criterion, optimizer, epoch, args):
             loss.backward()
             optimizer.step()
             scheduler.step()
-            
+
         # measure elapsed time
         batch_time.update(time.time() - end)
         end = time.time()
 
-        if i % args.print_freq == 0 and args.gpu==0: 
+        if i % args.print_freq == 0 and args.gpu==0:
             progress.display(i + 1)
-            
+
     if args.distributed:
         losses.all_reduce()
         top1.all_reduce()
@@ -432,7 +432,7 @@ def run_validate(loader, base_progress=0):
                 batch_time.update(time.time() - end)
                 end = time.time()
 
-                if i % args.print_freq == 0 and args.gpu==0: 
+                if i % args.print_freq == 0 and args.gpu==0:
                     progress.display(i + 1)
 
     batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
@@ -509,7 +509,7 @@ def all_reduce(self):
     def __str__(self):
         fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
         return fmtstr.format(**self.__dict__)
-    
+
     def summary(self):
         fmtstr = ''
         if self.summary_type is Summary.NONE:
@@ -522,7 +522,7 @@ def summary(self):
             fmtstr = '{name} {count:.3f}'
         else:
             raise ValueError('invalid summary type %r' % self.summary_type)
-        
+
         return fmtstr.format(**self.__dict__)
 
 
@@ -536,7 +536,7 @@ def display(self, batch):
         entries = [self.prefix + self.batch_fmtstr.format(batch)]
         entries += [str(meter) for meter in self.meters]
         print ('\t'.join(entries))
-        
+
     def display_summary(self):
         entries = [" *"]
         entries += [meter.summary() for meter in self.meters]
diff --git a/training/data_efficiency/vit_finetuning/utils/get_data.py b/training/data_efficiency/vit_finetuning/utils/get_data.py
index dfad5f3ba..c2505fd17 100644
--- a/training/data_efficiency/vit_finetuning/utils/get_data.py
+++ b/training/data_efficiency/vit_finetuning/utils/get_data.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 import torch
 import os
-import torchvision.transforms as transforms
-import torchvision.datasets as datasets
+from torchvision import transforms
+from torchvision import datasets
 
 def get_dataset(dataset_name, data_dir, split, rand_fraction=None,clean=False, transform=None, imsize=None, bucket='pytorch-data', **kwargs):
 
   if dataset_name in [ 'cifar10', 'cifar100']:
-    dataset = globals()[f'get_{dataset_name}'](dataset_name, data_dir, split,  imsize=imsize, bucket=bucket, **kwargs)  
+    dataset = globals()[f'get_{dataset_name}'](dataset_name, data_dir, split,  imsize=imsize, bucket=bucket, **kwargs)
   elif dataset_name in [ 'cifar10vit224', 'cifar100vit224','cifar10vit384', 'cifar100vit384',]:
     imsize = int(dataset_name.split('vit')[-1])
     dataset_name = dataset_name.split('vit')[0]
     #print ('here')
-    dataset = globals()['get_cifar_vit'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs)   
+    dataset = globals()['get_cifar_vit'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs)
   else:
     assert 'cifar' in dataset_name
   print (dataset_name)
@@ -59,10 +59,10 @@ def get_transform(split, normalize=None, transform=None, imsize=None, aug='large
   if transform is None:
     if normalize is None:
         if aug == 'large':
-          
+
           normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         else:
-          normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])  
+          normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
     transform = transforms.Compose(get_aug(split, imsize=imsize, aug=aug)
                                    + [transforms.ToTensor(), normalize])
   return transform
@@ -71,7 +71,7 @@ def get_transform(split, normalize=None, transform=None, imsize=None, aug='large
 def get_cifar10(dataset_name, data_dir, split, transform=None, imsize=None, bucket='pytorch-data', **kwargs):
   if imsize==224:
     transform = get_transform(split, transform=transform, imsize=imsize, aug='large')
-  else:  
+  else:
     transform = get_transform(split, transform=transform, imsize=imsize, aug='small')
   return datasets.CIFAR10(data_dir, train=(split=='train'), transform=transform, download=True, **kwargs)
 
@@ -88,7 +88,7 @@ def get_cifar100N(dataset_name, data_dir, split, rand_fraction=None,transform=No
   if split=='train':
     return CIFAR100N(root=data_dir, train=(split=='train'), transform=transform, download=True, rand_fraction=rand_fraction)
   else:
-    return datasets.CIFAR100(data_dir, train=(split=='train'), transform=transform, download=True, **kwargs)        
+    return datasets.CIFAR100(data_dir, train=(split=='train'), transform=transform, download=True, **kwargs)
 
 def get_cifar_vit(dataset_name, data_dir, split, transform=None, imsize=None, bucket='pytorch-data', **kwargs):
     if imsize==224:
@@ -111,12 +111,12 @@ def get_cifar_vit(dataset_name, data_dir, split, transform=None, imsize=None, bu
       if dataset_name =='cifar10':
           return datasets.CIFAR10(data_dir, train=(split=='train'), transform=transform_data, download=True, **kwargs)
       elif dataset_name =='cifar100':
-      
+
           return datasets.CIFAR100(data_dir, train=(split=='train'), transform=transform_data, download=True, **kwargs)
       else:
           assert dataset_name in ['cifar10', 'cifar100']
     else:
-      
+
       if split=='train':
         transform_data = transforms.Compose([# transforms.ColorJitter(brightness= 0.4, contrast= 0.4, saturation= 0.4, hue= 0.1),
           transforms.Resize(imsize),
@@ -164,4 +164,4 @@ def get_imagenet_vit(dataset_name, data_dir, split, transform=None, imsize=None,
       #return torch.utils.data.distributed.DistributedSampler(train_dataset)
     else:
       return datasets.ImageFolder(valdir, transform_data)
-      #Ereturn torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
\ No newline at end of file
+      #Ereturn torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
diff --git a/training/gan/gan_baseline_train.py b/training/gan/gan_baseline_train.py
index 3d223542f..ab2d67740 100755
--- a/training/gan/gan_baseline_train.py
+++ b/training/gan/gan_baseline_train.py
@@ -3,7 +3,7 @@
 import torch.nn as nn
 import torch.utils.data
 import torchvision.datasets as dset
-import torchvision.transforms as transforms
+from torchvision import transforms
 import torchvision.utils as vutils
 from torch.utils.tensorboard import SummaryWriter
 from time import time
diff --git a/training/gan/gan_deepspeed_train.py b/training/gan/gan_deepspeed_train.py
index f209a4273..ffaf59375 100755
--- a/training/gan/gan_deepspeed_train.py
+++ b/training/gan/gan_deepspeed_train.py
@@ -3,7 +3,7 @@
 import torch.nn as nn
 import torch.utils.data
 import torchvision.datasets as dset
-import torchvision.transforms as transforms
+from torchvision import transforms
 import torchvision.utils as vutils
 from torch.utils.tensorboard import SummaryWriter
 from time import time
diff --git a/training/imagenet/main.py b/training/imagenet/main.py
index 1558e6ae0..414d152f1 100644
--- a/training/imagenet/main.py
+++ b/training/imagenet/main.py
@@ -18,9 +18,9 @@
 import torch.optim
 import torch.utils.data
 import torch.utils.data.distributed
-import torchvision.datasets as datasets
 import torchvision.models as models
-import torchvision.transforms as transforms
+from torchvision import transforms
+from torchvision import datasets
 from torch.optim.lr_scheduler import StepLR
 from torch.utils.data import Subset
 
@@ -94,7 +94,7 @@ def main():
                       'which can slow down your training considerably! '
                       'You may see unexpected behavior when restarting '
                       'from checkpoints.')
-        
+
     if args.gpu is not None:
         warnings.warn('You have chosen a specific GPU. This will completely '
                       'disable data parallelism.')
@@ -112,7 +112,7 @@ def main():
     args.world_size = ngpus_per_node * args.world_size
     t_losses, t_acc1s = main_worker(args.gpu, ngpus_per_node, args)
     #dist.barrier()
-    
+
     # Write the losses to an excel file
     if dist.get_rank() ==0:
         all_losses = [torch.empty_like(t_losses) for _ in range(ngpus_per_node)]
@@ -278,7 +278,7 @@ def print_rank_0(msg):
         acc1s[epoch] = acc1
 
         scheduler.step()
-        
+
         # remember best acc@1 and save checkpoint
         is_best = acc1 > best_acc1
         best_acc1 = max(acc1, best_acc1)
@@ -449,7 +449,7 @@ def all_reduce(self):
     def __str__(self):
         fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
         return fmtstr.format(**self.__dict__)
-    
+
     def summary(self):
         fmtstr = ''
         if self.summary_type is Summary.NONE:
@@ -462,7 +462,7 @@ def summary(self):
             fmtstr = '{name} {count:.3f}'
         else:
             raise ValueError('invalid summary type %r' % self.summary_type)
-        
+
         return fmtstr.format(**self.__dict__)
 
 
@@ -476,7 +476,7 @@ def display(self, batch):
         entries = [self.prefix + self.batch_fmtstr.format(batch)]
         entries += [str(meter) for meter in self.meters]
         print('\t'.join(entries))
-        
+
     def display_summary(self):
         entries = [" *"]
         entries += [meter.summary() for meter in self.meters]
diff --git a/training/pipeline_parallelism/train.py b/training/pipeline_parallelism/train.py
index 1a418b427..b4bc49bf6 100755
--- a/training/pipeline_parallelism/train.py
+++ b/training/pipeline_parallelism/train.py
@@ -7,7 +7,7 @@
 import torch.distributed as dist
 
 import torchvision
-import torchvision.transforms as transforms
+from torchvision import transforms
 from torchvision.models import AlexNet
 from torchvision.models import vgg19
 

From abfedb94cc96ac3901cb3cfaa44d0c29c30bd8b8 Mon Sep 17 00:00:00 2001
From: zhangsmallshark <zhangsmallshark@gmail.com>
Date: Mon, 10 Feb 2025 10:32:32 -0600
Subject: [PATCH 11/13] save args

Signed-off-by: zhangsmallshark <zhangsmallshark@gmail.com>
---
 training/DeepSpeed-Domino/domino/arguments.py       | 2 +-
 training/DeepSpeed-Domino/megatron/checkpointing.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/training/DeepSpeed-Domino/domino/arguments.py b/training/DeepSpeed-Domino/domino/arguments.py
index 0faf58d93..6f238fd6f 100644
--- a/training/DeepSpeed-Domino/domino/arguments.py
+++ b/training/DeepSpeed-Domino/domino/arguments.py
@@ -218,7 +218,7 @@ def parse_args():
                        help='Do not load optimizer when loading checkpoint.')
     parser.add_argument('--no-load-rng', action='store_true', default=None,
                        help='Do not load rng state when loading checkpoint.')
-    group.add_argument('--exit-on-missing-checkpoint', action='store_true',
+    parser.add_argument('--exit-on-missing-checkpoint', action='store_true',
                        help="If '--load' is set, but checkpoint is not found "
                        "(e.g., path typo), then exit instead of random "
                        "initialization.")
diff --git a/training/DeepSpeed-Domino/megatron/checkpointing.py b/training/DeepSpeed-Domino/megatron/checkpointing.py
index 19aef958f..0d26d7967 100644
--- a/training/DeepSpeed-Domino/megatron/checkpointing.py
+++ b/training/DeepSpeed-Domino/megatron/checkpointing.py
@@ -247,7 +247,10 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler):
 
         # Arguments, iteration, and model.
         state_dict = {}
-        # state_dict['args'] = args
+        t_args = args
+        t_args.init_method = None
+        t_args.output_layer_init_method = None
+        state_dict['args'] = t_args
         state_dict['checkpoint_version'] = 3.0
         state_dict['iteration'] = iteration
         if len(model) == 1:

From d6047a5aa32492df13d5248a2ae0bb63a039fdd0 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Fri, 24 Jan 2025 14:37:32 -0800
Subject: [PATCH 12/13] Cleanup CODEOWNERS (#953)

Signed-off-by: zhangsmallshark <zhangsmallshark@gmail.com>
---
 CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 3eb8710ca..3ed374925 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1 +1 @@
-*       @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @duli2012 @mrwyattii @yaozhewei @arashb @xiaoxiawu-microsoft
+*       @tjruwase @ShadenSmith @awan-10 @minjiaz

From df387f5d29c0e1f1611e5ec7b442bbd047d86a76 Mon Sep 17 00:00:00 2001
From: muskonu <117137903+muskonu@users.noreply.github.com>
Date: Fri, 31 Jan 2025 06:18:32 +0800
Subject: [PATCH 13/13] fix: the json format of the training imagenet
 configuration file (#954)

Signed-off-by: zhangsmallshark <zhangsmallshark@gmail.com>
---
 training/imagenet/config/ds_config.json         | 2 +-
 training/imagenet/config/ds_fp16_config.json    | 2 +-
 training/imagenet/config/ds_fp16_z1_config.json | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/training/imagenet/config/ds_config.json b/training/imagenet/config/ds_config.json
index d786b90e6..bab0ea4c2 100644
--- a/training/imagenet/config/ds_config.json
+++ b/training/imagenet/config/ds_config.json
@@ -22,7 +22,7 @@
     "zero_allow_untested_optimizer": true,
     "fp16": {
 	"enabled": false
-    }
+    },
     "gradient_clipping": 0,
     "prescale_gradients": false,
     "cuda_visible_devices": 0,
diff --git a/training/imagenet/config/ds_fp16_config.json b/training/imagenet/config/ds_fp16_config.json
index 60e28fc8b..646a5d650 100644
--- a/training/imagenet/config/ds_fp16_config.json
+++ b/training/imagenet/config/ds_fp16_config.json
@@ -23,7 +23,7 @@
     "fp16": {
 	"enabled": true,
 	"auto_cast": true
-    }
+    },
     "gradient_clipping": 0,
     "prescale_gradients": false,
     "cuda_visible_devices": 0,
diff --git a/training/imagenet/config/ds_fp16_z1_config.json b/training/imagenet/config/ds_fp16_z1_config.json
index b3c0d1c68..ae34e5d57 100644
--- a/training/imagenet/config/ds_fp16_z1_config.json
+++ b/training/imagenet/config/ds_fp16_z1_config.json
@@ -23,7 +23,7 @@
     "fp16": {
 	"enabled": true,
 	"auto_cast": true
-    }
+    },
     "gradient_clipping": 0,
     "prescale_gradients": false,
     "cuda_visible_devices": 0,