Print lm loss in the last cm rank process (#21)

gajagajago · web-flow · commit 3e4263197f9c · 2024-06-15T23:12:21.000+09:00
* remove wrong logging in training.py

* set lm loss logging process

* update gitmodules commit idx
diff --git a/csrc/external/DeepSpeed b/csrc/external/DeepSpeed
@@ -1 +1 @@
-Subproject commit b6e24adb43257628592aaaa772c328efac30f797
+Subproject commit 04a6fedffd7672de8b2ca01644419631bd866746
diff --git a/csrc/external/spdlog b/csrc/external/spdlog
@@ -1 +1 @@
-Subproject commit dea6bb1085466370ed6d629b4d462f299db75958
+Subproject commit 696db97f672e9082e50e50af315d0f4234c82397
diff --git a/external/Megatron-DeepSpeed b/external/Megatron-DeepSpeed
@@ -1 +1 @@
-Subproject commit 00b7040de9de800db01e8469b7dc5a01c43ddd1b
+Subproject commit 6f113124edd051fa54c7c92b81e49f1a33d56dd1
diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py
@@ -821,7 +821,9 @@ def get_data_parallel_src_rank():
 
 def get_pipeline_model_parallel_first_rank():
     """Return the global rank of the first process in the pipeline for the
-    current tensor parallel group"""
+    current pipeline model parallel group
+    NOTE (SpiralPipe) Returns `pp rank` of the first `cm rank` process
+    """
     if _SPIRAL_CROSS_MAPPING:
         assert _SPIRAL_CROSS_MAPPING_LIST is not None
         return _SPIRAL_CROSS_MAPPING_LIST[0]
@@ -833,7 +835,9 @@ def get_pipeline_model_parallel_first_rank():
 
 def get_pipeline_model_parallel_last_rank():
     """Return the global rank of the last process in the pipeline for the
-    current tensor parallel group"""
+    current tensor parallel group
+    NOTE (SpiralPipe) Returns `pp rank` of the last `cm rank` process
+    """
     if _SPIRAL_CROSS_MAPPING:
         assert _SPIRAL_CROSS_MAPPING_LIST is not None
         return _SPIRAL_CROSS_MAPPING_LIST[-1]
@@ -844,7 +848,9 @@ def get_pipeline_model_parallel_last_rank():
         return _PIPELINE_GLOBAL_RANKS[last_rank_local]
 
 def get_pipeline_model_parallel_next_rank():
-    """Return the global rank that follows the caller in the pipeline"""
+    """Return the global rank that follows the caller in the pipeline
+    NOTE (SpiralPipe) Returns `pp rank` of the next `cm rank` process
+    """
     rank_in_pipeline = get_pipeline_model_parallel_rank()
     world_size = get_pipeline_model_parallel_world_size()
     if _SPIRAL_CROSS_MAPPING:
@@ -857,7 +863,9 @@ def get_pipeline_model_parallel_next_rank():
 
 
 def get_pipeline_model_parallel_prev_rank():
-    """Return the global rank that preceeds the caller in the pipeline"""
+    """Return the global rank that preceeds the caller in the pipeline
+    NOTE (SpiralPipe) Returns `pp rank` of the previous `cm rank` process
+    """
     rank_in_pipeline = get_pipeline_model_parallel_rank()
     world_size = get_pipeline_model_parallel_world_size()
     if _SPIRAL_CROSS_MAPPING:
diff --git a/megatron/training.py b/megatron/training.py
@@ -1079,9 +1079,6 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
                 iteration,
             )
 
-    if iteration == 1:
-        timers("interval-time").elapsed(barrier=True)
-
     if iteration % args.log_interval == 0:
         elapsed_time = timers("interval-time").elapsed(barrier=True)
 
@@ -1160,7 +1157,20 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
         total_loss_dict[advanced_iters_key] = 0
         total_loss_dict[skipped_iters_key] = 0
         total_loss_dict[nan_iters_key] = 0
-        print_rank_last(log_string)
+        # TODO (SpiralPipe) Only the pp rank that applies the `loss_func` saves "lm loss" in the `total_loss_dict`
+        # So, we can log loss in two ranks: the rank with last backward stage & the rank with last forward stage
+        # As the rank with last forward stage actually do not need to compute the loss (in future optimization),
+        # we should later print the log string in the rank with last backward stage. Note that this can handle the
+        # optimization case where the forward pass ends at the middle of the pipeline ranks (i.e., the last forward
+        # stage is not mapped to the last pipeline rank), does not compute the loss, while the last pipeline stage
+        # computes the loss after recomputation.
+        if args.spiral:
+            # NOTE (SpiralPipe) Currently, the last pipeline rank computes the loss.
+            # NOTE (SpiralPipe) We must consider the effect of cross-mapping also.
+            if mpu.get_pipeline_model_parallel_rank() == mpu.get_pipeline_model_parallel_world_size() - 1:
+                print(log_string, flush=True)
+        else:
+            print_rank_last(log_string)
         if report_memory_flag and learning_rate > 0.:
             # Report memory after optimizer state has been initialized.
             report_memory('(after {} iterations)'.format(iteration))