Fix hang when FP16 enabled and fvs!=bvs (#19)

gajagajago · web-flow · commit b87d831c4396 · 2024-06-13T15:18:24.000+09:00
* fix some miscs first

* fix hang when fvs!=bvs and fp16

* fix minor
diff --git a/megatron/arguments.py b/megatron/arguments.py
@@ -436,10 +436,6 @@ def validate_args(args, defaults={}):
         if args.use_distributed_optimizer:
             raise RuntimeError(
                 "SpiralPipe currently does not support distributed optimizer")
-        if args.spiral_cross_mapping:
-            if args.spiral_forward_virtual_size != args.spiral_backward_virtual_size:
-                raise RuntimeError(
-                    "SpiralPipe with cross mapping requires forward and backward virtual size to be the same")
 
     # GQA
     if args.num_key_value_heads is None:
diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py
@@ -1003,7 +1003,7 @@ def destroy_model_parallel():
 
 def apply_spiral_cross_mapping(ranks):
     """ Converts sorted pp rank list into list whose index is cm_rank and element is pp_rank.
-    Maps into pattern which elimiates inter-node fetch (when fvs==bvs) and minimizes intra-node activation comm.
+    Maps into pattern which elimiates inter-node fetch (when fvs==bvs) and minimizes inter-node activation comm.
 
     TODO (SpiralPipe) currently assumes nprocs_per_node=4 and stride=2
     """
diff --git a/megatron/model/module.py b/megatron/model/module.py
@@ -171,7 +171,12 @@ def float_conversion(val):
 
 class Float16Module(MegatronModule):
 
-    def __init__(self, module, args):
+    def __init__(self, module, args, spiral_disable_cast=False):
+        """Wrap the module with float16/bfloat16 cast.
+
+        Arguments:
+            spiral_disable_cast: bool, whether to disable float32 <-> float16/bfloat16 cast. SpiralPhaseList contains mutliple Float16Modules, while the `fp16/bf16 cast` is only needed at the first Float16Module in the first stage (i.e., fid/bid=0, fbp/bbp=0) and the `fp32 cast` is only needed at the last Float16Module in the last stage (i.e., fid/bid=last, fbp/bbp=last*).
+        """
         super(Float16Module, self).__init__()
 
         if args.fp16:
@@ -186,17 +191,19 @@ def float16_convertor(val):
             raise Exception('should not be here')
 
         self.float16_convertor = float16_convertor
+        if args.spiral:
+            self.spiral_disable_cast = spiral_disable_cast
 
 
     def set_input_tensor(self, input_tensor):
         return self.module.set_input_tensor(input_tensor)
 
 
     def forward(self, *inputs, **kwargs):
-        if mpu.is_pipeline_first_stage():
+        if mpu.is_pipeline_first_stage() and not self.spiral_disable_cast:
             inputs = fp32_to_float16(inputs, self.float16_convertor)
         outputs = self.module(*inputs, **kwargs)
-        if mpu.is_pipeline_last_stage():
+        if mpu.is_pipeline_last_stage() and not self.spiral_disable_cast:
             outputs = float16_to_fp32(outputs)
         return outputs
 
diff --git a/megatron/spiral/init_context.py b/megatron/spiral/init_context.py
@@ -457,7 +457,6 @@ def _fetch_data(param, non_blocking=False):
                     ).view(param.spiral_shape)
                 else:
                     param.data = torch.empty(param.spiral_shape, device=self.local_device, dtype=param.dtype)
-                    assert not get_args().spiral_cross_mapping, "Spiral cross mapping should eliminate remote fetch"
                     get_thunder_group().FetchRemoteParam(
                         param.spiral_id,
                         non_blocking,
@@ -471,7 +470,6 @@ def _fetch_data(param, non_blocking=False):
                     param.data.copy_(param.spiral_tensor, non_blocking=non_blocking)
                 else:
                     param.data = torch.empty(param.spiral_shape, device=self.local_device, dtype=param.dtype)
-                    assert not get_args().spiral_cross_mapping, "Spiral cross mapping should eliminate remote fetch"
                     get_thunder_group().FetchRemoteParam(
                         param.spiral_id,
                         non_blocking,
diff --git a/megatron/training.py b/megatron/training.py
@@ -382,8 +382,25 @@ def _model_provider_func_wrapper(
 
             # wrap model with Float16Module
             if args.fp16 or args.bf16:
+                # only the first or last stage can require cast, where a stage is a spiral phase list
+                # all modules in a spiral phase list shares fid and bid
+                # only the first or last module in the same spiral phase list requires cast
+                enable_cast = (
+                    (fvr == 0 and fbp == 0 and mpu.is_pipeline_first_stage())
+                    or (
+                        fvr == mpu.get_spiral_forward_virtual_size() - 1
+                        and fbp == sbs.get_spiral_forward_stage_build_phase_size() - 1
+                        and mpu.is_pipeline_last_stage()
+                    )
+                    or (bvr == 0 and bbp == 0 and mpu.is_pipeline_first_stage())
+                    or (
+                        bvr == mpu.get_spiral_backward_virtual_size() - 1
+                        and bbp == sbs.get_spiral_backward_stage_build_phase_size() - 1
+                        and mpu.is_pipeline_last_stage()
+                    )
+                )
                 with SpiralWrapperInitContext(enabled=True):
-                    this_model = Float16Module(this_model, args)
+                    this_model = Float16Module(this_model, args, spiral_disable_cast=not enable_cast)
 
             # reset states of the callee
             if mpu.is_spiral_forward_stage():