fix MegatronLayerPolicy to be compatible with the newest ParallelTransformerLayer (#4236)

Dino Chen · RezaYazdaniAminabadi · web-flow · commit 6cbf66613137 · 2023-08-30T23:28:43.000Z
Co-authored-by: Reza Yazdani &lt;44502768+RezaYazdaniAminabadi@users.noreply.github.com&gt;
diff --git a/deepspeed/module_inject/containers/megatron_gpt.py b/deepspeed/module_inject/containers/megatron_gpt.py
@@ -51,14 +51,21 @@ def __init__(self, client_module, inference=True):
                 try:
                     from megatron.model.transformer import ParallelTransformerLayer
                     MegatronLayerPolicy._orig_layer_class = ParallelTransformerLayer
+                    MegatronLayerPolicy.version = 1
                 except ImportError:
                     MegatronLayerPolicy._orig_layer_class = None
 
     def get_hidden_heads(self):
-        return self.client_module.attention.query_key_value.weight.shape[1], \
-                self.client_module.attention.num_attention_heads, \
-                self.client_module.input_layernorm.eps, \
-                DEFAULT_INTERMEDIATE_SIZE
+        if MegatronLayerPolicy.version == 0:
+            return self.client_module.attention.query_key_value.weight.shape[1], \
+                    self.client_module.attention.num_attention_heads, \
+                    self.client_module.input_layernorm.eps, \
+                    DEFAULT_INTERMEDIATE_SIZE
+        else:
+            return self.client_module.self_attention.query_key_value.weight.shape[1], \
+                    self.client_module.self_attention.num_attention_heads, \
+                    self.client_module.input_layernorm.eps, \
+                    DEFAULT_INTERMEDIATE_SIZE
 
     def attention(self, enable_training=False):
         if self.inference: