NVIDIA
diff --git a/‎.flake8‎
Lines changed: 1 addition & 1 deletion b/‎.flake8‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitlab/stages/01.tests.yml‎
Lines changed: 2 additions & 1 deletion b/‎.gitlab/stages/01.tests.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.pylintrc‎
Lines changed: 5 additions & 2 deletions b/‎.pylintrc‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎megatron/core/models/gpt/gpt_layer_specs.py‎
Lines changed: 36 additions & 9 deletions b/‎megatron/core/models/gpt/gpt_layer_specs.py‎
Lines changed: 36 additions & 9 deletions
diff --git a/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 41 additions & 24 deletions b/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 41 additions & 24 deletions
diff --git a/‎megatron/core/parallel_state.py‎
Lines changed: 14 additions & 4 deletions b/‎megatron/core/parallel_state.py‎
Lines changed: 14 additions & 4 deletions
@@ -1,4 +1,4 @@
 [flake8]
 max-line-length = 100
-extend-ignore = E203
+extend-ignore = E203,E501,F401,E402,E714
 per-file-ignores = __init__.py:F401
@@ -123,8 +123,9 @@ formatting:
   stage: test
   needs: [build_image]
   script:
+    - env
     - git fetch origin main
-    - CHECK_ONLY=true bash tools/autoformat.sh
+    - CHECK_ONLY=true SKIP_DOCS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Skip docs"* ]] && echo "true" || echo "false") bash tools/autoformat.sh
 
 copyright:
   extends: [.tests_common]
 
@@ -1,9 +1,12 @@
 [MAIN]
 ignore-paths=tests
+max-line-length=100
 
 [MESSAGES CONTROL]
 disable=all
 
-enable=C0115,C0116 
+enable=C0115,C0116,W0611,C0301
 # C0115: missing-class-docstring
-# C0116: missing-function-docstring
+# C0116: missing-function-docstring
+# W0611: unused-import
+# C0301: line-too-long
@@ -1,5 +1,7 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
+from typing import Optional
+
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
@@ -9,7 +11,6 @@
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.moe.moe_layer import MoELayer
 from megatron.core.transformer.spec_utils import ModuleSpec
-from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 
 try:
@@ -27,7 +28,7 @@
     HAVE_TE = False
 
 try:
-    import apex
+    import apex  # pylint: disable=unused-import
 
     from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
 
@@ -38,14 +39,26 @@
 
     from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
 
-    warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
+    warnings.warn('Apex is not installed. Falling back to Torch LayerNorm')
     LNImpl = WrappedTorchLayerNorm
 
 
-# Use this spec to use lower level Transformer Engine modules (required for fp8 training)
 def get_gpt_layer_with_transformer_engine_spec(
-    num_experts: int = None, moe_grouped_gemm: bool = False, qk_layernorm: bool = False
+    num_experts: Optional[int] = None,
+    moe_grouped_gemm: Optional[bool] = False,
+    qk_layernorm: Optional[bool] = False,
 ) -> ModuleSpec:
+    """Use this spec to use lower-level Transformer Engine modules (required for fp8 training).
+
+
+    Args:
+        num_experts (int, optional): Number of experts. Defaults to None.
+        moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
+        qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
+
+    Returns:
+        ModuleSpec: Module specification with TE modules
+    """
     mlp = _get_mlp_module_spec(
         use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
     )
@@ -73,10 +86,22 @@ def get_gpt_layer_with_transformer_engine_spec(
     )
 
 
-# Use this spec for an implementation using only modules in megatron core
 def get_gpt_layer_local_spec(
-    num_experts: int = None, moe_grouped_gemm: bool = False, qk_layernorm: bool = False
+    num_experts: Optional[int] = None,
+    moe_grouped_gemm: Optional[bool] = False,
+    qk_layernorm: Optional[bool] = False,
 ) -> ModuleSpec:
+    """Use this spec for an implementation using only modules in Megatron-Core.
+
+
+    Args:
+        num_experts (int, optional): Number of experts. Defaults to None.
+        moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
+        qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
+
+    Returns:
+        ModuleSpec: Module specification with Megatron-Core modules
+    """
     mlp = _get_mlp_module_spec(
         use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
     )
@@ -107,10 +132,12 @@ def get_gpt_layer_local_spec(
     )
 
 
-# Helper function to get module spec for MLP/MoE
 def _get_mlp_module_spec(
-    use_te: bool = True, num_experts: int = None, moe_grouped_gemm: bool = False
+    use_te: Optional[bool] = True,
+    num_experts: Optional[int] = None,
+    moe_grouped_gemm: Optional[bool] = False,
 ) -> ModuleSpec:
+    """Helper function to get module spec for MLP/MoE"""
     if num_experts is None:
         # Dense MLP w/ or w/o TE modules.
         return ModuleSpec(
 
@@ -1,43 +1,58 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
-import logging
 from collections import OrderedDict
-from typing import Dict, Literal, Optional, Tuple, Union
+from typing import Dict, Literal, Optional
 
-import torch
 from torch import Tensor
 
-from megatron.core import InferenceParams, parallel_state, tensor_parallel
+from megatron.core import InferenceParams, tensor_parallel
 from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
 from megatron.core.dist_checkpointing.mapping import ShardedStateDict
 from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
 from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
 from megatron.core.models.common.language_module.language_module import LanguageModule
 from megatron.core.packed_seq_params import PackedSeqParams
-from megatron.core.transformer.enums import AttnMaskType, ModelType
+from megatron.core.transformer.enums import ModelType
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_block import TransformerBlock
 from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint
 
 
 class GPTModel(LanguageModule):
     """GPT Transformer language model.
 
     Args:
-        config (TransformerConfig): Transformer config
-        transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers
-        vocab_size (int): Vocabulary size
-        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
-        pre_process (bool, optional): Include embedding layer (used with pipeline parallelism). Defaults to True.
-        post_process (bool, optional): Include an output layer (used with pipeline parallelism). Defaults to True.
-        fp16_lm_cross_entropy (bool, optional): Defaults to False.
-        parallel_output (bool, optional): Do not gather the outputs, keep them split across tensor parallel ranks. Defaults to True.
-        share_embeddings_and_output_weights (bool, optional): When True, input embeddings and output logit weights are shared. Defaults to False.
-        position_embedding_type (Literal[learned_absolute,rope], optional):  Position embedding type.. Defaults to 'learned_absolute'.
-        rotary_percent (float, optional): Percent of rotary dimension to use for rotary position embeddings. Ignored unless position_embedding_type is 'rope'. Defaults to 1.0.
-        rotary_base (int, optional): Base period for rotary position embeddings. Ignored unless position_embedding_type is 'rope'. Defaults to 10000.
-        seq_len_interpolation_factor (Optional[float], optional): scale of linearly interpolating RoPE for longer sequences. The value must be a float larger than 1.0. Defaults to None.
+        config (TransformerConfig):
+            Transformer config
+        transformer_layer_spec (ModuleSpec):
+            Specifies module to use for transformer layers
+        vocab_size (int):
+            Vocabulary size
+        max_sequence_length (int):
+            maximum size of sequence. This is used for positional embedding
+        pre_process (bool, optional):
+            Include embedding layer (used with pipeline parallelism). Defaults to True.
+        post_process (bool, optional):
+            Include an output layer (used with pipeline parallelism). Defaults to True.
+        fp16_lm_cross_entropy (bool, optional):
+            Defaults to False.
+        parallel_output (bool, optional):
+            Do not gather the outputs, keep them split across tensor
+            parallel ranks. Defaults to True.
+        share_embeddings_and_output_weights (bool, optional):
+            When True, input embeddings and output logit weights are shared. Defaults to False.
+        position_embedding_type (Literal[learned_absolute,rope], optional):
+            Position embedding type.. Defaults to 'learned_absolute'.
+        rotary_percent (float, optional):
+            Percent of rotary dimension to use for rotary position embeddings.
+            Ignored unless position_embedding_type is 'rope'. Defaults to 1.0.
+        rotary_base (int, optional):
+            Base period for rotary position embeddings. Ignored unless
+            position_embedding_type is 'rope'.
+            Defaults to 10000.
+        seq_len_interpolation_factor (Optional[float], optional):
+            scale of linearly interpolating RoPE for longer sequences.
+            The value must be a float larger than 1.0. Defaults to None.
     """
 
     def __init__(
@@ -113,8 +128,9 @@ def __init__(
                 # all the micro-batches of a global batch for the last pipeline stage. Once we are
                 # done with all the back props for all the microbatches for the last pipeline stage,
                 # it will be in the pipeline flush stage. During this pipeline flush we use the
-                # input activations stored in embedding activation buffer and gradient outputs stored
-                # in gradient buffer to calculate the weight gradients for the embedding final linear layer.
+                # input activations stored in embedding activation buffer and gradient outputs
+                # stored in gradient buffer to calculate the weight gradients for the embedding
+                # final linear layer.
                 self.embedding_activation_buffer = []
                 self.grad_output_buffer = []
             else:
@@ -239,7 +255,8 @@ def forward(
     def sharded_state_dict(
         self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[Dict] = None
     ) -> ShardedStateDict:
-        """Sharded state dict implementation for GPTModel backward-compatibility (removing extra state).
+        """Sharded state dict implementation for GPTModel backward-compatibility
+        (removing extra state).
 
         Args:
             prefix (str): Module name prefix.
@@ -252,8 +269,8 @@ def sharded_state_dict(
         sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
         output_layer_extra_state_key = f'{prefix}output_layer._extra_state'
 
-        # Old GPT checkpoints only stored the output layer weight key. So we remove the _extra_state key
-        # but check that it doesn't contain any data anyway
+        # Old GPT checkpoints only stored the output layer weight key. So we remove the
+        # _extra_state key but check that it doesn't contain any data anyway
         output_extra_state = sharded_state_dict.pop(output_layer_extra_state_key, None)
         assert not (
             output_extra_state and output_extra_state.data
 
@@ -255,7 +255,8 @@ def __init__(
         for name in self.name_to_size.keys():
             if name not in order and self.name_to_size[name] != 1:
                 raise RuntimeError(
-                    f"The size of ({name}) is ({self.name_to_size[name]}), but you haven't specified the order ({self.order})."
+                    f"The size of ({name}) is ({self.name_to_size[name]}), but you haven't"
+                    f"specified the order ({self.order})."
                 )
             elif name not in order:
                 order = order + '-' + name
@@ -355,6 +356,7 @@ def initialize_model_parallel(
     get_embedding_ranks: Optional[Callable[[List[int], Optional[int]], List[int]]] = None,
     get_position_embedding_ranks: Optional[Callable[[List[int], Optional[int]], List[int]]] = None,
 ) -> None:
+    # pylint: disable=line-too-long
     """Initialize model data parallel groups.
 
     Args:
@@ -524,7 +526,8 @@ def initialize_model_parallel(
 
     if data_parallel_size % expert_model_parallel_size != 0:
         raise RuntimeError(
-            f"data_parallel_size ({data_parallel_size}) is not divisible by expert_model_parallel_size "
+            f"data_parallel_size ({data_parallel_size}) is not divisible by "
+            "expert_model_parallel_size "
         )
 
     encoder_world_size = encoder_model_size * data_parallel_size
@@ -999,20 +1002,23 @@ def get_tensor_and_context_parallel_group():
 
 
 def get_expert_model_parallel_group():
+    """Get the expert model parallel group the caller rank belongs to."""
     assert (
         _EXPERT_MODEL_PARALLEL_GROUP is not None
     ), 'expert model parallel group is not initialized'
     return _EXPERT_MODEL_PARALLEL_GROUP
 
 
 def get_tensor_and_expert_parallel_group():
+    """Get the tensor and expert parallel group the caller rank belongs to."""
     assert (
         _TENSOR_AND_EXPERT_PARALLEL_GROUP is not None
     ), 'tensor and expert parallel group is not initialized'
     return _TENSOR_AND_EXPERT_PARALLEL_GROUP
 
 
 def get_data_modulo_expert_parallel_group(with_context_parallel=False):
+    """Get the data modulo expert parallel group the caller rank belongs to."""
     if with_context_parallel:
         assert (
             _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP is not None
@@ -1026,6 +1032,7 @@ def get_data_modulo_expert_parallel_group(with_context_parallel=False):
 
 
 def get_data_modulo_expert_parallel_group_gloo(with_context_parallel=False):
+    """Get the data modulo expert parallel group gloo the caller rank belongs to."""
     if with_context_parallel:
         assert (
             _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO is not None
@@ -1039,6 +1046,7 @@ def get_data_modulo_expert_parallel_group_gloo(with_context_parallel=False):
 
 
 def set_expert_model_parallel_world_size(world_size):
+    """Sets the expert model parallel world size."""
     global _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE
     _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE = world_size
 
@@ -1327,7 +1335,8 @@ def get_pipeline_model_parallel_last_rank():
 
 def get_pipeline_model_parallel_next_rank():
     """Return the global rank that follows the caller in the pipeline, for each pipeline group that
-    the rank is part of. If it's just part of one group, an int is returned, otherwise a list of ints.
+    the rank is part of. If it's just part of one group, an int is returned,
+    otherwise a list of ints.
     """
     assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
     rank_in_pipeline = get_pipeline_model_parallel_rank()
@@ -1343,7 +1352,8 @@ def get_pipeline_model_parallel_next_rank():
 
 def get_pipeline_model_parallel_prev_rank():
     """Return the global rank that preceeds the caller in the pipeline, for each pipeline group that
-    the rank is part of. If it's just part of one group, an int is returned, otherwise a list of ints.
+    the rank is part of. If it's just part of one group, an int is returned,
+    otherwise a list of ints.
     """
     assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
     rank_in_pipeline = get_pipeline_model_parallel_rank()