NVIDIA
diff --git a/‎.gitlab-ci.yml‎
Lines changed: 22 additions & 98 deletions b/‎.gitlab-ci.yml‎
Lines changed: 22 additions & 98 deletions
diff --git a/‎tests/unit_tests/data/test_builder.py‎
Lines changed: 10 additions & 18 deletions b/‎tests/unit_tests/data/test_builder.py‎
Lines changed: 10 additions & 18 deletions
diff --git a/‎tests/unit_tests/data/test_gpt_dataset.py‎
Lines changed: 11 additions & 17 deletions b/‎tests/unit_tests/data/test_gpt_dataset.py‎
Lines changed: 11 additions & 17 deletions
diff --git a/‎tests/unit_tests/data/test_multimodal_dataset.py‎
Lines changed: 12 additions & 17 deletions b/‎tests/unit_tests/data/test_multimodal_dataset.py‎
Lines changed: 12 additions & 17 deletions
diff --git a/‎tests/unit_tests/dist_checkpointing/models/test_bert_model.py‎
Lines changed: 23 additions & 13 deletions b/‎tests/unit_tests/dist_checkpointing/models/test_bert_model.py‎
Lines changed: 23 additions & 13 deletions
@@ -112,7 +112,7 @@ metadata:
     - if: '$FUNCTIONAL_TEST == "yes"'
 
 ppp_capacity_statistics:
-  tags: [mcore-ssh-node]
+  tags: [mcore-ssh-node-A]
   stage: .pre
   image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci:buildcache
   script:
@@ -169,7 +169,7 @@ ppp_capacity_statistics:
 
 build_image:
   tags:
-    - mcore-docker-node
+    - 8xL40S-builder
   image: docker:26.1.4-dind
   needs: []  # May start ASAP
   stage: build
@@ -229,113 +229,37 @@ build_image:
       fi
   retry:
     max: 2
-    
-.unit_test_common:
+
+unit_tests:
   image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
   stage: unit_tests
   needs: [build_image]
   tags:
     - 8xL40S
   variables:
     MOE_GROUPED_GEMM: 0 # Set to 1 to enable grouped gemm for MoE
-  retry:
-    max: 2
-    when: job_execution_timeout
-
-unit_tests:
-  extends: [.unit_test_common]
-  script:
-    - torchrun --nproc_per_node=8 -m pytest -x -v -s --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail tests/unit_tests
-  coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
-  artifacts:
-    paths:
-      - coverage
-    expire_in: 30 days
-  rules:
-    - if: '$FUNCTIONAL_TEST == "yes" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
-      allow_failure: true
-    - if: '$FUNCTIONAL_TEST == "yes"'
-
-unit_tests-data:
-  extends: [.unit_test_common]
-  script:
-    - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/data
-  rules:
-    - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
-      allow_failure: true
-    - if: '$FUNCTIONAL_TEST == "no"'
-
-unit_tests-dist-checkpointing:
-  extends: [.unit_test_common]
-  script:
-    - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/dist_checkpointing
-  rules:
-    - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
-      allow_failure: true
-    - if: '$FUNCTIONAL_TEST == "no"'
-
-unit_tests-fusions:
-  extends: [.unit_test_common]
-  script:
-    - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/fusions
-  rules:
-    - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
-      allow_failure: true
-    - if: '$FUNCTIONAL_TEST == "no"'
-
-unit_tests-inference:
-  extends: [.unit_test_common]
-  script:
-    - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/inference
   rules:
     - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
       allow_failure: true
-    - if: '$FUNCTIONAL_TEST == "no"'
-
-unit_tests-models:
-  extends: [.unit_test_common]
-  script:
-    - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/models
-  rules:
-    - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
-      allow_failure: true
-    - if: '$FUNCTIONAL_TEST == "no"'
-
-unit_tests-pipeline-parallel:
-  extends: [.unit_test_common]
-  script:
-    - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/pipeline_parallel
-  rules:
-    - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
-      allow_failure: true
-    - if: '$FUNCTIONAL_TEST == "no"'
-
-unit_tests-tensor-parallel:
-  extends: [.unit_test_common]
-  script:
-    - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/tensor_parallel
-  rules:
-    - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
-      allow_failure: true
-    - if: '$FUNCTIONAL_TEST == "no"'
-
-unit_tests-transformer:
-  extends: [.unit_test_common]
-  script:
-    - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/transformer
-  rules:
-    - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
-      allow_failure: true
-    - if: '$FUNCTIONAL_TEST == "no"'
-
-unit_tests-top-py:
-  extends: [.unit_test_common]
+    - when: always
+  parallel:
+    matrix:
+      - DIR: 
+        - data
+        - dist_checkpointing
+        - distributed
+        - fusions
+        - inference
+        - models
+        - pipeline_parallel
+        - tensor_parallel
+        - transformer
+        - '*.py'
   script:
-    - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/*.py
-  rules:
-    - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
-      allow_failure: true
-    - if: '$FUNCTIONAL_TEST == "no"'
+    - torchrun --nproc_per_node=8 -m pytest -x -v -s --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail tests/unit_tests/$DIR
+  artifacts:
+    paths:
+      - coverage
 
 docs_build_test:
   image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/python-format:0.0.1
 
@@ -2,35 +2,20 @@
 # Compile megatron.core.datasets.helpers dependencies before BlendedDataset import
 ##
 
-import torch
-
-from megatron.core.datasets.utils import compile_helpers
-from tests.unit_tests.test_utilities import Utils
-
-if torch.distributed.is_available():
-    Utils.initialize_distributed()
-    if torch.distributed.get_rank() == 0:
-        compile_helpers()
-    torch.distributed.barrier()
-else:
-    compile_helpers()
-
-##
-# Done
-##
-
 import os
 import tempfile
 from collections import defaultdict
 from typing import Dict, Optional
 
 import numpy
+import pytest
 import torch
 
 from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
 from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
 from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset
-from megatron.core.datasets.utils import Split, get_blend_from_list
+from megatron.core.datasets.utils import Split, compile_helpers, get_blend_from_list
+from tests.unit_tests.test_utilities import Utils
 
 _NUM_DATASETS = 10
 
@@ -62,6 +47,13 @@ def do_setup(odir):
 
 
 def test_builder():
+    if torch.distributed.is_available():
+        Utils.initialize_distributed()
+        if torch.distributed.get_rank() == 0:
+            compile_helpers()
+        torch.distributed.barrier()
+    else:
+        compile_helpers()
 
     # Define the class here to avoid pytest warnings
 
 
@@ -2,30 +2,16 @@
 # Compile megatron.core.datasets.helpers dependencies before BlendedDataset import
 ##
 
-import torch
-
-from megatron.core.datasets.utils import compile_helpers
-from tests.unit_tests.test_utilities import Utils
-
-if torch.distributed.is_available():
-    Utils.initialize_distributed()
-    if torch.distributed.get_rank() == 0:
-        compile_helpers()
-    torch.distributed.barrier()
-else:
-    compile_helpers()
-
-##
-# Done
-##
-
 import random
 
 import numpy
+import torch
 
 from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
 from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
+from megatron.core.datasets.utils import compile_helpers
 from megatron.training.tokenizer.tokenizer import _NullTokenizer
+from tests.unit_tests.test_utilities import Utils
 
 _MOCK_VOCAB_SIZE = 8192
 
@@ -40,6 +26,14 @@ def sample_N(dataset, N, randomize):
 
 
 def test_mock_gpt_dataset():
+    if torch.distributed.is_available():
+        Utils.initialize_distributed()
+        if torch.distributed.get_rank() == 0:
+            compile_helpers()
+        torch.distributed.barrier()
+    else:
+        compile_helpers()
+
     tokenizer = _NullTokenizer(vocab_size=_MOCK_VOCAB_SIZE)
 
     config = GPTDatasetConfig(
 
@@ -4,33 +4,28 @@
 # Compile megatron.core.datasets.helpers dependencies before BlendedDataset import
 ##
 
-import torch
-
-from megatron.core.datasets.utils import compile_helpers
-from tests.unit_tests.test_utilities import Utils
-
-if torch.distributed.is_available():
-    Utils.initialize_distributed()
-    if torch.distributed.get_rank() == 0:
-        compile_helpers()
-    torch.distributed.barrier()
-else:
-    compile_helpers()
-
-##
-# Done
-##
-
 from types import SimpleNamespace
 
+import torch
+
 from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
 from megatron.core.datasets.multimodal_dataset import MockMultimodalDataset, MultimodalDatasetConfig
+from megatron.core.datasets.utils import compile_helpers
 from megatron.training.tokenizer.tokenizer import _NullTokenizer
+from tests.unit_tests.test_utilities import Utils
 
 _MOCK_VOCAB_SIZE = 8192
 
 
 def test_mock_multimodal_dataset():
+    if torch.distributed.is_available():
+        Utils.initialize_distributed()
+        if torch.distributed.get_rank() == 0:
+            compile_helpers()
+        torch.distributed.barrier()
+    else:
+        compile_helpers()
+        
     config = MultimodalDatasetConfig(
         random_seed=1234,
         sequence_length=1024,
 
@@ -1,24 +1,25 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
-from megatron.core.models.bert.bert_model import BertModel
-import pytest
-
 import os
+
+import pytest
 import torch
-from torch.distributed._tensor import DeviceMesh
 
-from megatron.core.dist_checkpointing import save, load, load_plain_tensors
 from megatron.core import parallel_state as ps
-from megatron.core.dist_checkpointing.dict_utils import diff
+from megatron.core.models.bert.bert_layer_specs import (
+    bert_layer_local_spec,
+    bert_layer_with_transformer_engine_spec,
+)
+from megatron.core.models.bert.bert_model import BertModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.transformer_config import TransformerConfig
-from tests.unit_tests.dist_checkpointing import TempNamedDir
-from tests.unit_tests.dist_checkpointing.models.common import \
-    common_test_simple_sharded_state_dict_save_load, \
-    common_test_parallel_reconfiguration_e2e, common_test_state_dict_comparison, \
-    common_test_vocab_size_padding_change
+from tests.unit_tests.dist_checkpointing.models.common import (
+    common_test_parallel_reconfiguration_e2e,
+    common_test_simple_sharded_state_dict_save_load,
+    common_test_state_dict_comparison,
+    common_test_vocab_size_padding_change,
+)
 from tests.unit_tests.test_utilities import Utils
-from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
-from megatron.core.models.bert.bert_layer_specs import bert_layer_local_spec, bert_layer_with_transformer_engine_spec
 
 
 def initialize_bert_model(seed, layer_spec_fn=bert_layer_with_transformer_engine_spec, vocab_size=128, **config_kwargs):
@@ -52,6 +53,12 @@ def test_sharded_state_dict_save_load(self, tmp_path_dist_ckpt,
 
 
 class TestBERTModelReconfiguration:
+    def setup_method(self, method):
+        pass
+    
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+        
     @pytest.mark.parametrize(
         ('use_fpsl', 'src_tp_pp', 'dest_tp_pp', 'src_layer_spec', 'dst_layer_spec'),
         [
@@ -67,6 +74,8 @@ class TestBERTModelReconfiguration:
     def test_parallel_reconfiguration_e2e(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp,
                                           src_layer_spec, dst_layer_spec, use_fpsl):
         """ Test model saving and loading with different TP/PP """
+        Utils.initialize_model_parallel(src_tp_pp[0], src_tp_pp[1])
+                                        
         common_test_parallel_reconfiguration_e2e(initialize_bert_model, tmp_path_dist_ckpt, src_tp_pp,
                                                  dest_tp_pp, src_layer_spec, dst_layer_spec, use_fpsl)
 
@@ -82,5 +91,6 @@ def test_state_dict_comparison(self, tmp_path_dist_ckpt):
     ])
     def test_vocab_size_padding_change(self, tmp_path_dist_ckpt, vocab_size_base, src_tp_pp, dest_tp_pp):
         """ Test model loading with different vocab size (caused by TP padding). """
+        Utils.initialize_model_parallel(src_tp_pp[0], src_tp_pp[1])
         common_test_vocab_size_padding_change(initialize_bert_model, tmp_path_dist_ckpt, vocab_size_base,
                                               src_tp_pp, dest_tp_pp)