Skip to content

Commit 16eea87

Browse files
committed
Merge branch 'ko3n1g/ci/retry-unit-tests' into 'main'
tests: Make unit tests atomic again See merge request ADLR/megatron-lm!1810
2 parents 1637c68 + 02a3f91 commit 16eea87

13 files changed

Lines changed: 235 additions & 232 deletions

.gitlab-ci.yml

Lines changed: 22 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ metadata:
112112
- if: '$FUNCTIONAL_TEST == "yes"'
113113

114114
ppp_capacity_statistics:
115-
tags: [mcore-ssh-node]
115+
tags: [mcore-ssh-node-A]
116116
stage: .pre
117117
image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci:buildcache
118118
script:
@@ -169,7 +169,7 @@ ppp_capacity_statistics:
169169
170170
build_image:
171171
tags:
172-
- mcore-docker-node
172+
- 8xL40S-builder
173173
image: docker:26.1.4-dind
174174
needs: [] # May start ASAP
175175
stage: build
@@ -229,113 +229,37 @@ build_image:
229229
fi
230230
retry:
231231
max: 2
232-
233-
.unit_test_common:
232+
233+
unit_tests:
234234
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
235235
stage: unit_tests
236236
needs: [build_image]
237237
tags:
238238
- 8xL40S
239239
variables:
240240
MOE_GROUPED_GEMM: 0 # Set to 1 to enable grouped gemm for MoE
241-
retry:
242-
max: 2
243-
when: job_execution_timeout
244-
245-
unit_tests:
246-
extends: [.unit_test_common]
247-
script:
248-
- torchrun --nproc_per_node=8 -m pytest -x -v -s --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail tests/unit_tests
249-
coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
250-
artifacts:
251-
paths:
252-
- coverage
253-
expire_in: 30 days
254-
rules:
255-
- if: '$FUNCTIONAL_TEST == "yes" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
256-
allow_failure: true
257-
- if: '$FUNCTIONAL_TEST == "yes"'
258-
259-
unit_tests-data:
260-
extends: [.unit_test_common]
261-
script:
262-
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/data
263-
rules:
264-
- if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
265-
allow_failure: true
266-
- if: '$FUNCTIONAL_TEST == "no"'
267-
268-
unit_tests-dist-checkpointing:
269-
extends: [.unit_test_common]
270-
script:
271-
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/dist_checkpointing
272-
rules:
273-
- if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
274-
allow_failure: true
275-
- if: '$FUNCTIONAL_TEST == "no"'
276-
277-
unit_tests-fusions:
278-
extends: [.unit_test_common]
279-
script:
280-
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/fusions
281-
rules:
282-
- if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
283-
allow_failure: true
284-
- if: '$FUNCTIONAL_TEST == "no"'
285-
286-
unit_tests-inference:
287-
extends: [.unit_test_common]
288-
script:
289-
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/inference
290241
rules:
291242
- if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
292243
allow_failure: true
293-
- if: '$FUNCTIONAL_TEST == "no"'
294-
295-
unit_tests-models:
296-
extends: [.unit_test_common]
297-
script:
298-
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/models
299-
rules:
300-
- if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
301-
allow_failure: true
302-
- if: '$FUNCTIONAL_TEST == "no"'
303-
304-
unit_tests-pipeline-parallel:
305-
extends: [.unit_test_common]
306-
script:
307-
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/pipeline_parallel
308-
rules:
309-
- if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
310-
allow_failure: true
311-
- if: '$FUNCTIONAL_TEST == "no"'
312-
313-
unit_tests-tensor-parallel:
314-
extends: [.unit_test_common]
315-
script:
316-
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/tensor_parallel
317-
rules:
318-
- if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
319-
allow_failure: true
320-
- if: '$FUNCTIONAL_TEST == "no"'
321-
322-
unit_tests-transformer:
323-
extends: [.unit_test_common]
324-
script:
325-
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/transformer
326-
rules:
327-
- if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
328-
allow_failure: true
329-
- if: '$FUNCTIONAL_TEST == "no"'
330-
331-
unit_tests-top-py:
332-
extends: [.unit_test_common]
244+
- when: always
245+
parallel:
246+
matrix:
247+
- DIR:
248+
- data
249+
- dist_checkpointing
250+
- distributed
251+
- fusions
252+
- inference
253+
- models
254+
- pipeline_parallel
255+
- tensor_parallel
256+
- transformer
257+
- '*.py'
333258
script:
334-
- torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/*.py
335-
rules:
336-
- if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)'
337-
allow_failure: true
338-
- if: '$FUNCTIONAL_TEST == "no"'
259+
- torchrun --nproc_per_node=8 -m pytest -x -v -s --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail tests/unit_tests/$DIR
260+
artifacts:
261+
paths:
262+
- coverage
339263

340264
docs_build_test:
341265
image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/python-format:0.0.1

tests/unit_tests/data/test_builder.py

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,35 +2,20 @@
22
# Compile megatron.core.datasets.helpers dependencies before BlendedDataset import
33
##
44

5-
import torch
6-
7-
from megatron.core.datasets.utils import compile_helpers
8-
from tests.unit_tests.test_utilities import Utils
9-
10-
if torch.distributed.is_available():
11-
Utils.initialize_distributed()
12-
if torch.distributed.get_rank() == 0:
13-
compile_helpers()
14-
torch.distributed.barrier()
15-
else:
16-
compile_helpers()
17-
18-
##
19-
# Done
20-
##
21-
225
import os
236
import tempfile
247
from collections import defaultdict
258
from typing import Dict, Optional
269

2710
import numpy
11+
import pytest
2812
import torch
2913

3014
from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
3115
from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
3216
from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset
33-
from megatron.core.datasets.utils import Split, get_blend_from_list
17+
from megatron.core.datasets.utils import Split, compile_helpers, get_blend_from_list
18+
from tests.unit_tests.test_utilities import Utils
3419

3520
_NUM_DATASETS = 10
3621

@@ -62,6 +47,13 @@ def do_setup(odir):
6247

6348

6449
def test_builder():
50+
if torch.distributed.is_available():
51+
Utils.initialize_distributed()
52+
if torch.distributed.get_rank() == 0:
53+
compile_helpers()
54+
torch.distributed.barrier()
55+
else:
56+
compile_helpers()
6557

6658
# Define the class here to avoid pytest warnings
6759

tests/unit_tests/data/test_gpt_dataset.py

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,16 @@
22
# Compile megatron.core.datasets.helpers dependencies before BlendedDataset import
33
##
44

5-
import torch
6-
7-
from megatron.core.datasets.utils import compile_helpers
8-
from tests.unit_tests.test_utilities import Utils
9-
10-
if torch.distributed.is_available():
11-
Utils.initialize_distributed()
12-
if torch.distributed.get_rank() == 0:
13-
compile_helpers()
14-
torch.distributed.barrier()
15-
else:
16-
compile_helpers()
17-
18-
##
19-
# Done
20-
##
21-
225
import random
236

247
import numpy
8+
import torch
259

2610
from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
2711
from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
12+
from megatron.core.datasets.utils import compile_helpers
2813
from megatron.training.tokenizer.tokenizer import _NullTokenizer
14+
from tests.unit_tests.test_utilities import Utils
2915

3016
_MOCK_VOCAB_SIZE = 8192
3117

@@ -40,6 +26,14 @@ def sample_N(dataset, N, randomize):
4026

4127

4228
def test_mock_gpt_dataset():
29+
if torch.distributed.is_available():
30+
Utils.initialize_distributed()
31+
if torch.distributed.get_rank() == 0:
32+
compile_helpers()
33+
torch.distributed.barrier()
34+
else:
35+
compile_helpers()
36+
4337
tokenizer = _NullTokenizer(vocab_size=_MOCK_VOCAB_SIZE)
4438

4539
config = GPTDatasetConfig(

tests/unit_tests/data/test_multimodal_dataset.py

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,33 +4,28 @@
44
# Compile megatron.core.datasets.helpers dependencies before BlendedDataset import
55
##
66

7-
import torch
8-
9-
from megatron.core.datasets.utils import compile_helpers
10-
from tests.unit_tests.test_utilities import Utils
11-
12-
if torch.distributed.is_available():
13-
Utils.initialize_distributed()
14-
if torch.distributed.get_rank() == 0:
15-
compile_helpers()
16-
torch.distributed.barrier()
17-
else:
18-
compile_helpers()
19-
20-
##
21-
# Done
22-
##
23-
247
from types import SimpleNamespace
258

9+
import torch
10+
2611
from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
2712
from megatron.core.datasets.multimodal_dataset import MockMultimodalDataset, MultimodalDatasetConfig
13+
from megatron.core.datasets.utils import compile_helpers
2814
from megatron.training.tokenizer.tokenizer import _NullTokenizer
15+
from tests.unit_tests.test_utilities import Utils
2916

3017
_MOCK_VOCAB_SIZE = 8192
3118

3219

3320
def test_mock_multimodal_dataset():
21+
if torch.distributed.is_available():
22+
Utils.initialize_distributed()
23+
if torch.distributed.get_rank() == 0:
24+
compile_helpers()
25+
torch.distributed.barrier()
26+
else:
27+
compile_helpers()
28+
3429
config = MultimodalDatasetConfig(
3530
random_seed=1234,
3631
sequence_length=1024,

tests/unit_tests/dist_checkpointing/models/test_bert_model.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,25 @@
11
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
22

3-
from megatron.core.models.bert.bert_model import BertModel
4-
import pytest
5-
63
import os
4+
5+
import pytest
76
import torch
8-
from torch.distributed._tensor import DeviceMesh
97

10-
from megatron.core.dist_checkpointing import save, load, load_plain_tensors
118
from megatron.core import parallel_state as ps
12-
from megatron.core.dist_checkpointing.dict_utils import diff
9+
from megatron.core.models.bert.bert_layer_specs import (
10+
bert_layer_local_spec,
11+
bert_layer_with_transformer_engine_spec,
12+
)
13+
from megatron.core.models.bert.bert_model import BertModel
14+
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
1315
from megatron.core.transformer.transformer_config import TransformerConfig
14-
from tests.unit_tests.dist_checkpointing import TempNamedDir
15-
from tests.unit_tests.dist_checkpointing.models.common import \
16-
common_test_simple_sharded_state_dict_save_load, \
17-
common_test_parallel_reconfiguration_e2e, common_test_state_dict_comparison, \
18-
common_test_vocab_size_padding_change
16+
from tests.unit_tests.dist_checkpointing.models.common import (
17+
common_test_parallel_reconfiguration_e2e,
18+
common_test_simple_sharded_state_dict_save_load,
19+
common_test_state_dict_comparison,
20+
common_test_vocab_size_padding_change,
21+
)
1922
from tests.unit_tests.test_utilities import Utils
20-
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
21-
from megatron.core.models.bert.bert_layer_specs import bert_layer_local_spec, bert_layer_with_transformer_engine_spec
2223

2324

2425
def initialize_bert_model(seed, layer_spec_fn=bert_layer_with_transformer_engine_spec, vocab_size=128, **config_kwargs):
@@ -52,6 +53,12 @@ def test_sharded_state_dict_save_load(self, tmp_path_dist_ckpt,
5253

5354

5455
class TestBERTModelReconfiguration:
56+
def setup_method(self, method):
57+
pass
58+
59+
def teardown_method(self, method):
60+
Utils.destroy_model_parallel()
61+
5562
@pytest.mark.parametrize(
5663
('use_fpsl', 'src_tp_pp', 'dest_tp_pp', 'src_layer_spec', 'dst_layer_spec'),
5764
[
@@ -67,6 +74,8 @@ class TestBERTModelReconfiguration:
6774
def test_parallel_reconfiguration_e2e(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp,
6875
src_layer_spec, dst_layer_spec, use_fpsl):
6976
""" Test model saving and loading with different TP/PP """
77+
Utils.initialize_model_parallel(src_tp_pp[0], src_tp_pp[1])
78+
7079
common_test_parallel_reconfiguration_e2e(initialize_bert_model, tmp_path_dist_ckpt, src_tp_pp,
7180
dest_tp_pp, src_layer_spec, dst_layer_spec, use_fpsl)
7281

@@ -82,5 +91,6 @@ def test_state_dict_comparison(self, tmp_path_dist_ckpt):
8291
])
8392
def test_vocab_size_padding_change(self, tmp_path_dist_ckpt, vocab_size_base, src_tp_pp, dest_tp_pp):
8493
""" Test model loading with different vocab size (caused by TP padding). """
94+
Utils.initialize_model_parallel(src_tp_pp[0], src_tp_pp[1])
8595
common_test_vocab_size_padding_change(initialize_bert_model, tmp_path_dist_ckpt, vocab_size_base,
8696
src_tp_pp, dest_tp_pp)

0 commit comments

Comments
 (0)