Skip to content

Commit 394c622

Browse files
authored
Merge pull request #768 from NVIDIA/am/megatron-cmd
Do not enable recompute-activations by default
2 parents 28c4246 + a2a5916 commit 394c622

File tree

3 files changed

+10
-5
lines changed

3 files changed

+10
-5
lines changed

src/cloudai/workloads/megatron_run/megatron_run.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2-
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# SPDX-License-Identifier: Apache-2.0
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -36,7 +36,7 @@ class MegatronRunCmdArgs(CmdArgs):
3636
num_attention_heads: Optional[int] = 32
3737
num_layers: Optional[int] = 32
3838
pipeline_model_parallel_size: Optional[int] = 1
39-
recompute_activations: Optional[str] = ""
39+
recompute_activations: Optional[str] = None
4040
seq_length: Optional[int] = 4096
4141
tensor_model_parallel_size: Optional[int] = 2
4242

tests/ref_data/megatron-run.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/megatron:24.09
1414

1515
srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,$PWD --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
1616

17-
srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,$PWD bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python __CLOUDAI_DIR__/run.py --global-batch-size 16 --hidden-size 4096 --max-position-embeddings 4096 --num-attention-heads 32 --num-layers 32 --pipeline-model-parallel-size 1 --recompute-activations --seq-length 4096 --tensor-model-parallel-size 2 --save __CLOUDAI_DIR__ --load __CLOUDAI_DIR__ --tokenizer-model __CLOUDAI_DIR__/model.m"
17+
srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,$PWD bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python __CLOUDAI_DIR__/run.py --global-batch-size 16 --hidden-size 4096 --max-position-embeddings 4096 --num-attention-heads 32 --num-layers 32 --pipeline-model-parallel-size 1 --seq-length 4096 --tensor-model-parallel-size 2 --save __CLOUDAI_DIR__ --load __CLOUDAI_DIR__ --tokenizer-model __CLOUDAI_DIR__/model.m"

tests/test_test_definitions.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2-
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# SPDX-License-Identifier: Apache-2.0
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -283,10 +283,15 @@ def test_default(self, megatron_run: MegatronRunTestDefinition):
283283
assert "--num-attention-heads 32" in cmd
284284
assert "--num-layers 32" in cmd
285285
assert "--pipeline-model-parallel-size 1" in cmd
286-
assert "--recompute-activations " in cmd
286+
assert "--recompute-activations" not in cmd
287287
assert "--seq-length 4096" in cmd
288288
assert "--tensor-model-parallel-size 2" in cmd
289289

290+
def test_recompute_activations_set(self, megatron_run: MegatronRunTestDefinition):
291+
megatron_run.cmd_args.recompute_activations = ""
292+
cmd = " ".join([f"{k} {v}" for k, v in megatron_run.cmd_args_dict.items()])
293+
assert "--recompute-activations " in cmd
294+
290295
def test_nones_are_dropped(self, megatron_run: MegatronRunTestDefinition):
291296
to_be_none = {
292297
"hidden_size": None,

0 commit comments

Comments
 (0)