Skip to content

Commit ea2b48f

Browse files
committed
update config
1 parent f166413 commit ea2b48f

File tree

1 file changed

+14
-27
lines changed

1 file changed

+14
-27
lines changed

moe_pretraining/nemo/config_GB300_64x4x32xtp1pp4cp1.sh

Lines changed: 14 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -12,43 +12,30 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
# SSH: username that connects to the remote cluster
16-
export USER="DUMMY"
17-
# SSH: remote cluster URL
18-
export HOST="DUMMY"
15+
# SLURM: Username on a cluster
16+
export USER="<SLURM_USER>"
1917
# Slurm: account for job submission
20-
export ACCOUNT="DUMMY"
18+
export ACCOUNT="<SLURM_ACCOUNT>"
2119
# Slurm: partition for job submission
22-
export PARTITION="DUMMY"
20+
export PARTITION="<SLURM_PARTITION>"
2321
# Slurm: job time limit, defaults to 8 hours
24-
export TIME="08:00:00"
25-
# Slurm: --nodes arguments, default to use 288 nodes
22+
export TIME="02:00:00"
23+
# Slurm: --nodes arguments
2624
export NNODES=64
27-
# Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node
25+
# Slurm: --gpus_per_node and --ntasks_per_node argument
2826
export GPUS_PER_NODE=4
29-
# Slurm: max job retries for transient job failures, defaults to retry 3 times
27+
# Slurm: max job retries for transient job failures
3028
export MAX_RETRIES=1
3129

3230
# Folder mapping:
3331
# Output directory that holds logs, any path that you like.
34-
export JOB_DIR="/workspace/code/logs"
32+
export LOG_DIR="<LOG_DIR>"
3533
# Image / container path, either local cache file or remote URL
36-
export IMAGE="DUMMY"
34+
export IMAGE="<IMAGE>"
3735
# Dataset: C4 dataset location that contains the dataset after preprocessing
38-
# export ORIGINAL_C4_PATH="/data/data/C4"
39-
40-
# This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part
41-
export PREPROCESSED_PATH="/data/deepseek_v3_671b/data/C4_processed"
42-
export MERGED_C4_PATH="/data/deepseek_v3_671b/data/C4_merged"
43-
# Dataset: Numpy index working directory, contains shuffled dataset
44-
# This path must be able to hold >400GB data
45-
export TMP_NPY_INDEX="/data/npy_indices"
46-
# Dataset: Tokenizer path
47-
# This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part
48-
export TOKENIZER_PATH="/data/deepseek_v3_671b/model/DeepSeek-V3-671B-Base"
49-
# export TOKENIZER_PATH="/data/llama3_405b_ref/tokenizer"
50-
51-
export MODEL_CKPT="$TOKENIZER_PATH"
36+
export DATA_DIR="<DATA_DIR>"
37+
# Model checkpoint path
38+
export MODEL_CKPT="<MODEL_CKPT>"
5239

5340
# Training Configs:
5441
# Dataloader: Global batch size
@@ -66,7 +53,7 @@ export PIPELINE_PARALLEL_SIZE=4
6653
export CONTEXT_PARALLEL_SIZE=1
6754
export EXPERT_PARALLEL_SIZE=64
6855
export EXPERT_TENSOR_PARALLEL_SIZE=1
69-
export RECOMPUTE_MODULES="mlp,moe_act"
56+
export RECOMPUTE_MODULES="mlp,moe"
7057
export CUDA_GRAPH_IMPLEMENTATION="transformer_engine"
7158
export CUDA_GRAPH_SCOPE="attn"
7259

0 commit comments

Comments
 (0)