diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml index fd3dddbe1e..a00aaa3a4a 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml @@ -8,6 +8,9 @@ policy: pipeline_model_parallel_size: 8 num_layers_in_first_pipeline_stage: 7 num_layers_in_last_pipeline_stage: 6 + generation: + vllm_cfg: + gpu_memory_utilization: 0.3 logger: log_dir: logs/grpo-deepseek-v3-32n4g wandb: diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g.yaml new file mode 100644 index 0000000000..8e42014b77 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g.yaml @@ -0,0 +1,9 @@ +defaults: ./grpo-deepseek-v3-32n4g.yaml +checkpointing: + checkpoint_dir: results/grpo-deepseek-v3-64n4g +logger: + log_dir: logs/grpo-deepseek-v3-64n4g + wandb: + name: grpo-deepseek-v3-64n4g +cluster: + num_nodes: 64 diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g.yaml new file mode 100644 index 0000000000..19908585c5 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g.yaml @@ -0,0 +1,9 @@ +defaults: ./grpo-deepseek-v3-32n8g.yaml +checkpointing: + checkpoint_dir: results/grpo-deepseek-v3-64n8g +logger: + log_dir: logs/grpo-deepseek-v3-64n8g + wandb: + name: grpo-deepseek-v3-64n8g +cluster: + num_nodes: 64 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g.yaml new file mode 100644 index 0000000000..c21c75f789 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g.yaml @@ -0,0 +1,9 @@ +defaults: ./grpo-qwen3-235b-16n4g.yaml +checkpointing: + checkpoint_dir: results/grpo-qwen3-235b-32n4g +logger: + log_dir: logs/grpo-qwen3-235b-32n4g + wandb: + name: grpo-qwen3-235b-32n4g +cluster: + num_nodes: 32 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g.yaml new file mode 100644 index 0000000000..8871ec4552 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g.yaml @@ -0,0 +1,9 @@ +defaults: ./grpo-qwen3-235b-16n8g.yaml +checkpointing: + checkpoint_dir: results/grpo-qwen3-235b-32n8g +logger: + log_dir: logs/grpo-qwen3-235b-32n8g + wandb: + name: grpo-qwen3-235b-32n8g +cluster: + num_nodes: 32 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.yaml similarity index 91% rename from examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml rename to examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.yaml index a9837c87f2..dfe25f6944 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.yaml @@ -12,13 +12,13 @@ policy: megatron_cfg: tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 - expert_model_parallel_size: 16 + expert_model_parallel_size: 8 sequence_parallel: false generation: colocated: enabled: false resources: - num_nodes: 4 + num_nodes: 2 gpus_per_node: 4 vllm_cfg: async_engine: true @@ -30,4 +30,4 @@ logger: name: grpo-qwen3-30ba3b-8n4g-async-1off cluster: gpus_per_node: 4 - num_nodes: 8 + num_nodes: 4 diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g.sh new file mode 100755 index 0000000000..d5013074a9 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g.sh @@ -0,0 +1,52 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 + +# Use the DeepSeek-V3 checkpoint converted to BF16. +if [[ -z "$NRL_DEEPSEEK_V3_BF16_CKPT" ]]; then + echo "Need to set NRL_DEEPSEEK_V3_BF16_CKPT to the path of DeepSeek-V3 checkpoint converted to BF16. See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md for more details." + exit 1 +fi + +# ===== BEGIN CONFIG ===== +NUM_NODES=64 +GPUS_PER_NODE=4 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + policy.model_name=$NRL_DEEPSEEK_V3_BF16_CKPT \ + policy.tokenizer.name=$NRL_DEEPSEEK_V3_BF16_CKPT \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'median(data["train/token_mult_prob_error"]) < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" +fi diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g.sh new file mode 100755 index 0000000000..362aa69204 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g.sh @@ -0,0 +1,52 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 + +# Use the DeepSeek-V3 checkpoint converted to BF16. +if [[ -z "$NRL_DEEPSEEK_V3_BF16_CKPT" ]]; then + echo "Need to set NRL_DEEPSEEK_V3_BF16_CKPT to the path of DeepSeek-V3 checkpoint converted to BF16. See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md for more details." + exit 1 +fi + +# ===== BEGIN CONFIG ===== +NUM_NODES=64 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + policy.model_name=$NRL_DEEPSEEK_V3_BF16_CKPT \ + policy.tokenizer.name=$NRL_DEEPSEEK_V3_BF16_CKPT \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" +fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g.sh new file mode 100755 index 0000000000..51c6f5b4ac --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g.sh @@ -0,0 +1,45 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 + +# ===== BEGIN CONFIG ===== +NUM_NODES=32 +GPUS_PER_NODE=4 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" +fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g.sh new file mode 100755 index 0000000000..cbdd66ab9b --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g.sh @@ -0,0 +1,44 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 + +# ===== BEGIN CONFIG ===== +NUM_NODES=32 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=115 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'median(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' + + # Clean up checkpoint directory after successful run to save space. + rm -rf "$CKPT_DIR" +fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.sh similarity index 99% rename from tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh rename to tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.sh index 6c5a04794e..69d1ec7dab 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.sh @@ -3,7 +3,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== -NUM_NODES=8 +NUM_NODES=4 GPUS_PER_NODE=4 STEPS_PER_RUN=10 MAX_STEPS=10 diff --git a/tests/test_suites/performance_gb200.txt b/tests/test_suites/performance_gb200.txt index d958386001..c08e78f32a 100644 --- a/tests/test_suites/performance_gb200.txt +++ b/tests/test_suites/performance_gb200.txt @@ -5,15 +5,13 @@ # GB200 BF16 ## SYNC -tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh -tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh +tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g.sh tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh +tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g.sh ## ASYNC 1-off -tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh -tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh -tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh +tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.sh tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh diff --git a/tests/test_suites/performance_h100.txt b/tests/test_suites/performance_h100.txt index ee8fbf7c28..c8b6a349f6 100644 --- a/tests/test_suites/performance_h100.txt +++ b/tests/test_suites/performance_h100.txt @@ -5,18 +5,16 @@ # H100 BF16 ## SYNC -tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh -tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh +tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g.sh tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh +tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g.sh tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh ## ASYNC 1-off tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh -tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh -tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh