Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ policy:
pipeline_model_parallel_size: 8
num_layers_in_first_pipeline_stage: 7
num_layers_in_last_pipeline_stage: 6
generation:
vllm_cfg:
gpu_memory_utilization: 0.3
logger:
log_dir: logs/grpo-deepseek-v3-32n4g
wandb:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
defaults: ./grpo-deepseek-v3-32n4g.yaml
checkpointing:
checkpoint_dir: results/grpo-deepseek-v3-64n4g
logger:
log_dir: logs/grpo-deepseek-v3-64n4g
wandb:
name: grpo-deepseek-v3-64n4g
cluster:
num_nodes: 64
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
defaults: ./grpo-deepseek-v3-32n8g.yaml
checkpointing:
checkpoint_dir: results/grpo-deepseek-v3-64n8g
logger:
log_dir: logs/grpo-deepseek-v3-64n8g
wandb:
name: grpo-deepseek-v3-64n8g
cluster:
num_nodes: 64
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
defaults: ./grpo-qwen3-235b-16n4g.yaml
checkpointing:
checkpoint_dir: results/grpo-qwen3-235b-32n4g
logger:
log_dir: logs/grpo-qwen3-235b-32n4g
wandb:
name: grpo-qwen3-235b-32n4g
cluster:
num_nodes: 32
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
defaults: ./grpo-qwen3-235b-16n8g.yaml
checkpointing:
checkpoint_dir: results/grpo-qwen3-235b-32n8g
logger:
log_dir: logs/grpo-qwen3-235b-32n8g
wandb:
name: grpo-qwen3-235b-32n8g
cluster:
num_nodes: 32
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ policy:
megatron_cfg:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 16
expert_model_parallel_size: 8
sequence_parallel: false
generation:
colocated:
enabled: false
resources:
num_nodes: 4
num_nodes: 2
gpus_per_node: 4
vllm_cfg:
async_engine: true
Expand All @@ -30,4 +30,4 @@ logger:
name: grpo-qwen3-30ba3b-8n4g-async-1off
cluster:
gpus_per_node: 4
num_nodes: 8
num_nodes: 4
52 changes: 52 additions & 0 deletions tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env
# disable NVLS to avoid OOM issue
export NCCL_NVLS_ENABLE=0

# Use the DeepSeek-V3 checkpoint converted to BF16.
if [[ -z "$NRL_DEEPSEEK_V3_BF16_CKPT" ]]; then
echo "Need to set NRL_DEEPSEEK_V3_BF16_CKPT to the path of DeepSeek-V3 checkpoint converted to BF16. See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md for more details."
exit 1
fi

# ===== BEGIN CONFIG =====
NUM_NODES=64
GPUS_PER_NODE=4
STEPS_PER_RUN=10
MAX_STEPS=10
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=240
# ===== END CONFIG =====

exit_if_max_steps_reached

# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_grpo.py \
--config $CONFIG_PATH \
grpo.max_num_steps=$MAX_STEPS \
policy.model_name=$NRL_DEEPSEEK_V3_BF16_CKPT \
policy.tokenizer.name=$NRL_DEEPSEEK_V3_BF16_CKPT \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG

# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS

# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'median(data["train/token_mult_prob_error"]) < 1.1'

# Clean up checkpoint directory after successful run to save space.
rm -rf "$CKPT_DIR"
fi
52 changes: 52 additions & 0 deletions tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env
# disable NVLS to avoid OOM issue
export NCCL_NVLS_ENABLE=0

# Use the DeepSeek-V3 checkpoint converted to BF16.
if [[ -z "$NRL_DEEPSEEK_V3_BF16_CKPT" ]]; then
echo "Need to set NRL_DEEPSEEK_V3_BF16_CKPT to the path of DeepSeek-V3 checkpoint converted to BF16. See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md for more details."
exit 1
fi

# ===== BEGIN CONFIG =====
NUM_NODES=64
STEPS_PER_RUN=10
MAX_STEPS=10
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=240
# ===== END CONFIG =====

exit_if_max_steps_reached

# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_grpo.py \
--config $CONFIG_PATH \
grpo.max_num_steps=$MAX_STEPS \
policy.model_name=$NRL_DEEPSEEK_V3_BF16_CKPT \
policy.tokenizer.name=$NRL_DEEPSEEK_V3_BF16_CKPT \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG

# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS

# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'median(data["train/token_mult_prob_error"]) < 1.1' \
'data["train/token_mult_prob_error"]["10"] < 1.1'

# Clean up checkpoint directory after successful run to save space.
rm -rf "$CKPT_DIR"
fi
45 changes: 45 additions & 0 deletions tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env
# disable NVLS to avoid OOM issue
export NCCL_NVLS_ENABLE=0

# ===== BEGIN CONFIG =====
NUM_NODES=32
GPUS_PER_NODE=4
STEPS_PER_RUN=10
MAX_STEPS=10
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=100
# ===== END CONFIG =====

exit_if_max_steps_reached

# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_grpo.py \
--config $CONFIG_PATH \
grpo.max_num_steps=$MAX_STEPS \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG

# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS

# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'median(data["train/token_mult_prob_error"]) < 1.1' \
'data["train/token_mult_prob_error"]["10"] < 1.1'

# Clean up checkpoint directory after successful run to save space.
rm -rf "$CKPT_DIR"
fi
44 changes: 44 additions & 0 deletions tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env
# disable NVLS to avoid OOM issue
export NCCL_NVLS_ENABLE=0

# ===== BEGIN CONFIG =====
NUM_NODES=32
STEPS_PER_RUN=10
MAX_STEPS=10
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=115
# ===== END CONFIG =====

exit_if_max_steps_reached

# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_grpo.py \
--config $CONFIG_PATH \
grpo.max_num_steps=$MAX_STEPS \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG

# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS

# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'median(data["train/token_mult_prob_error"]) < 1.1' \
'data["train/token_mult_prob_error"]["10"] < 1.1'

# Clean up checkpoint directory after successful run to save space.
rm -rf "$CKPT_DIR"
fi
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env

# ===== BEGIN CONFIG =====
NUM_NODES=8
NUM_NODES=4
GPUS_PER_NODE=4
STEPS_PER_RUN=10
MAX_STEPS=10
Expand Down
8 changes: 3 additions & 5 deletions tests/test_suites/performance_gb200.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,13 @@
# GB200 BF16

## SYNC
tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g.sh
tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g.sh

## ASYNC 1-off
tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.sh
tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh
tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh
6 changes: 2 additions & 4 deletions tests/test_suites/performance_h100.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
# H100 BF16

## SYNC
tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh
tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh
tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh
tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh
tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh
tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g.sh
tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh
tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g.sh
tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh

## ASYNC 1-off
tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh
tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh
tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh
tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh

Expand Down
Loading