-
Notifications
You must be signed in to change notification settings - Fork 349
Expand file tree
/
Copy pathgrpo-deepseek-v3-64n4g.sh
More file actions
executable file
·52 lines (45 loc) · 1.73 KB
/
grpo-deepseek-v3-64n4g.sh
File metadata and controls
executable file
·52 lines (45 loc) · 1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env
# disable NVLS to avoid OOM issue
export NCCL_NVLS_ENABLE=0
# Use the DeepSeek-V3 checkpoint converted to BF16.
if [[ -z "$NRL_DEEPSEEK_V3_BF16_CKPT" ]]; then
echo "Need to set NRL_DEEPSEEK_V3_BF16_CKPT to the path of DeepSeek-V3 checkpoint converted to BF16. See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md for more details."
exit 1
fi
# ===== BEGIN CONFIG =====
NUM_NODES=64
GPUS_PER_NODE=4
STEPS_PER_RUN=10
MAX_STEPS=10
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=240
# ===== END CONFIG =====
exit_if_max_steps_reached
# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_grpo.py \
--config $CONFIG_PATH \
grpo.max_num_steps=$MAX_STEPS \
policy.model_name=$NRL_DEEPSEEK_V3_BF16_CKPT \
policy.tokenizer.name=$NRL_DEEPSEEK_V3_BF16_CKPT \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG
# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'median(data["train/token_mult_prob_error"]) < 1.1'
# Clean up checkpoint directory after successful run to save space.
rm -rf "$CKPT_DIR"
fi