diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
index fd3dddbe1e..a00aaa3a4a 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
@@ -8,6 +8,9 @@ policy:
     pipeline_model_parallel_size: 8
     num_layers_in_first_pipeline_stage: 7
     num_layers_in_last_pipeline_stage: 6
+  generation:
+    vllm_cfg:
+      gpu_memory_utilization: 0.3
 logger:
   log_dir: logs/grpo-deepseek-v3-32n4g
   wandb:
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g.yaml
new file mode 100644
index 0000000000..8e42014b77
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g.yaml
@@ -0,0 +1,9 @@
+defaults: ./grpo-deepseek-v3-32n4g.yaml
+checkpointing:
+  checkpoint_dir: results/grpo-deepseek-v3-64n4g
+logger:
+  log_dir: logs/grpo-deepseek-v3-64n4g
+  wandb:
+    name: grpo-deepseek-v3-64n4g
+cluster:
+  num_nodes: 64
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g.yaml
new file mode 100644
index 0000000000..19908585c5
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g.yaml
@@ -0,0 +1,9 @@
+defaults: ./grpo-deepseek-v3-32n8g.yaml
+checkpointing:
+  checkpoint_dir: results/grpo-deepseek-v3-64n8g
+logger:
+  log_dir: logs/grpo-deepseek-v3-64n8g
+  wandb:
+    name: grpo-deepseek-v3-64n8g
+cluster:
+  num_nodes: 64
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g.yaml
new file mode 100644
index 0000000000..c21c75f789
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g.yaml
@@ -0,0 +1,9 @@
+defaults: ./grpo-qwen3-235b-16n4g.yaml
+checkpointing:
+  checkpoint_dir: results/grpo-qwen3-235b-32n4g
+logger:
+  log_dir: logs/grpo-qwen3-235b-32n4g
+  wandb:
+    name: grpo-qwen3-235b-32n4g
+cluster:
+  num_nodes: 32
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g.yaml
new file mode 100644
index 0000000000..8871ec4552
--- /dev/null
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n8g.yaml
@@ -0,0 +1,9 @@
+defaults: ./grpo-qwen3-235b-16n8g.yaml
+checkpointing:
+  checkpoint_dir: results/grpo-qwen3-235b-32n8g
+logger:
+  log_dir: logs/grpo-qwen3-235b-32n8g
+  wandb:
+    name: grpo-qwen3-235b-32n8g
+cluster:
+  num_nodes: 32
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.yaml
similarity index 91%
rename from examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
rename to examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.yaml
index a9837c87f2..dfe25f6944 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.yaml
@@ -12,13 +12,13 @@ policy:
   megatron_cfg:
     tensor_model_parallel_size: 1
     pipeline_model_parallel_size: 1
-    expert_model_parallel_size: 16
+    expert_model_parallel_size: 8
     sequence_parallel: false
   generation:
     colocated:
       enabled: false
       resources:
-        num_nodes: 4
+        num_nodes: 2
         gpus_per_node: 4
     vllm_cfg:
       async_engine: true
@@ -30,4 +30,4 @@ logger:
     name: grpo-qwen3-30ba3b-8n4g-async-1off
 cluster:
   gpus_per_node: 4
-  num_nodes: 8
+  num_nodes: 4
diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g.sh
new file mode 100755
index 0000000000..d5013074a9
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+# disable NVLS to avoid OOM issue
+export NCCL_NVLS_ENABLE=0
+
+# Use the DeepSeek-V3 checkpoint converted to BF16.
+if [[ -z "$NRL_DEEPSEEK_V3_BF16_CKPT" ]]; then
+    echo "Need to set NRL_DEEPSEEK_V3_BF16_CKPT to the path of DeepSeek-V3 checkpoint converted to BF16. See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md for more details."
+    exit 1
+fi
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=64
+GPUS_PER_NODE=4
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=240
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    policy.model_name=$NRL_DEEPSEEK_V3_BF16_CKPT \
+    policy.tokenizer.name=$NRL_DEEPSEEK_V3_BF16_CKPT \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'median(data["train/token_mult_prob_error"]) < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
+fi
diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g.sh
new file mode 100755
index 0000000000..362aa69204
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+# disable NVLS to avoid OOM issue
+export NCCL_NVLS_ENABLE=0
+
+# Use the DeepSeek-V3 checkpoint converted to BF16.
+if [[ -z "$NRL_DEEPSEEK_V3_BF16_CKPT" ]]; then
+    echo "Need to set NRL_DEEPSEEK_V3_BF16_CKPT to the path of DeepSeek-V3 checkpoint converted to BF16. See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md for more details."
+    exit 1
+fi
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=64
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=240
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    policy.model_name=$NRL_DEEPSEEK_V3_BF16_CKPT \
+    policy.tokenizer.name=$NRL_DEEPSEEK_V3_BF16_CKPT \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
+fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g.sh
new file mode 100755
index 0000000000..51c6f5b4ac
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+# disable NVLS to avoid OOM issue
+export NCCL_NVLS_ENABLE=0
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=32
+GPUS_PER_NODE=4
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
+fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g.sh
new file mode 100755
index 0000000000..cbdd66ab9b
--- /dev/null
+++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+# disable NVLS to avoid OOM issue
+export NCCL_NVLS_ENABLE=0
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=32
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=115
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
+fi
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.sh
similarity index 99%
rename from tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
rename to tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.sh
index 6c5a04794e..69d1ec7dab 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.sh
@@ -3,7 +3,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 source $SCRIPT_DIR/common.env
 
 # ===== BEGIN CONFIG =====
-NUM_NODES=8
+NUM_NODES=4
 GPUS_PER_NODE=4
 STEPS_PER_RUN=10
 MAX_STEPS=10
diff --git a/tests/test_suites/performance_gb200.txt b/tests/test_suites/performance_gb200.txt
index d958386001..c08e78f32a 100644
--- a/tests/test_suites/performance_gb200.txt
+++ b/tests/test_suites/performance_gb200.txt
@@ -5,15 +5,13 @@
 # GB200 BF16
 
 ## SYNC
-tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh
-tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh
 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh
 tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
+tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
+tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g.sh
 
 ## ASYNC 1-off
-tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh
-tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh
-tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh
+tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g-async-1off.sh
 tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh
diff --git a/tests/test_suites/performance_h100.txt b/tests/test_suites/performance_h100.txt
index ee8fbf7c28..c8b6a349f6 100644
--- a/tests/test_suites/performance_h100.txt
+++ b/tests/test_suites/performance_h100.txt
@@ -5,18 +5,16 @@
 # H100 BF16
 
 ## SYNC
-tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh
 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh
 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh
 tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh
-tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh
+tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh
+tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g.sh
 tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh
 
 ## ASYNC 1-off
 tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh
-tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh
-tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh