kocchop
diff --git a/‎dependencies/scripts/docker_upload_runner.sh‎
Lines changed: 1 addition & 1 deletion b/‎dependencies/scripts/docker_upload_runner.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/tutorials/grpo.md‎
Lines changed: 10 additions & 2 deletions b/‎docs/tutorials/grpo.md‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎docs/tutorials/grpo_with_pathways.md‎
Lines changed: 9 additions & 2 deletions b/‎docs/tutorials/grpo_with_pathways.md‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎src/MaxText/configs/base.yml‎
Lines changed: 4 additions & 0 deletions b/‎src/MaxText/configs/base.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/MaxText/configs/rl.yml‎
Lines changed: 146 additions & 61 deletions b/‎src/MaxText/configs/rl.yml‎
Lines changed: 146 additions & 61 deletions
diff --git a/‎src/MaxText/configs/rl_mt_jt.yml‎
Lines changed: 77 additions & 0 deletions b/‎src/MaxText/configs/rl_mt_jt.yml‎
Lines changed: 77 additions & 0 deletions
@@ -88,7 +88,7 @@ if ! docker image inspect "${LOCAL_IMAGE_NAME}" &> /dev/null; then
   exit 1
 fi
 
-docker build --build-arg BASEIMAGE=${LOCAL_IMAGE_NAME} \
+docker build --no-cache --build-arg BASEIMAGE=${LOCAL_IMAGE_NAME} \
              -f "$MAXTEXT_REPO_ROOT"'/dependencies/dockerfiles/maxtext_runner.Dockerfile' \
              -t ${LOCAL_IMAGE_NAME_RUNNER} .
 
 
@@ -58,9 +58,17 @@ We use the scheduler code from vLLM, and the model runner code from `tpu_commons
 
 ## Run GRPO
 
-Finally, run the script
+Finally, run the command
 
-`python ~/maxtext/src/MaxText/examples/grpo_llama3_1_8b_demo.py`
+```
+python3 -m src.MaxText.rl.train_rl src/MaxText/configs/rl.yml \
+  --model_name=llama3.1-8b \
+  --tokenizer_path=meta-llama/Llama-3.1-8B-Instruct \
+  --load_parameters_path=gs://path/to/checkpoint/0/items \
+  --run_name=$WORKLOAD \
+  --base_output_directory=$OUTPUT_PATH \
+  --hf_access_token=$HF_TOKEN
+```
 
 The overview of the demo script is as follows:
 
 
@@ -53,13 +53,20 @@ bash docker_upload_runner.sh CLOUD_IMAGE_NAME=path/to/gcr.io
 
 ### Submit your jobs
 
-Please use a pathways enabled cluster, and you can submit the script `maxtext/src/MaxText/examples/grpo_llama3_1_70b_demo_pw.py` via XPK
+Please use a pathways enabled [XPK](https://github.com/AI-Hypercomputer/xpk) cluster, and you can submit the `train_rl.py` script via [XPK](https://github.com/AI-Hypercomputer/xpk)
 ```
 xpk workload create-pathways --workload $WORKLOAD \
 --docker-image path/to/gcr.io:latest --cluster $TPU_CLUSTER \
 --tpu-type=$TPU_TYPE --num-slices=1  --zone=$ZONE \
 --project=$PROJECT_ID --priority=high \
---command "HF_TOKEN=$HF_TOKEN TF_CPP_MIN_LOG_LEVEL=0 JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE='1' python  src/MaxText/examples/grpo_llama3_1_70b_demo_pw.py"
+--command "HF_TOKEN=$HF_TOKEN TF_CPP_MIN_LOG_LEVEL=0 JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE='1' # Llama3.1-70B-Instruct
+python3 -m src.MaxText.rl.train_rl src/MaxText/configs/rl.yml \
+  --model_name=llama3.1-70b \
+  --tokenizer_path=meta-llama/Llama-3.1-70B-Instruct \
+  --load_parameters_path=gs://path/to/checkpoint/0/items \
+  --run_name=$WORKLOAD \
+  --base_output_directory=$OUTPUT_PATH \
+  --hf_access_token=$HF_TOKEN"
 ```
 
 The overview of the demo script ~/maxtext/src/MaxText/examples/grpo_llama3_1_70b_demo_pw.py` is as follows:
 
@@ -18,6 +18,9 @@ run_name: ""
 
 model_name: "default" # override config settings to match a specific model. other than the override, nothing should use this!
 override_model_config: False # When set to true allows overriding model parameters via CLI for the purpose of debugging/testing.
+debug:
+  rl: False # RL-specific debugging
+
 normalization_layer_epsilon: 1.e-05 # epsilon value for rmsnorm, layernorm.
 
 ################################## CHECKPOINTING ##################################
@@ -47,6 +50,7 @@ enable_checkpointing: True
 save_checkpoint_on_completion: True
 async_checkpointing: True
 checkpoint_period: 10_000
+max_num_checkpoints_to_keep: None
 # enables one replica to read the ckpt then broadcast to the rest
 enable_single_replica_ckpt_restoring: False
 
 
@@ -12,66 +12,151 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# RL Configuration
+# This config consolidates common parameters for RL training across different model sizes
+
 base_config: "base.yml"
 
-logical_axis_rules: [
-                      ['prefill_activation_length', ['data']],
-                      ['prefill_activation_norm_length', ['data']],
-                      ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
-                      ['activation_batch_no_exp', ['data', 'fsdp', 'fsdp_transpose']],
-                      ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert']],
-                      ['activation_heads', ['tensor', 'tensor_transpose', 'sequence','tensor_sequence']],
-                      ['activation_kv_heads', ['tensor', 'tensor_transpose', 'sequence','tensor_sequence']],
-                      ['activation_length', ['context_autoregressive', 'sequence']],
-                      ['activation_length', ['context_autoregressive']],
-                      ['activation_q_length', ['context_autoregressive']],
-                      ['activation_kv_length', ['context_autoregressive']],
-                      ['activation_norm_length', ['tensor_sequence', 'sequence']],
-                      ['activation_embed', ['tensor_transpose']],
-                      ['activation_mlp', ['tensor', 'tensor_transpose', 'tensor_sequence']],
-                      ['activation_kv', ['tensor', 'tensor_transpose', 'tensor_sequence']],
-                      ['activation_prefill_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
-                      ['activation_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context_autoregressive']],
-                      ['activation_kv_head_dim', ['tensor', 'tensor_transpose', 'tensor_sequence']],
-                      ['activation_vocab', ['tensor', 'tensor_transpose', 'sequence', 'tensor_sequence']],
-                      ['activation_vocab', ['tensor', 'tensor_transpose']],
-                      ['activation_vocab', 'tensor_sequence'],
-                      ['activation_vocab', ['sequence', 'context_autoregressive']],
-                      ['activation_stage', 'stage'],
-                      ['activation_exp', ['expert', 'context_autoregressive']],
-                      ['decode_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context_autoregressive']],
-                      ['decode_length', []],
-                      ['mlp', ['fsdp_transpose', 'tensor', 'tensor_sequence', 'autoregressive']],
-                      ['vocab', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive','context_autoregressive']],
-                      ['heads', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive']],
-                      ['q_heads', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive']],
-                      ['kv_heads', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive']],
-                      ['embed', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'expert']],
-                      ['embed', ['fsdp', 'sequence', 'tensor_transpose', 'expert']],
-                      ['embed', ['fsdp', 'fsdp_transpose', 'sequence', 'expert']],
-                      ['embed', ['fsdp', 'sequence', 'expert']],
-                      ['embed_no_exp', ['fsdp', 'fsdp_transpose', 'sequence', 'context_autoregressive', 'tensor_transpose']],
-                      ['embed_no_exp', ['fsdp', 'sequence', 'context_autoregressive', 'tensor_transpose']],
-                      ['embed_no_exp', ['fsdp', 'fsdp_transpose', 'sequence', 'context_autoregressive']],
-                      ['embed_no_exp', ['fsdp', 'sequence', 'context_autoregressive']],
-                      ['norm', ['tensor', 'tensor_transpose', 'tensor_sequence']],
-                      ['layers', 'stage'],
-                      ['kv', []],
-                      ['kv_head_dim', []],
-                      ['cache_batch_prefill', []],
-                      ['cache_batch', ['context_autoregressive']],
-                      ['cache_heads', ['autoregressive', 'tensor', 'tensor_transpose', 'tensor_sequence']],
-                      ['cache_heads', ['autoregressive', 'tensor', 'tensor_sequence']],
-                      ['cache_kv', []],
-                      ['cache_sequence', ['context_autoregressive']],
-                      ['cache_scale_sequence', ['context_autoregressive']],
-                      ['exp', ['expert', 'context_autoregressive']],
-                      ['paged_kv_heads', []],
-                      ['num_pages', ['tensor']],
-                      ['tokens_per_page', []],
-                      ['paged_kv_head_dim_size', []],
-                    ]
-# Axes used for DCN must be earlier in this list than ICI, see (b/339009148) for details
-data_sharding: [['data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context_autoregressive', 'tensor', 'tensor_transpose', 'tensor_sequence', 'expert', 'autoregressive']]
-
-return_log_prob: True
+# ====== Hardware =====
+trainer_devices_fraction: 0.5
+sampler_devices_fraction: 0.5
+chips_per_vm: 4  # depends on hardware, for v5p this is 4
+
+# ====== Reproducibility ======
+data_shuffle_seed: 42
+
+# ====== GRPO ======
+
+# The number of times the policy generates multiple responses for a given prompt
+# within a single training step. This corresponds to `G` in Algorithm 1 in the
+# paper. The "group" in GRPO comes from here.
+num_generations: 2
+
+# === other GRPO configs ===
+# The number of iterations per batch (𝜇 in GRPO algo 1).
+num_iterations: 1
+
+# The coefficient for the KL divergence penalty (𝛽) in the GRPO loss function.
+# Important to keep a high enough value for this, otherwise, the KL divergence
+# can increase unchecked.
+grpo_beta: 0.08
+# Epsilon value for clipping (𝜀 in GRPO loss in paper). Similar to PPO, for
+# stable updates.
+grpo_epsilon: 0.2
+loss_algo: 'grpo' # grpo or gspo-token
+
+
+# ====== Models ======
+# for MaxText
+# Model and Tokenizer Configuration
+# Override these via CLI:
+# model_name, tokenizer_path, load_parameters_path
+# Model-Specific Overrides (examples)
+# For Llama3.1-8B:
+#   model_name: llama3.1-8b
+#   HF tokenizer_path: meta-llama/Llama-3.1-8B-Instruct
+#
+# For Llama3.1-70B with Pathways:
+#   model_name: llama3.1-70b
+#   HF tokenizer_path: meta-llama/Llama-3.1-70B-Instruct
+
+# ====== MaxText configs ======
+weight_dtype: 'bfloat16'
+attention: 'dot_product'
+remat_policy: 'custom'
+decoder_layer_input: 'offload'
+query_proj: 'offload'
+key_proj: 'offload'
+value_proj: 'offload'
+checkpoint_storage_use_ocdbt: False # For Pathways
+checkpoint_storage_use_zarr3: False # For Pathways
+use_pathways: True
+
+# ====== Debugging ======
+debug:
+  rl: True
+
+# ====== Training ======
+batch_size: 1
+# Increase `batch_size` and `MAX_STEPS` for better results.
+# num_batches: 3738
+num_batches: 4  # 200
+# Keep `num_test_batches` low so that evaluation runs quickly. It can be
+# increased to a max. of 330 (if batch size is 4).
+num_test_batches: 5  # 200
+train_fraction: 1.0
+
+eval_interval: 10  # this doesn't matter if `TRAIN_FRACTION = 1.0`.
+
+num_epochs: 1  # can potentially train for more epochs
+
+learning_rate: 3e-6
+adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradients.
+adam_b2: 0.99 # Exponential decay rate to track the second moment of past gradients.
+gradient_clipping_threshold: 0.1
+
+# ====== Evaluation ======
+eval_sampling_strategy: "greedy" # can be "greedy", "standard", or "liberal"
+generation_configs:
+  greedy:
+    eval_temperature: 0.01
+    eval_top_k: 1
+    eval_top_p: 1.0
+  standard:
+    eval_temperature: 0.7
+    eval_top_k: 50
+    eval_top_p: 0.95
+  liberal:
+    eval_temperature: 0.85
+    eval_top_k: 2000
+    eval_top_p: 1.0
+
+num_eval_passes: 1 # Number of generation passes during evaluation
+eval_corr_lst: False # If True, only include correct responses in the list during evaluation
+eval_make_lst: False # If True, return a list of (question, answer, responses) during evaluation
+
+# ====== Inference ======
+# === Generation during GRPO training ===
+# max Lengths for prompt and completion
+max_prefill_predict_length: 256
+max_target_length: 1024
+kv_cache_buffer: 256
+hbm_utilization_vllm: 0.72
+swap_space_vllm_gb: 2
+# Generation Configuration During Training
+# Important to keep a high-ish temperature for varied, diverse responses during
+# training.
+decode_sampling_temperature: 0.9
+decode_sampling_top_k: 50
+decode_sampling_nucleus_p: 1.0
+
+# ====== Checkpoint Configuration ======
+enable_checkpointing: True
+async_checkpointing: False
+checkpoint_period: 50
+max_num_checkpoints_to_keep: 10
+
+# ====== Reward ======
+
+reward_exact_format_match: 3.0
+reward_white_space_format_match: 1.5
+reward_partial_format_match: 0.5
+reward_ratio_guess_to_answer_high:  0.5
+reward_ratio_guess_to_answer_low: 0.25
+penalty_incorrect_format: -0.5
+penalty_incorrect_answer: -1.0
+
+# ====== Special tokens/templates for GSM8K reasoning ======
+reasoning_start_token: '<reasoning>'
+reasoning_end_token: '</reasoning>'
+solution_start_token: '<answer>'
+solution_end_token: '</answer>'
+chat_template_path: 'src/MaxText/examples/chat_templates/gsm8k_rl.json'
+skip_jax_distributed_system: True
+
+# # TODO(@mazumdera): fix this
+# Dataset Configuration
+dataset_name: 'gsm8k'
+train_split: 'train'
+eval_split: 'test'
+tokenizer_type: 'huggingface'
@@ -0,0 +1,77 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+base_config: "base.yml"
+
+logical_axis_rules: [
+                      ['prefill_activation_length', ['data']],
+                      ['prefill_activation_norm_length', ['data']],
+                      ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
+                      ['activation_batch_no_exp', ['data', 'fsdp', 'fsdp_transpose']],
+                      ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert']],
+                      ['activation_heads', ['tensor', 'tensor_transpose', 'sequence','tensor_sequence']],
+                      ['activation_kv_heads', ['tensor', 'tensor_transpose', 'sequence','tensor_sequence']],
+                      ['activation_length', ['context_autoregressive', 'sequence']],
+                      ['activation_length', ['context_autoregressive']],
+                      ['activation_q_length', ['context_autoregressive']],
+                      ['activation_kv_length', ['context_autoregressive']],
+                      ['activation_norm_length', ['tensor_sequence', 'sequence']],
+                      ['activation_embed', ['tensor_transpose']],
+                      ['activation_mlp', ['tensor', 'tensor_transpose', 'tensor_sequence']],
+                      ['activation_kv', ['tensor', 'tensor_transpose', 'tensor_sequence']],
+                      ['activation_prefill_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
+                      ['activation_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context_autoregressive']],
+                      ['activation_kv_head_dim', ['tensor', 'tensor_transpose', 'tensor_sequence']],
+                      ['activation_vocab', ['tensor', 'tensor_transpose', 'sequence', 'tensor_sequence']],
+                      ['activation_vocab', ['tensor', 'tensor_transpose']],
+                      ['activation_vocab', 'tensor_sequence'],
+                      ['activation_vocab', ['sequence', 'context_autoregressive']],
+                      ['activation_stage', 'stage'],
+                      ['activation_exp', ['expert', 'context_autoregressive']],
+                      ['decode_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context_autoregressive']],
+                      ['decode_length', []],
+                      ['mlp', ['fsdp_transpose', 'tensor', 'tensor_sequence', 'autoregressive']],
+                      ['vocab', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive','context_autoregressive']],
+                      ['heads', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive']],
+                      ['q_heads', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive']],
+                      ['kv_heads', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive']],
+                      ['embed', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'expert']],
+                      ['embed', ['fsdp', 'sequence', 'tensor_transpose', 'expert']],
+                      ['embed', ['fsdp', 'fsdp_transpose', 'sequence', 'expert']],
+                      ['embed', ['fsdp', 'sequence', 'expert']],
+                      ['embed_no_exp', ['fsdp', 'fsdp_transpose', 'sequence', 'context_autoregressive', 'tensor_transpose']],
+                      ['embed_no_exp', ['fsdp', 'sequence', 'context_autoregressive', 'tensor_transpose']],
+                      ['embed_no_exp', ['fsdp', 'fsdp_transpose', 'sequence', 'context_autoregressive']],
+                      ['embed_no_exp', ['fsdp', 'sequence', 'context_autoregressive']],
+                      ['norm', ['tensor', 'tensor_transpose', 'tensor_sequence']],
+                      ['layers', 'stage'],
+                      ['kv', []],
+                      ['kv_head_dim', []],
+                      ['cache_batch_prefill', []],
+                      ['cache_batch', ['context_autoregressive']],
+                      ['cache_heads', ['autoregressive', 'tensor', 'tensor_transpose', 'tensor_sequence']],
+                      ['cache_heads', ['autoregressive', 'tensor', 'tensor_sequence']],
+                      ['cache_kv', []],
+                      ['cache_sequence', ['context_autoregressive']],
+                      ['cache_scale_sequence', ['context_autoregressive']],
+                      ['exp', ['expert', 'context_autoregressive']],
+                      ['paged_kv_heads', []],
+                      ['num_pages', ['tensor']],
+                      ['tokens_per_page', []],
+                      ['paged_kv_head_dim_size', []],
+                    ]
+# Axes used for DCN must be earlier in this list than ICI, see (b/339009148) for details
+data_sharding: [['data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context_autoregressive', 'tensor', 'tensor_transpose', 'tensor_sequence', 'expert', 'autoregressive']]
+
+return_log_prob: True