allenai · finbarrtimbers · Jan 16, 2026 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ All notable changes to this project will be documented in this file.
 - Added the ability to set active tools on a per-sample basis. See the PR for more details: https://github.com/allenai/open-instruct/pull/1382
 - Added a new changelog Github Action that makes sure you contribute to the changelog! https://github.com/allenai/open-instruct/pull/1276
 - Now, we type check `open_instruct/dataset_transformation.py` (https://github.com/allenai/open-instruct/pull/1390).
+- Added a GRPO implementation that uses olmo-core (https://github.com/allenai/open-instruct/pull/1389).
 - Added a linter rule that imports go at the top of the file (https://github.com/allenai/open-instruct/pull/1394).
 
 ### Changed

diff --git a/open_instruct/benchmark_generators.py b/open_instruct/benchmark_generators.py
@@ -27,7 +27,7 @@
 import vllm
 from ray.util import queue as ray_queue
 
-from open_instruct import data_loader, dataset_transformation, grpo_fast, logger_utils, model_utils, utils, vllm_utils
+from open_instruct import data_loader, dataset_transformation, grpo_utils, logger_utils, model_utils, utils, vllm_utils
 from open_instruct.actor_manager import ActorManager
 from open_instruct.data_types import PromptRequest
 
@@ -211,7 +211,7 @@ def free_all_gpu_memory(device: int | str = 0) -> None:
 
 
 def setup_dataset(
-    args: grpo_fast.Args,
+    args: grpo_utils.ExperimentConfig,
     streaming_config: data_loader.StreamingDataLoaderConfig,
     tokenizer_config: dataset_transformation.TokenizerConfig,
 ) -> datasets.Dataset:
@@ -244,7 +244,7 @@ def setup_dataset(
 
 
 def setup_vllm_engines(
-    args: grpo_fast.Args,
+    args: grpo_utils.ExperimentConfig,
     streaming_config: data_loader.StreamingDataLoaderConfig,
     vllm_config: data_loader.VLLMConfig,
     tokenizer_config: dataset_transformation.TokenizerConfig,
@@ -292,7 +292,7 @@ def setup_vllm_engines(
 
 
 def simulate_weight_sync(
-    actor_manager: ray.actor.ActorHandle, vllm_engines: list[ray.actor.ActorHandle], args: grpo_fast.Args
+    actor_manager: ray.actor.ActorHandle, vllm_engines: list[ray.actor.ActorHandle], args: grpo_utils.ExperimentConfig
 ) -> float:
     """Simulate weight sync by pausing all actors.
 
@@ -363,7 +363,7 @@ def run_benchmark(
     param_prompt_Q: ray_queue.Queue,
     inference_results_Q: ray_queue.Queue,
     actor_manager: ray.actor.ActorHandle,
-    args: grpo_fast.Args,
+    args: grpo_utils.ExperimentConfig,
     streaming_config: data_loader.StreamingDataLoaderConfig,
     vllm_config: data_loader.VLLMConfig,
     model_config: model_utils.ModelConfig,
@@ -670,7 +670,7 @@ def main() -> None:
     # Parse arguments using ArgumentParserPlus
     parser = utils.ArgumentParserPlus(
         (
-            grpo_fast.Args,
+            grpo_utils.ExperimentConfig,
             dataset_transformation.TokenizerConfig,
             model_utils.ModelConfig,
             data_loader.StreamingDataLoaderConfig,
@@ -680,7 +680,7 @@ def main() -> None:
 
     args, tokenizer_config, model_config, streaming_config, vllm_config = cast(
         tuple[
-            grpo_fast.Args,
+            grpo_utils.ExperimentConfig,
             dataset_transformation.TokenizerConfig,
             model_utils.ModelConfig,
             data_loader.StreamingDataLoaderConfig,

diff --git a/open_instruct/ground_truth_utils.py b/open_instruct/ground_truth_utils.py
@@ -1055,7 +1055,7 @@ class RewardConfig:
     apply_r1_style_format_reward: bool = False
     r1_style_format_reward: float = 1.0
     apply_verifiable_reward: bool = True
-    verification_reward: int = 10
+    verification_reward: float = 10.0
     non_stop_penalty: bool = False
     non_stop_penalty_value: float = -10.0
     only_reward_good_outputs: bool = False