huggingface · kashif · Feb 10, 2025 · Feb 10, 2025 · Feb 10, 2025 · Feb 11, 2025
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
@@ -163,10 +163,12 @@ def main(script_args, training_args, model_args):
 
     # Format into conversation
     def make_conversation(example):
+        # start the assistant with a <think> tag
         return {
             "prompt": [
                 {"role": "system", "content": SYSTEM_PROMPT},
                 {"role": "user", "content": example["problem"]},
+                {"role": "assistant", "content": "Let me solve this step by step.\n<think>"},
             ],
         }
 

diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
@@ -51,7 +51,7 @@ def accuracy_reward(completions, solution, **kwargs):
 
 def format_reward(completions, **kwargs):
     """Reward function that checks if the completion has a specific format."""
-    pattern = r"^<think>.*?</think>\s*<answer>.*?</answer>$"
+    pattern = r"^.+(?:<think>.*?</think>\s*)?<answer>.*?</answer>$"
     completion_contents = [completion[0]["content"] for completion in completions]
     matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]
     return [1.0 if match else 0.0 for match in matches]

diff --git a/tests/test_rewards.py b/tests/test_rewards.py
@@ -28,9 +28,15 @@ def test_accuracy_reward_wrong_answer(self):
 
     def test_format_reward_correct(self):
         """Test format_reward with correct format."""
-        completion = [[{"content": "<think>Some reasoning</think><answer>The answer</answer>"}]]
-        rewards = format_reward(completion)
-        self.assertEqual(rewards[0], 1.0)
+        formats = [
+            "<think>Some reasoning</think><answer>The answer</answer>",
+            "Some reasoning</think><answer>The answer</answer>",
+            "<think><think>Some reasoning</think><answer>The answer</answer>",
+        ]
+        for fmt in formats:
+            completion = [[{"content": fmt}]]
+            rewards = format_reward(completion)
+            self.assertEqual(rewards[0], 1.0, msg=f"Expected format reward of 1.0 for {fmt}")
 
     def test_format_reward_incorrect(self):
         """Test format_reward with incorrect format."""
@@ -45,7 +51,7 @@ def test_format_reward_incorrect(self):
         for fmt in incorrect_formats:
             completion = [[{"content": fmt}]]
             rewards = format_reward(completion)
-            self.assertEqual(rewards[0], 0.0)
+            self.assertEqual(rewards[0], 0.0, msg=f"Expected format reward of 0.0 for {fmt}")
 
     def test_reasoning_steps_reward(self):
         """Test reasoning_steps_reward with various formats."""
@@ -118,6 +124,12 @@ def test_positive_max_penalty_raises_value_error(self):
         with self.assertRaisesRegex(ValueError, "max_penalty 1.5 should not be positive"):
             get_repetition_penalty_reward(ngram_size=2, max_penalty=1.5)
 
+    def test_zero_max_penalty_returns_zero(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=0.0)
+        completions = [[{"content": "this is a test sentence"}]]
+        rewards = reward_fn(completions)
+        self.assertEqual(rewards, [0.0])
+
     def test_no_repetition(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
         completions = [[{"content": "this is a test sentence"}]]