google
diff --git a/‎.github/workflows/tpu-tests.yml‎
Lines changed: 8 additions & 29 deletions b/‎.github/workflows/tpu-tests.yml‎
Lines changed: 8 additions & 29 deletions
diff --git a/‎scripts/grpo_demo_llama3_qwen2.py‎
Lines changed: 35 additions & 35 deletions b/‎scripts/grpo_demo_llama3_qwen2.py‎
Lines changed: 35 additions & 35 deletions
diff --git a/‎tests/cli/utils/data_test.py‎
Lines changed: 181 additions & 0 deletions b/‎tests/cli/utils/data_test.py‎
Lines changed: 181 additions & 0 deletions
@@ -166,36 +166,15 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
-
-          # Download GSM8K dataset
-          mkdir -p /tmp/grpo_test/rl/grpo/data
-          python3 -c "
-          from datasets import load_dataset
-          import json
-
-          # Download and save GSM8K train split
-          dataset = load_dataset('openai/gsm8k', 'main', split='train')
-          train_data = [{'question': item['question'], 'answer': item['answer']} for item in dataset]
-          with open('/tmp/grpo_test/rl/grpo/data/gsm8k_train.json', 'w') as f:
-              json.dump(train_data, f)
-
-          # Download and save GSM8K test split
-          dataset = load_dataset('openai/gsm8k', 'main', split='test')
-          test_data = [{'question': item['question'], 'answer': item['answer']} for item in dataset]
-          with open('/tmp/grpo_test/rl/grpo/data/gsm8k_test.json', 'w') as f:
-              json.dump(test_data, f)
-
-          print('GSM8K dataset downloaded successfully')
-          "
-
-          # TODO(lancewang): Re-enable this test once the segfault is fixed.
           # Run GRPO demo script with minimal configuration
-          # python3 scripts/grpo_demo_llama3_qwen2.py \
-          #   --root-dir=/tmp/grpo_test \
-          #   --model-version=Qwen/Qwen2.5-0.5B-Instruct \
-          #   --num-batches=1 \
-          #   --num-test-batches=1 \
-          #   --rollout-engine=vanilla
+          python3 scripts/grpo_demo_llama3_qwen2.py \
+            --root-dir=/tmp/grpo_test \
+            --num-batches=2 \
+            --num-test-batches=1 \
+            --global-batch-size=2 \
+            --train-mini-batch-size=2 \
+            --train-micro-batch-size=2 \
+            --rollout-engine=vanilla
       - name: Run vllm tests
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
@@ -40,6 +40,7 @@
 import qwix
 from tqdm.auto import tqdm
 import transformers
+from tunix.cli.utils import data as data_lib
 from tunix.examples.data import math_dataset
 from tunix.models.llama3 import model as llama_lib
 from tunix.models.llama3 import params as llama_params
@@ -573,37 +574,37 @@ def extract_hash_answer(text: str) -> str | None:
 dataset = create_dataset(
     args.data_source,
     args.dataset if args.data_source == "tfds" else LOCAL_TRAIN_DATA_DIR,
-    args.global_batch_size,
-    NUM_BATCHES,
+    tokenizer=model_tokenizer,
     tfds_download=True,
+    split="train",
 )
 
-if TRAIN_FRACTION == 1.0:
-  train_dataset = dataset.repeat(NUM_EPOCHS)
-  val_dataset = None
-else:
-  train_dataset = dataset[: int(len(dataset) * TRAIN_FRACTION)]
-  train_dataset = train_dataset.repeat(NUM_EPOCHS)
-
-  val_dataset = dataset[int(len(dataset) * TRAIN_FRACTION) :].repeat(NUM_EPOCHS)
+train_dataset, val_dataset = data_lib.post_init_dataset(
+    dataset,
+    model_tokenizer,
+    batch_size=args.global_batch_size,
+    num_batches=NUM_BATCHES,
+    max_prompt_length=MAX_PROMPT_LENGTH,
+    fraction=TRAIN_FRACTION,
+    num_epochs=NUM_EPOCHS,
+)
 
 test_dataset = create_dataset(
     args.data_source,
     args.dataset if args.data_source == "tfds" else LOCAL_TRAIN_DATA_DIR,
-    args.global_batch_size,
-    NUM_TEST_BATCHES,
+    tokenizer=model_tokenizer,
     tfds_download=True,
+    split="test",
 )
 
-print(
-    f"train_dataset size: {len(train_dataset)}, val_dataset size:"
-    f"{len(val_dataset) if val_dataset is not None else 0},"
-    f"test_dataset size: {len(test_dataset)}"
+test_dataset, _ = data_lib.post_init_dataset(
+    test_dataset,
+    model_tokenizer,
+    batch_size=args.global_batch_size,
+    num_batches=NUM_TEST_BATCHES,
+    max_prompt_length=MAX_PROMPT_LENGTH,
 )
 
-for ele in train_dataset[:1]:
-  pprint.pprint(ele)
-
 MODEL_CONFIG = {
     "meta-llama/Llama-3.2-1B-Instruct": llama_lib.ModelConfig.llama3p2_1b,
     "meta-llama/Llama-3.2-3B-Instruct": llama_lib.ModelConfig.llama3p2_3b,
@@ -774,8 +775,7 @@ def check_answer(prompts, completions, answer, **kargs):  # pylint: disable=unus
   responses = completions
 
   extracted_responses = [
-      guess.group(1) if (guess := match_format.search(r)) is not None else None
-      for r in responses
+      (m[-1] if (m := match_numbers.findall(r)) else None) for r in responses
   ]
 
   scores = []
@@ -808,7 +808,8 @@ def check_answer(prompts, completions, answer, **kargs):  # pylint: disable=unus
 
 
 match_numbers = re.compile(
-    rf"{solution_start}.*?([\d\.]{{1,}})", flags=re.MULTILINE | re.DOTALL
+    rf"{solution_start}.*?([+-]?(?:\d[\d,]*)(?:\.\d+)?|[+-]?\.\d+)",
+    flags=re.MULTILINE | re.DOTALL,
 )
 match_numbers.findall(f"{solution_start}  0.34  {solution_end}")
 
@@ -829,8 +830,7 @@ def check_numbers(prompts, completions, answer, **kargs):  # pylint: disable=unu
   responses = completions
 
   extracted_responses = [
-      guess.group(1) if (guess := match_numbers.search(r)) is not None else None
-      for r in responses
+      (m[-1] if (m := match_numbers.findall(r)) else None) for r in responses
   ]
 
   scores = []
@@ -846,8 +846,8 @@ def check_numbers(prompts, completions, answer, **kargs):  # pylint: disable=unu
       continue
     # Convert to numbers
     try:
-      true_answer = float(true_answer.strip())
-      guess = float(guess.strip())
+      true_answer = float(true_answer.replace(",", "").strip())
+      guess = float(guess.replace(",", "").strip())
       scores.append(1.5 if guess == true_answer else 0.0)
     except Exception:  # pylint: disable=broad-except
       scores.append(0)
@@ -938,20 +938,20 @@ def evaluate(
       partially_corr_per_question = 0
       corr_format_per_question = 0
       for response in multiple_call_response:
-        extracted_response = (
-            guess.group(1)
-            if (guess := match_numbers.search(response)) is not None
-            else "-1000000"
-        )
+        # Grab the last matched number from this response (not a generator)
+        matches = match_numbers.findall(response)
+        extracted_response = matches[-1] if matches else "-1000000"
         try:
-          if float(extracted_response.strip()) == float(answer.strip()):
+          response_num = float(extracted_response.replace(",", "").strip())
+          answer_num = float(answer.replace(",", "").strip())
+          if response_num == answer_num:
             corr_ctr_per_question += 1
 
-          ratio = float(extracted_response.strip()) / float(answer.strip())
+          ratio = response_num / answer_num
           if ratio >= 0.9 and ratio <= 1.1:
             partially_corr_per_question += 1
-        except (ValueError, ZeroDivisionError):
-          print("SKIPPED")
+        except (ValueError, ZeroDivisionError) as e:
+          print(f"SKIPPED: {e}")
 
         # check format
         if match_format.search(response) is not None:
 
@@ -0,0 +1,181 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tunix.cli.utils.data.post_init_dataset."""
+
+from __future__ import annotations
+
+from absl.testing import absltest
+from tunix.cli.utils import data as data_lib
+
+
+class _FakeTokenizer:
+
+  def tokenize(self, text: str):
+    # Simple tokenization: one token per whitespace-separated chunk
+    return text.split()
+
+
+class _BaseDataset:
+  """Minimal dataset to mimic grain interfaces used in post_init_dataset."""
+
+  def __init__(self, records):
+    self._records = list(records)
+
+  def __len__(self):
+    return len(self._records)
+
+  def __getitem__(self, idx):
+    if isinstance(idx, slice):
+      return _BaseDataset(self._records[idx])
+    return self._records[idx]
+
+  def filter(self, fn):
+    return _BaseDataset([x for x in self._records if fn(x)])
+
+  def repeat(self, n):
+    return _RepeatDataset(self, n)
+
+  def to_iter_dataset(self):
+    return _IterDataset(self._records)
+
+  def map(self, fn):  # Not used in tests, but kept for fidelity.
+    return _BaseDataset([fn(x) for x in self._records])
+
+
+class _RepeatDataset:
+
+  def __init__(self, base: _BaseDataset, n: int):
+    self._base = base
+    self._n = n
+
+  def __len__(self):
+    return len(self._base) * self._n
+
+  def to_iter_dataset(self):
+    return _IterDataset(self._base._records * self._n)
+
+
+class _IterDataset:
+
+  def __init__(self, records):
+    self._records = list(records)
+
+  def batch(self, batch_size: int):
+    return _BatchedDataset(self._records, batch_size)
+
+
+class _BatchedDataset:
+
+  def __init__(self, records, batch_size: int):
+    self._records = records
+    self._batch_size = batch_size
+
+  def __iter__(self):
+    for i in range(0, len(self._records), self._batch_size):
+      yield self._records[i : i + self._batch_size]
+
+
+class PostInitDatasetTest(absltest.TestCase):
+
+  def test_filters_by_prompt_length(self):
+    tokenizer = _FakeTokenizer()
+    dataset = _BaseDataset([
+        {"prompts": "short", "answer": 1},
+        {"prompts": "this is too long", "answer": 2},
+    ])
+
+    first, second = data_lib.post_init_dataset(
+        dataset,
+        tokenizer=tokenizer,
+        batch_size=2,
+        num_batches=None,
+        max_prompt_length=2,  # only the first record should remain
+    )
+
+    batches = list(first)
+    self.assertIsNone(second)
+    self.assertLen(batches, 1)
+    self.assertEqual(batches[0], [{"prompts": "short", "answer": 1}])
+
+  def test_limits_num_batches(self):
+    tokenizer = _FakeTokenizer()
+    dataset = _BaseDataset(
+        [{"prompts": f"p{i}", "answer": i} for i in range(10)]
+    )
+
+    first, _ = data_lib.post_init_dataset(
+        dataset,
+        tokenizer=tokenizer,
+        batch_size=3,
+        num_batches=2,  # keep at most 2 batches * 3 = 6 examples
+        max_prompt_length=None,
+    )
+
+    batches = list(first)
+    self.assertLen(batches, 2)
+    self.assertEqual([len(b) for b in batches], [3, 3])
+    self.assertEqual(batches[0][0]["prompts"], "p0")
+    self.assertEqual(batches[-1][-1]["prompts"], "p5")
+
+  def test_fraction_split_and_repeat(self):
+    tokenizer = _FakeTokenizer()
+    dataset = _BaseDataset(
+        [{"prompts": f"p{i}", "answer": i} for i in range(8)]
+    )
+
+    first, second = data_lib.post_init_dataset(
+        dataset,
+        tokenizer=tokenizer,
+        batch_size=2,
+        num_batches=None,
+        max_prompt_length=None,
+        fraction=0.5,
+        num_epochs=1,
+    )
+
+    first_batches = list(first)
+    second_batches = list(second)
+
+    self.assertLen(first_batches, 2)  # 4 items / batch_size 2
+    self.assertLen(second_batches, 2)  # remaining 4 items / batch_size 2
+    self.assertEqual(first_batches[0][0]["prompts"], "p0")
+    self.assertEqual(second_batches[-1][-1]["prompts"], "p7")
+
+  def test_num_epochs_repeats_dataset(self):
+    tokenizer = _FakeTokenizer()
+    dataset = _BaseDataset(
+        [{"prompts": "p0", "answer": 0}, {"prompts": "p1", "answer": 1}]
+    )
+
+    first, second = data_lib.post_init_dataset(
+        dataset,
+        tokenizer=tokenizer,
+        batch_size=1,
+        num_batches=None,
+        max_prompt_length=None,
+        num_epochs=3,
+    )
+
+    self.assertIsNone(second)
+    batches = list(first)
+    # 2 items * 3 epochs = 6 batches of size 1
+    self.assertLen(batches, 6)
+    self.assertEqual(
+        [b[0]["prompts"] for b in batches], ["p0", "p1", "p0", "p1", "p0", "p1"]
+    )
+
+
+if __name__ == "__main__":
+  absltest.main()