|
| 1 | +import json |
| 2 | +import os |
| 3 | +import tempfile |
| 4 | +import unittest |
| 5 | +from typing import List, Tuple |
| 6 | + |
| 7 | +from sglang.test.ci.ci_register import register_cpu_ci |
| 8 | +from sglang.test.simple_eval_gsm8k import get_one_example |
| 9 | +from sglang.test.simple_eval_mixed_prefix_gsm8k import MixedPrefixGSM8KEval |
| 10 | +from sglang.test.test_utils import CustomTestCase |
| 11 | + |
| 12 | +register_cpu_ci(est_time=5, suite="base-b-test-cpu") |
| 13 | + |
| 14 | + |
| 15 | +def _write_synthetic_dataset(path: str, n: int) -> None: |
| 16 | + with open(path, "w") as f: |
| 17 | + for i in range(n): |
| 18 | + f.write( |
| 19 | + json.dumps( |
| 20 | + { |
| 21 | + "question": f"Synthetic question {i}: what is {i} + {i}?", |
| 22 | + "answer": f"The answer is {2 * i}. #### {2 * i}", |
| 23 | + } |
| 24 | + ) |
| 25 | + + "\n" |
| 26 | + ) |
| 27 | + |
| 28 | + |
| 29 | +class TestMixedPrefixGSM8KEval(CustomTestCase): |
| 30 | + NUM_SHOTS = 4 |
| 31 | + SECONDARY_POOL_SIZE = 12 |
| 32 | + NUM_EXAMPLES = 40 |
| 33 | + |
| 34 | + @classmethod |
| 35 | + def setUpClass(cls): |
| 36 | + cls._tmpdir = tempfile.TemporaryDirectory() |
| 37 | + cls._data_path = os.path.join(cls._tmpdir.name, "synthetic.jsonl") |
| 38 | + _write_synthetic_dataset(cls._data_path, 100) |
| 39 | + |
| 40 | + @classmethod |
| 41 | + def tearDownClass(cls): |
| 42 | + cls._tmpdir.cleanup() |
| 43 | + |
| 44 | + def _make_eval(self, seed: int = 42, num_examples=None): |
| 45 | + return MixedPrefixGSM8KEval( |
| 46 | + num_examples=( |
| 47 | + num_examples if num_examples is not None else self.NUM_EXAMPLES |
| 48 | + ), |
| 49 | + num_threads=1, |
| 50 | + num_shots=self.NUM_SHOTS, |
| 51 | + secondary_pool_size=self.SECONDARY_POOL_SIZE, |
| 52 | + data_path=self._data_path, |
| 53 | + seed=seed, |
| 54 | + ) |
| 55 | + |
| 56 | + def _primary_lines(self, evaluator) -> List[str]: |
| 57 | + return [ |
| 58 | + get_one_example(evaluator._primary_shots, j, include_answer=True) + "\n\n" |
| 59 | + for j in range(self.NUM_SHOTS) |
| 60 | + ] |
| 61 | + |
| 62 | + def _decompose(self, evaluator, prefix: str) -> Tuple[int, List[str]]: |
| 63 | + k = 0 |
| 64 | + for line in self._primary_lines(evaluator): |
| 65 | + if prefix.startswith(line): |
| 66 | + prefix = prefix[len(line) :] |
| 67 | + k += 1 |
| 68 | + else: |
| 69 | + break |
| 70 | + remainder_questions: List[str] = [] |
| 71 | + if prefix: |
| 72 | + chunks = prefix.split("\n\n") |
| 73 | + for chunk in chunks: |
| 74 | + if chunk.startswith("Question: "): |
| 75 | + q_text = chunk[len("Question: ") :].split("\nAnswer:")[0] |
| 76 | + remainder_questions.append(q_text) |
| 77 | + return k, remainder_questions |
| 78 | + |
| 79 | + def test_primary_segment_is_strict_prefix_of_primary_shots(self): |
| 80 | + e = self._make_eval() |
| 81 | + for i in range(self.NUM_EXAMPLES): |
| 82 | + k, _ = self._decompose(e, e._build_prefix(i)) |
| 83 | + self.assertGreaterEqual(k, 0) |
| 84 | + self.assertLessEqual(k, self.NUM_SHOTS) |
| 85 | + |
| 86 | + def test_remainder_questions_come_from_secondary_pool(self): |
| 87 | + e = self._make_eval() |
| 88 | + secondary_qs = {item["question"] for item in e._secondary_pool} |
| 89 | + for i in range(self.NUM_EXAMPLES): |
| 90 | + _, remainder = self._decompose(e, e._build_prefix(i)) |
| 91 | + for q in remainder: |
| 92 | + self.assertIn(q, secondary_qs) |
| 93 | + |
| 94 | + def test_remainder_no_duplicates_within_one_query(self): |
| 95 | + e = self._make_eval() |
| 96 | + for i in range(self.NUM_EXAMPLES): |
| 97 | + _, remainder = self._decompose(e, e._build_prefix(i)) |
| 98 | + self.assertEqual( |
| 99 | + len(remainder), |
| 100 | + len(set(remainder)), |
| 101 | + f"query {i} has duplicate secondary samples", |
| 102 | + ) |
| 103 | + |
| 104 | + def test_remainder_size_within_secondary_pool_bound(self): |
| 105 | + e = self._make_eval() |
| 106 | + for i in range(self.NUM_EXAMPLES): |
| 107 | + _, remainder = self._decompose(e, e._build_prefix(i)) |
| 108 | + self.assertGreaterEqual(len(remainder), 0) |
| 109 | + self.assertLessEqual(len(remainder), self.SECONDARY_POOL_SIZE) |
| 110 | + |
| 111 | + def test_primary_depth_takes_multiple_values(self): |
| 112 | + e = self._make_eval() |
| 113 | + ks = { |
| 114 | + self._decompose(e, e._build_prefix(i))[0] for i in range(self.NUM_EXAMPLES) |
| 115 | + } |
| 116 | + self.assertGreater(len(ks), 2, f"k values seen: {ks}") |
| 117 | + |
| 118 | + def test_secondary_size_takes_multiple_values(self): |
| 119 | + e = self._make_eval() |
| 120 | + sizes = { |
| 121 | + len(self._decompose(e, e._build_prefix(i))[1]) |
| 122 | + for i in range(self.NUM_EXAMPLES) |
| 123 | + } |
| 124 | + self.assertGreater(len(sizes), 2, f"sizes seen: {sizes}") |
| 125 | + |
| 126 | + def test_two_queries_share_min_primary_prefix(self): |
| 127 | + e = self._make_eval() |
| 128 | + lines = self._primary_lines(e) |
| 129 | + prefixes = [e._build_prefix(i) for i in range(self.NUM_EXAMPLES)] |
| 130 | + ks = [self._decompose(e, p)[0] for p in prefixes] |
| 131 | + for i in range(self.NUM_EXAMPLES): |
| 132 | + for j in range(i + 1, self.NUM_EXAMPLES): |
| 133 | + shared = "".join(lines[: min(ks[i], ks[j])]) |
| 134 | + self.assertTrue(prefixes[i].startswith(shared)) |
| 135 | + self.assertTrue(prefixes[j].startswith(shared)) |
| 136 | + |
| 137 | + def test_build_prefix_is_deterministic(self): |
| 138 | + a = self._make_eval(seed=42) |
| 139 | + b = self._make_eval(seed=42) |
| 140 | + for i in range(self.NUM_EXAMPLES): |
| 141 | + self.assertEqual(a._build_prefix(i), b._build_prefix(i)) |
| 142 | + |
| 143 | + def test_seed_actually_matters(self): |
| 144 | + a = self._make_eval(seed=42) |
| 145 | + b = self._make_eval(seed=43) |
| 146 | + differences = sum( |
| 147 | + 1 |
| 148 | + for i in range(self.NUM_EXAMPLES) |
| 149 | + if a._build_prefix(i) != b._build_prefix(i) |
| 150 | + ) |
| 151 | + self.assertGreater(differences, self.NUM_EXAMPLES // 2) |
| 152 | + |
| 153 | + def test_pools_and_test_lines_pairwise_disjoint(self): |
| 154 | + e = self._make_eval(num_examples=None) |
| 155 | + primary_qs = {item["question"] for item in e._primary_shots} |
| 156 | + secondary_qs = {item["question"] for item in e._secondary_pool} |
| 157 | + test_qs = {item["question"] for item in e._lines} |
| 158 | + self.assertEqual(primary_qs & secondary_qs, set()) |
| 159 | + self.assertEqual(primary_qs & test_qs, set()) |
| 160 | + self.assertEqual(secondary_qs & test_qs, set()) |
| 161 | + |
| 162 | + def test_insufficient_dataset_raises(self): |
| 163 | + tiny = os.path.join(self._tmpdir.name, "tiny.jsonl") |
| 164 | + _write_synthetic_dataset(tiny, n=5) |
| 165 | + with self.assertRaises(ValueError): |
| 166 | + MixedPrefixGSM8KEval( |
| 167 | + num_examples=1, |
| 168 | + num_threads=1, |
| 169 | + num_shots=self.NUM_SHOTS, |
| 170 | + secondary_pool_size=self.SECONDARY_POOL_SIZE, |
| 171 | + data_path=tiny, |
| 172 | + seed=42, |
| 173 | + ) |
| 174 | + |
| 175 | + |
| 176 | +if __name__ == "__main__": |
| 177 | + unittest.main() |
0 commit comments