Fix post-processing aime dataset

arekay-nv · arekay-nv · commit 4727204f847b · 2026-01-08T12:39:29.000-08:00
Signed-off-by: Rashid Kaleem &lt;230885705+arekay-nv@users.noreply.github.com&gt;
diff --git a/examples/07_GPT-OSS-120B_SGLang_Example/run.py b/examples/07_GPT-OSS-120B_SGLang_Example/run.py
@@ -34,7 +34,7 @@
 from inference_endpoint import metrics
 from inference_endpoint.config.runtime_settings import RuntimeSettings
 from inference_endpoint.config.schema import LoadPattern, LoadPatternType
-from inference_endpoint.dataset_manager.dataset import Dataset
+from inference_endpoint.dataset_manager import Dataset, EmptyDataset
 from inference_endpoint.dataset_manager.predefined.aime25 import AIME25, AIME_MLPerf
 from inference_endpoint.dataset_manager.predefined.gpqa import GPQA, GPQA_MLPerf
 from inference_endpoint.endpoint_client.configs import (
@@ -57,7 +57,7 @@
 
 # Configuration for SGLang server
 SGLANG_SERVER_HOST = "localhost"
-SGLANG_SERVER_PORT = 3000
+SGLANG_SERVER_PORT = 30000
 SGLANG_ENDPOINT = f"http://{SGLANG_SERVER_HOST}:{SGLANG_SERVER_PORT}/generate"
 
 
@@ -102,19 +102,6 @@ def create_sglang_client(tmp_dir: Path) -> HTTPEndpointClient:
     return client
 
 
-class EmptyDataset(Dataset):
-    """Empty dataset for performance run."""
-
-    def __init__(self):
-        super().__init__(None)
-
-    def load_sample(self, index: int):
-        return None
-
-    def num_samples(self):
-        return 0
-
-
 def run_benchmark_session(
     accuracy_datasets: list[Dataset], issuer: HttpClientSampleIssuer, args
 ):
diff --git a/src/inference_endpoint/dataset_manager/__init__.py b/src/inference_endpoint/dataset_manager/__init__.py
@@ -19,7 +19,7 @@
 This module handles dataset loading, preprocessing, and management.
 """
 
-from .dataset import Dataset
+from .dataset import Dataset, EmptyDataset
 from .factory import DataLoaderFactory
 from .transforms import (
     AddStaticColumns,
@@ -33,6 +33,7 @@
 
 __all__ = [
     "Dataset",
+    "EmptyDataset",
     "DataLoaderFactory",
     "ColumnNameRemap",
     "AddStaticColumns",
diff --git a/src/inference_endpoint/dataset_manager/dataset.py b/src/inference_endpoint/dataset_manager/dataset.py
@@ -323,12 +323,6 @@ def load_from_huggingface(
     load_options = load_options or {}
     cache_options = cache_options or {}
 
-    # if cache_dir is not None and cache_dir.exists():
-    #     try:
-    #         ds = load_from_disk(str(cache_dir), **cache_options)
-    #         return ds[split].to_pandas()
-    #     except Exception as e:
-    #         logger.warning(f"Error loading dataset from cache: {e}")
     ds = load_dataset(dataset_path, dataset_name, **load_options)
 
     if cache_dir is not None:
@@ -450,3 +444,16 @@ def load_sample(self, index: int) -> Any:
 
     def num_samples(self) -> int:
         return len(self.data)
+
+
+class EmptyDataset(Dataset):
+    """Empty dataset for performance run."""
+
+    def __init__(self):
+        super().__init__(None)
+
+    def load_sample(self, index: int):
+        return None
+
+    def num_samples(self):
+        return 0
diff --git a/src/inference_endpoint/dataset_manager/predefined/aime25/__init__.py b/src/inference_endpoint/dataset_manager/predefined/aime25/__init__.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import random
+import re
 from logging import getLogger
 from pathlib import Path
 
@@ -30,6 +31,16 @@
 logger = getLogger(__name__)
 
 
+def normalize_number(s):
+    """Normalize a number string to an integer.
+    Reference https://github.com/openai/gpt-oss/blob/48db88d8e29f48493fe75f084a8c9bd900a2b92f/gpt_oss/evals/aime_eval.py#L20
+    """
+    match = re.match(r"\d+", s)  # match digits from the start
+    if not match:
+        return None
+    return int(match.group(0))
+
+
 class AIME25(
     Dataset,
     dataset_id="aime25",
@@ -110,12 +121,15 @@ def generate(
 
         processed_rows = []
         for _, row in df.iterrows():
-            correct_answer = row["answer"]
-
+            correct_answer = (
+                normalize_number(row["answer"])
+                if isinstance(row["answer"], str)
+                else row["answer"]
+            )
             # Create processed row
             processed_row = {
                 "question": row["question"],  # Original question
-                "answer": correct_answer,
+                "answer": str(correct_answer),
             }
 
             processed_rows.append(processed_row)
@@ -126,21 +140,6 @@ def generate(
         logger.info(f"Saved {len(df)} samples to {dst_path}")
         return df
 
-    # @classmethod
-    # def generate_aime25_dataset(
-    #     cls,
-    #     datasets_dir: Path,
-    #     max_samples: int | None = None,
-    #     force: bool = False,
-    # ) -> pd.DataFrame:
-    #     """Generate the AIME25 dataset to a file."""
-    #     df = AIME25.generate(
-    #         datasets_dir=Path(datasets_dir),
-    #         max_samples=max_samples,
-    #         force=force,
-    #     )
-    #     return df
-
 
 class AIME_MLPerf(AIME25):
     """AIME_MLPerf: AIME 2025 MLPerf Dataset
diff --git a/src/inference_endpoint/evaluation/extractor.py b/src/inference_endpoint/evaluation/extractor.py
@@ -165,11 +165,23 @@ def extract(cls, text: str) -> str | None:
 
 
 class BoxedMathExtractor(Extractor):
-    """Extract boxed math answer from response text."""
+    """Extract boxed math answer from response text.
+    Based on OpenAI's extract_boxed_math function from GPT-OSS.
+    https://github.com/openai/gpt-oss/blob/main/gpt_oss/evals/aime_eval.py
+    """
 
     @classmethod
     def extract(cls, text: str) -> str | None:
-        matches = re.findall(r"\\boxed\{([^}]+)\}", text)
+        pattern = r"boxed{(.*?)}|framebox{(.*?)}"
+        matches = re.findall(pattern, text, re.DOTALL)
+        if matches:
+            for match in matches[::-1]:
+                for group in match:
+                    if group != "":
+                        retval = group.split(",")[-1].strip()
+                        return retval
+        pattern = r"\d+"  # get the last integer if no pattern found
+        matches = re.findall(pattern, text, re.DOTALL)
         if matches:
             return matches[-1]
         return None