refactor: Replace _to_number function with gold_from_gsm8k in dataset_handler and update load_generation_model parameters in core.py

milad1378yz · milad1378yz · commit 418ea07dab7c · 2026-04-21T17:31:37.000-07:00
diff --git a/src/dataset_handler.py b/src/dataset_handler.py
@@ -3,6 +3,8 @@
 from fractions import Fraction
 import random
 
+from answer_utils import _to_number
+
 
 def load_hard_dataset(name, split, n, seed):
     if name == "competition_math":
@@ -32,9 +34,7 @@ def load_hard_dataset(name, split, n, seed):
     elif name == "gsm8k":
         ds = load_dataset("gsm8k", "main")
         data = ds[split].shuffle(seed=seed).select(range(n)) if n else ds[split]
-        gold_fn = lambda ex: _to_number(
-            re.findall(r"####\s*([-\$]?\s*\d[\d,]*(?:\.\d+)?(?:\s+\w+)?)", ex["answer"])[0]
-        )
+        gold_fn = lambda ex: gold_from_gsm8k(ex["answer"])
         q_fn = lambda ex: ex["question"]
 
     elif name == "svamp":
@@ -276,26 +276,6 @@ def _norm(s):
     return re.sub(r"\s+", " ", str(s)).strip().lower()
 
 
-def _to_number(s):
-    if s is None:
-        return None
-    s = s.strip()
-    s = s.replace(",", "")
-    s = re.sub(r"^\$", "", s)
-    s = re.sub(r"\s+(dollars?|tickets?|units?|boxes?|people|students?)$", "", s, flags=re.I)
-    if re.fullmatch(r"-?\d+/\d+", s):
-        return float(Fraction(s))
-    if s.endswith("%"):
-        try:
-            return float(s[:-1]) / 100.0
-        except:
-            return None
-    try:
-        return float(s)
-    except:
-        return None
-
-
 def _gpqa_perm(ex):
     """
     Deterministically shuffle the 4 answer options so that:
diff --git a/src/experiments/core.py b/src/experiments/core.py
@@ -560,6 +560,11 @@ def load_generation_model(
     base_model_id: str,
     tokenizer: Optional[AutoTokenizer] = None,
     torch_dtype: torch.dtype = torch.float16,
+    *,
+    device_map: Optional[str] = "auto",
+    load_in_4bit: bool = False,
+    attn_impl: str = "sdpa",
+    compile_model: bool = False,
 ):
     """
     Loads a Causal LM for generation. If `tokenizer` provided, embeddings are resized accordingly.
@@ -568,18 +573,42 @@ def load_generation_model(
         tokenizer = AutoTokenizer.from_pretrained(
             base_model_id, use_fast=True, trust_remote_code=True
         )
-        if tokenizer.pad_token is None and tokenizer.eos_token is not None:
-            tokenizer.pad_token = tokenizer.eos_token
+    if tokenizer.pad_token is None and tokenizer.eos_token is not None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    kwargs: Dict[str, Any] = {
+        "torch_dtype": torch_dtype,
+        "trust_remote_code": True,
+    }
+    if device_map is not None:
+        kwargs["device_map"] = device_map
+    if attn_impl:
+        kwargs["attn_implementation"] = attn_impl
+    if load_in_4bit:
+        compute_dtype = (
+            torch_dtype if torch_dtype in (torch.float16, torch.bfloat16) else torch.float16
+        )
+        kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=compute_dtype,
+            bnb_4bit_use_double_quant=True,
+        )
+        kwargs["torch_dtype"] = compute_dtype
 
     model = AutoModelForCausalLM.from_pretrained(
         base_model_id,
-        torch_dtype=torch_dtype,
-        device_map="auto",
-        trust_remote_code=True,
+        **kwargs,
     )
     if len(tokenizer) != model.get_input_embeddings().weight.size(0):
         model.resize_token_embeddings(len(tokenizer))
     model.eval()
+
+    if compile_model:
+        try:
+            model = torch.compile(model, mode="reduce-overhead", fullgraph=False)
+        except Exception as exc:
+            print(f"[warn] torch.compile failed: {exc}. Continuing without compile().")
+
     return model, tokenizer
 
 
diff --git a/src/run_mcts.py b/src/run_mcts.py
@@ -8,14 +8,15 @@
 import warnings
 
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import AutoTokenizer
 from transformers.utils import logging as hf_logging
 import yaml
 
 from answer_utils import is_correct
 from mas import build_mas_from_specs
 from mcts import Node, MAS_MCTS
 from dataset_handler import load_hard_dataset
+from experiments.core import load_generation_model
 from show_tree import build_graph, draw_tree
 
 import ray
@@ -30,47 +31,6 @@
 )
 
 
-def load_policy(
-    model_id: str,
-    device_map: str,
-    load_in_4bit: bool = False,
-    attn_impl: str = "sdpa",
-    compile_model: bool = True,
-):
-    """
-    Loads tokenizer and model with either fp16/fp32 or 4-bit quantization.
-    device_map is now provided by argparse for flexibility.
-    """
-    kwargs = dict(device_map=device_map)
-    kwargs["attn_implementation"] = attn_impl
-    if load_in_4bit:
-        from transformers import BitsAndBytesConfig
-
-        kwargs["quantization_config"] = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_use_double_quant=True,
-        )
-        kwargs["torch_dtype"] = torch.float16
-    else:
-        kwargs["torch_dtype"] = torch.float16 if torch.cuda.is_available() else torch.float32
-
-    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
-    mdl = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
-
-    # tok = AutoTokenizer.from_pretrained(model_id, use_fast=True, local_files_only=True)
-    # mdl = AutoModelForCausalLM.from_pretrained(model_id, local_files_only=True, **kwargs)
-
-    mdl.eval()
-    if compile_model:
-        try:
-            mdl = torch.compile(mdl, mode="reduce-overhead", fullgraph=False)
-        except Exception as e:
-            print(f"[warn] torch.compile failed: {e}. Continuing without compile().")
-
-    return tok, mdl
-
-
 def node_to_dict(node: Node, max_children: int = 8) -> Dict[str, Any]:
     return {
         "steps": node.steps,
@@ -187,7 +147,7 @@ def __init__(
             self.mdl = None
         else:
             self.client = None
-            self.tok, self.mdl = load_policy(
+            self.mdl, self.tok = load_generation_model(
                 model_id,
                 device_map=device_map,
                 load_in_4bit=load_in_4bit,
@@ -428,9 +388,9 @@ def main():
             openai_model=args.model_id,
         )
     else:
-        tok, mdl = load_policy(
+        mdl, tok = load_generation_model(
             args.model_id,
-            args.device_map,
+            device_map=args.device_map,
             load_in_4bit=args.load_in_4bit,
             attn_impl=args.attn_impl,
             compile_model=not args.no_compile,