slow tokenizer

pythongiant · pythongiant · commit 6ae9ec573764 · 2026-05-08T10:25:42.000+05:30
diff --git a/benchmarks_and_experiments/important/accuracy_benchmark.py b/benchmarks_and_experiments/important/accuracy_benchmark.py
@@ -497,7 +497,7 @@ def _run_baseline(
     device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
     tokenizer = AutoTokenizer.from_pretrained(model)
     hf_model = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16 if device in ("cuda", "mps") else torch.float32
+        model, dtype=torch.float16 if device in ("cuda", "mps") else torch.float32
     ).to(device)
     hf_model.eval()
 
diff --git a/benchmarks_and_experiments/important/comparison_benchmark.py b/benchmarks_and_experiments/important/comparison_benchmark.py
@@ -423,7 +423,7 @@ def _run_baseline(
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         hf_model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
-            torch_dtype=torch.float16 if device in ("cuda", "mps") else torch.float32,
+            dtype=torch.float16 if device in ("cuda", "mps") else torch.float32,
         ).to(device)
         hf_model.eval()
 
diff --git a/benchmarks_and_experiments/important/latency_benchmark.py b/benchmarks_and_experiments/important/latency_benchmark.py
@@ -455,7 +455,7 @@ def _measure_baseline(
     tokenizer = AutoTokenizer.from_pretrained(model)
     hf_model = AutoModelForCausalLM.from_pretrained(
         model,
-        torch_dtype=torch.float16 if device in ("cuda", "mps") else torch.float32,
+        dtype=torch.float16 if device in ("cuda", "mps") else torch.float32,
     ).to(device)
     hf_model.eval()
     n = len(samples)
diff --git a/benchmarks_and_experiments/important/memory_benchmark.py b/benchmarks_and_experiments/important/memory_benchmark.py
@@ -475,7 +475,7 @@ def _measure_baseline(
     tokenizer = AutoTokenizer.from_pretrained(model)
     hf_model = AutoModelForCausalLM.from_pretrained(
         model,
-        torch_dtype=torch.float16 if device in ("cuda", "mps") else torch.float32,
+        dtype=torch.float16 if device in ("cuda", "mps") else torch.float32,
     ).to(device)
     hf_model.eval()
     weights_mb = _get_model_weights_mb()
diff --git a/src/kvboost/cpu_paged/cpu_engine.py b/src/kvboost/cpu_paged/cpu_engine.py
@@ -133,7 +133,7 @@ def from_pretrained(
         device = "cpu"
         model = AutoModelForCausalLM.from_pretrained(
             model_name_or_path,
-            torch_dtype=torch.float16,
+            dtype=torch.float16,
             device_map=device,
         )
         tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
diff --git a/src/kvboost/server/__main__.py b/src/kvboost/server/__main__.py
@@ -149,7 +149,7 @@ def load_engine(args):
         "bfloat16": torch.bfloat16,
         "float32": torch.float32,
     }
-    torch_dtype = dtype_map[args.dtype]
+    dtype = dtype_map[args.dtype]
 
     log.info("Loading model %s ...", args.model)
     if args.gguf_file:
@@ -170,7 +170,7 @@ def load_engine(args):
             quant_config = BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_quant_type="nf4",
-                bnb_4bit_compute_dtype=torch_dtype,
+                bnb_4bit_compute_dtype=dtype,
                 bnb_4bit_use_double_quant=True,
             )
         else:
@@ -198,7 +198,11 @@ def load_engine(args):
         max_memory = {(int(k) if k.isdigit() else k): v for k, v in raw.items()}
         log.info("CPU/GPU offload max_memory=%s", max_memory)
 
-    tokenizer = AutoTokenizer.from_pretrained(args.model, **gguf_kwargs)
+    tokenizer_kwargs = dict(**gguf_kwargs)
+    if args.use_slow_tokenizer:
+        tokenizer_kwargs["use_fast"] = False
+        log.info("Loading slow (SentencePiece) tokenizer.")
+    tokenizer = AutoTokenizer.from_pretrained(args.model, **tokenizer_kwargs)
 
     if args.backend == "cpu-paged":
         if args.gguf_file:
@@ -241,11 +245,11 @@ def load_engine(args):
             target = device if ":" in device or device in ("cpu", "mps") else f"{device}:0"
             from_pretrained_kwargs["device_map"] = {"": target}
         if quant_config is not None:
-            # bnb/HQQ set compute dtype themselves; passing torch_dtype here
+            # bnb/HQQ set compute dtype themselves; passing dtype here
             # is ignored (and would warn), so omit it.
             from_pretrained_kwargs["quantization_config"] = quant_config
         else:
-            from_pretrained_kwargs["torch_dtype"] = torch_dtype
+            from_pretrained_kwargs["dtype"] = dtype
         model = AutoModelForCausalLM.from_pretrained(
             args.model,
             **from_pretrained_kwargs,
diff --git a/tests/test_flash_attn_real_model.py b/tests/test_flash_attn_real_model.py
@@ -46,7 +46,7 @@ def tinyllama():
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
-        torch_dtype=torch.float32,   # float32 so CPU attention is numerically stable
+        dtype=torch.float32,   # float32 so CPU attention is numerically stable
         low_cpu_mem_usage=True,
     )
     model = model.to("cpu")

Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ def from_pretrained(`
`133`	`133`	`device = "cpu"`
`134`	`134`	`model = AutoModelForCausalLM.from_pretrained(`
`135`	`135`	`model_name_or_path,`
`136`		`- torch_dtype=torch.float16,`
	`136`	`+ dtype=torch.float16,`
`137`	`137`	`device_map=device,`
`138`	`138`	`)`
`139`	`139`	`tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)`
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ def tinyllama():`
`46`	`46`	`tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)`
`47`	`47`	`model = AutoModelForCausalLM.from_pretrained(`
`48`	`48`	`MODEL_ID,`
`49`		`- torch_dtype=torch.float32, # float32 so CPU attention is numerically stable`
	`49`	`+ dtype=torch.float32, # float32 so CPU attention is numerically stable`
`50`	`50`	`low_cpu_mem_usage=True,`
`51`	`51`	`)`
`52`	`52`	`model = model.to("cpu")`