Skip to content

Commit 6ae9ec5

Browse files
committed
slow tokenizer
1 parent d8d4717 commit 6ae9ec5

7 files changed

Lines changed: 15 additions & 11 deletions

File tree

benchmarks_and_experiments/important/accuracy_benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -497,7 +497,7 @@ def _run_baseline(
497497
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
498498
tokenizer = AutoTokenizer.from_pretrained(model)
499499
hf_model = AutoModelForCausalLM.from_pretrained(
500-
model, torch_dtype=torch.float16 if device in ("cuda", "mps") else torch.float32
500+
model, dtype=torch.float16 if device in ("cuda", "mps") else torch.float32
501501
).to(device)
502502
hf_model.eval()
503503

benchmarks_and_experiments/important/comparison_benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ def _run_baseline(
423423
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
424424
hf_model = AutoModelForCausalLM.from_pretrained(
425425
self.model_name,
426-
torch_dtype=torch.float16 if device in ("cuda", "mps") else torch.float32,
426+
dtype=torch.float16 if device in ("cuda", "mps") else torch.float32,
427427
).to(device)
428428
hf_model.eval()
429429

benchmarks_and_experiments/important/latency_benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -455,7 +455,7 @@ def _measure_baseline(
455455
tokenizer = AutoTokenizer.from_pretrained(model)
456456
hf_model = AutoModelForCausalLM.from_pretrained(
457457
model,
458-
torch_dtype=torch.float16 if device in ("cuda", "mps") else torch.float32,
458+
dtype=torch.float16 if device in ("cuda", "mps") else torch.float32,
459459
).to(device)
460460
hf_model.eval()
461461
n = len(samples)

benchmarks_and_experiments/important/memory_benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ def _measure_baseline(
475475
tokenizer = AutoTokenizer.from_pretrained(model)
476476
hf_model = AutoModelForCausalLM.from_pretrained(
477477
model,
478-
torch_dtype=torch.float16 if device in ("cuda", "mps") else torch.float32,
478+
dtype=torch.float16 if device in ("cuda", "mps") else torch.float32,
479479
).to(device)
480480
hf_model.eval()
481481
weights_mb = _get_model_weights_mb()

src/kvboost/cpu_paged/cpu_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def from_pretrained(
133133
device = "cpu"
134134
model = AutoModelForCausalLM.from_pretrained(
135135
model_name_or_path,
136-
torch_dtype=torch.float16,
136+
dtype=torch.float16,
137137
device_map=device,
138138
)
139139
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

src/kvboost/server/__main__.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def load_engine(args):
149149
"bfloat16": torch.bfloat16,
150150
"float32": torch.float32,
151151
}
152-
torch_dtype = dtype_map[args.dtype]
152+
dtype = dtype_map[args.dtype]
153153

154154
log.info("Loading model %s ...", args.model)
155155
if args.gguf_file:
@@ -170,7 +170,7 @@ def load_engine(args):
170170
quant_config = BitsAndBytesConfig(
171171
load_in_4bit=True,
172172
bnb_4bit_quant_type="nf4",
173-
bnb_4bit_compute_dtype=torch_dtype,
173+
bnb_4bit_compute_dtype=dtype,
174174
bnb_4bit_use_double_quant=True,
175175
)
176176
else:
@@ -198,7 +198,11 @@ def load_engine(args):
198198
max_memory = {(int(k) if k.isdigit() else k): v for k, v in raw.items()}
199199
log.info("CPU/GPU offload max_memory=%s", max_memory)
200200

201-
tokenizer = AutoTokenizer.from_pretrained(args.model, **gguf_kwargs)
201+
tokenizer_kwargs = dict(**gguf_kwargs)
202+
if args.use_slow_tokenizer:
203+
tokenizer_kwargs["use_fast"] = False
204+
log.info("Loading slow (SentencePiece) tokenizer.")
205+
tokenizer = AutoTokenizer.from_pretrained(args.model, **tokenizer_kwargs)
202206

203207
if args.backend == "cpu-paged":
204208
if args.gguf_file:
@@ -241,11 +245,11 @@ def load_engine(args):
241245
target = device if ":" in device or device in ("cpu", "mps") else f"{device}:0"
242246
from_pretrained_kwargs["device_map"] = {"": target}
243247
if quant_config is not None:
244-
# bnb/HQQ set compute dtype themselves; passing torch_dtype here
248+
# bnb/HQQ set compute dtype themselves; passing dtype here
245249
# is ignored (and would warn), so omit it.
246250
from_pretrained_kwargs["quantization_config"] = quant_config
247251
else:
248-
from_pretrained_kwargs["torch_dtype"] = torch_dtype
252+
from_pretrained_kwargs["dtype"] = dtype
249253
model = AutoModelForCausalLM.from_pretrained(
250254
args.model,
251255
**from_pretrained_kwargs,

tests/test_flash_attn_real_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def tinyllama():
4646
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
4747
model = AutoModelForCausalLM.from_pretrained(
4848
MODEL_ID,
49-
torch_dtype=torch.float32, # float32 so CPU attention is numerically stable
49+
dtype=torch.float32, # float32 so CPU attention is numerically stable
5050
low_cpu_mem_usage=True,
5151
)
5252
model = model.to("cpu")

0 commit comments

Comments
 (0)