@@ -149,7 +149,7 @@ def load_engine(args):
149149 "bfloat16" : torch .bfloat16 ,
150150 "float32" : torch .float32 ,
151151 }
152- torch_dtype = dtype_map [args .dtype ]
152+ dtype = dtype_map [args .dtype ]
153153
154154 log .info ("Loading model %s ..." , args .model )
155155 if args .gguf_file :
@@ -170,7 +170,7 @@ def load_engine(args):
170170 quant_config = BitsAndBytesConfig (
171171 load_in_4bit = True ,
172172 bnb_4bit_quant_type = "nf4" ,
173- bnb_4bit_compute_dtype = torch_dtype ,
173+ bnb_4bit_compute_dtype = dtype ,
174174 bnb_4bit_use_double_quant = True ,
175175 )
176176 else :
@@ -198,7 +198,11 @@ def load_engine(args):
198198 max_memory = {(int (k ) if k .isdigit () else k ): v for k , v in raw .items ()}
199199 log .info ("CPU/GPU offload max_memory=%s" , max_memory )
200200
201- tokenizer = AutoTokenizer .from_pretrained (args .model , ** gguf_kwargs )
201+ tokenizer_kwargs = dict (** gguf_kwargs )
202+ if args .use_slow_tokenizer :
203+ tokenizer_kwargs ["use_fast" ] = False
204+ log .info ("Loading slow (SentencePiece) tokenizer." )
205+ tokenizer = AutoTokenizer .from_pretrained (args .model , ** tokenizer_kwargs )
202206
203207 if args .backend == "cpu-paged" :
204208 if args .gguf_file :
@@ -241,11 +245,11 @@ def load_engine(args):
241245 target = device if ":" in device or device in ("cpu" , "mps" ) else f"{ device } :0"
242246 from_pretrained_kwargs ["device_map" ] = {"" : target }
243247 if quant_config is not None :
244- # bnb/HQQ set compute dtype themselves; passing torch_dtype here
248+ # bnb/HQQ set compute dtype themselves; passing dtype here
245249 # is ignored (and would warn), so omit it.
246250 from_pretrained_kwargs ["quantization_config" ] = quant_config
247251 else :
248- from_pretrained_kwargs ["torch_dtype " ] = torch_dtype
252+ from_pretrained_kwargs ["dtype " ] = dtype
249253 model = AutoModelForCausalLM .from_pretrained (
250254 args .model ,
251255 ** from_pretrained_kwargs ,
0 commit comments