@@ -232,6 +232,73 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:
232232 f'{ ", " .join ([value + " for " + key for key , value in execution_providers .items ()])} .' ,
233233 )
234234
235+ parser .add_argument (
236+ "--packed-const" ,
237+ action = "store_true" ,
238+ default = False ,
239+ help = "[model-generate] Pass this if packed constants are\n "
240+ "required (packed constants)." ,
241+ )
242+
243+ parser .add_argument (
244+ "--script-option" ,
245+ choices = ["jit_npu" , "non_jit" ],
246+ default = None ,
247+ help = "[model-generate] Script variant: jit_npu (hybrid),\n "
248+ "non_jit (NPU basic) (default depends on device)" ,
249+ )
250+
251+ parser .add_argument (
252+ "--optimize" ,
253+ choices = [
254+ "prefill" ,
255+ "prefill_llama3" ,
256+ "decode" ,
257+ "full_fusion" ,
258+ "full_fusion_llama3" ,
259+ ],
260+ default = None ,
261+ help = "[model-generate] Optimization: prefill(_llama3) (hybrid),\n "
262+ "decode/full_fusion(_llama3) (NPU basic)" ,
263+ )
264+
265+ parser .add_argument (
266+ "--max-seq-len" ,
267+ default = None ,
268+ type = int ,
269+ help = "[model-generate] Max sequence length for prefill\n "
270+ "fusion (default: 4096)" ,
271+ )
272+
273+ parser .add_argument (
274+ "--npu-op-version" ,
275+ choices = ["v1" , "v2" ],
276+ default = None ,
277+ help = "[model-generate] NPU LLM op version (v1 / v2)" ,
278+ )
279+
280+ parser .add_argument (
281+ "--npu-basic" ,
282+ action = "store_true" ,
283+ default = False ,
284+ help = "[model-generate] Use basic NPU flow with matmulnbits pass file" ,
285+ )
286+
287+ parser .add_argument (
288+ "--npu-use-ep" ,
289+ action = "store_true" ,
290+ default = False ,
291+ help = "[model-generate] Use EP (Execution Provider) flow\n "
292+ "(only applies to --npu --optimize decode)" ,
293+ )
294+
295+ parser .add_argument (
296+ "--no-prune-logits" ,
297+ action = "store_true" ,
298+ default = False ,
299+ help = "[model-generate] Disable logits pruning by setting prune_logits=false" ,
300+ )
301+
235302 return parser
236303
237304 @staticmethod
@@ -340,7 +407,7 @@ def _setup_model_dependencies(full_model_path, device, ryzenai_version, oga_path
340407 3. Check NPU driver version if required for device and ryzenai_version.
341408 """
342409
343- # For RyzenAI 1.6 .0, check NPU driver version for NPU and hybrid devices
410+ # For RyzenAI 1.7 .0, check NPU driver version for NPU and hybrid devices
344411 if device in ["npu" , "hybrid" ]:
345412 required_driver_version = REQUIRED_NPU_DRIVER_VERSION
346413
@@ -378,24 +445,6 @@ def _setup_model_dependencies(full_model_path, device, ryzenai_version, oga_path
378445 dll_source_path = os .path .join (
379446 env_path , "Lib" , "site-packages" , "onnxruntime_genai"
380447 )
381- required_dlls = ["libutf8_validity.dll" , "abseil_dll.dll" ]
382-
383- # Validate that all required DLLs exist in the source directory
384- missing_dlls = []
385-
386- for dll_name in required_dlls :
387- dll_source = os .path .join (dll_source_path , dll_name )
388- if not os .path .exists (dll_source ):
389- missing_dlls .append (dll_source )
390-
391- if missing_dlls :
392- dll_list = "\n - " .join (missing_dlls )
393- raise RuntimeError (
394- f"Required DLLs not found for { device } inference:\n - { dll_list } \n "
395- f"Please ensure your RyzenAI installation is complete and supports { device } .\n "
396- "See installation instructions at:\n "
397- "https://github.com/lemonade-sdk/lemonade-eval#installation\n "
398- )
399448
400449 # Add the DLL source directory to PATH
401450 current_path = os .environ .get ("PATH" , "" )
@@ -543,7 +592,22 @@ def _cleanup_environment(saved_state):
543592 os .chdir (saved_state ["cwd" ])
544593 os .environ ["PATH" ] = saved_state ["path" ]
545594
546- def _generate_model_for_oga (self , output_model_path , device , input_model_path ):
595+ def _generate_model_for_oga (
596+ self ,
597+ output_model_path ,
598+ device ,
599+ input_model_path ,
600+ packed_const = False ,
601+ script_option = None ,
602+ optimize = None ,
603+ max_seq_len = None ,
604+ npu_op_version = None ,
605+ npu_basic = False ,
606+ npu_use_ep = False ,
607+ no_prune_logits = False ,
608+ dml_only = False ,
609+ cpu_only = False ,
610+ ):
547611 """
548612 Uses the model_generate tool to generate the model for OGA hybrid or npu targets.
549613 """
@@ -569,18 +633,30 @@ def _generate_model_for_oga(self, output_model_path, device, input_model_path):
569633
570634 try :
571635 if device_flag == "npu" :
636+ script_opt = script_option if script_option is not None else "non_jit"
572637 model_generate .generate_npu_model (
573638 input_model = input_model_path ,
574639 output_dir = output_model_path ,
575- packed_const = False ,
640+ packed_const = packed_const ,
641+ script_option = script_opt ,
642+ optimize = optimize ,
643+ max_seq_len = max_seq_len ,
644+ npu_op_version = npu_op_version ,
645+ basic = npu_basic ,
646+ use_ep = npu_use_ep ,
647+ no_prune_logits = no_prune_logits ,
648+ cpu_only = cpu_only ,
576649 )
577650 else : # hybrid
651+ script_opt = script_option if script_option is not None else "jit_npu"
578652 model_generate .generate_hybrid_model (
579653 input_model = input_model_path ,
580654 output_dir = output_model_path ,
581- script_option = "jit_npu" ,
582- mode = "bf16" ,
583- dml_only = False ,
655+ script_option = script_opt ,
656+ optimize = optimize ,
657+ max_seq_len = max_seq_len ,
658+ no_prune_logits = no_prune_logits ,
659+ dml_only = dml_only ,
584660 )
585661 except Exception as e :
586662 raise RuntimeError (
@@ -600,6 +676,16 @@ def run(
600676 trust_remote_code = False ,
601677 subfolder : str = None ,
602678 do_not_upgrade : bool = False ,
679+ packed_const : bool = False ,
680+ script_option : str = None ,
681+ optimize : str = None ,
682+ max_seq_len : int = None ,
683+ npu_op_version : str = None ,
684+ npu_basic : bool = False ,
685+ npu_use_ep : bool = False ,
686+ no_prune_logits : bool = False ,
687+ dml_only : bool = False ,
688+ cpu_only : bool = False ,
603689 ) -> State :
604690 from lemonade .common .network import (
605691 custom_snapshot_download ,
@@ -714,28 +800,23 @@ def run(
714800 "It does not contain ONNX or safetensors files."
715801 )
716802 if device in ["npu" , "hybrid" ]:
803+ needs_generation = False
717804 if is_onnx_model :
718805 if is_preoptimized_onnx :
719806 # Use HuggingFace cache path as it is
720807 full_model_path = input_model_path
721808 else :
722809 # If ONNX but not modified yet for Hybrid or NPU,
723810 # needs further optimization
724- self ._generate_model_for_oga (
725- full_model_path ,
726- device ,
727- input_model_path ,
728- )
811+ needs_generation = True
729812 elif is_safetensors_model :
730813 config_path = os .path .join (input_model_path , "config.json" )
731814 if os .path .exists (config_path ):
732815 with open (config_path , "r" , encoding = "utf-8" ) as f :
733816 config = json .load (f )
734817 if "quantization_config" in config :
735818 # If quantized, use subprocess to generate the model
736- self ._generate_model_for_oga (
737- full_model_path , device , input_model_path
738- )
819+ needs_generation = True
739820 else :
740821 raise ValueError (
741822 f"The safetensors model { checkpoint } is not quantized. "
@@ -750,6 +831,23 @@ def run(
750831 raise ValueError (
751832 f"Unsupported model type for checkpoint: { checkpoint } "
752833 )
834+
835+ if needs_generation :
836+ self ._generate_model_for_oga (
837+ full_model_path ,
838+ device ,
839+ input_model_path ,
840+ packed_const ,
841+ script_option ,
842+ optimize ,
843+ max_seq_len ,
844+ npu_op_version ,
845+ npu_basic ,
846+ npu_use_ep ,
847+ no_prune_logits ,
848+ dml_only ,
849+ cpu_only ,
850+ )
753851 else :
754852 if is_onnx_model :
755853 # Use HuggingFace cache path as it is
0 commit comments