Skip to content
2 changes: 1 addition & 1 deletion .github/workflows/test_lemonade_eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -200,4 +200,4 @@ jobs:
Write-Host "Server stopped." -ForegroundColor Green

# This file was originally licensed under Apache 2.0. It has been modified.
# Modifications Copyright (c) 2025 AMD
# Modifications Copyright (c) 2025 AMD
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ pip install -e .[oga-cpu]
# For RyzenAI NPU support (Windows + Python 3.12 only):
pip install -e .[oga-ryzenai] --extra-index-url=https://pypi.amd.com/simple

# For model generation/export (Windows + Python 3.12 only):
# For model generation/custom export (Windows + Python 3.12 only):
pip install -e .[oga-ryzenai,model-generate] --extra-index-url=https://pypi.amd.com/simple
```

Expand Down Expand Up @@ -171,15 +171,15 @@ See the [Models List](https://lemonade-server.ai/docs/server/server_models/) for

## OGA-Load for Model Preparation

The `oga-load` tool is for preparing custom OGA (ONNX Runtime GenAI) models. It can build and quantize models from Hugging Face for use on NPU, iGPU, or CPU.

The `oga-load` tool is for preparing custom OGA (ONNX Runtime GenAI) models. It can build quark-quantized models from Hugging Face for use on NPU, iGPU, or CPU.
Checkout the official [Ryzen AI Model Preparation guide](https://ryzenai.docs.amd.com/en/latest/oga_model_prepare.html) for more details.
> **Note**: For running pre-built NPU/Hybrid models, use the server-based workflow above with `-NPU` or `-Hybrid` models. The `oga-load` tool is primarily for model preparation and testing custom checkpoints.

### Usage

```bash
# Prepare and test a model on CPU
lemonade-eval -i microsoft/Phi-3-mini-4k-instruct oga-load --device cpu --dtype int4 llm-prompt -p "Hello!"
lemonade-eval -i amd/Llama-3.2-1B-Instruct-awq-uint4-asym-g128-bf16-lmhead oga-load --device hybrid --dtype int4 llm-prompt -p "Alice and Bob" --max-new-tokens 10
```

### Installation for OGA
Expand Down
8 changes: 5 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,17 @@
extras_require={
# Extras for specific backends
"oga-ryzenai": [
"onnxruntime-genai-directml-ryzenai==0.9.2.1",
"onnxruntime-genai-directml-ryzenai==0.11.2",
"protobuf>=6.30.1",
],
"oga-cpu": [
"onnxruntime-genai==0.9.2",
"onnxruntime-genai==0.11.2",
"onnxruntime >=1.22.0",
],
"model-generate": [
"model-generate==1.5.0; platform_system=='Windows' and python_version=='3.10'",
"model-generate==1.7.0; platform_system=='Windows' and python_version=='3.12'",
"numpy<2",
"onnx_ir",
],
},
classifiers=[],
Expand Down
162 changes: 130 additions & 32 deletions src/lemonade/tools/oga/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,73 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:
f'{", ".join([value + " for " + key for key, value in execution_providers.items()])}.',
)

parser.add_argument(
"--packed-const",
action="store_true",
default=False,
help="[model-generate] Pass this if packed constants are\n"
"required (packed constants).",
)

parser.add_argument(
"--script-option",
choices=["jit_npu", "non_jit"],
default=None,
help="[model-generate] Script variant: jit_npu (hybrid),\n"
"non_jit (NPU basic) (default depends on device)",
)

parser.add_argument(
"--optimize",
choices=[
"prefill",
"prefill_llama3",
"decode",
"full_fusion",
"full_fusion_llama3",
],
default=None,
help="[model-generate] Optimization: prefill(_llama3) (hybrid),\n"
"decode/full_fusion(_llama3) (NPU basic)",
)

parser.add_argument(
"--max-seq-len",
default=None,
type=int,
help="[model-generate] Max sequence length for prefill\n"
"fusion (default: 4096)",
)

parser.add_argument(
"--npu-op-version",
choices=["v1", "v2"],
default=None,
help="[model-generate] NPU LLM op version (v1 / v2)",
)

parser.add_argument(
"--npu-basic",
action="store_true",
default=False,
help="[model-generate] Use basic NPU flow with matmulnbits pass file",
)

parser.add_argument(
"--npu-use-ep",
action="store_true",
default=False,
help="[model-generate] Use EP (Execution Provider) flow\n"
"(only applies to --npu --optimize decode)",
)

parser.add_argument(
"--no-prune-logits",
action="store_true",
default=False,
help="[model-generate] Disable logits pruning by setting prune_logits=false",
)

return parser

@staticmethod
Expand Down Expand Up @@ -340,7 +407,7 @@ def _setup_model_dependencies(full_model_path, device, ryzenai_version, oga_path
3. Check NPU driver version if required for device and ryzenai_version.
"""

# For RyzenAI 1.6.0, check NPU driver version for NPU and hybrid devices
# For RyzenAI 1.7.0, check NPU driver version for NPU and hybrid devices
if device in ["npu", "hybrid"]:
required_driver_version = REQUIRED_NPU_DRIVER_VERSION

Expand Down Expand Up @@ -378,24 +445,6 @@ def _setup_model_dependencies(full_model_path, device, ryzenai_version, oga_path
dll_source_path = os.path.join(
env_path, "Lib", "site-packages", "onnxruntime_genai"
)
required_dlls = ["libutf8_validity.dll", "abseil_dll.dll"]

# Validate that all required DLLs exist in the source directory
missing_dlls = []

for dll_name in required_dlls:
dll_source = os.path.join(dll_source_path, dll_name)
if not os.path.exists(dll_source):
missing_dlls.append(dll_source)

if missing_dlls:
dll_list = "\n - ".join(missing_dlls)
raise RuntimeError(
f"Required DLLs not found for {device} inference:\n - {dll_list}\n"
f"Please ensure your RyzenAI installation is complete and supports {device}.\n"
"See installation instructions at:\n"
"https://github.com/lemonade-sdk/lemonade-eval#installation\n"
)

# Add the DLL source directory to PATH
current_path = os.environ.get("PATH", "")
Expand Down Expand Up @@ -543,7 +592,22 @@ def _cleanup_environment(saved_state):
os.chdir(saved_state["cwd"])
os.environ["PATH"] = saved_state["path"]

def _generate_model_for_oga(self, output_model_path, device, input_model_path):
def _generate_model_for_oga(
self,
output_model_path,
device,
input_model_path,
packed_const=False,
script_option=None,
optimize=None,
max_seq_len=None,
npu_op_version=None,
npu_basic=False,
npu_use_ep=False,
no_prune_logits=False,
dml_only=False,
cpu_only=False,
):
"""
Uses the model_generate tool to generate the model for OGA hybrid or npu targets.
"""
Expand All @@ -569,18 +633,30 @@ def _generate_model_for_oga(self, output_model_path, device, input_model_path):

try:
if device_flag == "npu":
script_opt = script_option if script_option is not None else "non_jit"
model_generate.generate_npu_model(
input_model=input_model_path,
output_dir=output_model_path,
packed_const=False,
packed_const=packed_const,
script_option=script_opt,
optimize=optimize,
max_seq_len=max_seq_len,
npu_op_version=npu_op_version,
basic=npu_basic,
use_ep=npu_use_ep,
no_prune_logits=no_prune_logits,
cpu_only=cpu_only,
)
else: # hybrid
script_opt = script_option if script_option is not None else "jit_npu"
model_generate.generate_hybrid_model(
input_model=input_model_path,
output_dir=output_model_path,
script_option="jit_npu",
mode="bf16",
dml_only=False,
script_option=script_opt,
optimize=optimize,
max_seq_len=max_seq_len,
no_prune_logits=no_prune_logits,
dml_only=dml_only,
)
except Exception as e:
raise RuntimeError(
Expand All @@ -600,6 +676,16 @@ def run(
trust_remote_code=False,
subfolder: str = None,
do_not_upgrade: bool = False,
packed_const: bool = False,
script_option: str = None,
optimize: str = None,
max_seq_len: int = None,
npu_op_version: str = None,
npu_basic: bool = False,
npu_use_ep: bool = False,
no_prune_logits: bool = False,
dml_only: bool = False,
cpu_only: bool = False,
) -> State:
from lemonade.common.network import (
custom_snapshot_download,
Expand Down Expand Up @@ -714,28 +800,23 @@ def run(
"It does not contain ONNX or safetensors files."
)
if device in ["npu", "hybrid"]:
needs_generation = False
if is_onnx_model:
if is_preoptimized_onnx:
# Use HuggingFace cache path as it is
full_model_path = input_model_path
else:
# If ONNX but not modified yet for Hybrid or NPU,
# needs further optimization
self._generate_model_for_oga(
full_model_path,
device,
input_model_path,
)
needs_generation = True
elif is_safetensors_model:
config_path = os.path.join(input_model_path, "config.json")
if os.path.exists(config_path):
with open(config_path, "r", encoding="utf-8") as f:
config = json.load(f)
if "quantization_config" in config:
# If quantized, use subprocess to generate the model
self._generate_model_for_oga(
full_model_path, device, input_model_path
)
needs_generation = True
else:
raise ValueError(
f"The safetensors model {checkpoint} is not quantized. "
Expand All @@ -750,6 +831,23 @@ def run(
raise ValueError(
f"Unsupported model type for checkpoint: {checkpoint}"
)

if needs_generation:
self._generate_model_for_oga(
full_model_path,
device,
input_model_path,
packed_const,
script_option,
optimize,
max_seq_len,
npu_op_version,
npu_basic,
npu_use_ep,
no_prune_logits,
dml_only,
cpu_only,
)
else:
if is_onnx_model:
# Use HuggingFace cache path as it is
Expand Down
8 changes: 4 additions & 4 deletions src/lemonade/tools/oga/ryzenai.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,17 @@ def get_ryzenai_version_info():

if Version(og.__version__) >= Version("0.7.0"):
oga_path = os.path.dirname(og.__file__)
if og.__version__ in ("0.9.2", "0.9.2.1"):
return "1.6.0", oga_path
if og.__version__ in ("0.11.2", "0.11.2.1"):
return "1.7.0", oga_path
else:
raise ValueError(
f"Unsupported onnxruntime-genai-directml-ryzenai version: {og.__version__}\n"
"Only RyzenAI 1.6.0 is currently supported.\n"
"Only RyzenAI 1.7.0 is currently supported.\n"
"See installation instructions at: https://github.com/lemonade-sdk/lemonade-eval#installation" # pylint: disable=line-too-long
)
else:
raise ValueError(
"Legacy RyzenAI installation detected (version < 0.7.0).\n"
"RyzenAI 1.4.0 and 1.5.0 are no longer supported. Please upgrade to 1.6.0.\n"
"RyzenAI 1.4.0, 1.5.0 and 1.6.0 are no longer supported. Please upgrade to 1.7.0.\n"
"See installation instructions at: https://github.com/lemonade-sdk/lemonade-eval#installation" # pylint: disable=line-too-long
)
61 changes: 61 additions & 0 deletions test/oga_hybrid_model_prep_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import unittest
import shutil
import os
from lemonade.state import State
import lemonade.common.test_helpers as common
from lemonade.common.build import builds_dir
from lemonade.tools.prompt import LLMPrompt
from lemonade.tools.oga.load import OgaLoad
import sys

ci_mode = os.getenv("LEMONADE_CI_MODE", False)

checkpoint = "amd/Llama-3.2-1B-Instruct-awq-uint4-asym-g128-bf16-lmhead"
device = "hybrid"
dtype = "int4"
force = False
prompt = "Alice and Bob"


class Testing(unittest.TestCase):

def setUp(self) -> None:
shutil.rmtree(builds_dir(cache_dir), ignore_errors=True)

def test_001_oga_model_prep_hybrid(self):
# Test the OgaLoad with model generation (oga_model_prep) for hybrid device
# and LLMPrompt tools

state = State(cache_dir=cache_dir, build_name="test")

state = OgaLoad().run(
state,
input=checkpoint,
device=device,
dtype=dtype,
force=force,
dml_only=True,
)
state = LLMPrompt().run(state, prompt=prompt, max_new_tokens=10)

assert len(state.response) > 0, state.response


if __name__ == "__main__":
cache_dir, _ = common.create_test_dir(
"lemonade_oga_hybrid_model_prep_api", base_dir=os.path.abspath(".")
)

suite = unittest.TestSuite()
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Testing))

# Run the test suite
runner = unittest.TextTestRunner()
result = runner.run(suite)

# Set exit code based on test results
if not result.wasSuccessful():
sys.exit(1)

# This file was originally licensed under Apache 2.0. It has been modified.
# Modifications Copyright (c) 2025 AMD