Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions runtime/onert/api/python/package/common/basesession.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ def set_inputs(self, size, inputs_array=[]):
raise ValueError(
"Session is not initialized with a model. Please compile with a model before setting inputs."
)

self.inputs = []
for i in range(size):
input_tensorinfo = self.session.input_tensorinfo(i)

Expand All @@ -115,6 +117,8 @@ def set_outputs(self, size):
raise ValueError(
"Session is not initialized with a model. Please compile a model before setting outputs."
)

self.outputs = []
for i in range(size):
output_tensorinfo = self.session.output_tensorinfo(i)
output_array = np.zeros((num_elems(output_tensorinfo)),
Expand Down
45 changes: 37 additions & 8 deletions runtime/onert/api/python/package/infer/session.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import List, Any
from typing import List, Union, Tuple, Dict
import numpy as np
import time
from contextlib import contextmanager

from ..native.libnnfw_api_pybind import infer, tensorinfo
from ..common.basesession import BaseSession
Expand Down Expand Up @@ -57,7 +59,12 @@ def update_inputs_tensorinfo(self, new_infos: List[tensorinfo]) -> None:
f"{info.dims[:info.rank]}")
self.session.set_input_tensorinfo(i, info)

def infer(self, inputs_array: List[np.ndarray]) -> List[np.ndarray]:
def infer(
self,
inputs_array: List[np.ndarray],
*,
measure: bool = False
) -> Union[List[np.ndarray], Tuple[List[np.ndarray], Dict[str, float]]]:
"""
Run a complete inference cycle:
- If the session has not been prepared or outputs have not been set, call prepare() and set_outputs().
Expand All @@ -72,15 +79,22 @@ def infer(self, inputs_array: List[np.ndarray]) -> List[np.ndarray]:

Args:
inputs_array (list[np.ndarray]): List of numpy arrays representing the input data.
measure (bool): If True, measure prepare/io/run latencies (ms).

Returns:
list[np.ndarray]: A list containing the output numpy arrays.
OR
(outputs, metrics): Tuple where metrics is a dict with keys
'prepare_time_ms', 'io_time_ms', 'run_time_ms'
"""
metrics: Dict[str, float] = {}

# Check if the session is prepared. If not, call prepare() and set_outputs() once.
if not self._prepared:
self.session.prepare()
self.set_outputs(self.session.output_size())
self._prepared = True
with self._time_block(metrics, 'prepare_time_ms', measure):
self.session.prepare()
self.set_outputs(self.session.output_size())
self._prepared = True

# Verify that the number of provided inputs matches the session's expected input count.
expected_input_size: int = self.session.input_size()
Expand All @@ -90,8 +104,23 @@ def infer(self, inputs_array: List[np.ndarray]) -> List[np.ndarray]:
)

# Configure input buffers using the current session's input size and provided data.
self.set_inputs(expected_input_size, inputs_array)
with self._time_block(metrics, 'io_time_ms', measure):
self.set_inputs(expected_input_size, inputs_array)

# Execute the inference.
self.session.run()
with self._time_block(metrics, 'run_time_ms', measure):
self.session.run()

# TODO: Support dynamic shapes for outputs.

# Return the output buffers.
return self.outputs
return (self.outputs.copy(), metrics) if measure else self.outputs.copy()

@contextmanager
def _time_block(self, metrics: Dict[str, float], key: str, mesure: bool):
if mesure:
start = time.perf_counter()
yield
metrics[key] = (time.perf_counter() - start) * 1000
else:
yield
128 changes: 128 additions & 0 deletions runtime/onert/sample/minimal-python/src/inference_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import argparse
import numpy as np
import psutil
import os
from typing import List
from onert import infer
# TODO: Import tensorinfo from onert
from onert.native.libnnfw_api_pybind import tensorinfo


def get_memory_usage_mb() -> float:
"""Get current process memory usage in MB."""
process = psutil.Process(os.getpid())
return process.memory_info().rss / (1024 * 1024)


def parse_shapes(shape_strs: List[str]) -> List[List[int]]:
shapes = []
for s in shape_strs:
try:
shapes.append([int(dim) for dim in s.strip().split(",")])
except Exception:
raise ValueError(f"Invalid shape string: '{s}' (expected: 1,224,224,3 ...)")
return shapes


def apply_static_shapes(sess: infer.session,
static_shapes: List[List[int]]) -> List[tensorinfo]:
original_infos = sess.get_inputs_tensorinfo()
if len(static_shapes) != len(original_infos):
raise ValueError(
f"Input count mismatch: model expects {len(original_infos)} inputs, but got {len(static_shapes)} shapes"
)

updated_infos: List[tensorinfo] = []

for i, info in enumerate(original_infos):
shape = static_shapes[i]
if info.rank != len(shape):
raise ValueError(
f"Rank mismatch for input {i}: expected rank {info.rank}, got {len(shape)}"
)
info.dims = shape
info.rank = len(shape)
updated_infos.append(info)

sess.update_inputs_tensorinfo(updated_infos)
return updated_infos


def benchmark_inference(nnpackage_path: str, backends: str, input_shapes: List[List[int]],
repeat: int):
mem_before_kb = get_memory_usage_mb() * 1024

sess = infer.session(path=nnpackage_path, backends=backends)
model_load_kb = get_memory_usage_mb() * 1024 - mem_before_kb

input_infos = apply_static_shapes(
sess, input_shapes) if input_shapes else sess.get_inputs_tensorinfo()

# Create dummy input arrays
dummy_inputs = []
for info in input_infos:
shape = tuple(info.dims[:info.rank])
dummy_inputs.append(np.random.rand(*shape).astype(info.dtype))

prepare = total_io = total_run = 0.0

# Warmup runs
prepare_kb = 0
for _ in range(3):
outputs, metrics = sess.infer(dummy_inputs, measure=True)
del outputs
if "prepare_time_ms" in metrics:
prepare = metrics["prepare_time_ms"]
prepare_kb = get_memory_usage_mb() * 1024 - mem_before_kb

# Benchmark runs
for _ in range(repeat):
outputs, metrics = sess.infer(dummy_inputs, measure=True)
del outputs
total_io += metrics["io_time_ms"]
total_run += metrics["run_time_ms"]

execute_kb = get_memory_usage_mb() * 1024 - mem_before_kb

print("======= Inference Benchmark =======")
print(f"- Warmup runs : 3")
print(f"- Measured runs : {repeat}")
print(f"- Prepare : {prepare:.3f} ms")
print(f"- Avg I/O : {total_io / repeat:.3f} ms")
print(f"- Avg Run : {total_run / repeat:.3f} ms")
print("===================================")
print("RSS")
print(f"- MODEL_LOAD : {model_load_kb:.0f} KB")
print(f"- PREPARE : {prepare_kb:.0f} KB")
print(f"- EXECUTE : {execute_kb:.0f} KB")
print(f"- PEAK : {max(model_load_kb, prepare_kb, execute_kb):.0f} KB")
print("===================================")


# TODO: Support dynamic(on-the-fly) shape
def main():
parser = argparse.ArgumentParser(description="ONERT Inference Benchmark")
parser.add_argument("nnpackage", type=str, help="Path to .nnpackage directory")
parser.add_argument("--backends",
type=str,
default="cpu",
help="Backends to use (default: cpu)")
parser.add_argument("--input-shape",
nargs="+",
help="Input shapes for each input (e.g. 1,224,224,3 1,10)")
parser.add_argument("--repeat",
type=int,
default=5,
help="Number of measured inference repetitions")

args = parser.parse_args()
shapes = parse_shapes(args.input_shape) if args.input_shape else None

benchmark_inference(nnpackage_path=args.nnpackage,
backends=args.backends,
input_shapes=shapes,
repeat=args.repeat)


if __name__ == "__main__":
main()