Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions runtime/onert/api/python/package/common/basesession.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ def set_inputs(self, size, inputs_array=[]):
raise ValueError(
"Session is not initialized with a model. Please compile with a model before setting inputs."
)

self.inputs = []
for i in range(size):
input_tensorinfo = self.session.input_tensorinfo(i)

Expand All @@ -115,6 +117,8 @@ def set_outputs(self, size):
raise ValueError(
"Session is not initialized with a model. Please compile a model before setting outputs."
)

self.outputs = []
for i in range(size):
output_tensorinfo = self.session.output_tensorinfo(i)
output_array = np.zeros((num_elems(output_tensorinfo)),
Expand Down
45 changes: 37 additions & 8 deletions runtime/onert/api/python/package/infer/session.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import List, Any
from typing import List, Union, Tuple, Dict
import numpy as np
import time
from contextlib import contextmanager

from ..native.libnnfw_api_pybind import infer, tensorinfo
from ..common.basesession import BaseSession
Expand Down Expand Up @@ -57,7 +59,12 @@ def update_inputs_tensorinfo(self, new_infos: List[tensorinfo]) -> None:
f"{info.dims[:info.rank]}")
self.session.set_input_tensorinfo(i, info)

def infer(self, inputs_array: List[np.ndarray]) -> List[np.ndarray]:
def infer(
self,
inputs_array: List[np.ndarray],
*,
measure: bool = False
) -> Union[List[np.ndarray], Tuple[List[np.ndarray], Dict[str, float]]]:
"""
Run a complete inference cycle:
- If the session has not been prepared or outputs have not been set, call prepare() and set_outputs().
Expand All @@ -72,15 +79,22 @@ def infer(self, inputs_array: List[np.ndarray]) -> List[np.ndarray]:

Args:
inputs_array (list[np.ndarray]): List of numpy arrays representing the input data.
measure (bool): If True, measure prepare/io/run latencies (ms).

Returns:
list[np.ndarray]: A list containing the output numpy arrays.
OR
(outputs, metrics): Tuple where metrics is a dict with keys
'prepare_time_ms', 'io_time_ms', 'run_time_ms'
"""
metrics: Dict[str, float] = {}

# Check if the session is prepared. If not, call prepare() and set_outputs() once.
if not self._prepared:
self.session.prepare()
self.set_outputs(self.session.output_size())
self._prepared = True
with self._time_block(metrics, 'prepare_time_ms', measure):
self.session.prepare()
self.set_outputs(self.session.output_size())
self._prepared = True

# Verify that the number of provided inputs matches the session's expected input count.
expected_input_size: int = self.session.input_size()
Expand All @@ -90,8 +104,23 @@ def infer(self, inputs_array: List[np.ndarray]) -> List[np.ndarray]:
)

# Configure input buffers using the current session's input size and provided data.
self.set_inputs(expected_input_size, inputs_array)
with self._time_block(metrics, 'io_time_ms', measure):
self.set_inputs(expected_input_size, inputs_array)

# Execute the inference.
self.session.run()
with self._time_block(metrics, 'run_time_ms', measure):
self.session.run()

# TODO: Support dynamic shapes for outputs.

# Return the output buffers.
return self.outputs
return (self.outputs.copy(), metrics) if measure else self.outputs.copy()

@contextmanager
def _time_block(self, metrics: Dict[str, float], key: str, mesure: bool):
if mesure:
start = time.perf_counter()
yield
metrics[key] = (time.perf_counter() - start) * 1000
else:
yield
128 changes: 128 additions & 0 deletions runtime/onert/sample/minimal-python/src/inference_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import argparse
import numpy as np
import psutil
import os
from typing import List
from onert import infer
# TODO: Import tensorinfo from onert
from onert.native.libnnfw_api_pybind import tensorinfo


def get_memory_usage_mb() -> float:
"""Get current process memory usage in MB."""
process = psutil.Process(os.getpid())
return process.memory_info().rss / (1024 * 1024)


def parse_shapes(shape_strs: List[str]) -> List[List[int]]:
shapes = []
for s in shape_strs:
try:
shapes.append([int(dim) for dim in s.strip().split(",")])
except Exception:
raise ValueError(f"Invalid shape string: '{s}' (expected: 1,224,224,3 ...)")
return shapes


def apply_static_shapes(sess: infer.session,
static_shapes: List[List[int]]) -> List[tensorinfo]:
original_infos = sess.get_inputs_tensorinfo()
if len(static_shapes) != len(original_infos):
raise ValueError(
f"Input count mismatch: model expects {len(original_infos)} inputs, but got {len(static_shapes)} shapes"
)

updated_infos: List[tensorinfo] = []

for i, info in enumerate(original_infos):
shape = static_shapes[i]
if info.rank != len(shape):
raise ValueError(
f"Rank mismatch for input {i}: expected rank {info.rank}, got {len(shape)}"
)
info.dims = shape
info.rank = len(shape)
updated_infos.append(info)

sess.update_inputs_tensorinfo(updated_infos)
return updated_infos


def benchmark_inference(nnpackage_path: str, backends: str, input_shapes: List[List[int]],
repeat: int):
mem_before_kb = get_memory_usage_mb() * 1024

sess = infer.session(path=nnpackage_path, backends=backends)
model_load_kb = get_memory_usage_mb() * 1024 - mem_before_kb

input_infos = apply_static_shapes(
sess, input_shapes) if input_shapes else sess.get_inputs_tensorinfo()

# Create dummy input arrays
dummy_inputs = []
for info in input_infos:
shape = tuple(info.dims[:info.rank])
dummy_inputs.append(np.random.rand(*shape).astype(info.dtype))

prepare = total_io = total_run = 0.0

# Warmup runs
prepare_kb = 0
for _ in range(3):
outputs, metrics = sess.infer(dummy_inputs, measure=True)
del outputs
if "prepare_time_ms" in metrics:
prepare = metrics["prepare_time_ms"]
prepare_kb = get_memory_usage_mb() * 1024 - mem_before_kb

# Benchmark runs
for _ in range(repeat):
outputs, metrics = sess.infer(dummy_inputs, measure=True)
del outputs
total_io += metrics["io_time_ms"]
total_run += metrics["run_time_ms"]

execute_kb = get_memory_usage_mb() * 1024
Comment thread
ragmani marked this conversation as resolved.
Outdated

print("======= Inference Benchmark =======")
print(f"- Warmup runs : 3")
print(f"- Measured runs : {repeat}")
print(f"- Prepare : {prepare:.3f} ms")
print(f"- Avg I/O : {total_io / repeat:.3f} ms")
print(f"- Avg Run : {total_run / repeat:.3f} ms")
print("===================================")
print("RSS")
print(f"- MODEL_LOAD : {model_load_kb:.0f} KB")
print(f"- PREPARE : {prepare_kb:.0f} KB")
print(f"- EXECUTE : {execute_kb:.0f} KB")
print(f"- PEAK : {max(model_load_kb, prepare_kb, execute_kb):.0f} KB")
print("===================================")


# TODO: Support dynamic(on-the-fly) shape
def main():
parser = argparse.ArgumentParser(description="ONERT Inference Benchmark")
parser.add_argument("nnpackage", type=str, help="Path to .nnpackage directory")
parser.add_argument("--backends",
type=str,
default="cpu",
help="Backends to use (default: cpu)")
parser.add_argument("--input-shape",
nargs="+",
help="Input shapes for each input (e.g. 1,224,224,3 1,10)")
parser.add_argument("--repeat",
type=int,
default=5,
help="Number of measured inference repetitions")

args = parser.parse_args()
shapes = parse_shapes(args.input_shape) if args.input_shape else None

benchmark_inference(nnpackage_path=args.nnpackage,
backends=args.backends,
input_shapes=shapes,
repeat=args.repeat)


if __name__ == "__main__":
main()