Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 33 additions & 18 deletions python/utils/hostruntime/xrtruntime/hostruntime.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ def run(
kernel_handle.insts,
flags=pyxrt.bo.cacheable,
group_id=kernel_handle.kernel.group_id(1),
xrt_device=self._device,
).buffer_object()

start = time.time_ns()
Expand Down Expand Up @@ -330,19 +331,21 @@ def __init__(self, kernel, xclbin, context, insts, insts_bo=None):

def invalidate(self):
"""
Invalidate the handle and release resources.
Invalidate the handle and release resources in dependency order.
"""
self._is_valid = False
if hasattr(self, "context"):
del self.context
# Instruction BOs and kernels depend on the hardware context. Those must
# be released before dropping the handle's context reference.
if hasattr(self, "insts_bo"):
del self.insts_bo
if hasattr(self, "kernel"):
del self.kernel
if hasattr(self, "context"):
del self.context
if hasattr(self, "xclbin"):
del self.xclbin
if hasattr(self, "insts"):
del self.insts
if hasattr(self, "insts_bo"):
del self.insts_bo


class CachedXRTRuntime(XRTHostRuntime):
Expand Down Expand Up @@ -391,14 +394,21 @@ def __init__(self):

def cleanup(self):
"""
Clean up the cache by evicting all entries.
Clean up cached XRT resources in dependency order.
"""
while self._context_cache:
self._evict()
while self._insts_cache:
self._evict_insts()
while self._context_cache:
self._evict()
gc.collect() # Make sure contexts are garbage collected.

def _cleanup_entry_insts(self, entry):
"""Release instruction BOs owned by a cached context entry."""
for insts_key in list(entry.get("insts_keys", ())):
insts_entry = self._insts_cache.pop(insts_key, None)
if insts_entry is not None:
self._cleanup_insts_entry(insts_key, insts_entry)

def _cleanup_entry(self, entry):
handles = entry["handles"]

Expand All @@ -408,6 +418,8 @@ def _cleanup_entry(self, entry):
if handle:
handle.invalidate()

self._cleanup_entry_insts(entry)

# Clear kernel cache so pyxrt.kernel objects are released with the context
entry["kernels"].clear()

Expand All @@ -422,14 +434,18 @@ def _evict(self):
# Pop the oldest item
key, entry = self._context_cache.popitem(last=False)
self._cleanup_entry(entry)
gc.collect()

def _cleanup_insts_entry(self, entry):
def _cleanup_insts_entry(self, insts_key, entry):
owner_entry = entry.get("owner_entry")
if owner_entry is not None:
owner_entry.get("insts_keys", set()).discard(insts_key)
# Delete the key (not a local copy) so the refcount drops here.
del entry["insts_bo"]

def _evict_insts(self):
key, entry = self._insts_cache.popitem(last=False)
self._cleanup_insts_entry(entry)
self._cleanup_insts_entry(key, entry)

def run(
self,
Expand Down Expand Up @@ -533,15 +549,10 @@ def load(
try:
context = pyxrt.hw_context(self._device, xclbin_uuid)
except RuntimeError as e:
# If we hit a resource limit (err=-2 usually means EMFILE/ENFILE or similar resource exhaustion)
# and we have items in the cache, try evicting.
if (
"No such file or directory" in str(e)
and self._context_cache
and retries < max_retries
):
# Context-slot exhaustion is reported differently across XRT backends.
# Evict cached contexts and retry, but only while cached entries remain.
if self._context_cache and retries < max_retries:
self._evict()
gc.collect() # Make sure contexts are garbage collected.
retries += 1
else:
raise e
Expand All @@ -551,6 +562,7 @@ def load(
"xclbin": xclbin,
"kernels": {}, # kernel_name -> pyxrt.kernel (strong ref, tied to context)
"handles": [],
"insts_keys": set(),
"uuid": xclbin_uuid,
}
self._context_cache[context_key] = entry
Expand Down Expand Up @@ -599,12 +611,15 @@ def load(
insts,
flags=pyxrt.bo.cacheable,
group_id=group_id,
xrt_device=self._device,
).buffer_object()

insts_entry = {
"insts_bo": insts_bo,
"owner_entry": entry,
}
self._insts_cache[insts_key] = insts_entry
entry["insts_keys"].add(insts_key)

kernel_handle = CachedXRTKernelHandle(
kernel, xclbin, context, insts, insts_bo
Expand Down
8 changes: 5 additions & 3 deletions python/utils/hostruntime/xrtruntime/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __init__(
device="npu",
flags=xrt.bo.host_only,
group_id=0,
xrt_device=None,
):
"""
Initialize the XRTTensor.
Expand All @@ -41,10 +42,11 @@ def __init__(
device (str, optional): Device string identifier. Defaults to 'npu'.
flags (optional): XRT buffer object flags. Defaults to xrt.bo.host_only.
group_id (int, optional): XRT buffer object group ID. Defaults to 0.
xrt_device (optional): Existing PyXRT device handle to use for BO allocation.
When omitted, a new handle for device index 0 is opened for this tensor.
"""
super().__init__(shape_or_data, dtype=dtype, device=device)
device_index = 0
self.xrt_device = xrt.device(device_index)
self.xrt_device = xrt_device if xrt_device is not None else xrt.device(0)

# Extract the shape
if isinstance(shape_or_data, tuple):
Expand All @@ -58,7 +60,7 @@ def __init__(
else:
# TODO(efficiency): Extra data copy here (when necessary)
# so we can borrow verification of array-like things from numpy.
np_data = np.array(shape_or_data, dtype=dtype, copy=False)
np_data = np.asarray(shape_or_data, dtype=dtype)
self._shape = np_data.shape

# Ideally, we use xrt::ext::bo host-only BO but there are no bindings for that currently.
Expand Down
169 changes: 169 additions & 0 deletions test/npu-xrt/xrt_handle_lifetime/aie.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// (c) Copyright 2022-2026 Advanced Micro Devices, Inc. or its affiliates
// Copyright (C) 2020-2022, Xilinx Inc.
//
//===----------------------------------------------------------------------===//

module {
aie.device(NPUDEVICE) {
%tile_0_0 = aie.tile(0, 0)
%tile_0_1 = aie.tile(0, 1)
%tile_0_2 = aie.tile(0, 2)

%objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_0"} : memref<8xi32>
%objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_1"} : memref<8xi32>
%objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_0"} : memref<8xi32>
%objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_1"} : memref<8xi32>

%objFifo_in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "objFifo_in1_cons_prod_lock"}
%objFifo_in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_cons_lock"}
%objFifo_out1_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "objFifo_out1_prod_lock"}
%objFifo_out1_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_lock"}

aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0)
aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1)

%core_0_2 = aie.core(%tile_0_2) {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c1_i32 = arith.constant 1 : i32
%c2 = arith.constant 2 : index
scf.for %arg0 = %c0 to %c8 step %c2 {
aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1)
aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1)

scf.for %arg1 = %c0 to %c8 step %c1 {
%0 = memref.load %objFifo_in1_cons_buff_0[%arg1] : memref<8xi32>
%1 = arith.addi %0, %c1_i32 : i32
memref.store %1, %objFifo_out1_buff_0[%arg1] : memref<8xi32>
}

aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1)
aie.use_lock(%objFifo_out1_cons_lock, Release, 1)

aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1)
aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1)

scf.for %arg1 = %c0 to %c8 step %c1 {
%0 = memref.load %objFifo_in1_cons_buff_1[%arg1] : memref<8xi32>
%1 = arith.addi %0, %c1_i32 : i32
memref.store %1, %objFifo_out1_buff_1[%arg1] : memref<8xi32>
}

aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1)
aie.use_lock(%objFifo_out1_cons_lock, Release, 1)
}
aie.end
}

aie.shim_dma_allocation @objFifo_in0 (%tile_0_0, MM2S, 0)

aie.runtime_sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
%c0_i64 = arith.constant 0 : i64
%c1_i64 = arith.constant 1 : i64
%c64_i64 = arith.constant 64 : i64
aiex.npu.dma_memcpy_nd(%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi32>
aiex.npu.dma_wait {symbol = @objFifo_out0}
}

%memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
%objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_0"} : memref<16xi32>
%objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_1"} : memref<16xi32>
%objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_0"} : memref<16xi32>
%objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_1"} : memref<16xi32>

%objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in0_cons_prod_lock"}
%objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"}
%objFifo_out0_prod_lock = aie.lock(%tile_0_1, 2) {init = 2 : i32, sym_name = "objFifo_out0_prod_lock"}
%objFifo_out0_cons_lock = aie.lock(%tile_0_1, 3) {init = 0 : i32, sym_name = "objFifo_out0_cons_lock"}
%0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
^bb1: // 2 preds: ^bb0, ^bb2
aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<16xi32>, 0, 16)
aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1)
aie.next_bd ^bb2
^bb2: // pred: ^bb1
aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%objFifo_in0_cons_buff_1 : memref<16xi32>, 0, 16)
aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1)
aie.next_bd ^bb1
^bb3: // pred: ^bb0
%1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
^bb4: // 2 preds: ^bb3, ^bb5
aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<16xi32>, 0, 16)
aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1)
aie.next_bd ^bb5
^bb5: // pred: ^bb4
aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%objFifo_in0_cons_buff_1 : memref<16xi32>, 0, 16)
aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1)
aie.next_bd ^bb4
^bb6: // pred: ^bb3
%2 = aie.dma_start(MM2S, 1, ^bb7, ^bb9)
^bb7: // 2 preds: ^bb6, ^bb8
aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%objFifo_out0_buff_0 : memref<16xi32>, 0, 16)
aie.use_lock(%objFifo_out0_prod_lock, Release, 1)
aie.next_bd ^bb8
^bb8: // pred: ^bb7
aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%objFifo_out0_buff_1 : memref<16xi32>, 0, 16)
aie.use_lock(%objFifo_out0_prod_lock, Release, 1)
aie.next_bd ^bb7
^bb9: // pred: ^bb6
%3 = aie.dma_start(S2MM, 1, ^bb10, ^bb12)
^bb10: // 2 preds: ^bb9, ^bb11
aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%objFifo_out0_buff_0 : memref<16xi32>, 0, 16)
aie.use_lock(%objFifo_out0_cons_lock, Release, 1)
aie.next_bd ^bb11
^bb11: // pred: ^bb10
aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%objFifo_out0_buff_1 : memref<16xi32>, 0, 16)
aie.use_lock(%objFifo_out0_cons_lock, Release, 1)
aie.next_bd ^bb10
^bb12: // pred: ^bb9
aie.end
}

aie.shim_dma_allocation @objFifo_out0 (%tile_0_0, S2MM, 0)

%mem_0_2 = aie.mem(%tile_0_2) {
%0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
^bb1: // 2 preds: ^bb0, ^bb2
aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%objFifo_in1_cons_buff_0 : memref<8xi32>, 0, 8)
aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1)
aie.next_bd ^bb2
^bb2: // pred: ^bb1
aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%objFifo_in1_cons_buff_1 : memref<8xi32>, 0, 8)
aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1)
aie.next_bd ^bb1
^bb3: // pred: ^bb0
%1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
^bb4: // 2 preds: ^bb3, ^bb5
aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%objFifo_out1_buff_0 : memref<8xi32>, 0, 8)
aie.use_lock(%objFifo_out1_prod_lock, Release, 1)
aie.next_bd ^bb5
^bb5: // pred: ^bb4
aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%objFifo_out1_buff_1 : memref<8xi32>, 0, 8)
aie.use_lock(%objFifo_out1_prod_lock, Release, 1)
aie.next_bd ^bb4
^bb6: // pred: ^bb3
aie.end
}
}
}
12 changes: 12 additions & 0 deletions test/npu-xrt/xrt_handle_lifetime/run_ordered.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// (c) Copyright 2026 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai
//
// RUN: cp %S/aie.mlir aie_arch.mlir
// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir
// RUN: %aiecc %backend_flags --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie_arch.mlir
// RUN: %host_clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags %host_link_flags %test_utils_flags
// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode ordered
// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode ordered
13 changes: 13 additions & 0 deletions test/npu-xrt/xrt_handle_lifetime/run_stale_instr_bo.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// (c) Copyright 2026 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai
// XFAIL: system-windows
//
// RUN: cp %S/aie.mlir aie_arch.mlir
// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir
// RUN: %aiecc %backend_flags --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie_arch.mlir
// RUN: %host_clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags %host_link_flags %test_utils_flags
// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-instr-bo
// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-instr-bo
12 changes: 12 additions & 0 deletions test/npu-xrt/xrt_handle_lifetime/run_stale_io_bos.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// (c) Copyright 2026 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai
//
// RUN: cp %S/aie.mlir aie_arch.mlir
// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir
// RUN: %aiecc %backend_flags --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie_arch.mlir
// RUN: %host_clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags %host_link_flags %test_utils_flags
// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-io-bos
// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-io-bos
Loading
Loading