diff --git a/python/utils/hostruntime/xrtruntime/hostruntime.py b/python/utils/hostruntime/xrtruntime/hostruntime.py index 8266acd1a7a..f06887612bb 100644 --- a/python/utils/hostruntime/xrtruntime/hostruntime.py +++ b/python/utils/hostruntime/xrtruntime/hostruntime.py @@ -268,6 +268,7 @@ def run( kernel_handle.insts, flags=pyxrt.bo.cacheable, group_id=kernel_handle.kernel.group_id(1), + xrt_device=self._device, ).buffer_object() start = time.time_ns() @@ -330,19 +331,21 @@ def __init__(self, kernel, xclbin, context, insts, insts_bo=None): def invalidate(self): """ - Invalidate the handle and release resources. + Invalidate the handle and release resources in dependency order. """ self._is_valid = False - if hasattr(self, "context"): - del self.context + # Instruction BOs and kernels depend on the hardware context. Those must + # be released before dropping the handle's context reference. + if hasattr(self, "insts_bo"): + del self.insts_bo if hasattr(self, "kernel"): del self.kernel + if hasattr(self, "context"): + del self.context if hasattr(self, "xclbin"): del self.xclbin if hasattr(self, "insts"): del self.insts - if hasattr(self, "insts_bo"): - del self.insts_bo class CachedXRTRuntime(XRTHostRuntime): @@ -391,14 +394,21 @@ def __init__(self): def cleanup(self): """ - Clean up the cache by evicting all entries. + Clean up cached XRT resources in dependency order. """ - while self._context_cache: - self._evict() while self._insts_cache: self._evict_insts() + while self._context_cache: + self._evict() gc.collect() # Make sure contexts are garbage collected. + def _cleanup_entry_insts(self, entry): + """Release instruction BOs owned by a cached context entry.""" + for insts_key in list(entry.get("insts_keys", ())): + insts_entry = self._insts_cache.pop(insts_key, None) + if insts_entry is not None: + self._cleanup_insts_entry(insts_key, insts_entry) + def _cleanup_entry(self, entry): handles = entry["handles"] @@ -408,6 +418,8 @@ def _cleanup_entry(self, entry): if handle: handle.invalidate() + self._cleanup_entry_insts(entry) + # Clear kernel cache so pyxrt.kernel objects are released with the context entry["kernels"].clear() @@ -422,14 +434,18 @@ def _evict(self): # Pop the oldest item key, entry = self._context_cache.popitem(last=False) self._cleanup_entry(entry) + gc.collect() - def _cleanup_insts_entry(self, entry): + def _cleanup_insts_entry(self, insts_key, entry): + owner_entry = entry.get("owner_entry") + if owner_entry is not None: + owner_entry.get("insts_keys", set()).discard(insts_key) # Delete the key (not a local copy) so the refcount drops here. del entry["insts_bo"] def _evict_insts(self): key, entry = self._insts_cache.popitem(last=False) - self._cleanup_insts_entry(entry) + self._cleanup_insts_entry(key, entry) def run( self, @@ -533,15 +549,10 @@ def load( try: context = pyxrt.hw_context(self._device, xclbin_uuid) except RuntimeError as e: - # If we hit a resource limit (err=-2 usually means EMFILE/ENFILE or similar resource exhaustion) - # and we have items in the cache, try evicting. - if ( - "No such file or directory" in str(e) - and self._context_cache - and retries < max_retries - ): + # Context-slot exhaustion is reported differently across XRT backends. + # Evict cached contexts and retry, but only while cached entries remain. + if self._context_cache and retries < max_retries: self._evict() - gc.collect() # Make sure contexts are garbage collected. retries += 1 else: raise e @@ -551,6 +562,7 @@ def load( "xclbin": xclbin, "kernels": {}, # kernel_name -> pyxrt.kernel (strong ref, tied to context) "handles": [], + "insts_keys": set(), "uuid": xclbin_uuid, } self._context_cache[context_key] = entry @@ -599,12 +611,15 @@ def load( insts, flags=pyxrt.bo.cacheable, group_id=group_id, + xrt_device=self._device, ).buffer_object() insts_entry = { "insts_bo": insts_bo, + "owner_entry": entry, } self._insts_cache[insts_key] = insts_entry + entry["insts_keys"].add(insts_key) kernel_handle = CachedXRTKernelHandle( kernel, xclbin, context, insts, insts_bo diff --git a/python/utils/hostruntime/xrtruntime/tensor.py b/python/utils/hostruntime/xrtruntime/tensor.py index 3c803a613f0..f5613a8f271 100644 --- a/python/utils/hostruntime/xrtruntime/tensor.py +++ b/python/utils/hostruntime/xrtruntime/tensor.py @@ -29,6 +29,7 @@ def __init__( device="npu", flags=xrt.bo.host_only, group_id=0, + xrt_device=None, ): """ Initialize the XRTTensor. @@ -41,10 +42,11 @@ def __init__( device (str, optional): Device string identifier. Defaults to 'npu'. flags (optional): XRT buffer object flags. Defaults to xrt.bo.host_only. group_id (int, optional): XRT buffer object group ID. Defaults to 0. + xrt_device (optional): Existing PyXRT device handle to use for BO allocation. + When omitted, a new handle for device index 0 is opened for this tensor. """ super().__init__(shape_or_data, dtype=dtype, device=device) - device_index = 0 - self.xrt_device = xrt.device(device_index) + self.xrt_device = xrt_device if xrt_device is not None else xrt.device(0) # Extract the shape if isinstance(shape_or_data, tuple): @@ -58,7 +60,7 @@ def __init__( else: # TODO(efficiency): Extra data copy here (when necessary) # so we can borrow verification of array-like things from numpy. - np_data = np.array(shape_or_data, dtype=dtype, copy=False) + np_data = np.asarray(shape_or_data, dtype=dtype) self._shape = np_data.shape # Ideally, we use xrt::ext::bo host-only BO but there are no bindings for that currently. diff --git a/test/npu-xrt/xrt_handle_lifetime/aie.mlir b/test/npu-xrt/xrt_handle_lifetime/aie.mlir new file mode 100644 index 00000000000..0694b134c4f --- /dev/null +++ b/test/npu-xrt/xrt_handle_lifetime/aie.mlir @@ -0,0 +1,169 @@ +//===- aie.mlir ------------------------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2022-2026 Advanced Micro Devices, Inc. or its affiliates +// Copyright (C) 2020-2022, Xilinx Inc. +// +//===----------------------------------------------------------------------===// + +module { + aie.device(NPUDEVICE) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + + %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_0"} : memref<8xi32> + %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_1"} : memref<8xi32> + %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_0"} : memref<8xi32> + %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_1"} : memref<8xi32> + + %objFifo_in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "objFifo_in1_cons_prod_lock"} + %objFifo_in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_cons_lock"} + %objFifo_out1_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "objFifo_out1_prod_lock"} + %objFifo_out1_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_lock"} + + aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0) + aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) + aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0) + aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1) + + %core_0_2 = aie.core(%tile_0_2) { + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1_i32 = arith.constant 1 : i32 + %c2 = arith.constant 2 : index + scf.for %arg0 = %c0 to %c8 step %c2 { + aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1) + aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1) + + scf.for %arg1 = %c0 to %c8 step %c1 { + %0 = memref.load %objFifo_in1_cons_buff_0[%arg1] : memref<8xi32> + %1 = arith.addi %0, %c1_i32 : i32 + memref.store %1, %objFifo_out1_buff_0[%arg1] : memref<8xi32> + } + + aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1) + aie.use_lock(%objFifo_out1_cons_lock, Release, 1) + + aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1) + aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1) + + scf.for %arg1 = %c0 to %c8 step %c1 { + %0 = memref.load %objFifo_in1_cons_buff_1[%arg1] : memref<8xi32> + %1 = arith.addi %0, %c1_i32 : i32 + memref.store %1, %objFifo_out1_buff_1[%arg1] : memref<8xi32> + } + + aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1) + aie.use_lock(%objFifo_out1_cons_lock, Release, 1) + } + aie.end + } + + aie.shim_dma_allocation @objFifo_in0 (%tile_0_0, MM2S, 0) + + aie.runtime_sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) { + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %c64_i64 = arith.constant 64 : i64 + aiex.npu.dma_memcpy_nd(%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> + aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi32> + aiex.npu.dma_wait {symbol = @objFifo_out0} + } + + %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { + %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_0"} : memref<16xi32> + %objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_1"} : memref<16xi32> + %objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_0"} : memref<16xi32> + %objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_1"} : memref<16xi32> + + %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in0_cons_prod_lock"} + %objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"} + %objFifo_out0_prod_lock = aie.lock(%tile_0_1, 2) {init = 2 : i32, sym_name = "objFifo_out0_prod_lock"} + %objFifo_out0_cons_lock = aie.lock(%tile_0_1, 3) {init = 0 : i32, sym_name = "objFifo_out0_cons_lock"} + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb2 + aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<16xi32>, 0, 16) + aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1) + aie.next_bd ^bb2 + ^bb2: // pred: ^bb1 + aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_in0_cons_buff_1 : memref<16xi32>, 0, 16) + aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1) + aie.next_bd ^bb1 + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6) + ^bb4: // 2 preds: ^bb3, ^bb5 + aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<16xi32>, 0, 16) + aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1) + aie.next_bd ^bb5 + ^bb5: // pred: ^bb4 + aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_in0_cons_buff_1 : memref<16xi32>, 0, 16) + aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1) + aie.next_bd ^bb4 + ^bb6: // pred: ^bb3 + %2 = aie.dma_start(MM2S, 1, ^bb7, ^bb9) + ^bb7: // 2 preds: ^bb6, ^bb8 + aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_out0_buff_0 : memref<16xi32>, 0, 16) + aie.use_lock(%objFifo_out0_prod_lock, Release, 1) + aie.next_bd ^bb8 + ^bb8: // pred: ^bb7 + aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_out0_buff_1 : memref<16xi32>, 0, 16) + aie.use_lock(%objFifo_out0_prod_lock, Release, 1) + aie.next_bd ^bb7 + ^bb9: // pred: ^bb6 + %3 = aie.dma_start(S2MM, 1, ^bb10, ^bb12) + ^bb10: // 2 preds: ^bb9, ^bb11 + aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_out0_buff_0 : memref<16xi32>, 0, 16) + aie.use_lock(%objFifo_out0_cons_lock, Release, 1) + aie.next_bd ^bb11 + ^bb11: // pred: ^bb10 + aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_out0_buff_1 : memref<16xi32>, 0, 16) + aie.use_lock(%objFifo_out0_cons_lock, Release, 1) + aie.next_bd ^bb10 + ^bb12: // pred: ^bb9 + aie.end + } + + aie.shim_dma_allocation @objFifo_out0 (%tile_0_0, S2MM, 0) + + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb2 + aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_in1_cons_buff_0 : memref<8xi32>, 0, 8) + aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1) + aie.next_bd ^bb2 + ^bb2: // pred: ^bb1 + aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_in1_cons_buff_1 : memref<8xi32>, 0, 8) + aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1) + aie.next_bd ^bb1 + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6) + ^bb4: // 2 preds: ^bb3, ^bb5 + aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_out1_buff_0 : memref<8xi32>, 0, 8) + aie.use_lock(%objFifo_out1_prod_lock, Release, 1) + aie.next_bd ^bb5 + ^bb5: // pred: ^bb4 + aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_out1_buff_1 : memref<8xi32>, 0, 8) + aie.use_lock(%objFifo_out1_prod_lock, Release, 1) + aie.next_bd ^bb4 + ^bb6: // pred: ^bb3 + aie.end + } + } +} diff --git a/test/npu-xrt/xrt_handle_lifetime/run_ordered.lit b/test/npu-xrt/xrt_handle_lifetime/run_ordered.lit new file mode 100644 index 00000000000..db69d86f10a --- /dev/null +++ b/test/npu-xrt/xrt_handle_lifetime/run_ordered.lit @@ -0,0 +1,12 @@ +// (c) Copyright 2026 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// +// RUN: cp %S/aie.mlir aie_arch.mlir +// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir +// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir +// RUN: %aiecc %backend_flags --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie_arch.mlir +// RUN: %host_clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags %host_link_flags %test_utils_flags +// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode ordered +// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode ordered diff --git a/test/npu-xrt/xrt_handle_lifetime/run_stale_instr_bo.lit b/test/npu-xrt/xrt_handle_lifetime/run_stale_instr_bo.lit new file mode 100644 index 00000000000..8b68cd5c361 --- /dev/null +++ b/test/npu-xrt/xrt_handle_lifetime/run_stale_instr_bo.lit @@ -0,0 +1,13 @@ +// (c) Copyright 2026 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// XFAIL: system-windows +// +// RUN: cp %S/aie.mlir aie_arch.mlir +// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir +// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir +// RUN: %aiecc %backend_flags --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie_arch.mlir +// RUN: %host_clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags %host_link_flags %test_utils_flags +// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-instr-bo +// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-instr-bo diff --git a/test/npu-xrt/xrt_handle_lifetime/run_stale_io_bos.lit b/test/npu-xrt/xrt_handle_lifetime/run_stale_io_bos.lit new file mode 100644 index 00000000000..46818bc2f6a --- /dev/null +++ b/test/npu-xrt/xrt_handle_lifetime/run_stale_io_bos.lit @@ -0,0 +1,12 @@ +// (c) Copyright 2026 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// +// RUN: cp %S/aie.mlir aie_arch.mlir +// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir +// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir +// RUN: %aiecc %backend_flags --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie_arch.mlir +// RUN: %host_clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags %host_link_flags %test_utils_flags +// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-io-bos +// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-io-bos diff --git a/test/npu-xrt/xrt_handle_lifetime/test.cpp b/test/npu-xrt/xrt_handle_lifetime/test.cpp new file mode 100644 index 00000000000..f858b03a529 --- /dev/null +++ b/test/npu-xrt/xrt_handle_lifetime/test.cpp @@ -0,0 +1,193 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2026 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cxxopts.hpp" +#include "test_utils.h" +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr int IN_SIZE = 64; +constexpr int OUT_SIZE = 64; + +struct Buffers { + std::unique_ptr instr; + std::unique_ptr in_a; + std::unique_ptr in_b; + std::unique_ptr out; +}; + +static std::string find_kernel_name(xrt::xclbin &xclbin, + const std::string &kernel_prefix) { + auto xkernels = xclbin.get_kernels(); + auto it = std::find_if(xkernels.begin(), xkernels.end(), + [&](xrt::xclbin::kernel &kernel) { + auto name = kernel.get_name(); + std::cout << "Name: " << name << "\n"; + return name.rfind(kernel_prefix, 0) == 0; + }); + if (it == xkernels.end()) + throw std::runtime_error("kernel not found: " + kernel_prefix); + return it->get_name(); +} + +static Buffers make_buffers(xrt::device &instr_device, xrt::device &io_device, + xrt::kernel &kernel, + const std::vector &instr_v) { + Buffers buffers; + buffers.instr = + std::make_unique(instr_device, instr_v.size() * sizeof(uint32_t), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + buffers.in_a = + std::make_unique(io_device, IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + buffers.in_b = + std::make_unique(io_device, IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + buffers.out = + std::make_unique(io_device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + return buffers; +} + +static void initialize_buffers(Buffers &buffers, + const std::vector &instr_v) { + auto *buf_in_a = buffers.in_a->map(); + std::vector src_vec_a; + for (int i = 0; i < IN_SIZE; i++) + src_vec_a.push_back(i + 1); + std::memcpy(buf_in_a, src_vec_a.data(), src_vec_a.size() * sizeof(uint32_t)); + + void *buf_instr = buffers.instr->map(); + std::memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(uint32_t)); + + buffers.instr->sync(XCL_BO_SYNC_BO_TO_DEVICE); + buffers.in_a->sync(XCL_BO_SYNC_BO_TO_DEVICE); +} + +static int run_and_check(xrt::kernel &kernel, Buffers &buffers, + std::size_t instr_word_count) { + auto run = kernel(3, *buffers.instr, instr_word_count, *buffers.in_a, + *buffers.in_b, *buffers.out); + ert_cmd_state state = run.wait(); + if (state != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << state << "\n"; + return 1; + } + + buffers.out->sync(XCL_BO_SYNC_BO_FROM_DEVICE); + auto *buf_out = buffers.out->map(); + + int errors = 0; + for (uint32_t i = 0; i < OUT_SIZE; i++) { + uint32_t ref = i + 2; + if (*(buf_out + i) != ref) { + std::cout << "Error in output " << *(buf_out + i) << " != " << ref + << "\n"; + errors++; + } + } + + if (errors) { + std::cout << "failed.\n"; + return 1; + } + + std::cout << "PASS!\n"; + return 0; +} + +static void destroy_io_bos(Buffers &buffers) { + buffers.in_a.reset(); + buffers.in_b.reset(); + buffers.out.reset(); +} + +static int run_mode(const cxxopts::ParseResult &vm, + const std::vector &instr_v, + const std::string &mode) { + auto device = xrt::device(0); + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + std::string kernel_name = + find_kernel_name(xclbin, vm["kernel"].as()); + + device.register_xclbin(xclbin); + auto context = std::make_unique(device, xclbin.get_uuid()); + auto kernel = std::make_unique(*context, kernel_name); + + if (mode == "ordered") { + auto buffers = make_buffers(device, device, *kernel, instr_v); + initialize_buffers(buffers, instr_v); + return run_and_check(*kernel, buffers, instr_v.size()); + } + + if (mode == "stale-instr-bo") { + auto instr_device = xrt::device(0); + auto buffers = make_buffers(instr_device, device, *kernel, instr_v); + initialize_buffers(buffers, instr_v); + + int result = run_and_check(*kernel, buffers, instr_v.size()); + if (result != 0) + return result; + + destroy_io_bos(buffers); + kernel.reset(); + context.reset(); + std::cout << "Destroying instruction BO after kernel/context.\n"; + buffers.instr.reset(); + return 0; + } + + if (mode == "stale-io-bos") { + auto io_device = xrt::device(0); + auto buffers = make_buffers(device, io_device, *kernel, instr_v); + initialize_buffers(buffers, instr_v); + + int result = run_and_check(*kernel, buffers, instr_v.size()); + if (result != 0) + return result; + + buffers.instr.reset(); + kernel.reset(); + context.reset(); + std::cout << "Destroying input/output BOs after kernel/context.\n"; + destroy_io_bos(buffers); + return 0; + } + + throw std::runtime_error("unknown mode: " + mode); +} + +int main(int argc, const char *argv[]) { + cxxopts::Options options("xrt_handle_lifetime"); + test_utils::add_default_options(options); + options.add_options()( + "mode", "Test mode", + cxxopts::value()->default_value("ordered")); + + cxxopts::ParseResult vm; + test_utils::parse_options(argc, argv, options, vm); + + std::string mode = vm["mode"].as(); + auto instr_v = test_utils::load_instr_binary(vm["instr"].as()); + std::cout << "mode=" << mode << "\n"; + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + return run_mode(vm, instr_v, mode); +} diff --git a/test/python/npu-xrt/test_cached_xrt_runtime.py b/test/python/npu-xrt/test_cached_xrt_runtime.py index 99fa6df0ee4..d44b44b8cbb 100644 --- a/test/python/npu-xrt/test_cached_xrt_runtime.py +++ b/test/python/npu-xrt/test_cached_xrt_runtime.py @@ -178,29 +178,36 @@ def test_runtime_eviction_logic(runtime): def test_runtime_cache_fill(runtime): - """Test filling the cache to its capacity.""" + """Test filling the Python-side cache to its configured capacity.""" - # Ensure cache is empty - runtime.cleanup() + # Use a deliberately small artificial capacity. On Windows, reserved + # resources make the practical limit lower than the nominal cache size. + original_size = runtime._cache_size + runtime._cache_size = min(original_size, 8) - input_tensor = iron.arange(32, dtype=np.int32) + try: + runtime.cleanup() - # Load kernels up to capacity + 1 - limit = runtime._cache_size - first_key = None + input_tensor = iron.arange(32, dtype=np.int32) + + # Load kernels up to the artificial capacity + 1. + limit = runtime._cache_size + first_key = None - for i in range(limit + 1): - transform(input_tensor, input_tensor, lambda x, val=i: x + val) + for i in range(limit + 1): + transform(input_tensor, input_tensor, lambda x, val=i: x + val) - if i == 0: - first_key = list(runtime._context_cache.keys())[0] + if i == 0: + first_key = list(runtime._context_cache.keys())[0] - # Check size - expected_size = min(i + 1, limit) - assert len(runtime._context_cache) == expected_size + expected_size = min(i + 1, limit) + assert len(runtime._context_cache) == expected_size - # Verify the first one was evicted (since we went to limit + 1) - assert first_key not in runtime._context_cache + # Verify the first one was evicted (since we went to limit + 1). + assert first_key not in runtime._context_cache + finally: + runtime.cleanup() + runtime._cache_size = original_size def test_runtime_mtime_sensitivity(runtime):