diff --git a/python/utils/hostruntime/xrtruntime/hostruntime.py b/python/utils/hostruntime/xrtruntime/hostruntime.py
index 8266acd1a7a..f06887612bb 100644
--- a/python/utils/hostruntime/xrtruntime/hostruntime.py
+++ b/python/utils/hostruntime/xrtruntime/hostruntime.py
@@ -268,6 +268,7 @@ def run(
                         kernel_handle.insts,
                         flags=pyxrt.bo.cacheable,
                         group_id=kernel_handle.kernel.group_id(1),
+                        xrt_device=self._device,
                     ).buffer_object()
 
             start = time.time_ns()
@@ -330,19 +331,21 @@ def __init__(self, kernel, xclbin, context, insts, insts_bo=None):
 
     def invalidate(self):
         """
-        Invalidate the handle and release resources.
+        Invalidate the handle and release resources in dependency order.
         """
         self._is_valid = False
-        if hasattr(self, "context"):
-            del self.context
+        # Instruction BOs and kernels depend on the hardware context. Those must
+        # be released before dropping the handle's context reference.
+        if hasattr(self, "insts_bo"):
+            del self.insts_bo
         if hasattr(self, "kernel"):
             del self.kernel
+        if hasattr(self, "context"):
+            del self.context
         if hasattr(self, "xclbin"):
             del self.xclbin
         if hasattr(self, "insts"):
             del self.insts
-        if hasattr(self, "insts_bo"):
-            del self.insts_bo
 
 
 class CachedXRTRuntime(XRTHostRuntime):
@@ -391,14 +394,21 @@ def __init__(self):
 
     def cleanup(self):
         """
-        Clean up the cache by evicting all entries.
+        Clean up cached XRT resources in dependency order.
         """
-        while self._context_cache:
-            self._evict()
         while self._insts_cache:
             self._evict_insts()
+        while self._context_cache:
+            self._evict()
         gc.collect()  # Make sure contexts are garbage collected.
 
+    def _cleanup_entry_insts(self, entry):
+        """Release instruction BOs owned by a cached context entry."""
+        for insts_key in list(entry.get("insts_keys", ())):
+            insts_entry = self._insts_cache.pop(insts_key, None)
+            if insts_entry is not None:
+                self._cleanup_insts_entry(insts_key, insts_entry)
+
     def _cleanup_entry(self, entry):
         handles = entry["handles"]
 
@@ -408,6 +418,8 @@ def _cleanup_entry(self, entry):
             if handle:
                 handle.invalidate()
 
+        self._cleanup_entry_insts(entry)
+
         # Clear kernel cache so pyxrt.kernel objects are released with the context
         entry["kernels"].clear()
 
@@ -422,14 +434,18 @@ def _evict(self):
         # Pop the oldest item
         key, entry = self._context_cache.popitem(last=False)
         self._cleanup_entry(entry)
+        gc.collect()
 
-    def _cleanup_insts_entry(self, entry):
+    def _cleanup_insts_entry(self, insts_key, entry):
+        owner_entry = entry.get("owner_entry")
+        if owner_entry is not None:
+            owner_entry.get("insts_keys", set()).discard(insts_key)
         # Delete the key (not a local copy) so the refcount drops here.
         del entry["insts_bo"]
 
     def _evict_insts(self):
         key, entry = self._insts_cache.popitem(last=False)
-        self._cleanup_insts_entry(entry)
+        self._cleanup_insts_entry(key, entry)
 
     def run(
         self,
@@ -533,15 +549,10 @@ def load(
                     try:
                         context = pyxrt.hw_context(self._device, xclbin_uuid)
                     except RuntimeError as e:
-                        # If we hit a resource limit (err=-2 usually means EMFILE/ENFILE or similar resource exhaustion)
-                        # and we have items in the cache, try evicting.
-                        if (
-                            "No such file or directory" in str(e)
-                            and self._context_cache
-                            and retries < max_retries
-                        ):
+                        # Context-slot exhaustion is reported differently across XRT backends.
+                        # Evict cached contexts and retry, but only while cached entries remain.
+                        if self._context_cache and retries < max_retries:
                             self._evict()
-                            gc.collect()  # Make sure contexts are garbage collected.
                             retries += 1
                         else:
                             raise e
@@ -551,6 +562,7 @@ def load(
                     "xclbin": xclbin,
                     "kernels": {},  # kernel_name -> pyxrt.kernel (strong ref, tied to context)
                     "handles": [],
+                    "insts_keys": set(),
                     "uuid": xclbin_uuid,
                 }
                 self._context_cache[context_key] = entry
@@ -599,12 +611,15 @@ def load(
                         insts,
                         flags=pyxrt.bo.cacheable,
                         group_id=group_id,
+                        xrt_device=self._device,
                     ).buffer_object()
 
                     insts_entry = {
                         "insts_bo": insts_bo,
+                        "owner_entry": entry,
                     }
                     self._insts_cache[insts_key] = insts_entry
+                    entry["insts_keys"].add(insts_key)
 
             kernel_handle = CachedXRTKernelHandle(
                 kernel, xclbin, context, insts, insts_bo
diff --git a/python/utils/hostruntime/xrtruntime/tensor.py b/python/utils/hostruntime/xrtruntime/tensor.py
index 3c803a613f0..f5613a8f271 100644
--- a/python/utils/hostruntime/xrtruntime/tensor.py
+++ b/python/utils/hostruntime/xrtruntime/tensor.py
@@ -29,6 +29,7 @@ def __init__(
         device="npu",
         flags=xrt.bo.host_only,
         group_id=0,
+        xrt_device=None,
     ):
         """
         Initialize the XRTTensor.
@@ -41,10 +42,11 @@ def __init__(
             device (str, optional): Device string identifier. Defaults to 'npu'.
             flags (optional): XRT buffer object flags. Defaults to xrt.bo.host_only.
             group_id (int, optional): XRT buffer object group ID. Defaults to 0.
+            xrt_device (optional): Existing PyXRT device handle to use for BO allocation.
+                When omitted, a new handle for device index 0 is opened for this tensor.
         """
         super().__init__(shape_or_data, dtype=dtype, device=device)
-        device_index = 0
-        self.xrt_device = xrt.device(device_index)
+        self.xrt_device = xrt_device if xrt_device is not None else xrt.device(0)
 
         # Extract the shape
         if isinstance(shape_or_data, tuple):
@@ -58,7 +60,7 @@ def __init__(
         else:
             # TODO(efficiency): Extra data copy here (when necessary)
             # so we can borrow verification of array-like things from numpy.
-            np_data = np.array(shape_or_data, dtype=dtype, copy=False)
+            np_data = np.asarray(shape_or_data, dtype=dtype)
             self._shape = np_data.shape
 
         # Ideally, we use xrt::ext::bo host-only BO but there are no bindings for that currently.
diff --git a/test/npu-xrt/xrt_handle_lifetime/aie.mlir b/test/npu-xrt/xrt_handle_lifetime/aie.mlir
new file mode 100644
index 00000000000..0694b134c4f
--- /dev/null
+++ b/test/npu-xrt/xrt_handle_lifetime/aie.mlir
@@ -0,0 +1,169 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2022-2026 Advanced Micro Devices, Inc. or its affiliates
+// Copyright (C) 2020-2022, Xilinx Inc.
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(NPUDEVICE) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_0_2 = aie.tile(0, 2)
+
+    %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_0"} : memref<8xi32>
+    %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_1"} : memref<8xi32>
+    %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_0"} : memref<8xi32>
+    %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_1"} : memref<8xi32>
+
+    %objFifo_in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "objFifo_in1_cons_prod_lock"}
+    %objFifo_in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_cons_lock"}
+    %objFifo_out1_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "objFifo_out1_prod_lock"}
+    %objFifo_out1_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_lock"}
+
+    aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0)
+    aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1)
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c8 = arith.constant 8 : index
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c1_i32 = arith.constant 1 : i32
+      %c2 = arith.constant 2 : index
+      scf.for %arg0 = %c0 to %c8 step %c2 {
+        aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1)
+        aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1)
+
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+          %0 = memref.load %objFifo_in1_cons_buff_0[%arg1] : memref<8xi32>
+          %1 = arith.addi %0, %c1_i32 : i32
+          memref.store %1, %objFifo_out1_buff_0[%arg1] : memref<8xi32>
+        }
+
+        aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1)
+        aie.use_lock(%objFifo_out1_cons_lock, Release, 1)
+
+        aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1)
+        aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1)
+
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+          %0 = memref.load %objFifo_in1_cons_buff_1[%arg1] : memref<8xi32>
+          %1 = arith.addi %0, %c1_i32 : i32
+          memref.store %1, %objFifo_out1_buff_1[%arg1] : memref<8xi32>
+        }
+
+        aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1)
+        aie.use_lock(%objFifo_out1_cons_lock, Release, 1)
+      }
+      aie.end
+    }
+
+    aie.shim_dma_allocation @objFifo_in0 (%tile_0_0, MM2S, 0)
+
+    aie.runtime_sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
+      %c0_i64 = arith.constant 0 : i64
+      %c1_i64 = arith.constant 1 : i64
+      %c64_i64 = arith.constant 64 : i64
+      aiex.npu.dma_memcpy_nd(%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi32>
+      aiex.npu.dma_wait {symbol = @objFifo_out0}
+    }
+
+    %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
+      %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_0"} : memref<16xi32>
+      %objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_1"} : memref<16xi32>
+      %objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_0"} : memref<16xi32>
+      %objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_1"} : memref<16xi32>
+
+      %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in0_cons_prod_lock"}
+      %objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"}
+      %objFifo_out0_prod_lock = aie.lock(%tile_0_1, 2) {init = 2 : i32, sym_name = "objFifo_out0_prod_lock"}
+      %objFifo_out0_cons_lock = aie.lock(%tile_0_1, 3) {init = 0 : i32, sym_name = "objFifo_out0_cons_lock"}
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb2
+      aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb2
+    ^bb2:  // pred: ^bb1
+      aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_in0_cons_buff_1 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb1
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
+    ^bb4:  // 2 preds: ^bb3, ^bb5
+      aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1)
+      aie.next_bd ^bb5
+    ^bb5:  // pred: ^bb4
+      aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_in0_cons_buff_1 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1)
+      aie.next_bd ^bb4
+    ^bb6:  // pred: ^bb3
+      %2 = aie.dma_start(MM2S, 1, ^bb7, ^bb9)
+    ^bb7:  // 2 preds: ^bb6, ^bb8
+      aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_out0_buff_0 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_out0_prod_lock, Release, 1)
+      aie.next_bd ^bb8
+    ^bb8:  // pred: ^bb7
+      aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_out0_buff_1 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_out0_prod_lock, Release, 1)
+      aie.next_bd ^bb7
+    ^bb9:  // pred: ^bb6
+      %3 = aie.dma_start(S2MM, 1, ^bb10, ^bb12)
+    ^bb10:  // 2 preds: ^bb9, ^bb11
+      aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_out0_buff_0 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_out0_cons_lock, Release, 1)
+      aie.next_bd ^bb11
+    ^bb11:  // pred: ^bb10
+      aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_out0_buff_1 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_out0_cons_lock, Release, 1)
+      aie.next_bd ^bb10
+    ^bb12:  // pred: ^bb9
+      aie.end
+    }
+
+    aie.shim_dma_allocation @objFifo_out0 (%tile_0_0, S2MM, 0)
+
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb2
+      aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_in1_cons_buff_0 : memref<8xi32>, 0, 8)
+      aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb2
+    ^bb2:  // pred: ^bb1
+      aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_in1_cons_buff_1 : memref<8xi32>, 0, 8)
+      aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb1
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
+    ^bb4:  // 2 preds: ^bb3, ^bb5
+      aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_out1_buff_0 : memref<8xi32>, 0, 8)
+      aie.use_lock(%objFifo_out1_prod_lock, Release, 1)
+      aie.next_bd ^bb5
+    ^bb5:  // pred: ^bb4
+      aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_out1_buff_1 : memref<8xi32>, 0, 8)
+      aie.use_lock(%objFifo_out1_prod_lock, Release, 1)
+      aie.next_bd ^bb4
+    ^bb6:  // pred: ^bb3
+      aie.end
+    }
+  }
+}
diff --git a/test/npu-xrt/xrt_handle_lifetime/run_ordered.lit b/test/npu-xrt/xrt_handle_lifetime/run_ordered.lit
new file mode 100644
index 00000000000..db69d86f10a
--- /dev/null
+++ b/test/npu-xrt/xrt_handle_lifetime/run_ordered.lit
@@ -0,0 +1,12 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: cp %S/aie.mlir aie_arch.mlir
+// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir
+// RUN: %aiecc %backend_flags --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie_arch.mlir
+// RUN: %host_clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags %host_link_flags %test_utils_flags
+// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode ordered
+// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode ordered
diff --git a/test/npu-xrt/xrt_handle_lifetime/run_stale_instr_bo.lit b/test/npu-xrt/xrt_handle_lifetime/run_stale_instr_bo.lit
new file mode 100644
index 00000000000..8b68cd5c361
--- /dev/null
+++ b/test/npu-xrt/xrt_handle_lifetime/run_stale_instr_bo.lit
@@ -0,0 +1,13 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+// XFAIL: system-windows
+//
+// RUN: cp %S/aie.mlir aie_arch.mlir
+// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir
+// RUN: %aiecc %backend_flags --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie_arch.mlir
+// RUN: %host_clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags %host_link_flags %test_utils_flags
+// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-instr-bo
+// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-instr-bo
diff --git a/test/npu-xrt/xrt_handle_lifetime/run_stale_io_bos.lit b/test/npu-xrt/xrt_handle_lifetime/run_stale_io_bos.lit
new file mode 100644
index 00000000000..46818bc2f6a
--- /dev/null
+++ b/test/npu-xrt/xrt_handle_lifetime/run_stale_io_bos.lit
@@ -0,0 +1,12 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: cp %S/aie.mlir aie_arch.mlir
+// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir
+// RUN: %aiecc %backend_flags --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie_arch.mlir
+// RUN: %host_clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags %host_link_flags %test_utils_flags
+// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-io-bos
+// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-io-bos
diff --git a/test/npu-xrt/xrt_handle_lifetime/test.cpp b/test/npu-xrt/xrt_handle_lifetime/test.cpp
new file mode 100644
index 00000000000..f858b03a529
--- /dev/null
+++ b/test/npu-xrt/xrt_handle_lifetime/test.cpp
@@ -0,0 +1,193 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "cxxopts.hpp"
+#include "test_utils.h"
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 64;
+constexpr int OUT_SIZE = 64;
+
+struct Buffers {
+  std::unique_ptr<xrt::bo> instr;
+  std::unique_ptr<xrt::bo> in_a;
+  std::unique_ptr<xrt::bo> in_b;
+  std::unique_ptr<xrt::bo> out;
+};
+
+static std::string find_kernel_name(xrt::xclbin &xclbin,
+                                    const std::string &kernel_prefix) {
+  auto xkernels = xclbin.get_kernels();
+  auto it = std::find_if(xkernels.begin(), xkernels.end(),
+                         [&](xrt::xclbin::kernel &kernel) {
+                           auto name = kernel.get_name();
+                           std::cout << "Name: " << name << "\n";
+                           return name.rfind(kernel_prefix, 0) == 0;
+                         });
+  if (it == xkernels.end())
+    throw std::runtime_error("kernel not found: " + kernel_prefix);
+  return it->get_name();
+}
+
+static Buffers make_buffers(xrt::device &instr_device, xrt::device &io_device,
+                            xrt::kernel &kernel,
+                            const std::vector<uint32_t> &instr_v) {
+  Buffers buffers;
+  buffers.instr =
+      std::make_unique<xrt::bo>(instr_device, instr_v.size() * sizeof(uint32_t),
+                                XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  buffers.in_a =
+      std::make_unique<xrt::bo>(io_device, IN_SIZE * sizeof(int32_t),
+                                XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  buffers.in_b =
+      std::make_unique<xrt::bo>(io_device, IN_SIZE * sizeof(int32_t),
+                                XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  buffers.out =
+      std::make_unique<xrt::bo>(io_device, OUT_SIZE * sizeof(int32_t),
+                                XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+  return buffers;
+}
+
+static void initialize_buffers(Buffers &buffers,
+                               const std::vector<uint32_t> &instr_v) {
+  auto *buf_in_a = buffers.in_a->map<uint32_t *>();
+  std::vector<uint32_t> src_vec_a;
+  for (int i = 0; i < IN_SIZE; i++)
+    src_vec_a.push_back(i + 1);
+  std::memcpy(buf_in_a, src_vec_a.data(), src_vec_a.size() * sizeof(uint32_t));
+
+  void *buf_instr = buffers.instr->map<void *>();
+  std::memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(uint32_t));
+
+  buffers.instr->sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  buffers.in_a->sync(XCL_BO_SYNC_BO_TO_DEVICE);
+}
+
+static int run_and_check(xrt::kernel &kernel, Buffers &buffers,
+                         std::size_t instr_word_count) {
+  auto run = kernel(3, *buffers.instr, instr_word_count, *buffers.in_a,
+                    *buffers.in_b, *buffers.out);
+  ert_cmd_state state = run.wait();
+  if (state != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << state << "\n";
+    return 1;
+  }
+
+  buffers.out->sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  auto *buf_out = buffers.out->map<uint32_t *>();
+
+  int errors = 0;
+  for (uint32_t i = 0; i < OUT_SIZE; i++) {
+    uint32_t ref = i + 2;
+    if (*(buf_out + i) != ref) {
+      std::cout << "Error in output " << *(buf_out + i) << " != " << ref
+                << "\n";
+      errors++;
+    }
+  }
+
+  if (errors) {
+    std::cout << "failed.\n";
+    return 1;
+  }
+
+  std::cout << "PASS!\n";
+  return 0;
+}
+
+static void destroy_io_bos(Buffers &buffers) {
+  buffers.in_a.reset();
+  buffers.in_b.reset();
+  buffers.out.reset();
+}
+
+static int run_mode(const cxxopts::ParseResult &vm,
+                    const std::vector<uint32_t> &instr_v,
+                    const std::string &mode) {
+  auto device = xrt::device(0);
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+  std::string kernel_name =
+      find_kernel_name(xclbin, vm["kernel"].as<std::string>());
+
+  device.register_xclbin(xclbin);
+  auto context = std::make_unique<xrt::hw_context>(device, xclbin.get_uuid());
+  auto kernel = std::make_unique<xrt::kernel>(*context, kernel_name);
+
+  if (mode == "ordered") {
+    auto buffers = make_buffers(device, device, *kernel, instr_v);
+    initialize_buffers(buffers, instr_v);
+    return run_and_check(*kernel, buffers, instr_v.size());
+  }
+
+  if (mode == "stale-instr-bo") {
+    auto instr_device = xrt::device(0);
+    auto buffers = make_buffers(instr_device, device, *kernel, instr_v);
+    initialize_buffers(buffers, instr_v);
+
+    int result = run_and_check(*kernel, buffers, instr_v.size());
+    if (result != 0)
+      return result;
+
+    destroy_io_bos(buffers);
+    kernel.reset();
+    context.reset();
+    std::cout << "Destroying instruction BO after kernel/context.\n";
+    buffers.instr.reset();
+    return 0;
+  }
+
+  if (mode == "stale-io-bos") {
+    auto io_device = xrt::device(0);
+    auto buffers = make_buffers(device, io_device, *kernel, instr_v);
+    initialize_buffers(buffers, instr_v);
+
+    int result = run_and_check(*kernel, buffers, instr_v.size());
+    if (result != 0)
+      return result;
+
+    buffers.instr.reset();
+    kernel.reset();
+    context.reset();
+    std::cout << "Destroying input/output BOs after kernel/context.\n";
+    destroy_io_bos(buffers);
+    return 0;
+  }
+
+  throw std::runtime_error("unknown mode: " + mode);
+}
+
+int main(int argc, const char *argv[]) {
+  cxxopts::Options options("xrt_handle_lifetime");
+  test_utils::add_default_options(options);
+  options.add_options()(
+      "mode", "Test mode",
+      cxxopts::value<std::string>()->default_value("ordered"));
+
+  cxxopts::ParseResult vm;
+  test_utils::parse_options(argc, argv, options, vm);
+
+  std::string mode = vm["mode"].as<std::string>();
+  auto instr_v = test_utils::load_instr_binary(vm["instr"].as<std::string>());
+  std::cout << "mode=" << mode << "\n";
+  std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  return run_mode(vm, instr_v, mode);
+}
diff --git a/test/python/npu-xrt/test_cached_xrt_runtime.py b/test/python/npu-xrt/test_cached_xrt_runtime.py
index 99fa6df0ee4..d44b44b8cbb 100644
--- a/test/python/npu-xrt/test_cached_xrt_runtime.py
+++ b/test/python/npu-xrt/test_cached_xrt_runtime.py
@@ -178,29 +178,36 @@ def test_runtime_eviction_logic(runtime):
 
 
 def test_runtime_cache_fill(runtime):
-    """Test filling the cache to its capacity."""
+    """Test filling the Python-side cache to its configured capacity."""
 
-    # Ensure cache is empty
-    runtime.cleanup()
+    # Use a deliberately small artificial capacity. On Windows, reserved
+    # resources make the practical limit lower than the nominal cache size.
+    original_size = runtime._cache_size
+    runtime._cache_size = min(original_size, 8)
 
-    input_tensor = iron.arange(32, dtype=np.int32)
+    try:
+        runtime.cleanup()
 
-    # Load kernels up to capacity + 1
-    limit = runtime._cache_size
-    first_key = None
+        input_tensor = iron.arange(32, dtype=np.int32)
+
+        # Load kernels up to the artificial capacity + 1.
+        limit = runtime._cache_size
+        first_key = None
 
-    for i in range(limit + 1):
-        transform(input_tensor, input_tensor, lambda x, val=i: x + val)
+        for i in range(limit + 1):
+            transform(input_tensor, input_tensor, lambda x, val=i: x + val)
 
-        if i == 0:
-            first_key = list(runtime._context_cache.keys())[0]
+            if i == 0:
+                first_key = list(runtime._context_cache.keys())[0]
 
-        # Check size
-        expected_size = min(i + 1, limit)
-        assert len(runtime._context_cache) == expected_size
+            expected_size = min(i + 1, limit)
+            assert len(runtime._context_cache) == expected_size
 
-    # Verify the first one was evicted (since we went to limit + 1)
-    assert first_key not in runtime._context_cache
+        # Verify the first one was evicted (since we went to limit + 1).
+        assert first_key not in runtime._context_cache
+    finally:
+        runtime.cleanup()
+        runtime._cache_size = original_size
 
 
 def test_runtime_mtime_sensitivity(runtime):