Xilinx · thomthehound · May 20, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
@@ -268,6 +268,7 @@ def run(
                         kernel_handle.insts,
                         flags=pyxrt.bo.cacheable,
                         group_id=kernel_handle.kernel.group_id(1),
+                        xrt_device=self._device,
                     ).buffer_object()
 
             start = time.time_ns()
@@ -330,19 +331,21 @@ def __init__(self, kernel, xclbin, context, insts, insts_bo=None):
 
     def invalidate(self):
         """
-        Invalidate the handle and release resources.
+        Invalidate the handle and release resources in dependency order.
         """
         self._is_valid = False
-        if hasattr(self, "context"):
-            del self.context
+        # Instruction BOs and kernels depend on the hardware context. Those must
+        # be released before dropping the handle's context reference.
+        if hasattr(self, "insts_bo"):
+            del self.insts_bo
         if hasattr(self, "kernel"):
             del self.kernel
+        if hasattr(self, "context"):
+            del self.context
         if hasattr(self, "xclbin"):
             del self.xclbin
         if hasattr(self, "insts"):
             del self.insts
-        if hasattr(self, "insts_bo"):
-            del self.insts_bo
 
 
 class CachedXRTRuntime(XRTHostRuntime):
@@ -391,14 +394,21 @@ def __init__(self):
 
     def cleanup(self):
         """
-        Clean up the cache by evicting all entries.
+        Clean up cached XRT resources in dependency order.
         """
-        while self._context_cache:
-            self._evict()
         while self._insts_cache:
             self._evict_insts()
+        while self._context_cache:
+            self._evict()
         gc.collect()  # Make sure contexts are garbage collected.
 
+    def _cleanup_entry_insts(self, entry):
+        """Release instruction BOs owned by a cached context entry."""
+        for insts_key in list(entry.get("insts_keys", ())):
+            insts_entry = self._insts_cache.pop(insts_key, None)
+            if insts_entry is not None:
+                self._cleanup_insts_entry(insts_key, insts_entry)
+
     def _cleanup_entry(self, entry):
         handles = entry["handles"]
 
@@ -408,6 +418,8 @@ def _cleanup_entry(self, entry):
             if handle:
                 handle.invalidate()
 
+        self._cleanup_entry_insts(entry)
+
         # Clear kernel cache so pyxrt.kernel objects are released with the context
         entry["kernels"].clear()
 
@@ -422,14 +434,18 @@ def _evict(self):
         # Pop the oldest item
         key, entry = self._context_cache.popitem(last=False)
         self._cleanup_entry(entry)
+        gc.collect()
 
-    def _cleanup_insts_entry(self, entry):
+    def _cleanup_insts_entry(self, insts_key, entry):
+        owner_entry = entry.get("owner_entry")
+        if owner_entry is not None:
+            owner_entry.get("insts_keys", set()).discard(insts_key)
         # Delete the key (not a local copy) so the refcount drops here.
         del entry["insts_bo"]
 
     def _evict_insts(self):
         key, entry = self._insts_cache.popitem(last=False)
-        self._cleanup_insts_entry(entry)
+        self._cleanup_insts_entry(key, entry)
 
     def run(
         self,
@@ -533,15 +549,10 @@ def load(
                     try:
                         context = pyxrt.hw_context(self._device, xclbin_uuid)
                     except RuntimeError as e:
-                        # If we hit a resource limit (err=-2 usually means EMFILE/ENFILE or similar resource exhaustion)
-                        # and we have items in the cache, try evicting.
-                        if (
-                            "No such file or directory" in str(e)
-                            and self._context_cache
-                            and retries < max_retries
-                        ):
+                        # Context-slot exhaustion is reported differently across XRT backends.
+                        # Evict cached contexts and retry, but only while cached entries remain.
+                        if self._context_cache and retries < max_retries:
                             self._evict()
-                            gc.collect()  # Make sure contexts are garbage collected.
                             retries += 1
                         else:
                             raise e
@@ -551,6 +562,7 @@ def load(
                     "xclbin": xclbin,
                     "kernels": {},  # kernel_name -> pyxrt.kernel (strong ref, tied to context)
                     "handles": [],
+                    "insts_keys": set(),
                     "uuid": xclbin_uuid,
                 }
                 self._context_cache[context_key] = entry
@@ -599,12 +611,15 @@ def load(
                         insts,
                         flags=pyxrt.bo.cacheable,
                         group_id=group_id,
+                        xrt_device=self._device,
                     ).buffer_object()
 
                     insts_entry = {
                         "insts_bo": insts_bo,
+                        "owner_entry": entry,
                     }
                     self._insts_cache[insts_key] = insts_entry
+                    entry["insts_keys"].add(insts_key)
 
             kernel_handle = CachedXRTKernelHandle(
                 kernel, xclbin, context, insts, insts_bo

@@ -29,6 +29,7 @@ def __init__(
         device="npu",
         flags=xrt.bo.host_only,
         group_id=0,
+        xrt_device=None,
     ):
         """
         Initialize the XRTTensor.
@@ -41,10 +42,11 @@ def __init__(
             device (str, optional): Device string identifier. Defaults to 'npu'.
             flags (optional): XRT buffer object flags. Defaults to xrt.bo.host_only.
             group_id (int, optional): XRT buffer object group ID. Defaults to 0.
+            xrt_device (optional): Existing PyXRT device handle to use for BO allocation.
+                When omitted, a new handle for device index 0 is opened for this tensor.
         """
         super().__init__(shape_or_data, dtype=dtype, device=device)
-        device_index = 0
-        self.xrt_device = xrt.device(device_index)
+        self.xrt_device = xrt_device if xrt_device is not None else xrt.device(0)
 
         # Extract the shape
         if isinstance(shape_or_data, tuple):
@@ -58,7 +60,7 @@ def __init__(
         else:
             # TODO(efficiency): Extra data copy here (when necessary)
             # so we can borrow verification of array-like things from numpy.
-            np_data = np.array(shape_or_data, dtype=dtype, copy=False)
+            np_data = np.asarray(shape_or_data, dtype=dtype)
             self._shape = np_data.shape
 
         # Ideally, we use xrt::ext::bo host-only BO but there are no bindings for that currently.

@@ -0,0 +1,169 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2022-2026 Advanced Micro Devices, Inc. or its affiliates
+// Copyright (C) 2020-2022, Xilinx Inc.
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(NPUDEVICE) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_0_2 = aie.tile(0, 2)
+
+    %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_0"} : memref<8xi32>
+    %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_1"} : memref<8xi32>
+    %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_0"} : memref<8xi32>
+    %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_1"} : memref<8xi32>
+
+    %objFifo_in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "objFifo_in1_cons_prod_lock"}
+    %objFifo_in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_cons_lock"}
+    %objFifo_out1_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "objFifo_out1_prod_lock"}
+    %objFifo_out1_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_lock"}
+
+    aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0)
+    aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1)
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c8 = arith.constant 8 : index
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c1_i32 = arith.constant 1 : i32
+      %c2 = arith.constant 2 : index
+      scf.for %arg0 = %c0 to %c8 step %c2 {
+        aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1)
+        aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1)
+
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+          %0 = memref.load %objFifo_in1_cons_buff_0[%arg1] : memref<8xi32>
+          %1 = arith.addi %0, %c1_i32 : i32
+          memref.store %1, %objFifo_out1_buff_0[%arg1] : memref<8xi32>
+        }
+
+        aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1)
+        aie.use_lock(%objFifo_out1_cons_lock, Release, 1)
+
+        aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1)
+        aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1)
+
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+          %0 = memref.load %objFifo_in1_cons_buff_1[%arg1] : memref<8xi32>
+          %1 = arith.addi %0, %c1_i32 : i32
+          memref.store %1, %objFifo_out1_buff_1[%arg1] : memref<8xi32>
+        }
+
+        aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1)
+        aie.use_lock(%objFifo_out1_cons_lock, Release, 1)
+      }
+      aie.end
+    }
+
+    aie.shim_dma_allocation @objFifo_in0 (%tile_0_0, MM2S, 0)
+
+    aie.runtime_sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
+      %c0_i64 = arith.constant 0 : i64
+      %c1_i64 = arith.constant 1 : i64
+      %c64_i64 = arith.constant 64 : i64
+      aiex.npu.dma_memcpy_nd(%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi32>
+      aiex.npu.dma_wait {symbol = @objFifo_out0}
+    }
+
+    %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
+      %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_0"} : memref<16xi32>
+      %objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_1"} : memref<16xi32>
+      %objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_0"} : memref<16xi32>
+      %objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_1"} : memref<16xi32>
+
+      %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in0_cons_prod_lock"}
+      %objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"}
+      %objFifo_out0_prod_lock = aie.lock(%tile_0_1, 2) {init = 2 : i32, sym_name = "objFifo_out0_prod_lock"}
+      %objFifo_out0_cons_lock = aie.lock(%tile_0_1, 3) {init = 0 : i32, sym_name = "objFifo_out0_cons_lock"}
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb2
+      aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb2
+    ^bb2:  // pred: ^bb1
+      aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_in0_cons_buff_1 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb1
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
+    ^bb4:  // 2 preds: ^bb3, ^bb5
+      aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1)
+      aie.next_bd ^bb5
+    ^bb5:  // pred: ^bb4
+      aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_in0_cons_buff_1 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1)
+      aie.next_bd ^bb4
+    ^bb6:  // pred: ^bb3
+      %2 = aie.dma_start(MM2S, 1, ^bb7, ^bb9)
+    ^bb7:  // 2 preds: ^bb6, ^bb8
+      aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_out0_buff_0 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_out0_prod_lock, Release, 1)
+      aie.next_bd ^bb8
+    ^bb8:  // pred: ^bb7
+      aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_out0_buff_1 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_out0_prod_lock, Release, 1)
+      aie.next_bd ^bb7
+    ^bb9:  // pred: ^bb6
+      %3 = aie.dma_start(S2MM, 1, ^bb10, ^bb12)
+    ^bb10:  // 2 preds: ^bb9, ^bb11
+      aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_out0_buff_0 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_out0_cons_lock, Release, 1)
+      aie.next_bd ^bb11
+    ^bb11:  // pred: ^bb10
+      aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_out0_buff_1 : memref<16xi32>, 0, 16)
+      aie.use_lock(%objFifo_out0_cons_lock, Release, 1)
+      aie.next_bd ^bb10
+    ^bb12:  // pred: ^bb9
+      aie.end
+    }
+
+    aie.shim_dma_allocation @objFifo_out0 (%tile_0_0, S2MM, 0)
+
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb2
+      aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_in1_cons_buff_0 : memref<8xi32>, 0, 8)
+      aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb2
+    ^bb2:  // pred: ^bb1
+      aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_in1_cons_buff_1 : memref<8xi32>, 0, 8)
+      aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb1
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
+    ^bb4:  // 2 preds: ^bb3, ^bb5
+      aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_out1_buff_0 : memref<8xi32>, 0, 8)
+      aie.use_lock(%objFifo_out1_prod_lock, Release, 1)
+      aie.next_bd ^bb5
+    ^bb5:  // pred: ^bb4
+      aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%objFifo_out1_buff_1 : memref<8xi32>, 0, 8)
+      aie.use_lock(%objFifo_out1_prod_lock, Release, 1)
+      aie.next_bd ^bb4
+    ^bb6:  // pred: ^bb3
+      aie.end
+    }
+  }
+}
@@ -0,0 +1,12 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: cp %S/aie.mlir aie_arch.mlir
+// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir
+// RUN: %aiecc %backend_flags --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie_arch.mlir
+// RUN: %host_clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags %host_link_flags %test_utils_flags
+// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode ordered
+// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode ordered
@@ -0,0 +1,13 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+// XFAIL: system-windows
+//
+// RUN: cp %S/aie.mlir aie_arch.mlir
+// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir
+// RUN: %aiecc %backend_flags --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie_arch.mlir
+// RUN: %host_clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags %host_link_flags %test_utils_flags
+// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-instr-bo
+// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-instr-bo
@@ -0,0 +1,12 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: cp %S/aie.mlir aie_arch.mlir
+// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir
+// RUN: %aiecc %backend_flags --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie_arch.mlir
+// RUN: %host_clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags %host_link_flags %test_utils_flags
+// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-io-bos
+// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin --mode stale-io-bos