mlir-aie/python/utils/hostruntime/xrtruntime/hostruntime.py at 0645b4901465b55d87adc3e1cf4a1817ef43189f · Xilinx/mlir-aie · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
# SPDX-FileCopyrightText: Copyright (C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
XRT-based implementation of the HostRuntime
"""

import atexit
import logging
from collections import OrderedDict
import os
import shutil
import time
import weakref
import gc
import subprocess
from pathlib import Path
from typing import TYPE_CHECKING
import numpy as np
import pyxrt

from ..hostruntime import HostRuntime, HostRuntimeError, KernelHandle, KernelResult

if TYPE_CHECKING:
    from aie.iron.device import Device
from .tensor import XRTTensor

logger = logging.getLogger(__name__)


# XRTKernelHandle(kernel, xclbin, context, insts_path)
class XRTKernelHandle(KernelHandle):
    """
    Handle for a loaded XRT kernel.
    """

    def __init__(self, kernel, xclbin, context, insts, insts_bo=None):
        """
        Initialize the XRTKernelHandle.

        Args:
            kernel: The XRT kernel object.
            xclbin: The XRT xclbin object.
            context: The XRT context object.
            insts: The instructions for the kernel.
            insts_bo (optional): The instruction buffer object. Defaults to None.
        """
        self.kernel = kernel
        self.xclbin = xclbin
        self.context = context
        self.insts = insts
        self.insts_bo = insts_bo


class XRTKernelResult(KernelResult):
    """A wrapper around data produced as the result of running a kernel with the PyXRT runtime"""

    def __init__(
        self,
        ret: pyxrt.ert_cmd_state,
        npu_time: int,
        trace_data: XRTTensor | None = None,
    ):
        super().__init__(npu_time, trace_data)
        self.ret = ret

    def is_success(self) -> bool:
        return self.ret == pyxrt.ert_cmd_state.ERT_CMD_STATE_COMPLETED


class XRTHostRuntime(HostRuntime):
    """Singleton manager for AIE XRT resources."""

    # TODO: this is duplicated from the LIT helpers.
    # NPU Model mappings - centralized for easy updates
    # Maps generation name to list of model strings that may appear in xrt-smi
    NPU_MODELS = {
        "npu1": ["npu1", "Phoenix"],
        "npu2": ["npu4", "Strix", "npu5", "Strix Halo", "npu6", "Krackan"],
    }
    _tensor_class = XRTTensor

    def __init__(self):
        """
        Initialize the XRTHostRuntime.
        """
        # Retry logic for device acquisition to handle transient failures
        max_retries = 5
        for attempt in range(max_retries):
            try:
                self._device = pyxrt.device(0)
                break
            except RuntimeError as e:
                logger.warning(
                    "XRTHostRuntime: Failed to acquire device (attempt %d/%d): %s",
                    attempt + 1,
                    max_retries,
                    e,
                )

                # Debugging info
                try:
                    if os.path.exists("/dev/accel/accel0"):
                        logger.debug("/dev/accel/accel0 exists")
                        # Stat it
                        st = os.stat("/dev/accel/accel0")
                        logger.debug("Stat: %s", st)
                    else:
                        logger.debug("/dev/accel/accel0 does not exist")

                    # Try running xrt-smi examine
                    xrt_bin = shutil.which("xrt-smi")
                    if xrt_bin is None:
                        xrt_base = os.environ.get("XILINX_XRT", "/opt/xilinx/xrt")
                        xrt_bin = xrt_base + "/bin/xrt-smi"
                    if os.path.exists(xrt_bin):
                        logger.debug("Running %s examine", xrt_bin)
                        result = subprocess.run(
                            [xrt_bin, "examine"],
                            timeout=5,
                            capture_output=True,
                            text=True,
                        )
                        logger.debug("xrt-smi stdout:\n%s", result.stdout)
                        logger.debug("xrt-smi stderr:\n%s", result.stderr)
                except Exception as debug_e:
                    logger.debug("Failed to run debug checks: %s", debug_e)

                if attempt == max_retries - 1:
                    raise e

                gc.collect()  # Make sure contexts are garbage collected.
                time.sleep(1.0 * (attempt + 1))  # Exponential backoff

        self._device_type_str = self._device.get_info(pyxrt.xrt_info_device.name)

        self.npu_str = None
        for key, value in self.NPU_MODELS.items():
            if any([model in self._device_type_str for model in self.NPU_MODELS[key]]):
                self.npu_str = key
                break
        if not self.npu_str:
            raise RuntimeError(f"Unknown device type: {self._device_type_str}")

    @classmethod
    def read_insts(cls, insts_path: Path):
        """
        Reads instructions from the given file, with XRT-specific handling for ELF files.

        Args:
            insts_path (Path): Path to the instruction file.

        Returns:
            The instructions (either as bytes/array or XRT module).
        """
        # Overload the function in the generic class so we can use xrt-specific handling of elf files.
        ext = insts_path.suffix.lower()
        if ext == ".elf" and hasattr(pyxrt, "module"):
            elf = pyxrt.elf(str(insts_path))
            return pyxrt.module(elf)
        else:
            return super().read_insts(insts_path)

    def load(
        self,
        npu_kernel,
        **kwargs,
    ) -> XRTKernelHandle:
        """
        Load an NPU kernel into the XRT runtime.

        Args:
            npu_kernel: The NPU kernel to load.
            **kwargs: Additional arguments for loading.

        Returns:
            XRTKernelHandle: A handle to the loaded kernel.

        Raises:
            HostRuntimeError: If xclbin or insts files do not exist, or if kernel is not found.
        """
        self.check_device_consistency()
        xclbin_path = Path(npu_kernel.xclbin_path).resolve()
        insts_path = Path(npu_kernel.insts_path).resolve()
        kernel_name = npu_kernel.kernel_name

        if not xclbin_path.exists() or not xclbin_path.is_file():
            raise HostRuntimeError(
                f"xclbin {xclbin_path} does not exist or is not a file."
            )
        if not insts_path.exists() or not insts_path.is_file():
            raise HostRuntimeError(
                f"insts {insts_path} does not exist or is not a file."
            )

        xclbin = pyxrt.xclbin(str(xclbin_path))
        self._device.register_xclbin(xclbin)
        xclbin_uuid = xclbin.get_uuid()
        context = pyxrt.hw_context(self._device, xclbin_uuid)

        if kernel_name is None:
            kernels = xclbin.get_kernels()
            if not kernels:
                raise RuntimeError("No kernels found in xclbin")
            kernel_name = kernels[0].get_name()
        else:
            available_kernels = [k.get_name() for k in xclbin.get_kernels()]
            if kernel_name not in available_kernels:
                raise HostRuntimeError(
                    f"Kernel {kernel_name} not found in xclbin (kernels found: {available_kernels})"
                )

        insts = self.read_insts(insts_path)
        if hasattr(pyxrt, "module") and isinstance(insts, pyxrt.module):
            kernel = pyxrt.ext.kernel(context, insts, kernel_name)
        else:
            kernel = pyxrt.kernel(context, kernel_name)

        kernel_handle = XRTKernelHandle(kernel, xclbin, context, insts)
        return kernel_handle

    def run(
        self,
        kernel_handle: XRTKernelHandle,
        args,
        trace_config=None,
        fail_on_error: bool = True,
        **kwargs,
    ) -> XRTKernelResult:
        """
        Run a loaded XRT kernel.

        Args:
            kernel_handle (XRTKernelHandle): The handle to the loaded kernel.
            args: Arguments to pass to the kernel.
            trace_config (optional): Configuration for tracing. Defaults to None.
            fail_on_error (bool, optional): Whether to raise an exception on kernel failure. Defaults to True.
            **kwargs: Additional arguments.

        Returns:
            XRTKernelResult: The result of the kernel execution.

        Raises:
            HostRuntimeError: If arguments are invalid or kernel execution fails (and fail_on_error is True).
        """
        self.check_device_consistency()
        # Filter out callable functions and check arg types
        args = [a for a in args if not callable(a)]
        if not all([isinstance(a, self._tensor_class) for a in args]):
            raise HostRuntimeError(
                f"The {self.__class__.__name__} can only take {self._tensor_class.__name__} as arguments, but got: {args}"
            )
        [a.to("npu") for a in args]
        buffers = [a.buffer_object() for a in args]

        insts_bo = None
        insts_bytes = 0
        try:
            is_module = hasattr(pyxrt, "module") and isinstance(
                kernel_handle.insts, pyxrt.module
            )
            if not is_module:
                insts_bytes = kernel_handle.insts.nbytes
                if kernel_handle.insts_bo:
                    insts_bo = kernel_handle.insts_bo
                else:
                    insts_bo = self._tensor_class(
                        kernel_handle.insts,
                        flags=pyxrt.bo.cacheable,
                        group_id=kernel_handle.kernel.group_id(1),
                        xrt_device=self._device,
                    ).buffer_object()

            start = time.time_ns()
            h = kernel_handle.kernel(3, insts_bo, insts_bytes, *buffers)
            r = h.wait()
            stop = time.time_ns()

            if fail_on_error and r != pyxrt.ert_cmd_state.ERT_CMD_STATE_COMPLETED:
                raise HostRuntimeError(f"Kernel returned {str(r)}")
        finally:
            # delete insts buffer if it was created locally
            if insts_bo and not kernel_handle.insts_bo:
                del insts_bo

        return XRTKernelResult(r, stop - start)

    def device(self) -> "Device":
        """
        Get the device associated with this runtime.

        Returns:
            Device: The device object (NPU1 or NPU2).

        Raises:
            HostRuntimeError: If the device string is unknown.
        """
        from aie.iron.device import NPU1, NPU2

        devices = {
            "npu1": NPU1(),
            "npu2": NPU2(),
        }

        if self.npu_str in devices:
            return devices[self.npu_str]
        else:
            raise HostRuntimeError(
                f"Unknown device string: {self.npu_str}: expected npu1 or npu2"
            )


class CachedXRTKernelHandle(XRTKernelHandle):
    """
    A cached handle for a loaded XRT kernel.
    """

    def __init__(self, kernel, xclbin, context, insts, insts_bo=None):
        """
        Initialize the CachedXRTKernelHandle.

        Args:
            kernel: The XRT kernel object.
            xclbin: The XRT xclbin object.
            context: The XRT context object.
            insts: The instructions for the kernel.
            insts_bo (optional): The instruction buffer object. Defaults to None.
        """
        super().__init__(kernel, xclbin, context, insts, insts_bo)
        self._is_valid = True

    def invalidate(self):
        """
        Invalidate the handle and release resources in dependency order.
        """
        self._is_valid = False
        # Instruction BOs and kernels depend on the hardware context. Those must
        # be released before dropping the handle's context reference.
        if hasattr(self, "insts_bo"):
            del self.insts_bo
        if hasattr(self, "kernel"):
            del self.kernel
        if hasattr(self, "context"):
            del self.context
        if hasattr(self, "xclbin"):
            del self.xclbin
        if hasattr(self, "insts"):
            del self.insts


class CachedXRTRuntime(XRTHostRuntime):
    """
    A cached version of XRTHostRuntime that caches up to n contexts,
    depending on the type of NPU.
    It reuses contexts for the same xclbin (identified by path and mtime).
    """

    # I got these values through experimentation on two machines
    # These values are primarily determined by the hardware/driver, and could change
    # in the future. But currently, if you exceed these sizes, you will fail to be
    # able to create a new context. At the driver level, the cached contexts are
    # a system-wide constrained resource, so caching on systems with many concurrent
    # processes trying to create contexts (as in parallel CI jobs) can be flaky.
    # TODO: use some sort of file system artifact or figure out how to query the driver
    # for the state of the cache, and how to make loading operations atomic between processes.
    NPU_CONTEXT_CACHE_SIZE = {
        "npu1": 6,
        "npu2": 32,
    }

    def __init__(self):
        """
        Initialize the CachedXRTRuntime.
        """
        super().__init__()
        # We use OrderedDict so that we can use Fifo behavior for LRU eviction policies
        self._context_cache = OrderedDict()
        self._insts_cache = OrderedDict()

        # Set default from dict if present
        self._cache_size = None
        if self.npu_str in self.NPU_CONTEXT_CACHE_SIZE.keys():
            self._cache_size = self.NPU_CONTEXT_CACHE_SIZE[self.npu_str]

        # Environment variable always override default values
        # TODO: should probably emit warning if exceeds recorded max size.
        self._cache_size = os.environ.get("XRT_CONTEXT_CACHE_SIZE", self._cache_size)

        # Error if no default and no env var
        if self._cache_size is None:
            raise HostRuntimeError(f"No known cache size for {self.npu_str}")

        atexit.register(self.cleanup)

    def cleanup(self):
        """
        Clean up cached XRT resources in dependency order.
        """
        while self._insts_cache:
            self._evict_insts()
        while self._context_cache:
            self._evict()
        gc.collect()  # Make sure contexts are garbage collected.

    def _cleanup_entry_insts(self, entry):
        """Release instruction BOs owned by a cached context entry."""
        for insts_key in list(entry.get("insts_keys", ())):
            insts_entry = self._insts_cache.pop(insts_key, None)
            if insts_entry is not None:
                self._cleanup_insts_entry(insts_key, insts_entry)

    def _cleanup_entry(self, entry):
        handles = entry["handles"]

        # Invalidate all handles
        for ref in handles:
            handle = ref()
            if handle:
                handle.invalidate()

        self._cleanup_entry_insts(entry)

        # Clear kernel cache so pyxrt.kernel objects are released with the context
        entry["kernels"].clear()

        # Release the hw_context by removing its strong reference from the entry dict.
        # Simply assigning a local `context = entry["context"]` and then `del context`
        # only removes the local name — entry["context"] would keep the object alive for
        # as long as any caller holds a reference to the entry dict (e.g. tests, or the
        # exception handler).  Deleting the key guarantees the refcount drops here.
        del entry["context"]

    def _evict(self):
        # Pop the oldest item
        key, entry = self._context_cache.popitem(last=False)
        self._cleanup_entry(entry)
        gc.collect()

    def _cleanup_insts_entry(self, insts_key, entry):
        owner_entry = entry.get("owner_entry")
        if owner_entry is not None:
            owner_entry.get("insts_keys", set()).discard(insts_key)
        # Delete the key (not a local copy) so the refcount drops here.
        del entry["insts_bo"]

    def _evict_insts(self):
        key, entry = self._insts_cache.popitem(last=False)
        self._cleanup_insts_entry(key, entry)

    def run(
        self,
        kernel_handle: XRTKernelHandle,
        args,
        trace_config=None,
        fail_on_error: bool = True,
        only_if_loaded: bool = False,
        **kwargs,
    ) -> XRTKernelResult:
        """
        Run a loaded XRT kernel.

        Args:
            kernel_handle (XRTKernelHandle): The handle to the loaded kernel.
            args: Arguments to pass to the kernel.
            trace_config (optional): Configuration for tracing. Defaults to None.
            fail_on_error (bool, optional): Whether to raise an exception on kernel failure. Defaults to True.
            only_if_loaded (bool, optional): If True, only run if the kernel is currently loaded in the cache. Defaults to False.
            **kwargs: Additional arguments.

        Returns:
            XRTKernelResult: The result of the kernel execution.

        Raises:
            HostRuntimeError: If arguments are invalid, kernel execution fails, or kernel is not loaded (if only_if_loaded=True).
        """
        if only_if_loaded:
            if (
                isinstance(kernel_handle, CachedXRTKernelHandle)
                and not kernel_handle._is_valid
            ):
                raise HostRuntimeError("Kernel not loaded (evicted from cache)")

        return super().run(kernel_handle, args, trace_config, fail_on_error, **kwargs)

    def load(
        self,
        npu_kernel,
        retry: bool = True,
        **kwargs,
    ) -> XRTKernelHandle:
        """
        Load an NPU kernel into the cached XRT runtime.

        Args:
            npu_kernel: The NPU kernel to load.
            retry (bool, optional): Whether to retry loading if context creation fails due to resource limits. Defaults to True.
            **kwargs: Additional arguments for loading.

        Returns:
            XRTKernelHandle: A handle to the loaded kernel.

        Raises:
            HostRuntimeError: If xclbin or insts files do not exist, or if kernel is not found.
        """
        self.check_device_consistency()
        xclbin_path = Path(npu_kernel.xclbin_path).resolve()
        insts_path = Path(npu_kernel.insts_path).resolve()
        kernel_name = npu_kernel.kernel_name

        if not xclbin_path.exists() or not xclbin_path.is_file():
            raise HostRuntimeError(
                f"xclbin {xclbin_path} does not exist or is not a file."
            )
        if not insts_path.exists() or not insts_path.is_file():
            raise HostRuntimeError(
                f"insts {insts_path} does not exist or is not a file."
            )

        xclbin_mtime = xclbin_path.stat().st_mtime
        insts_mtime = insts_path.stat().st_mtime

        # Context Cache Lookup
        context_key = (str(xclbin_path), xclbin_mtime)

        try:
            if context_key in self._context_cache:
                entry = self._context_cache[context_key]
                self._context_cache.move_to_end(context_key)
                context = entry["context"]
                xclbin = entry["xclbin"]
                # Clean up dead handles
                entry["handles"] = [
                    ref for ref in entry["handles"] if ref() is not None
                ]
            else:
                xclbin = pyxrt.xclbin(str(xclbin_path))
                xclbin_uuid = xclbin.get_uuid()

                if len(self._context_cache) >= self._cache_size:
                    self._evict()

                self._device.register_xclbin(xclbin)

                # Try to create context, evicting if necessary
                context = None
                retries = 0
                max_retries = len(self._context_cache) if retry else 0
                while context is None:
                    try:
                        context = pyxrt.hw_context(self._device, xclbin_uuid)
                    except RuntimeError as e:
                        # Context-slot exhaustion is reported differently across XRT backends.
                        # Evict cached contexts and retry, but only while cached entries remain.
                        if self._context_cache and retries < max_retries:
                            self._evict()
                            retries += 1
                        else:
                            raise e

                entry = {
                    "context": context,
                    "xclbin": xclbin,
                    "kernels": {},  # kernel_name -> pyxrt.kernel (strong ref, tied to context)
                    "handles": [],
                    "insts_keys": set(),
                    "uuid": xclbin_uuid,
                }
                self._context_cache[context_key] = entry

            # Kernel Name Resolution
            if kernel_name is None:
                kernels = xclbin.get_kernels()
                if not kernels:
                    raise RuntimeError("No kernels found in xclbin")
                kernel_name = kernels[0].get_name()
            else:
                available_kernels = [k.get_name() for k in xclbin.get_kernels()]
                if kernel_name not in available_kernels:
                    raise HostRuntimeError(
                        f"Kernel {kernel_name} not found in xclbin (kernels found: {available_kernels})"
                    )

            insts = self.read_insts(insts_path)
            insts_bo = None
            if hasattr(pyxrt, "module") and isinstance(insts, pyxrt.module):
                ext_kernel_key = (kernel_name, str(insts_path), insts_mtime)
                if ext_kernel_key not in entry["kernels"]:
                    entry["kernels"][ext_kernel_key] = pyxrt.ext.kernel(
                        context, insts, kernel_name
                    )
                kernel = entry["kernels"][ext_kernel_key]
            else:
                if kernel_name not in entry["kernels"]:
                    entry["kernels"][kernel_name] = pyxrt.kernel(context, kernel_name)
                kernel = entry["kernels"][kernel_name]

                # Magic number for RyzenAI group id that will be fixed in the future. See same code at XRT:
                # https://github.com/Xilinx/XRT/blob/56222ed5cfd119dff0d5bd920735b87024e8c829/src/runtime_src/core/common/api/xrt_module.cpp#L1621
                group_id = kernel.group_id(1)
                insts_key = (str(insts_path), insts_mtime, group_id)

                if insts_key in self._insts_cache:
                    insts_entry = self._insts_cache[insts_key]
                    self._insts_cache.move_to_end(insts_key)
                    insts_bo = insts_entry["insts_bo"]
                else:
                    if len(self._insts_cache) >= self._cache_size:
                        self._evict_insts()

                    insts_bo = self._tensor_class(
                        insts,
                        flags=pyxrt.bo.cacheable,
                        group_id=group_id,
                        xrt_device=self._device,
                    ).buffer_object()

                    insts_entry = {
                        "insts_bo": insts_bo,
                        "owner_entry": entry,
                    }
                    self._insts_cache[insts_key] = insts_entry
                    entry["insts_keys"].add(insts_key)

            kernel_handle = CachedXRTKernelHandle(
                kernel, xclbin, context, insts, insts_bo
            )
            entry["handles"].append(weakref.ref(kernel_handle))

            return kernel_handle

        except Exception:
            if context_key in self._context_cache:
                entry = self._context_cache[context_key]
                # Clean up dead handles
                entry["handles"] = [
                    ref for ref in entry["handles"] if ref() is not None
                ]
                if not entry["handles"]:
                    del self._context_cache[context_key]
                    self._cleanup_entry(entry)
            raise