pytorch · bowang007 · Apr 22, 2026 · Apr 22, 2026 · May 5, 2026 · narendasan
diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
@@ -459,6 +459,12 @@ jobs:
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml automatic_plugin/test_automatic_plugin_with_attrs.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml automatic_plugin/test_flashinfer_rmsnorm.py
         popd
+        pushd .
+        # cuda-python is an optional runtime dep for the torch_tensorrt.annotation QDP layer.
+        python -m pip install cuda-python
+        cd tests/py/annotation
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_annotation_test_results.xml .
+        popd
 
   L2-torchscript-tests:
     name: ${{ matrix.display-name }}

diff --git a/docsrc/py_api/annotation.rst b/docsrc/py_api/annotation.rst
@@ -0,0 +1,156 @@
+.. _torch_tensorrt_annotation_py:
+
+torch_tensorrt.annotation
+==========================
+
+.. currentmodule:: torch_tensorrt.annotation
+
+.. automodule:: torch_tensorrt.annotation
+
+.. note::
+
+   This module is **experimental**.  It requires ``cuda-python`` at runtime
+   and TensorRT ``>=10.7.0`` (and not ``10.14.x``) for Quick Deployable
+   Plugin (QDP) support.  Install ``cuda-python`` with ``pip install
+   cuda-python``.
+
+Overview
+--------
+
+The ``annotation`` module registers NVRTC-compiled CUDA C++ kernels as
+TensorRT Quick Deployable Plugins with full Ahead-of-Time (AOT)
+compilation support.  It offers two entry points that trade
+declarativeness for flexibility — start with :func:`auto_cuda_kernel_plugin` and
+drop down to :func:`manual_cuda_kernel_plugin` only when your kernel falls outside
+the declarative DSL:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 28 36 36
+
+   * - Entry point
+     - What you provide
+     - What you get for free
+   * - :func:`auto_cuda_kernel_plugin`
+     - A :class:`KernelSpec` dataclass (source, inputs, outputs, extras,
+       geometry)
+     - Meta / eager / AOT functions and the PyTorch schema — all derived
+   * - :func:`manual_cuda_kernel_plugin`
+     - ``aot_fn`` + ``eager_fn`` + a meta function decorated with the
+       one-shot decorator
+     - PyTorch op + TRT plugin + converter, registered together
+
+For unary-pointwise kernels, :func:`pointwise_aot` and
+:func:`pointwise_eager` produce the two callables so users can plug them
+directly into :func:`manual_cuda_kernel_plugin`.
+
+Declarative entry point
+-----------------------
+
+.. autofunction:: auto_cuda_kernel_plugin
+
+KernelSpec DSL
+^^^^^^^^^^^^^^
+
+.. autoclass:: KernelSpec
+   :members:
+
+.. autoclass:: InputDecl
+   :members:
+
+.. autoclass:: ScalarInput
+   :members:
+
+.. autoclass:: OutputDecl
+   :members:
+
+Shape relations
+"""""""""""""""
+
+.. autoclass:: SameAs
+   :members:
+
+.. autoclass:: ReduceDims
+   :members:
+
+Extra scalar args
+"""""""""""""""""
+
+Extras are passed to the kernel between the input and output pointer
+lists in :class:`KernelSpec` order.
+
+.. autoclass:: Numel
+   :members:
+
+.. autoclass:: DimSize
+   :members:
+
+Launch geometry
+"""""""""""""""
+
+.. autoclass:: Elementwise
+   :members:
+
+.. autoclass:: Reduction
+   :members:
+
+.. autoclass:: Custom
+   :members:
+
+One-shot hand-written entry point
+---------------------------------
+
+.. autofunction:: manual_cuda_kernel_plugin
+
+Lower-level building blocks
+---------------------------
+
+.. autofunction:: cuda_python
+
+.. autofunction:: custom_plugin
+
+Spec class
+^^^^^^^^^^
+
+.. autoclass:: CudaPythonSpec
+   :members:
+
+Pointwise helpers
+-----------------
+
+.. autofunction:: pointwise_aot
+
+.. autofunction:: pointwise_eager
+
+Kernel signature convention
+---------------------------
+
+All entry points assume the ``__global__`` kernel takes its arguments in
+the fixed order::
+
+    (input_ptrs..., extras..., output_ptrs...)
+
+Pointers are ``void*`` cast to the appropriate element type.  Extras
+follow the order declared in :attr:`KernelSpec.extras` for the
+declarative path, or the order your ``aot_fn`` builds for the manual
+path.
+
+Error behavior
+--------------
+
+:func:`auto_cuda_kernel_plugin` validates the :class:`KernelSpec` at decorator
+time and raises :class:`ValueError` for the common authoring mistakes:
+
+- Empty or duplicate-named ``inputs`` / ``outputs``.
+- ``ReduceDims(input_idx=N)`` or ``SameAs(input_idx=N)`` where ``N`` is
+  out of range.
+- ``Numel`` / ``DimSize`` referencing a name that is not an input.
+- ``dtype_from`` pointing at an unknown input.
+- ``Elementwise(layout='flat')`` with a multi-dimensional block tuple.
+- Invalid block sizes, ``block_size`` in :class:`Reduction`, or a
+  non-callable :attr:`Custom.fn`.
+
+Shape-dependent errors — for example
+``Elementwise(layout='nd', block=(16, 16))`` invoked against a 1-D
+output — are raised at launch time in a clear ``ValueError`` because
+the offending ranks are only known when concrete tensors arrive.
diff --git a/docsrc/py_api/index.rst b/docsrc/py_api/index.rst
@@ -13,6 +13,7 @@ Core
    dynamo
    logging
    runtime
+   annotation
    ../cli/torchtrtc
    ../indices/supported_ops
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,6 +13,7 @@ Core @@
        dynamo
        logging
        runtime
+       annotation
        ../cli/torchtrtc
        ../indices/supported_ops
@@ Expand Down @@