NVIDIA · lmondada · Feb 26, 2026 · Mar 3, 2026 · Mar 10, 2026 · schweitzpgi
diff --git a/runtime/common/CMakeLists.txt b/runtime/common/CMakeLists.txt
@@ -26,6 +26,7 @@ set(COMMON_RUNTIME_SRC
   SampleResult.cpp
   ServerHelper.cpp
   Trace.cpp
+  CompiledKernel.cpp
 )
 
 # Create the cudaq-common library

diff --git a/runtime/common/CompiledKernel.cpp b/runtime/common/CompiledKernel.cpp
@@ -0,0 +1,36 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CompiledKernel.h"
+
+namespace cudaq {
-namespace cudaq {
-namespace cudaq {
+
+CompiledKernel::CompiledKernel(OpaquePtr<JitEngine> engine,
+                               std::string kernelName, void (*entryPoint)(),
+                               bool hasResult)
+    : engine(std::move(engine)), name(std::move(kernelName)),
+      entryPoint(entryPoint), hasResult(hasResult) {}
+
+KernelThunkResultType
+CompiledKernel::execute(const std::vector<void *> &rawArgs) const {
+  auto funcPtr = getEntryPoint();
+  if (hasResult) {
+    void *buff = const_cast<void *>(rawArgs.back());
+    return reinterpret_cast<KernelThunkResultType (*)(void *, bool)>(funcPtr)(
+        buff, /*client_server=*/false);
+  } else {
+    reinterpret_cast<void (*)()>(funcPtr)();
+    return {nullptr, 0};
+  }
+}
+
+void (*CompiledKernel::getEntryPoint() const)() { return entryPoint; }
+
+const JitEngine &CompiledKernel::getEngine() const { return *engine; }
+
+} // namespace cudaq
diff --git a/runtime/common/CompiledKernel.h b/runtime/common/CompiledKernel.h
@@ -0,0 +1,64 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#pragma once
+
+#include "common/ThunkInterface.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace cudaq {
+
+class JitEngine;
+
+/// A unique_ptr with a plain function pointer as destructor, allowing
+/// type-erased ownership of forward-declared (incomplete) types.
+template <typename T>
+using OpaquePtr = std::unique_ptr<T, void (*)(T *)>;
+
+template <typename T, typename... Args>
+OpaquePtr<T> makeOpaquePtr(Args &&...args) {
+  return OpaquePtr<T>(new T(std::forward<Args>(args)...),
+                      [](T *p) { delete p; });
+}
+
+/// @brief A compiled, ready-to-execute kernel.
+///
+/// This type does not have a dependency on MLIR (or LLVM) as it only keeps
+/// type-erased pointers to JIT-related types.
+///
+/// The constructor is private; use the factory function in
+/// `runtime/common/JIT.h` to construct instances.
+class CompiledKernel {
+public:
+  /// @brief Execute the JIT-ed kernel.
+  ///
+  /// If the kernel has a return type, the caller must have appended a result
+  /// buffer as the last element of \p rawArgs.
+  KernelThunkResultType execute(const std::vector<void *> &rawArgs) const;
+
+  void (*getEntryPoint() const)();
+
+  const JitEngine &getEngine() const;
+
+private:
+  CompiledKernel(OpaquePtr<JitEngine> engine, std::string kernelName,
+                 void (*entryPoint)(), bool hasResult);
+
+  // Use the following factory function in JIT.h to construct CompiledKernels.
+  friend CompiledKernel createCompiledKernel(JitEngine engine,
+                                             std::string kernelName,
+                                             bool hasResult);
+
+  OpaquePtr<JitEngine> engine;
+  std::string name;
+  void (*entryPoint)();
+  bool hasResult;
+};
+
+} // namespace cudaq
diff --git a/runtime/common/JIT.cpp b/runtime/common/JIT.cpp
@@ -325,6 +325,16 @@ cudaq::JitEngine cudaq::createQIRJITEngine(mlir::ModuleOp &moduleOp,
   return JitEngine(std::move(jitOrError.get()));
 }
 
+cudaq::CompiledKernel cudaq::createCompiledKernel(JitEngine engine,
+                                                  std::string kernelName,
+                                                  bool hasResult) {
+  std::string fullName = cudaq::runtime::cudaqGenPrefixName + kernelName;
+  std::string entryName = hasResult ? kernelName + ".thunk" : fullName;
+  void (*entryPoint)() = engine.lookupRawNameOrFail(entryName);
+  return cudaq::CompiledKernel(cudaq::makeOpaquePtr<JitEngine>(engine),
+                               std::move(kernelName), entryPoint, hasResult);
+}
+
 namespace cudaq {
 class JitEngine::Impl {
 public:

diff --git a/runtime/common/JIT.h b/runtime/common/JIT.h
@@ -7,6 +7,7 @@
  ******************************************************************************/
 #pragma once
 
+#include "CompiledKernel.h"
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -56,4 +57,7 @@ class JitEngine {
 JitEngine createQIRJITEngine(mlir::ModuleOp &moduleOp,
                              llvm::StringRef convertTo);
 
+CompiledKernel createCompiledKernel(JitEngine engine, std::string kernelName,
+                                    bool hasResult);
+
 } // namespace cudaq
diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp
@@ -187,84 +187,22 @@ static void cacheJITForPerformance(cudaq::JitEngine jit) {
 
 namespace {
 struct PythonLauncher : public cudaq::ModuleLauncher {
-  cudaq::KernelThunkResultType launchModule(const std::string &name,
-                                            ModuleOp module,
-                                            const std::vector<void *> &rawArgs,
-                                            Type resultTy) override {
-    // In this launch scenario, we have a ModuleOp that has the entry-point
-    // kernel, but needs to be merged with anything else it may call. The
-    // merging of modules mirrors the late binding and dynamic scoping of the
-    // host language (Python).
-    ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchModule");
+  cudaq::CompiledKernel compileModule(const std::string &name, ModuleOp module,
+                                      const std::vector<void *> &rawArgs,
+                                      Type resultTy,
+                                      bool isEntryPoint) override {
+    // Check the ExecutionContext JIT cache first (used by cudaq.sample to
+    // avoid recompiling on every shot).
+    if (auto jit = alreadyBuiltJITCode())
+      return cudaq::createCompiledKernel(*jit, name, /*hasResult=*/!!resultTy);
+
+    ScopedTraceWithContext(cudaq::TIMING_LAUNCH,
+                           "PythonLauncher::compileModule");
     const bool enablePythonCodegenDump =
         cudaq::getEnvBool("CUDAQ_PYTHON_CODEGEN_DUMP", false);
 
     std::string fullName = cudaq::runtime::cudaqGenPrefixName + name;
-    cudaq::KernelThunkResultType result{nullptr, 0};
-    auto jit = alreadyBuiltJITCode();
-    if (!jit) {
-      // 1. Check that this call is sane.
-      if (enablePythonCodegenDump)
-        module.dump();
-      auto funcOp = module.lookupSymbol<func::FuncOp>(fullName);
-      if (!funcOp)
-        throw std::runtime_error("no kernel named " + name +
-                                 " found in module");
 
-      // 2. Merge other modules (e.g., if there are device kernel calls).
-      cudaq::detail::mergeAllCallableClosures(module, name, rawArgs);
-
-      // Mark all newly merged kernels private.
-      for (auto &op : module)
-        if (auto f = dyn_cast<func::FuncOp>(op))
-          if (f != funcOp)
-            f.setPrivate();
-
-      updateExecutionContext(module);
-
-      // 3. LLVM JIT the code so we can execute it.
-      CUDAQ_INFO("Run Argument Synth.\n");
-      if (enablePythonCodegenDump)
-        module.dump();
-      specializeKernel(name, module, rawArgs, resultTy,
-                       enablePythonCodegenDump);
-
-      // 4. Execute the code right here, right now.
-      jit = cudaq::createQIRJITEngine(module, "qir:");
-    }
-
-    if (resultTy) {
-      // Proceed to call the .thunk function so that the result value will be
-      // properly marshaled into the buffer we allocated in
-      // appendTheResultBuffer().
-      // FIXME: Python ought to set up the call stack so that a legit C++ entry
-      // point can be called instead of winging it and duplicating what the core
-      // compiler already does.
-      auto funcPtr = jit->lookupRawNameOrFail(name + ".thunk");
-      void *buff = const_cast<void *>(rawArgs.back());
-      result = reinterpret_cast<cudaq::KernelThunkResultType (*)(void *, bool)>(
-          *funcPtr)(buff, /*client_server=*/false);
-    } else {
-      jit->run(name);
-    }
-    cacheJITForPerformance(jit.value());
-    // FIXME: actually handle results
-    return result;
-  }
-
-  void *specializeModule(const std::string &name, ModuleOp module,
-                         const std::vector<void *> &rawArgs, Type resultTy,
-                         std::optional<cudaq::JitEngine> &cachedEngine,
-                         bool isEntryPoint) override {
-    // In this launch scenario, we have a ModuleOp that has the entry-point
-    // kernel, but needs to be merged with anything else it may call. The
-    // merging of modules mirrors the late binding and dynamic scoping of the
-    // host language (Python).
-    ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchModule");
-    const bool enablePythonCodegenDump =
-        cudaq::getEnvBool("CUDAQ_PYTHON_CODEGEN_DUMP", false);
-
-    std::string fullName = cudaq::runtime::cudaqGenPrefixName + name;
     // 1. Check that this call is sane.
     if (enablePythonCodegenDump)
       module.dump();
@@ -283,23 +221,20 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
 
     updateExecutionContext(module);
 
-    // 3. LLVM JIT the code so we can execute it.
+    // 3. Specialize the kernel (argument synthesis, optimization).
     CUDAQ_INFO("Run Argument Synth.\n");
     if (enablePythonCodegenDump)
       module.dump();
     specializeKernel(name, module, rawArgs, resultTy, enablePythonCodegenDump,
                      isEntryPoint);
 
-    // 4. Execute the code right here, right now.
+    // 4. Lower to QIR and JIT compile.
     auto jit = cudaq::createQIRJITEngine(module, "qir:");
-    if (cachedEngine)
-      throw std::runtime_error("cache must not be populated");
-    cachedEngine = jit;
+    cacheJITForPerformance(jit);
 
-    std::string entryName =
-        (resultTy && isEntryPoint) ? name + ".thunk" : fullName;
-    auto funcPtr = jit.lookupRawNameOrFail(entryName);
-    return reinterpret_cast<void *>(funcPtr);
+    return cudaq::createCompiledKernel(jit, name,
+                                       /*hasResult=*/!!resultTy &&
+                                           isEntryPoint);
   }
 };
 } // namespace

diff --git a/runtime/cudaq/platform/qpu.cpp b/runtime/cudaq/platform/qpu.cpp
@@ -21,7 +21,9 @@ cudaq::QPU::launchModule(const std::string &name, mlir::ModuleOp module,
         "No ModuleLauncher registered with name 'default'. This may be a "
         "result of attempting to use `launchModule` outside Python.");
   ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchModule", name);
-  return launcher->launchModule(name, module, rawArgs, resultTy);
+  auto compiled =
+      launcher->compileModule(name, module, rawArgs, resultTy, true);
+  return compiled.execute(rawArgs);
 }
 
 void *cudaq::QPU::specializeModule(
@@ -34,6 +36,10 @@ void *cudaq::QPU::specializeModule(
         "No ModuleLauncher registered with name 'default'. This may be a "
         "result of attempting to use `specializeModule` outside Python.");
   ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::specializeModule", name);
-  return launcher->specializeModule(name, module, rawArgs, resultTy,
-                                    cachedEngine, isEntryPoint);
+  auto compiled =
+      launcher->compileModule(name, module, rawArgs, resultTy, isEntryPoint);
+  if (cachedEngine)
+    throw std::runtime_error("cache must not be populated");
+  cachedEngine = compiled.getEngine();
+  return reinterpret_cast<void *>(compiled.getEntryPoint());
 }
diff --git a/runtime/cudaq/platform/qpu.h b/runtime/cudaq/platform/qpu.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include "QuantumExecutionQueue.h"
+#include "common/CompiledKernel.h"
 #include "common/Registry.h"
 #include "common/ThunkInterface.h"
 #include "common/Timing.h"
@@ -221,15 +222,13 @@ class QPU : public registry::RegisteredType<QPU> {
 struct ModuleLauncher : public registry::RegisteredType<ModuleLauncher> {
   virtual ~ModuleLauncher() = default;
 
-  virtual KernelThunkResultType launchModule(const std::string &name,
-                                             mlir::ModuleOp module,
-                                             const std::vector<void *> &rawArgs,
-                                             mlir::Type resultTy) = 0;
-  virtual void *specializeModule(const std::string &name, mlir::ModuleOp module,
-                                 const std::vector<void *> &rawArgs,
-                                 mlir::Type resultTy,
-                                 std::optional<cudaq::JitEngine> &cachedEngine,
-                                 bool isEntryPoint) = 0;
+  /// Compile (specialize + JIT) a kernel module and return a ready-to-execute
+  /// CompiledKernel.
+  virtual CompiledKernel compileModule(const std::string &name,
+                                       mlir::ModuleOp module,
+                                       const std::vector<void *> &rawArgs,
+                                       mlir::Type resultTy,
+                                       bool isEntryPoint) = 0;
 };
 
 } // namespace cudaq