From 284ff7335608f87c4165862ced39277b6577c3a4 Mon Sep 17 00:00:00 2001
From: Luca Mondada <luca@mondada.net>
Date: Tue, 10 Mar 2026 23:33:08 +0000
Subject: [PATCH 1/2] Create executeKernel function in PythonLauncher

Signed-off-by: Luca Mondada <luca@mondada.net>
---
 runtime/cudaq/platform/default/python/QPU.cpp | 136 ++++++++++--------
 1 file changed, 77 insertions(+), 59 deletions(-)
diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp
index 7daccf784ef..23d11910d82 100644
--- a/runtime/cudaq/platform/default/python/QPU.cpp
+++ b/runtime/cudaq/platform/default/python/QPU.cpp
@@ -180,16 +180,65 @@ static void updateExecutionContext(ModuleOp module) {
   }
 }
 
-static std::optional<cudaq::JitEngine> alreadyBuiltJITCode() {
+static std::optional<cudaq::JitEngine>
+alreadyBuiltJITCode(const std::string &name,
+                    const std::vector<void *> &rawArgs) {
   auto *currentExecCtx = cudaq::getExecutionContext();
   if (!currentExecCtx || !currentExecCtx->allowJitEngineCaching)
     return std::nullopt;
-  if (currentExecCtx->jitEng)
+
+  auto jit = currentExecCtx->jitEng;
+  if (jit && cudaq::compiler_artifact::isPersistingJITEngine()) {
     CUDAQ_INFO("Loading previously compiled JIT engine for {}. This will "
                "re-run the previous job, discarding any changes to the kernel, "
                "arguments or launch configuration.",
                currentExecCtx->kernelName);
-  return currentExecCtx->jitEng;
+
+    // Ensure the arguments are the same as the previous launch.
+    auto argsCreatorThunk = [&jit, &name]() {
+      return (void *)jit->lookupRawNameOrFail(name + ".argsCreator");
+    };
+    cudaq::compiler_artifact::checkArtifactReuse(name, rawArgs, jit.value(),
+                                                 argsCreatorThunk);
+  }
+
+  return jit;
+}
+
+static cudaq::KernelThunkResultType
+executeKernel(cudaq::JitEngine jit, const std::string &name,
+              const std::vector<void *> &rawArgs, bool hasResult,
+              bool hasVariationalArgs) {
+  cudaq::KernelThunkResultType result{nullptr, 0};
+  void *buff = nullptr;
+  if (hasResult) {
+    buff = const_cast<void *>(rawArgs.back());
+  } else if (hasVariationalArgs) {
+    auto argsCreatorFn = reinterpret_cast<int64_t (*)(const void *, void **)>(
+        jit.lookupRawNameOrFail(name + ".argsCreator"));
+    argsCreatorFn(static_cast<const void *>(rawArgs.data()), &buff);
+  }
+
+  if (buff) {
+    // Proceed to call the .thunk function so that the result value will be
+    // properly marshaled into the buffer we allocated in
+    // appendTheResultBuffer().
+    // FIXME: Python ought to set up the call stack so that a legit C++ entry
+    // point can be called instead of winging it and duplicating what the core
+    // compiler already does.
+    auto funcPtr = jit.lookupRawNameOrFail(name + ".thunk");
+    result = reinterpret_cast<cudaq::KernelThunkResultType (*)(void *, bool)>(
+        funcPtr)(buff, /*client_server=*/false);
+  } else {
+    jit.run(name);
+  }
+
+  if (hasVariationalArgs) {
+    std::free(buff);
+    return {nullptr, 0};
+  }
+
+  return result;
 }
 
 /// In a sample launch context, the (`JIT` compiled) execution engine may be
@@ -218,7 +267,6 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
         cudaq::getEnvBool("CUDAQ_PYTHON_CODEGEN_DUMP", false);
 
     std::string fullName = cudaq::runtime::cudaqGenPrefixName + name;
-    cudaq::KernelThunkResultType result{nullptr, 0};
 
     auto funcOp = module.lookupSymbol<func::FuncOp>(fullName);
     if (!funcOp)
@@ -249,70 +297,40 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
         varArgIndices.clear();
     }
     const bool hasVariationalArgs = !varArgIndices.empty();
+    const bool hasResult = !!resultTy;
 
-    auto jit = alreadyBuiltJITCode();
-    if (!jit) {
-      // 1. Check that this call is sane.
-      if (enablePythonCodegenDump)
-        module.dump();
+    if (auto jit = alreadyBuiltJITCode(name, rawArgs)) {
+      return executeKernel(*jit, name, rawArgs, hasResult, hasVariationalArgs);
+    }
 
-      // 2. Merge other modules (e.g., if there are device kernel calls).
-      cudaq::detail::mergeAllCallableClosures(module, name, rawArgs);
+    // 1. Check that this call is sane.
+    if (enablePythonCodegenDump)
+      module.dump();
 
-      // Mark all newly merged kernels private.
-      for (auto &op : module)
-        if (auto f = dyn_cast<func::FuncOp>(op))
-          if (f != funcOp)
-            f.setPrivate();
+    // 2. Merge other modules (e.g., if there are device kernel calls).
+    cudaq::detail::mergeAllCallableClosures(module, name, rawArgs);
 
-      updateExecutionContext(module);
+    // Mark all newly merged kernels private.
+    for (auto &op : module)
+      if (auto f = dyn_cast<func::FuncOp>(op))
+        if (f != funcOp)
+          f.setPrivate();
 
-      // 3. LLVM JIT the code so we can execute it.
-      CUDAQ_INFO("Run Argument Synth.\n");
-      if (enablePythonCodegenDump)
-        module.dump();
-      specializeKernel(name, module, rawArgs, resultTy, enablePythonCodegenDump,
-                       /*isEntryPoint=*/true, varArgIndices);
+    updateExecutionContext(module);
 
-      // 4. Execute the code right here, right now.
-      jit = cudaq::createQIRJITEngine(module, "qir:");
-    }
+    // 3. LLVM JIT the code so we can execute it.
+    CUDAQ_INFO("Run Argument Synth.\n");
+    if (enablePythonCodegenDump)
+      module.dump();
+    specializeKernel(name, module, rawArgs, resultTy, enablePythonCodegenDump,
+                     /*isEntryPoint=*/true, varArgIndices);
 
-    if (cudaq::compiler_artifact::isPersistingJITEngine()) {
-      auto argsCreatorThunk = [&jit, &name]() {
-        return (void *)jit->lookupRawNameOrFail(name + ".argsCreator");
-      };
-      cudaq::compiler_artifact::checkArtifactReuse(name, rawArgs, jit.value(),
-                                                   argsCreatorThunk);
-    }
+    auto jit = cudaq::createQIRJITEngine(module, "qir:");
+    cacheJITForPerformance(jit);
 
-    if (resultTy) {
-      // Proceed to call the .thunk function so that the result value will be
-      // properly marshaled into the buffer we allocated in
-      // appendTheResultBuffer().
-      // FIXME: Python ought to set up the call stack so that a legit C++ entry
-      // point can be called instead of winging it and duplicating what the core
-      // compiler already does.
-      auto funcPtr = jit->lookupRawNameOrFail(name + ".thunk");
-      void *buff = const_cast<void *>(rawArgs.back());
-      result = reinterpret_cast<cudaq::KernelThunkResultType (*)(void *, bool)>(
-          *funcPtr)(buff, /*client_server=*/false);
-    } else if (hasVariationalArgs) {
-      auto argsCreatorFn = reinterpret_cast<int64_t (*)(const void *, void **)>(
-          *jit->lookupRawNameOrFail(name + ".argsCreator"));
-      void *argsBuffer = nullptr;
-      argsCreatorFn(static_cast<const void *>(rawArgs.data()), &argsBuffer);
-      auto thunkFn =
-          reinterpret_cast<cudaq::KernelThunkResultType (*)(void *, bool)>(
-              *jit->lookupRawNameOrFail(name + ".thunk"));
-      thunkFn(argsBuffer, /*client_server=*/false);
-      std::free(argsBuffer);
-    } else {
-      jit->run(name);
-    }
-    cacheJITForPerformance(jit.value());
     // FIXME: actually handle results
-    return result;
+    // 4. Execute the code right here, right now.
+    return executeKernel(jit, name, rawArgs, hasResult, hasVariationalArgs);
   }
 
   void *specializeModule(const std::string &name, ModuleOp module,

From 890416ca419456cdb95913ea5cb06cedb915921c Mon Sep 17 00:00:00 2001
From: Luca Mondada <luca@mondada.net>
Date: Wed, 11 Mar 2026 17:19:45 +0000
Subject: [PATCH 2/2] add saveArtifact

Signed-off-by: Luca Mondada <luca@mondada.net>
---
 runtime/common/ExecutionContext.cpp           | 34 +++++++++++++++----
 runtime/common/ExecutionContext.h             |  4 +++
 runtime/cudaq/platform/default/python/QPU.cpp |  5 +++
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/runtime/common/ExecutionContext.cpp b/runtime/common/ExecutionContext.cpp
index 3bfe725c819..3691900eaa8 100644
--- a/runtime/common/ExecutionContext.cpp
+++ b/runtime/common/ExecutionContext.cpp
@@ -27,18 +27,29 @@ thread_local bool reuseArtifact = false;
 
 class SavedCompilerArtifact {
 public:
+  void saveArtifact(const std::string &kernelName,
+                    const std::vector<void *> &args,
+                    const cudaq::JitEngine &engine,
+                    std::function<void *()> argsCreatorThunk) {
+    if (jitEng.has_value()) {
+      throw std::runtime_error(
+          "Attempted to overwrite saved compiler artifact.");
+    }
+    jitEng = engine;
+    argsCreator = reinterpret_cast<int64_t (*)(const void *, void **)>(
+        argsCreatorThunk());
+    this->kernelName = kernelName;
+    auto [resSize, scopedArgBuffer] = processArgs(args);
+    argSize = resSize;
+    argBuff = std::move(scopedArgBuffer);
+  }
+
   void checkArtifactReuse(const std::string &kernelName,
                           const std::vector<void *> &args,
                           const cudaq::JitEngine &engine,
                           std::function<void *()> argsCreatorThunk) {
     if (!jitEng.has_value()) {
-      jitEng = engine;
-      this->argsCreator = reinterpret_cast<int64_t (*)(const void *, void **)>(
-          argsCreatorThunk());
-      this->kernelName = kernelName;
-      auto [resSize, scopedArgBuffer] = processArgs(args);
-      this->argSize = resSize;
-      this->argBuff = std::move(scopedArgBuffer);
+      saveArtifact(kernelName, args, engine, argsCreatorThunk);
       return;
     }
 
@@ -126,6 +137,15 @@ void checkArtifactReuse(const std::string kernelName,
 
   savedArtifact.checkArtifactReuse(kernelName, args, jit, argsCreatorThunk);
 }
+
+void saveArtifact(const std::string kernelName, const std::vector<void *> &args,
+                  const JitEngine jit,
+                  std::function<void *()> argsCreatorThunk) {
+  if (!reuseArtifact)
+    return;
+
+  savedArtifact.saveArtifact(kernelName, args, jit, argsCreatorThunk);
+}
 } // namespace compiler_artifact
 
 ExecutionContext *getExecutionContext() { return currentExecutionContext; }
diff --git a/runtime/common/ExecutionContext.h b/runtime/common/ExecutionContext.h
index 34e56800199..3307032d3b0 100644
--- a/runtime/common/ExecutionContext.h
+++ b/runtime/common/ExecutionContext.h
@@ -225,5 +225,9 @@ bool isPersistingJITEngine();
 void checkArtifactReuse(const std::string kernelName,
                         const std::vector<void *> &args, const JitEngine jit,
                         std::function<void *()> argsCreatorThunk);
+
+void saveArtifact(const std::string kernelName, const std::vector<void *> &args,
+                  const JitEngine jit,
+                  std::function<void *()> argsCreatorThunk);
 }; // namespace compiler_artifact
 } // namespace cudaq
diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp
index 23d11910d82..9e91fc659a8 100644
--- a/runtime/cudaq/platform/default/python/QPU.cpp
+++ b/runtime/cudaq/platform/default/python/QPU.cpp
@@ -327,6 +327,11 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
 
     auto jit = cudaq::createQIRJITEngine(module, "qir:");
     cacheJITForPerformance(jit);
+    auto argsCreatorThunk = [&jit, &name]() {
+      return (void *)jit.lookupRawNameOrFail(name + ".argsCreator");
+    };
+    cudaq::compiler_artifact::saveArtifact(name, rawArgs, jit,
+                                           argsCreatorThunk);
 
     // FIXME: actually handle results
     // 4. Execute the code right here, right now.