NVIDIA · Renaud-K · Feb 17, 2026 · Feb 17, 2026
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
@@ -14,6 +14,7 @@
 #include "common/ExecutionContext.h"
 #include "common/Executor.h"
 #include "common/ExtraPayloadProvider.h"
+#include "common/JIT.h"
 #include "common/Resources.h"
 #include "cudaq.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"

diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h
@@ -10,6 +10,7 @@
 
 #include "common/ArgumentConversion.h"
 #include "common/ExecutionContext.h"
+#include "common/JIT.h"
 #include "common/RemoteKernelExecutor.h"
 #include "common/Resources.h"
 #include "common/RuntimeMLIR.h"

diff --git a/runtime/common/JIT.cpp b/runtime/common/JIT.cpp
@@ -7,31 +7,40 @@
  ******************************************************************************/
 
 #include "JIT.h"
-#include "ExecutionContext.h"
+#include "common/Environment.h"
+#include "common/Timing.h"
+#include "cudaq/Frontend/nvqpp/AttributeNames.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
-#include "llvm/ExecutionEngine/JITEventListener.h"
-#include "llvm/ExecutionEngine/ObjectCache.h"
-#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "cudaq/Optimizer/CodeGen/Passes.h"
+#include "cudaq/Optimizer/CodeGen/QIRAttributeNames.h"
+#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
+#include "cudaq/Optimizer/CodeGen/QIROpaqueStructTypes.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/runtime/logger/logger.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
-#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
-#include <cudaq/platform.h>
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Target/LLVMIR/Export.h"
+#include <cassert>
 #include <cxxabi.h>
-#include <llvm/Support/Error.h>
+#include <iterator>
 #include <memory>
+#include <stdexcept>
 #include <tuple>
 
 #define DEBUG_TYPE "cudaq-qpud"
@@ -148,6 +157,174 @@ cudaq::createWrappedKernel(std::string_view irString,
   return std::make_tuple(std::move(jit), callable);
 }
 
+namespace {
+void insertSetupAndCleanupOperations(mlir::Operation *module) {
+  mlir::OpBuilder modBuilder(module);
+  auto *context = module->getContext();
+  auto arrayQubitTy = cudaq::opt::getArrayType(context);
+  auto voidTy = mlir::LLVM::LLVMVoidType::get(context);
+  auto boolTy = modBuilder.getI1Type();
+  mlir::FlatSymbolRefAttr allocateSymbol =
+      cudaq::opt::factory::createLLVMFunctionSymbol(
+          cudaq::opt::QIRArrayQubitAllocateArray, arrayQubitTy,
+          {modBuilder.getI64Type()}, mlir::dyn_cast<mlir::ModuleOp>(module));
+  mlir::FlatSymbolRefAttr releaseSymbol =
+      cudaq::opt::factory::createLLVMFunctionSymbol(
+          cudaq::opt::QIRArrayQubitReleaseArray, {voidTy}, {arrayQubitTy},
+          mlir::dyn_cast<mlir::ModuleOp>(module));
+  mlir::FlatSymbolRefAttr isDynamicSymbol =
+      cudaq::opt::factory::createLLVMFunctionSymbol(
+          cudaq::opt::QIRisDynamicQubitManagement, {boolTy}, {},
+          mlir::dyn_cast<mlir::ModuleOp>(module));
+  mlir::FlatSymbolRefAttr setDynamicSymbol =
+      cudaq::opt::factory::createLLVMFunctionSymbol(
+          cudaq::opt::QIRsetDynamicQubitManagement, {voidTy}, {boolTy},
+          mlir::dyn_cast<mlir::ModuleOp>(module));
+  mlir::FlatSymbolRefAttr clearResultMapsSymbol =
+      cudaq::opt::factory::createLLVMFunctionSymbol(
+          cudaq::opt::QIRClearResultMaps, {voidTy}, {},
+          mlir::dyn_cast<mlir::ModuleOp>(module));
+
+  // Iterate through all operations in the ModuleOp
+  mlir::SmallVector<mlir::LLVM::LLVMFuncOp> funcs;
+  module->walk([&](mlir::LLVM::LLVMFuncOp func) { funcs.push_back(func); });
+  for (auto &func : funcs) {
+    if (!func->hasAttr(cudaq::entryPointAttrName))
+      continue;
+    std::int64_t num_qubits = -1;
+    if (auto requiredQubits = func->getAttrOfType<mlir::StringAttr>(
+            cudaq::opt::qir0_1::RequiredQubitsAttrName))
+      requiredQubits.strref().getAsInteger(10, num_qubits);
+    else if (auto requiredQubits = func->getAttrOfType<mlir::StringAttr>(
+                 cudaq::opt::qir1_0::RequiredQubitsAttrName))
+      requiredQubits.strref().getAsInteger(10, num_qubits);
+
+    auto &blocks = func.getBlocks();
+    if (blocks.size() < 1 || num_qubits < 0)
+      continue;
+
+    mlir::Block &block = *blocks.begin();
+    mlir::OpBuilder builder(&block, block.begin());
+    auto loc = builder.getUnknownLoc();
+
+    auto origMode = builder.create<mlir::LLVM::CallOp>(
+        loc, mlir::TypeRange{boolTy}, isDynamicSymbol, mlir::ValueRange{});
+
+    auto numQubitsVal =
+        cudaq::opt::factory::genLlvmI64Constant(loc, builder, num_qubits);
+    auto falseVal = builder.create<mlir::LLVM::ConstantOp>(
+        loc, boolTy, builder.getI16IntegerAttr(false));
+
+    auto qubitAlloc = builder.create<mlir::LLVM::CallOp>(
+        loc, mlir::TypeRange{arrayQubitTy}, allocateSymbol,
+        mlir::ValueRange{numQubitsVal.getResult()});
+    builder.create<mlir::LLVM::CallOp>(loc, mlir::TypeRange{voidTy},
+                                       setDynamicSymbol,
+                                       mlir::ValueRange{falseVal.getResult()});
+
+    // At the end of the function, deallocate the qubits and restore the
+    // simulator state.
+    builder.setInsertionPoint(std::prev(blocks.end())->getTerminator());
+    builder.create<mlir::LLVM::CallOp>(
+        loc, mlir::TypeRange{voidTy}, releaseSymbol,
+        mlir::ValueRange{qubitAlloc.getResult()});
+    builder.create<mlir::LLVM::CallOp>(loc, mlir::TypeRange{voidTy},
+                                       setDynamicSymbol,
+                                       mlir::ValueRange{origMode.getResult()});
+    builder.create<mlir::LLVM::CallOp>(loc, mlir::TypeRange{voidTy},
+                                       clearResultMapsSymbol,
+                                       mlir::ValueRange{});
+  }
+}
+} // namespace
+
+cudaq::JitEngine cudaq::createQIRJITEngine(mlir::ModuleOp &moduleOp,
+                                           llvm::StringRef convertTo) {
+  // The "fast" instruction selection compilation algorithm is actually very
+  // slow for large quantum circuits. Disable that here.
+  ScopedTraceWithContext(cudaq::TIMING_JIT, "createQIRJITEngine");
+  const char *argv[] = {"", "-fast-isel=0", nullptr};
+  llvm::cl::ParseCommandLineOptions(2, argv);
+
+  mlir::ExecutionEngineOptions opts;
+  opts.transformer = [](llvm::Module *m) { return llvm::ErrorSuccess(); };
+  opts.jitCodeGenOptLevel = llvm::CodeGenOpt::None;
+  opts.llvmModuleBuilder =
+      [convertTo = convertTo.str()](
+          mlir::Operation *module,
+          llvm::LLVMContext &llvmContext) -> std::unique_ptr<llvm::Module> {
+    ScopedTraceWithContext(cudaq::TIMING_JIT,
+                           "createQIRJITEngine::llvmModuleBuilder");
+    llvmContext.setOpaquePointers(false);
+
+    auto *context = module->getContext();
+    mlir::PassManager pm(context);
+
+    bool containsWireSet =
+        module
+            ->walk<mlir::WalkOrder::PreOrder>([](quake::WireSetOp wireSetOp) {
+              return mlir::WalkResult::interrupt();
+            })
+            .wasInterrupted();
+
+    // Even though we're not lowering all the way to a real QIR profile for
+    // this emulated path, we need to pass in `convertTo` to mimic the
+    // non-emulated path.
+    if (containsWireSet)
+      cudaq::opt::addWiresetToProfileQIRPipeline(pm, convertTo);
+    else
+      cudaq::opt::addAOTPipelineConvertToQIR(pm);
+
+    auto enablePrintMLIREachPass =
+        getEnvBool("CUDAQ_MLIR_PRINT_EACH_PASS", false);
+    if (enablePrintMLIREachPass) {
+      module->getContext()->disableMultithreading();
+      pm.enableIRPrinting();
+    }
+
+    std::string error_msg;
+    mlir::DiagnosticEngine &engine = context->getDiagEngine();
+    auto handlerId = engine.registerHandler(
+        [&error_msg](mlir::Diagnostic &diag) -> mlir::LogicalResult {
+          if (diag.getSeverity() == mlir::DiagnosticSeverity::Error) {
+            error_msg += diag.str();
+            return mlir::failure(false);
+          }
+          return mlir::failure();
+        });
+
+    mlir::DefaultTimingManager tm;
+    tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
+    auto timingScope = tm.getRootScope(); // starts the timer
+    pm.enableTiming(timingScope);         // do this right before pm.run
+    if (mlir::failed(pm.run(module))) {
+      engine.eraseHandler(handlerId);
+      throw std::runtime_error("[createQIRJITEngine] Lowering to QIR for "
+                               "remote emulation failed.\n" +
+                               error_msg);
+    }
+    timingScope.stop();
+    engine.eraseHandler(handlerId);
+
+    // Insert necessary calls to qubit allocations and qubit releases if the
+    // original module contained WireSetOp's.
+    if (containsWireSet)
+      insertSetupAndCleanupOperations(module);
+
+    auto llvmModule = translateModuleToLLVMIR(module, llvmContext);
+    if (!llvmModule)
+      throw std::runtime_error(
+          "[createQIRJITEngine] Lowering to LLVM IR failed.");
+
+    mlir::ExecutionEngine::setupTargetTriple(llvmModule.get());
+    return llvmModule;
+  };
+
+  auto jitOrError = mlir::ExecutionEngine::create(moduleOp, opts);
+  assert(!!jitOrError && "ExecutionEngine creation failed.");
+  return JitEngine(std::move(jitOrError.get()));
+}
+
 namespace cudaq {
 class JitEngine::Impl {
 public:

diff --git a/runtime/common/JIT.h b/runtime/common/JIT.h
@@ -13,13 +13,17 @@
 #include <memory>
 #include <string>
 
-namespace llvm::orc {
+namespace llvm {
+class StringRef;
+namespace orc {
 class LLJIT;
 }
+} // namespace llvm
 
 namespace mlir {
 class ExecutionEngine;
-}
+class ModuleOp;
+} // namespace mlir
 
 namespace cudaq {
 
@@ -47,4 +51,9 @@ class JitEngine {
   class Impl;
   std::shared_ptr<Impl> impl;
 };
+
+/// Lower ModuleOp to QIR/LLVM IR and create a JIT execution engine.
+JitEngine createQIRJITEngine(mlir::ModuleOp &moduleOp,
+                             llvm::StringRef convertTo);
+
 } // namespace cudaq