diff --git a/runtime/common/CMakeLists.txt b/runtime/common/CMakeLists.txt index 54068ee5198..ab171ab57c6 100644 --- a/runtime/common/CMakeLists.txt +++ b/runtime/common/CMakeLists.txt @@ -26,6 +26,7 @@ set(COMMON_RUNTIME_SRC SampleResult.cpp ServerHelper.cpp Trace.cpp + CompiledKernel.cpp ) # Create the cudaq-common library diff --git a/runtime/common/CompiledKernel.cpp b/runtime/common/CompiledKernel.cpp new file mode 100644 index 00000000000..978224bdb49 --- /dev/null +++ b/runtime/common/CompiledKernel.cpp @@ -0,0 +1,35 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "CompiledKernel.h" + +namespace cudaq { + +CompiledKernel::CompiledKernel(JitEngine engine, std::string kernelName, + void (*entryPoint)(), bool hasResult) + : engine(engine), name(std::move(kernelName)), entryPoint(entryPoint), + hasResult(hasResult) {} + +KernelThunkResultType +CompiledKernel::execute(const std::vector &rawArgs) const { + auto funcPtr = getEntryPoint(); + if (hasResult) { + void *buff = const_cast(rawArgs.back()); + return reinterpret_cast(funcPtr)( + buff, /*client_server=*/false); + } else { + reinterpret_cast(funcPtr)(); + return {nullptr, 0}; + } +} + +void (*CompiledKernel::getEntryPoint() const)() { return entryPoint; } + +const JitEngine CompiledKernel::getEngine() const { return engine; } + +} // namespace cudaq diff --git a/runtime/common/CompiledKernel.h b/runtime/common/CompiledKernel.h new file mode 100644 index 00000000000..9578865ca25 --- /dev/null +++ b/runtime/common/CompiledKernel.h @@ -0,0 +1,55 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ +#pragma once + +#include "common/JIT.h" +#include "common/ThunkInterface.h" +#include +#include + +namespace cudaq { + +/// @brief A compiled, ready-to-execute kernel. +/// +/// This type does not have a dependency on MLIR (or LLVM) as it only keeps +/// type-erased pointers to JIT-related types. +/// +/// The constructor is private; use the factory function in +/// `runtime/common/JIT.h` to construct instances. +class CompiledKernel { +public: + /// @brief Execute the JIT-ed kernel. + /// + /// If the kernel has a return type, the caller must have appended a result + /// buffer as the last element of \p rawArgs. + KernelThunkResultType execute(const std::vector &rawArgs) const; + + // TODO: remove these two methods once the CompiledKernel is returned to + // Python. + void (*getEntryPoint() const)(); + const JitEngine getEngine() const; + +private: + CompiledKernel(JitEngine engine, std::string kernelName, void (*entryPoint)(), + bool hasResult); + + // Use the following factory function (compiled into cudaq-mlir-runtime) to + // construct CompiledKernels. + friend CompiledKernel createCompiledKernel(JitEngine engine, + std::string kernelName, + bool hasResult); + + JitEngine engine; + std::string name; + void (*entryPoint)(); + bool hasResult; +}; + +CompiledKernel createCompiledKernel(JitEngine engine, std::string kernelName, + bool hasResult); +} // namespace cudaq diff --git a/runtime/common/JIT.cpp b/runtime/common/JIT.cpp index 49d4c1384b6..4d7ea189f5f 100644 --- a/runtime/common/JIT.cpp +++ b/runtime/common/JIT.cpp @@ -7,6 +7,7 @@ ******************************************************************************/ #include "JIT.h" +#include "CompiledKernel.h" #include "common/Environment.h" #include "common/Timing.h" #include "cudaq/Frontend/nvqpp/AttributeNames.h" @@ -325,6 +326,16 @@ cudaq::JitEngine cudaq::createQIRJITEngine(mlir::ModuleOp &moduleOp, return JitEngine(std::move(jitOrError.get())); } +cudaq::CompiledKernel cudaq::createCompiledKernel(JitEngine engine, + std::string kernelName, + bool hasResult) { + std::string fullName = cudaq::runtime::cudaqGenPrefixName + kernelName; + std::string entryName = hasResult ? kernelName + ".thunk" : fullName; + void (*entryPoint)() = engine.lookupRawNameOrFail(entryName); + return cudaq::CompiledKernel(engine, std::move(kernelName), entryPoint, + hasResult); +} + namespace cudaq { class JitEngine::Impl { public: diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp index 9c9653529fe..2d5cac5f62a 100644 --- a/runtime/cudaq/platform/default/python/QPU.cpp +++ b/runtime/cudaq/platform/default/python/QPU.cpp @@ -187,84 +187,22 @@ static void cacheJITForPerformance(cudaq::JitEngine jit) { namespace { struct PythonLauncher : public cudaq::ModuleLauncher { - cudaq::KernelThunkResultType launchModule(const std::string &name, - ModuleOp module, - const std::vector &rawArgs, - Type resultTy) override { - // In this launch scenario, we have a ModuleOp that has the entry-point - // kernel, but needs to be merged with anything else it may call. The - // merging of modules mirrors the late binding and dynamic scoping of the - // host language (Python). - ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchModule"); + cudaq::CompiledKernel compileModule(const std::string &name, ModuleOp module, + const std::vector &rawArgs, + Type resultTy, + bool isEntryPoint) override { + // Check the ExecutionContext JIT cache first (used by cudaq.sample to + // avoid recompiling on every shot). + if (auto jit = alreadyBuiltJITCode()) + return cudaq::createCompiledKernel(*jit, name, /*hasResult=*/!!resultTy); + + ScopedTraceWithContext(cudaq::TIMING_LAUNCH, + "PythonLauncher::compileModule"); const bool enablePythonCodegenDump = cudaq::getEnvBool("CUDAQ_PYTHON_CODEGEN_DUMP", false); std::string fullName = cudaq::runtime::cudaqGenPrefixName + name; - cudaq::KernelThunkResultType result{nullptr, 0}; - auto jit = alreadyBuiltJITCode(); - if (!jit) { - // 1. Check that this call is sane. - if (enablePythonCodegenDump) - module.dump(); - auto funcOp = module.lookupSymbol(fullName); - if (!funcOp) - throw std::runtime_error("no kernel named " + name + - " found in module"); - // 2. Merge other modules (e.g., if there are device kernel calls). - cudaq::detail::mergeAllCallableClosures(module, name, rawArgs); - - // Mark all newly merged kernels private. - for (auto &op : module) - if (auto f = dyn_cast(op)) - if (f != funcOp) - f.setPrivate(); - - updateExecutionContext(module); - - // 3. LLVM JIT the code so we can execute it. - CUDAQ_INFO("Run Argument Synth.\n"); - if (enablePythonCodegenDump) - module.dump(); - specializeKernel(name, module, rawArgs, resultTy, - enablePythonCodegenDump); - - // 4. Execute the code right here, right now. - jit = cudaq::createQIRJITEngine(module, "qir:"); - } - - if (resultTy) { - // Proceed to call the .thunk function so that the result value will be - // properly marshaled into the buffer we allocated in - // appendTheResultBuffer(). - // FIXME: Python ought to set up the call stack so that a legit C++ entry - // point can be called instead of winging it and duplicating what the core - // compiler already does. - auto funcPtr = jit->lookupRawNameOrFail(name + ".thunk"); - void *buff = const_cast(rawArgs.back()); - result = reinterpret_cast( - *funcPtr)(buff, /*client_server=*/false); - } else { - jit->run(name); - } - cacheJITForPerformance(jit.value()); - // FIXME: actually handle results - return result; - } - - void *specializeModule(const std::string &name, ModuleOp module, - const std::vector &rawArgs, Type resultTy, - std::optional &cachedEngine, - bool isEntryPoint) override { - // In this launch scenario, we have a ModuleOp that has the entry-point - // kernel, but needs to be merged with anything else it may call. The - // merging of modules mirrors the late binding and dynamic scoping of the - // host language (Python). - ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchModule"); - const bool enablePythonCodegenDump = - cudaq::getEnvBool("CUDAQ_PYTHON_CODEGEN_DUMP", false); - - std::string fullName = cudaq::runtime::cudaqGenPrefixName + name; // 1. Check that this call is sane. if (enablePythonCodegenDump) module.dump(); @@ -283,23 +221,20 @@ struct PythonLauncher : public cudaq::ModuleLauncher { updateExecutionContext(module); - // 3. LLVM JIT the code so we can execute it. + // 3. Specialize the kernel (argument synthesis, optimization). CUDAQ_INFO("Run Argument Synth.\n"); if (enablePythonCodegenDump) module.dump(); specializeKernel(name, module, rawArgs, resultTy, enablePythonCodegenDump, isEntryPoint); - // 4. Execute the code right here, right now. + // 4. Lower to QIR and JIT compile. auto jit = cudaq::createQIRJITEngine(module, "qir:"); - if (cachedEngine) - throw std::runtime_error("cache must not be populated"); - cachedEngine = jit; + cacheJITForPerformance(jit); - std::string entryName = - (resultTy && isEntryPoint) ? name + ".thunk" : fullName; - auto funcPtr = jit.lookupRawNameOrFail(entryName); - return reinterpret_cast(funcPtr); + return cudaq::createCompiledKernel(jit, name, + /*hasResult=*/!!resultTy && + isEntryPoint); } }; } // namespace diff --git a/runtime/cudaq/platform/qpu.cpp b/runtime/cudaq/platform/qpu.cpp index 55a1c162683..fc2572f1d32 100644 --- a/runtime/cudaq/platform/qpu.cpp +++ b/runtime/cudaq/platform/qpu.cpp @@ -21,7 +21,9 @@ cudaq::QPU::launchModule(const std::string &name, mlir::ModuleOp module, "No ModuleLauncher registered with name 'default'. This may be a " "result of attempting to use `launchModule` outside Python."); ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchModule", name); - return launcher->launchModule(name, module, rawArgs, resultTy); + auto compiled = + launcher->compileModule(name, module, rawArgs, resultTy, true); + return compiled.execute(rawArgs); } void *cudaq::QPU::specializeModule( @@ -34,6 +36,10 @@ void *cudaq::QPU::specializeModule( "No ModuleLauncher registered with name 'default'. This may be a " "result of attempting to use `specializeModule` outside Python."); ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::specializeModule", name); - return launcher->specializeModule(name, module, rawArgs, resultTy, - cachedEngine, isEntryPoint); + auto compiled = + launcher->compileModule(name, module, rawArgs, resultTy, isEntryPoint); + if (cachedEngine) + throw std::runtime_error("cache must not be populated"); + cachedEngine = compiled.getEngine(); + return reinterpret_cast(compiled.getEntryPoint()); } diff --git a/runtime/cudaq/platform/qpu.h b/runtime/cudaq/platform/qpu.h index 6904cdd3c0a..275830295d6 100644 --- a/runtime/cudaq/platform/qpu.h +++ b/runtime/cudaq/platform/qpu.h @@ -9,6 +9,7 @@ #pragma once #include "QuantumExecutionQueue.h" +#include "common/CompiledKernel.h" #include "common/Registry.h" #include "common/ThunkInterface.h" #include "common/Timing.h" @@ -221,15 +222,13 @@ class QPU : public registry::RegisteredType { struct ModuleLauncher : public registry::RegisteredType { virtual ~ModuleLauncher() = default; - virtual KernelThunkResultType launchModule(const std::string &name, - mlir::ModuleOp module, - const std::vector &rawArgs, - mlir::Type resultTy) = 0; - virtual void *specializeModule(const std::string &name, mlir::ModuleOp module, - const std::vector &rawArgs, - mlir::Type resultTy, - std::optional &cachedEngine, - bool isEntryPoint) = 0; + /// Compile (specialize + JIT) a kernel module and return a ready-to-execute + /// CompiledKernel. + virtual CompiledKernel compileModule(const std::string &name, + mlir::ModuleOp module, + const std::vector &rawArgs, + mlir::Type resultTy, + bool isEntryPoint) = 0; }; } // namespace cudaq