Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions runtime/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ set(COMMON_RUNTIME_SRC
SampleResult.cpp
ServerHelper.cpp
Trace.cpp
CompiledKernel.cpp
)

# Create the cudaq-common library
Expand Down
36 changes: 36 additions & 0 deletions runtime/common/CompiledKernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*******************************************************************************
* Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates. *
* All rights reserved. *
* *
* This source code and the accompanying materials are made available under *
* the terms of the Apache License 2.0 which accompanies this distribution. *
******************************************************************************/

#include "CompiledKernel.h"

namespace cudaq {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
namespace cudaq {

We should move to the LLVM style so as to better catch bugs, etc.

Copy link
Collaborator Author

@lmondada lmondada Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Understood, I'm removing the namespace {} block.


CompiledKernel::CompiledKernel(OpaquePtr<JitEngine> engine,
std::string kernelName, void (*entryPoint)(),
bool hasResult)
: engine(std::move(engine)), name(std::move(kernelName)),
entryPoint(entryPoint), hasResult(hasResult) {}

KernelThunkResultType
CompiledKernel::execute(const std::vector<void *> &rawArgs) const {
auto funcPtr = getEntryPoint();
if (hasResult) {
void *buff = const_cast<void *>(rawArgs.back());
return reinterpret_cast<KernelThunkResultType (*)(void *, bool)>(funcPtr)(
buff, /*client_server=*/false);
} else {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed, thanks.

reinterpret_cast<void (*)()>(funcPtr)();
return {nullptr, 0};
}
}

void (*CompiledKernel::getEntryPoint() const)() { return entryPoint; }

const JitEngine &CompiledKernel::getEngine() const { return *engine; }

} // namespace cudaq
64 changes: 64 additions & 0 deletions runtime/common/CompiledKernel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/****************************************************************-*- C++ -*-****
* Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates. *
* All rights reserved. *
* *
* This source code and the accompanying materials are made available under *
* the terms of the Apache License 2.0 which accompanies this distribution. *
******************************************************************************/
#pragma once

#include "common/ThunkInterface.h"
#include <memory>
#include <string>
#include <vector>

namespace cudaq {

class JitEngine;

/// A unique_ptr with a plain function pointer as destructor, allowing
/// type-erased ownership of forward-declared (incomplete) types.
template <typename T>
using OpaquePtr = std::unique_ptr<T, void (*)(T *)>;

template <typename T, typename... Args>
OpaquePtr<T> makeOpaquePtr(Args &&...args) {
return OpaquePtr<T>(new T(std::forward<Args>(args)...),
[](T *p) { delete p; });
}

/// @brief A compiled, ready-to-execute kernel.
///
/// This type does not have a dependency on MLIR (or LLVM) as it only keeps
/// type-erased pointers to JIT-related types.
///
/// The constructor is private; use the factory function in
/// `runtime/common/JIT.h` to construct instances.
class CompiledKernel {
public:
/// @brief Execute the JIT-ed kernel.
///
/// If the kernel has a return type, the caller must have appended a result
/// buffer as the last element of \p rawArgs.
KernelThunkResultType execute(const std::vector<void *> &rawArgs) const;

void (*getEntryPoint() const)();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm dubious of this. CUDA-Q does not define entry point kernels as always having void(void) signatures. So this seems like a cul-de-sac.


const JitEngine &getEngine() const;

private:
CompiledKernel(OpaquePtr<JitEngine> engine, std::string kernelName,
void (*entryPoint)(), bool hasResult);

// Use the following factory function in JIT.h to construct CompiledKernels.
friend CompiledKernel createCompiledKernel(JitEngine engine,
std::string kernelName,
bool hasResult);

OpaquePtr<JitEngine> engine;
std::string name;
void (*entryPoint)();
bool hasResult;
};

} // namespace cudaq
10 changes: 10 additions & 0 deletions runtime/common/JIT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,16 @@ cudaq::JitEngine cudaq::createQIRJITEngine(mlir::ModuleOp &moduleOp,
return JitEngine(std::move(jitOrError.get()));
}

cudaq::CompiledKernel cudaq::createCompiledKernel(JitEngine engine,
std::string kernelName,
bool hasResult) {
std::string fullName = cudaq::runtime::cudaqGenPrefixName + kernelName;
std::string entryName = hasResult ? kernelName + ".thunk" : fullName;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code looks like the efficiency hack used in Python. i.e., unrelated to the C++ implementation. But it is appearing here in what looks like common code?

void (*entryPoint)() = engine.lookupRawNameOrFail(entryName);
return cudaq::CompiledKernel(cudaq::makeOpaquePtr<JitEngine>(engine),
std::move(kernelName), entryPoint, hasResult);
}

namespace cudaq {
class JitEngine::Impl {
public:
Expand Down
4 changes: 4 additions & 0 deletions runtime/common/JIT.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
******************************************************************************/
#pragma once

#include "CompiledKernel.h"
#include <cstddef>
#include <cstdint>
#include <functional>
Expand Down Expand Up @@ -56,4 +57,7 @@ class JitEngine {
JitEngine createQIRJITEngine(mlir::ModuleOp &moduleOp,
llvm::StringRef convertTo);

CompiledKernel createCompiledKernel(JitEngine engine, std::string kernelName,
bool hasResult);

} // namespace cudaq
99 changes: 17 additions & 82 deletions runtime/cudaq/platform/default/python/QPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,84 +187,22 @@ static void cacheJITForPerformance(cudaq::JitEngine jit) {

namespace {
struct PythonLauncher : public cudaq::ModuleLauncher {
cudaq::KernelThunkResultType launchModule(const std::string &name,
ModuleOp module,
const std::vector<void *> &rawArgs,
Type resultTy) override {
// In this launch scenario, we have a ModuleOp that has the entry-point
// kernel, but needs to be merged with anything else it may call. The
// merging of modules mirrors the late binding and dynamic scoping of the
// host language (Python).
ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchModule");
cudaq::CompiledKernel compileModule(const std::string &name, ModuleOp module,
const std::vector<void *> &rawArgs,
Type resultTy,
bool isEntryPoint) override {
// Check the ExecutionContext JIT cache first (used by cudaq.sample to
// avoid recompiling on every shot).
if (auto jit = alreadyBuiltJITCode())
return cudaq::createCompiledKernel(*jit, name, /*hasResult=*/!!resultTy);

ScopedTraceWithContext(cudaq::TIMING_LAUNCH,
"PythonLauncher::compileModule");
const bool enablePythonCodegenDump =
cudaq::getEnvBool("CUDAQ_PYTHON_CODEGEN_DUMP", false);

std::string fullName = cudaq::runtime::cudaqGenPrefixName + name;
cudaq::KernelThunkResultType result{nullptr, 0};
auto jit = alreadyBuiltJITCode();
if (!jit) {
// 1. Check that this call is sane.
if (enablePythonCodegenDump)
module.dump();
auto funcOp = module.lookupSymbol<func::FuncOp>(fullName);
if (!funcOp)
throw std::runtime_error("no kernel named " + name +
" found in module");

// 2. Merge other modules (e.g., if there are device kernel calls).
cudaq::detail::mergeAllCallableClosures(module, name, rawArgs);

// Mark all newly merged kernels private.
for (auto &op : module)
if (auto f = dyn_cast<func::FuncOp>(op))
if (f != funcOp)
f.setPrivate();

updateExecutionContext(module);

// 3. LLVM JIT the code so we can execute it.
CUDAQ_INFO("Run Argument Synth.\n");
if (enablePythonCodegenDump)
module.dump();
specializeKernel(name, module, rawArgs, resultTy,
enablePythonCodegenDump);

// 4. Execute the code right here, right now.
jit = cudaq::createQIRJITEngine(module, "qir:");
}

if (resultTy) {
// Proceed to call the .thunk function so that the result value will be
// properly marshaled into the buffer we allocated in
// appendTheResultBuffer().
// FIXME: Python ought to set up the call stack so that a legit C++ entry
// point can be called instead of winging it and duplicating what the core
// compiler already does.
auto funcPtr = jit->lookupRawNameOrFail(name + ".thunk");
void *buff = const_cast<void *>(rawArgs.back());
result = reinterpret_cast<cudaq::KernelThunkResultType (*)(void *, bool)>(
*funcPtr)(buff, /*client_server=*/false);
} else {
jit->run(name);
}
cacheJITForPerformance(jit.value());
// FIXME: actually handle results
return result;
}

void *specializeModule(const std::string &name, ModuleOp module,
const std::vector<void *> &rawArgs, Type resultTy,
std::optional<cudaq::JitEngine> &cachedEngine,
bool isEntryPoint) override {
// In this launch scenario, we have a ModuleOp that has the entry-point
// kernel, but needs to be merged with anything else it may call. The
// merging of modules mirrors the late binding and dynamic scoping of the
// host language (Python).
ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchModule");
const bool enablePythonCodegenDump =
cudaq::getEnvBool("CUDAQ_PYTHON_CODEGEN_DUMP", false);

std::string fullName = cudaq::runtime::cudaqGenPrefixName + name;
// 1. Check that this call is sane.
if (enablePythonCodegenDump)
module.dump();
Expand All @@ -283,23 +221,20 @@ struct PythonLauncher : public cudaq::ModuleLauncher {

updateExecutionContext(module);

// 3. LLVM JIT the code so we can execute it.
// 3. Specialize the kernel (argument synthesis, optimization).
CUDAQ_INFO("Run Argument Synth.\n");
if (enablePythonCodegenDump)
module.dump();
specializeKernel(name, module, rawArgs, resultTy, enablePythonCodegenDump,
isEntryPoint);

// 4. Execute the code right here, right now.
// 4. Lower to QIR and JIT compile.
auto jit = cudaq::createQIRJITEngine(module, "qir:");
if (cachedEngine)
throw std::runtime_error("cache must not be populated");
cachedEngine = jit;
cacheJITForPerformance(jit);

std::string entryName =
(resultTy && isEntryPoint) ? name + ".thunk" : fullName;
auto funcPtr = jit.lookupRawNameOrFail(entryName);
return reinterpret_cast<void *>(funcPtr);
return cudaq::createCompiledKernel(jit, name,
/*hasResult=*/!!resultTy &&
isEntryPoint);
}
};
} // namespace
Expand Down
12 changes: 9 additions & 3 deletions runtime/cudaq/platform/qpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ cudaq::QPU::launchModule(const std::string &name, mlir::ModuleOp module,
"No ModuleLauncher registered with name 'default'. This may be a "
"result of attempting to use `launchModule` outside Python.");
ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchModule", name);
return launcher->launchModule(name, module, rawArgs, resultTy);
auto compiled =
launcher->compileModule(name, module, rawArgs, resultTy, true);
return compiled.execute(rawArgs);
}

void *cudaq::QPU::specializeModule(
Expand All @@ -34,6 +36,10 @@ void *cudaq::QPU::specializeModule(
"No ModuleLauncher registered with name 'default'. This may be a "
"result of attempting to use `specializeModule` outside Python.");
ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::specializeModule", name);
return launcher->specializeModule(name, module, rawArgs, resultTy,
cachedEngine, isEntryPoint);
auto compiled =
launcher->compileModule(name, module, rawArgs, resultTy, isEntryPoint);
if (cachedEngine)
throw std::runtime_error("cache must not be populated");
cachedEngine = compiled.getEngine();
return reinterpret_cast<void *>(compiled.getEntryPoint());
}
17 changes: 8 additions & 9 deletions runtime/cudaq/platform/qpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#pragma once

#include "QuantumExecutionQueue.h"
#include "common/CompiledKernel.h"
#include "common/Registry.h"
#include "common/ThunkInterface.h"
#include "common/Timing.h"
Expand Down Expand Up @@ -221,15 +222,13 @@ class QPU : public registry::RegisteredType<QPU> {
struct ModuleLauncher : public registry::RegisteredType<ModuleLauncher> {
virtual ~ModuleLauncher() = default;

virtual KernelThunkResultType launchModule(const std::string &name,
mlir::ModuleOp module,
const std::vector<void *> &rawArgs,
mlir::Type resultTy) = 0;
virtual void *specializeModule(const std::string &name, mlir::ModuleOp module,
const std::vector<void *> &rawArgs,
mlir::Type resultTy,
std::optional<cudaq::JitEngine> &cachedEngine,
bool isEntryPoint) = 0;
/// Compile (specialize + JIT) a kernel module and return a ready-to-execute
/// CompiledKernel.
virtual CompiledKernel compileModule(const std::string &name,
mlir::ModuleOp module,
const std::vector<void *> &rawArgs,
mlir::Type resultTy,
bool isEntryPoint) = 0;
};

} // namespace cudaq
Loading