From ac01dd197f9fafb0301b0432da90698cd7e9517a Mon Sep 17 00:00:00 2001
From: Ben Howe <bhowe@nvidia.com>
Date: Fri, 11 Oct 2024 16:52:57 +0000
Subject: [PATCH 01/54] DCO Remediation Commit for Ben Howe <bhowe@nvidia.com>

I, Ben Howe <bhowe@nvidia.com>, hereby add my Signed-off-by to this commit: 86681ef67d3b76c0e468f6595e2c2524cf9b4b6c

Signed-off-by: Ben Howe <bhowe@nvidia.com>
Signed-off-by: Anna Gringauze <agringauze@nvidia.com>

From 21a87c1646f168a6465c3e51dc4fc510c1de9c43 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 17 Sep 2024 14:40:45 -0700
Subject: [PATCH 02/54] State pointer synthesis for quantum hardware

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Builder/Intrinsics.h  |   4 +
 include/cudaq/Optimizer/Transforms/Passes.td  |  38 ++++
 lib/Optimizer/Builder/Intrinsics.cpp          |   4 +
 lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp    |   3 +-
 lib/Optimizer/Transforms/CMakeLists.txt       |   1 +
 lib/Optimizer/Transforms/LiftArrayAlloc.cpp   |  11 +-
 .../Transforms/StateInitialization.cpp        | 146 +++++++++++++++
 python/runtime/cudaq/algorithms/py_state.cpp  |   5 +-
 .../cudaq/platform/py_alt_launch_kernel.cpp   |   2 +-
 runtime/common/ArgumentConversion.cpp         | 167 ++++++++++++++++--
 runtime/common/ArgumentConversion.h           |  22 ++-
 runtime/common/BaseRemoteRESTQPU.h            |  33 ++--
 runtime/common/BaseRestRemoteClient.h         |   4 +-
 runtime/common/CMakeLists.txt                 |   2 +-
 runtime/common/SimulationState.h              |  11 ++
 runtime/cudaq/CMakeLists.txt                  |   1 +
 runtime/cudaq/algorithms/get_state.h          |  12 ++
 .../rest/helpers/quantinuum/quantinuum.yml    |   2 +
 runtime/cudaq/qis/quantum_state.cpp           | 113 ++++++++++++
 runtime/cudaq/qis/quantum_state.h             | 151 ++++++++++++++++
 runtime/cudaq/qis/remote_state.cpp            |   2 +-
 runtime/cudaq/qis/remote_state.h              |   3 +-
 .../Remote-Sim/qvector_init_from_state.cpp    |  16 ++
 .../execution/qvector_init_from_state.cpp     | 147 +++++++++++++++
 targettests/execution/state_init.cpp          |   2 +-
 test/Quake/arg_subst-5.txt                    |  15 ++
 test/Quake/arg_subst-6.txt                    |  11 ++
 test/Quake/arg_subst_func.qke                 |  37 +++-
 test/Quake/state_init.qke                     |  37 ++++
 test/Quake/state_prep.qke                     |   2 +-
 tpls/Stim                                     |   2 +-
 31 files changed, 955 insertions(+), 51 deletions(-)
 create mode 100644 lib/Optimizer/Transforms/StateInitialization.cpp
 create mode 100644 runtime/cudaq/qis/quantum_state.cpp
 create mode 100644 runtime/cudaq/qis/quantum_state.h
 create mode 100644 targettests/execution/qvector_init_from_state.cpp
 create mode 100644 test/Quake/arg_subst-5.txt
 create mode 100644 test/Quake/arg_subst-6.txt
 create mode 100644 test/Quake/state_init.qke

diff --git a/include/cudaq/Optimizer/Builder/Intrinsics.h b/include/cudaq/Optimizer/Builder/Intrinsics.h
index 30ab0e696a1..c05021b879f 100644
--- a/include/cudaq/Optimizer/Builder/Intrinsics.h
+++ b/include/cudaq/Optimizer/Builder/Intrinsics.h
@@ -55,6 +55,10 @@ static constexpr const char createCudaqStateFromDataFP32[] =
 // Delete a state created by the runtime functions above.
 static constexpr const char deleteCudaqState[] = "__nvqpp_cudaq_state_delete";
 
+// Get state of a kernel (placeholder function, calls are always replaced in
+// opts)
+static constexpr const char getCudaqState[] = "__nvqpp_cudaq_state_get";
+
 /// Builder for lowering the clang AST to an IR for CUDA-Q. Lowering includes
 /// the transformation of both quantum and classical computation. Different
 /// features of the CUDA-Q programming model are lowered into different dialects
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 9ca3810f395..66eb4cfcb0d 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -779,6 +779,44 @@ def DeleteStates : Pass<"delete-states", "mlir::ModuleOp"> {
   }];
 }
 
+def StateInitialization : Pass<"state-initialization", "mlir::ModuleOp"> {
+  let summary =
+    "Replace `quake.init_state` instructions with call to the kernel generating the state";
+  let description = [{
+    Argument synthesis for state pointers for quantum devices substitutes state
+    argument by a new state created from `__nvqpp_cudaq_state_get` intrinsic, which
+    in turn accepts the name for the synthesized kernel that generated the state.
+
+    This optimization completes the replacement of `quake.init_state` instruction by:
+
+    - Replace `quake.init_state` by a call that `get_state` call refers to.
+    - Remove all unneeded instructions.
+
+    For example:
+
+    Before StateInitialization (state-initialization):
+    ```
+    func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+      %0 = cc.string_literal "__nvqpp__mlirgen__test_init_state.modified_0" : !cc.ptr<!cc.array<i8 x 45>>
+      %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 45>>) -> !cc.ptr<i8>
+      %2 = call @__nvqpp_cudaq_state_get(%1) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+      %3 = call @__nvqpp_cudaq_state_numberOfQubits(%2) : (!cc.ptr<!cc.state>) -> i64
+      %4 = quake.alloca !quake.veq<?>[%3 : i64]
+      %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+      return
+    }
+    ```
+
+    After StateInitialization (state-initialization):
+    ```
+    func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+      %5 = call @__nvqpp__mlirgen__test_init_state.modified_0() : () -> !quake.veq<?>
+      return
+    }
+    ```
+  }];
+}
+
 def StatePreparation : Pass<"state-prep", "mlir::ModuleOp"> {
   let summary =
     "Convert state vector data into gates";
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index 12e430dc031..57c636e31dd 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -261,6 +261,10 @@ static constexpr IntrinsicCode intrinsicTable[] = {
 
     {cudaq::deleteCudaqState, {}, R"#(
   func.func private @__nvqpp_cudaq_state_delete(%p : !cc.ptr<!cc.state>) -> ()
+  )#"},
+
+    {cudaq::getCudaqState, {}, R"#(
+  func.func private @__nvqpp_cudaq_state_get(%p : !cc.ptr<i8>) -> !cc.ptr<!cc.state>
   )#"},
 
     {cudaq::getNumQubitsFromCudaqState, {}, R"#(
diff --git a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
index 4de20fd7bef..04eac5b06f7 100644
--- a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
+++ b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
@@ -49,7 +49,8 @@ struct VerifyNVQIRCallOpsPass
           cudaq::getNumQubitsFromCudaqState,
           cudaq::createCudaqStateFromDataFP32,
           cudaq::createCudaqStateFromDataFP64,
-          cudaq::deleteCudaqState};
+          cudaq::deleteCudaqState,
+          cudaq::getCudaqState};
       // It must be either NVQIR extension functions or in the allowed list.
       return std::find(NVQIR_FUNCS.begin(), NVQIR_FUNCS.end(), functionName) !=
                  NVQIR_FUNCS.end() ||
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index a6b94d9a596..f107d78bde6 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -50,6 +50,7 @@ add_cudaq_library(OptTransforms
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
   RegToMem.cpp
+  StateInitialization.cpp
   StatePreparation.cpp
   UnitarySynthesis.cpp
   WiresToWiresets.cpp
diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 9328b78896d..8cf6a019f8b 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -170,9 +170,10 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
         if (auto load = dyn_cast<cudaq::cc::LoadOp>(useuser)) {
           rewriter.setInsertionPointAfter(useuser);
           LLVM_DEBUG(llvm::dbgs() << "replaced load\n");
-          rewriter.replaceOpWithNewOp<cudaq::cc::ExtractValueOp>(
-              load, eleTy, conArr,
-              ArrayRef<cudaq::cc::ExtractValueArg>{offset});
+          auto extract = rewriter.create<cudaq::cc::ExtractValueOp>(
+              loc, eleTy, conArr, ArrayRef<cudaq::cc::ExtractValueArg>{offset});
+          rewriter.replaceAllUsesWith(load, extract);
+          toErase.push_back(load);
           continue;
         }
         if (isa<cudaq::cc::StoreOp>(useuser))
@@ -199,8 +200,10 @@ class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
       toErase.push_back(alloc);
     }
 
-    for (auto *op : toErase)
+    for (auto *op : toErase) {
+      op->dropAllUses();
       rewriter.eraseOp(op);
+    }
 
     return success();
   }
diff --git a/lib/Optimizer/Transforms/StateInitialization.cpp b/lib/Optimizer/Transforms/StateInitialization.cpp
new file mode 100644
index 00000000000..3a122f02a7b
--- /dev/null
+++ b/lib/Optimizer/Transforms/StateInitialization.cpp
@@ -0,0 +1,146 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+#include <span>
+
+namespace cudaq::opt {
+#define GEN_PASS_DEF_STATEINITIALIZATION
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
+#define DEBUG_TYPE "state-initialization"
+
+using namespace mlir;
+
+namespace {
+
+static bool isCall(Operation *callOp, std::vector<const char *> &&names) {
+  if (callOp) {
+    if (auto createStateCall = dyn_cast<func::CallOp>(callOp)) {
+      if (auto calleeAttr = createStateCall.getCalleeAttr()) {
+        auto funcName = calleeAttr.getValue().str();
+        if (std::find(names.begin(), names.end(), funcName) != names.end())
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+static bool isGetStateCall(Operation *callOp) {
+  return isCall(callOp, {cudaq::getCudaqState});
+}
+
+static bool isNumberOfQubitsCall(Operation *callOp) {
+  return isCall(callOp, {cudaq::getNumQubitsFromCudaqState});
+}
+
+// clang-format off
+/// Replace `quake.init_state` by a call to a (modified) kernel that produced the state.
+/// ```
+///  %0 = cc.string_literal "callee.modified_0" : !cc.ptr<!cc.array<i8 x 27>>
+///  %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 27>>) -> !cc.ptr<i8>
+///  %2 = call @__nvqpp_cudaq_state_get(%1) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+///  %3 = call @__nvqpp_cudaq_state_numberOfQubits(%2) : (!cc.ptr<!cc.state>) -> i64
+///  %4 = quake.alloca !quake.veq<?>[%3 : i64]
+///  %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+/// ───────────────────────────────────────────
+/// ...
+///  %5 = call @callee.modified_0() : () -> !quake.veq<?>
+/// ```
+// clang-format on
+class StateInitPattern : public OpRewritePattern<quake::InitializeStateOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
+                                PatternRewriter &rewriter) const override {
+    auto loc = initState.getLoc();
+    auto allocaOp = initState.getOperand(0).getDefiningOp();
+    auto getStateOp = initState.getOperand(1).getDefiningOp();
+    auto numOfQubits = allocaOp->getOperand(0).getDefiningOp();
+
+    if (isGetStateCall(getStateOp)) {
+      auto calleeNameOp = getStateOp->getOperand(0);
+      if (auto cast =
+              dyn_cast<cudaq::cc::CastOp>(calleeNameOp.getDefiningOp())) {
+        calleeNameOp = cast.getOperand();
+
+        if (auto literal = dyn_cast<cudaq::cc::CreateStringLiteralOp>(
+                calleeNameOp.getDefiningOp())) {
+          auto calleeName = literal.getStringLiteral();
+
+          Value result =
+              rewriter
+                  .create<func::CallOp>(loc, initState.getType(), calleeName,
+                                        mlir::ValueRange{})
+                  .getResult(0);
+          rewriter.replaceAllUsesWith(initState, result);
+          initState.erase();
+          allocaOp->dropAllUses();
+          rewriter.eraseOp(allocaOp);
+          if (isNumberOfQubitsCall(numOfQubits)) {
+            numOfQubits->dropAllUses();
+            rewriter.eraseOp(numOfQubits);
+          }
+          getStateOp->dropAllUses();
+          rewriter.eraseOp(getStateOp);
+          cast->dropAllUses();
+          rewriter.eraseOp(cast);
+          literal->dropAllUses();
+          rewriter.eraseOp(literal);
+          return success();
+        }
+      }
+    }
+    return failure();
+  }
+};
+
+class StateInitializationPass
+    : public cudaq::opt::impl::StateInitializationBase<
+          StateInitializationPass> {
+public:
+  using StateInitializationBase::StateInitializationBase;
+
+  void runOnOperation() override {
+    auto *ctx = &getContext();
+    auto module = getOperation();
+    for (Operation &op : *module.getBody()) {
+      auto func = dyn_cast<func::FuncOp>(op);
+      if (!func)
+        continue;
+
+      std::string funcName = func.getName().str();
+      RewritePatternSet patterns(ctx);
+      patterns.insert<StateInitPattern>(ctx);
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Before state initialization: " << func << '\n');
+
+      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
+                                              std::move(patterns))))
+        signalPassFailure();
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "After state initialization: " << func << '\n');
+    }
+  }
+};
+} // namespace
diff --git a/python/runtime/cudaq/algorithms/py_state.cpp b/python/runtime/cudaq/algorithms/py_state.cpp
index 77a8e4a36d0..74e098ebbf9 100644
--- a/python/runtime/cudaq/algorithms/py_state.cpp
+++ b/python/runtime/cudaq/algorithms/py_state.cpp
@@ -96,8 +96,9 @@ class PyRemoteSimulationState : public RemoteSimulationState {
     }
   }
 
-  std::pair<std::string, std::vector<void *>> getKernelInfo() const override {
-    return {kernelName, argsData->getArgs()};
+  std::optional<std::pair<std::string, std::vector<void *>>>
+  getKernelInfo() const override {
+    return std::make_pair(kernelName, argsData->getArgs());
   }
 
   std::complex<double> overlap(const cudaq::SimulationState &other) override {
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index b91627de9fc..a7531f9caa1 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -517,7 +517,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   auto isLocalSimulator = platform.is_simulator() && !platform.is_emulated();
   auto isSimulator = isLocalSimulator || isRemoteSimulator;
 
-  cudaq::opt::ArgumentConverter argCon(name, unwrap(module), isSimulator);
+  cudaq::opt::ArgumentConverter argCon(name, unwrap(module));
   argCon.gen(runtimeArgs.getArgs());
   std::string kernName = cudaq::runtime::cudaqGenPrefixName + name;
   SmallVector<StringRef> kernels = {kernName};
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 424cbd8873d..83e4dd3725c 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -10,6 +10,8 @@
 #include "cudaq.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Todo.h"
 #include "cudaq/qis/pauli_word.h"
 #include "cudaq/utils/registry.h"
@@ -97,11 +99,25 @@ static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *,
                          ModuleOp substMod, llvm::DataLayout &);
 
 static Value genConstant(OpBuilder &builder, const cudaq::state *v,
-                         ModuleOp substMod, llvm::DataLayout &layout,
-                         llvm::StringRef kernelName, bool isSimulator) {
-  if (isSimulator) {
-    // The program is executed remotely, materialize the simulation data
-    // into an array and create a new state from it.
+                         llvm::DataLayout &layout,
+                         cudaq::opt::ArgumentConverter &converter) {
+  auto simState =
+      cudaq::state_helper::getSimulationState(const_cast<cudaq::state *>(v));
+
+  auto kernelName = converter.getKernelName();
+  auto sourceMod = converter.getSourceModule();
+  auto substMod = converter.getSubstitutionModule();
+
+  // If the state has amplitude data, we materialize the data as a state
+  // vector and create a new state from it.
+  // TODO: how to handle density matrices? Should we just inline calls?
+  if (simState->hasData()) {
+    // The call below might cause lazy execution of the state kernel.
+    // TODO: For lazy execution scenario on remote simulators, we have the
+    // kernel info available on the state as well, before we needed to run
+    // the state kernel and compute its data, which might cause significant
+    // data transfer). Investigate if it is more performant to use the other
+    // synthesis option in that case (see the next `if`).
     auto numQubits = v->get_num_qubits();
 
     // We currently only synthesize small states.
@@ -130,11 +146,11 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
       std::string name =
           kernelName.str() + ".rodata_synth_" + std::to_string(counter++);
       irBuilder.genVectorOfConstants(loc, substMod, name, vec);
-      auto conGlobal = builder.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
-      return builder.create<cudaq::cc::LoadOp>(loc, arrTy, conGlobal);
+
+      return builder.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
     };
 
-    auto conArr = is64Bit ? genConArray.template operator()<double>()
+    auto buffer = is64Bit ? genConArray.template operator()<double>()
                           : genConArray.template operator()<float>();
 
     auto createState = is64Bit ? cudaq::createCudaqStateFromDataFP64
@@ -146,21 +162,111 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     auto stateTy = cudaq::cc::StateType::get(ctx);
     auto statePtrTy = cudaq::cc::PointerType::get(stateTy);
     auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
-    auto buffer = builder.create<cudaq::cc::AllocaOp>(loc, arrTy);
-    builder.create<cudaq::cc::StoreOp>(loc, conArr, buffer);
 
     auto cast = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, buffer);
     auto statePtr = builder
                         .create<func::CallOp>(loc, statePtrTy, createState,
                                               ValueRange{cast, arrSize})
                         .getResult(0);
+    return builder.create<cudaq::cc::CastOp>(loc, statePtrTy, statePtr);
+  }
+
+  // For quantum hardware, replace states with calls to kernels that generated
+  // them.
+  if (simState->getKernelInfo().has_value()) {
+    auto [calleeName, calleeArgs] = simState->getKernelInfo().value();
+
+    std::string calleeKernelName =
+        cudaq::runtime::cudaqGenPrefixName + calleeName;
+
+    auto ctx = builder.getContext();
+    auto loc = builder.getUnknownLoc();
 
-    // TODO: Delete the new state before function exit.
+    auto code = cudaq::get_quake_by_name(calleeName, /*throwException=*/false);
+    assert(!code.empty() && "Quake code not found for callee");
+    auto fromModule = parseSourceString<ModuleOp>(code, ctx);
+
+    static unsigned counter = 0;
+    std::string modifiedCalleeName =
+        calleeName + ".modified_" + std::to_string(counter++);
+    std::string modifiedCalleeKernelName =
+        cudaq::runtime::cudaqGenPrefixName + modifiedCalleeName;
+
+    // Create callee.modified that returns concat of veq allocations.
+    auto calleeFunc = fromModule->lookupSymbol<func::FuncOp>(calleeKernelName);
+    assert(calleeFunc && "callee is missing");
+    auto argTypes = calleeFunc.getArgumentTypes();
+    auto retType = quake::VeqType::getUnsized(ctx);
+    auto funcTy = FunctionType::get(ctx, argTypes, {retType});
+
+    {
+      OpBuilder::InsertionGuard guard(builder);
+      builder.setInsertionPointToEnd(sourceMod.getBody());
+
+      auto modifiedCalleeFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
+      modifiedCalleeFunc.setName(modifiedCalleeKernelName);
+      modifiedCalleeFunc.setType(funcTy);
+      modifiedCalleeFunc.setPrivate();
+
+      OpBuilder modifiedBuilder(ctx);
+      SmallVector<Value> allocations;
+      SmallVector<Operation *> cleanUps;
+      for (auto &op : modifiedCalleeFunc.getOps()) {
+        if (auto alloc = dyn_cast<quake::AllocaOp>(op)) {
+          allocations.push_back(alloc.getResult());
+          // Replace by the result of quake.init_state if used by it
+          for (auto *user : op.getUsers()) {
+            if (auto init = dyn_cast<quake::InitializeStateOp>(*user)) {
+              allocations.pop_back();
+              allocations.push_back(init.getResult());
+            }
+          }
+        }
+        if (auto retOp = dyn_cast<func::ReturnOp>(op)) {
+          if (retOp.getOperands().size() == 0) {
+            modifiedBuilder.setInsertionPointAfter(retOp);
+            assert(allocations.size() > 0 && "No veq allocations found");
+            Value ret = modifiedBuilder.create<quake::ConcatOp>(
+                loc, quake::VeqType::getUnsized(ctx), allocations);
+            modifiedBuilder.create<func::ReturnOp>(loc, ret);
+            cleanUps.push_back(retOp);
+          }
+        }
+      }
+      for (auto *op : cleanUps) {
+        op->dropAllUses();
+        op->erase();
+      }
+    }
+
+    // Create substitutions for the `callee.modified.N`.
+    converter.genCallee(modifiedCalleeName, calleeArgs);
+
+    // Create a subst for state pointer.
+    auto strLitTy = cudaq::cc::PointerType::get(
+        cudaq::cc::ArrayType::get(builder.getContext(), builder.getI8Type(),
+                                  modifiedCalleeKernelName.size() + 1));
+    auto callee = builder.create<cudaq::cc::CreateStringLiteralOp>(
+        loc, strLitTy, builder.getStringAttr(modifiedCalleeKernelName));
+
+    auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
+    auto calleeCast = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, callee);
+
+    cudaq::IRBuilder irBuilder(ctx);
+    auto result = irBuilder.loadIntrinsic(substMod, cudaq::getCudaqState);
+    assert(succeeded(result) && "loading intrinsic should never fail");
+
+    auto statePtrTy =
+        cudaq::cc::PointerType::get(cudaq::cc::StateType::get(ctx));
+    auto statePtr =
+        builder
+            .create<func::CallOp>(loc, statePtrTy, cudaq::getCudaqState,
+                                  ValueRange{calleeCast})
+            .getResult(0);
     return builder.create<cudaq::cc::CastOp>(loc, statePtrTy, statePtr);
   }
-  // The program is executed on quantum hardware, state data is not
-  // available and needs to be regenerated.
-  TODO("cudaq::state* argument synthesis for quantum hardware");
+
+  TODO("cudaq::state* argument synthesis for quantum hardware for c functions");
   return {};
 }
 
@@ -326,7 +432,7 @@ cudaq::opt::ArgumentConverter::ArgumentConverter(StringRef kernelName,
                                                  ModuleOp sourceModule,
                                                  bool isSimulator)
     : sourceModule(sourceModule), builder(sourceModule.getContext()),
-      kernelName(kernelName), isSimulator(isSimulator) {
+      kernelName(kernelName) {
   substModule = builder.create<ModuleOp>(builder.getUnknownLoc());
 }
 
@@ -335,7 +441,7 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
   // We should look up the input type signature here.
 
   auto fun = sourceModule.lookupSymbol<func::FuncOp>(
-      cudaq::runtime::cudaqGenPrefixName + kernelName.str());
+      cudaq::runtime::cudaqGenPrefixName + kernelName);
   FunctionType fromFuncTy = fun.getFunctionType();
   for (auto iter :
        llvm::enumerate(llvm::zip(fromFuncTy.getInputs(), arguments))) {
@@ -403,8 +509,7 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
             .Case([&](cc::PointerType ptrTy) -> cc::ArgumentSubstitutionOp {
               if (ptrTy.getElementType() == cc::StateType::get(ctx))
                 return buildSubst(static_cast<const state *>(argPtr),
-                                  substModule, dataLayout, kernelName,
-                                  isSimulator);
+                                  dataLayout, *this);
               return {};
             })
             .Case([&](cc::StdvecType ty) {
@@ -457,3 +562,29 @@ void cudaq::opt::ArgumentConverter::gen_drop_front(
   }
   gen(partialArgs);
 }
+
+std::pair<std::vector<std::string>, std::vector<std::string>>
+cudaq::opt::ArgumentConverter::collectAllSubstitutions() {
+  std::vector<std::string> kernels;
+  std::vector<std::string> substs;
+
+  std::function<void(ArgumentConverter &)> collect =
+      [&kernels, &substs, &collect](ArgumentConverter &con) {
+        auto name = con.getKernelName();
+        std::string kernName = cudaq::runtime::cudaqGenPrefixName + name.str();
+        kernels.push_back(kernName);
+
+        {
+          std::string substBuff;
+          llvm::raw_string_ostream ss(substBuff);
+          ss << con.getSubstitutionModule();
+          substs.push_back(substBuff);
+        }
+
+        for (auto &calleeCon : con.getCalleeConverters())
+          collect(calleeCon);
+      };
+
+  collect(*this);
+  return {kernels, substs};
+}
diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index 45e6607b0c9..be438fe66ca 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -14,6 +14,7 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Types.h"
 #include <unordered_set>
+#include <vector>
 
 namespace cudaq::opt {
 
@@ -47,13 +48,30 @@ class ArgumentConverter {
   /// created.
   mlir::ModuleOp getSubstitutionModule() { return substModule; }
 
+  mlir::ModuleOp getSourceModule() { return sourceModule; }
+
+  mlir::StringRef getKernelName() { return kernelName; }
+
+  void genCallee(std::string &calleeName, std::vector<void *> &args) {
+    auto converter = ArgumentConverter(calleeName, sourceModule);
+    converter.gen(args);
+    calleeConverters.push_back(converter);
+  }
+
+  std::vector<ArgumentConverter> &getCalleeConverters() {
+    return calleeConverters;
+  }
+
+  std::pair<std::vector<std::string>, std::vector<std::string>>
+  collectAllSubstitutions();
+
 private:
   mlir::ModuleOp sourceModule;
   mlir::ModuleOp substModule;
   mlir::OpBuilder builder;
-  mlir::StringRef kernelName;
+  std::string kernelName;
   mlir::SmallVector<cc::ArgumentSubstitutionOp> substitutions;
-  bool isSimulator;
+  std::vector<ArgumentConverter> calleeConverters;
 };
 
 } // namespace cudaq::opt
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 61c26dc791f..41f45b6b759 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -393,15 +393,18 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (!func->hasAttr(cudaq::entryPointAttrName))
       func->setAttr(cudaq::entryPointAttrName, builder.getUnitAttr());
     auto moduleOp = builder.create<mlir::ModuleOp>();
-    moduleOp.push_back(func.clone());
     moduleOp->setAttrs(m_module->getAttrDictionary());
 
     for (auto &op : m_module.getOps()) {
-      // Add any global symbols, including global constant arrays.
-      // Global constant arrays can be created during compilation,
-      // `lift-array-value`, `quake-synthesizer`, and `get-concrete-matrix`
-      // passes.
-      if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op))
+      if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
+        // Add quantum kernels defined in the module.
+        if (funcOp->hasAttr(cudaq::kernelAttrName) ||
+            funcOp.getName().startswith("__nvqpp__mlirgen__") ||
+            funcOp.getBody().empty())
+          moduleOp.push_back(funcOp.clone());
+      }
+      // Add globals defined in the module.
+      if (auto globalOp = dyn_cast<cc::GlobalOp>(op))
         moduleOp.push_back(globalOp.clone());
     }
 
@@ -428,16 +431,18 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
       mlir::PassManager pm(&context);
       if (!rawArgs.empty()) {
         cudaq::info("Run Argument Synth.\n");
-        opt::ArgumentConverter argCon(kernelName, moduleOp, false);
+        opt::ArgumentConverter argCon(kernelName, moduleOp);
         argCon.gen(rawArgs);
-        std::string kernName = cudaq::runtime::cudaqGenPrefixName + kernelName;
-        mlir::SmallVector<mlir::StringRef> kernels = {kernName};
-        std::string substBuff;
-        llvm::raw_string_ostream ss(substBuff);
-        ss << argCon.getSubstitutionModule();
-        mlir::SmallVector<mlir::StringRef> substs = {substBuff};
+        auto [kernels, substs] = argCon.collectAllSubstitutions();
         pm.addNestedPass<mlir::func::FuncOp>(
-            opt::createArgumentSynthesisPass(kernels, substs));
+            cudaq::opt::createArgumentSynthesisPass(
+                mlir::SmallVector<mlir::StringRef>{kernels.begin(),
+                                                   kernels.end()},
+                mlir::SmallVector<mlir::StringRef>{substs.begin(),
+                                                   substs.end()}));
+        pm.addPass(mlir::createCanonicalizerPass());
+        pm.addPass(opt::createDeleteStates());
+        pm.addPass(opt::createStateInitialization());
       } else if (updatedArgs) {
         cudaq::info("Run Quake Synth.\n");
         pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index b938815d926..5384d71008a 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -329,8 +329,8 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       if (!castedState1 || !castedState2)
         throw std::runtime_error(
             "Invalid execution context: input states are not compatible");
-      auto [kernelName1, args1] = castedState1->getKernelInfo();
-      auto [kernelName2, args2] = castedState2->getKernelInfo();
+      auto [kernelName1, args1] = castedState1->getKernelInfo().value();
+      auto [kernelName2, args2] = castedState2->getKernelInfo().value();
       cudaq::IRPayLoad stateIrPayload1, stateIrPayload2;
 
       stateIrPayload1.entryPoint = kernelName1;
diff --git a/runtime/common/CMakeLists.txt b/runtime/common/CMakeLists.txt
index bb8a5ecaba6..e1a38c4e257 100644
--- a/runtime/common/CMakeLists.txt
+++ b/runtime/common/CMakeLists.txt
@@ -102,7 +102,7 @@ set_source_files_properties(
     JIT.cpp
     Logger.cpp
     RuntimeMLIR.cpp
-  PROPERTIES COMPILE_FLAGS -fno-rtti
+#  PROPERTIES COMPILE_FLAGS -fno-rtti
 )
 
 target_include_directories(cudaq-mlir-runtime
diff --git a/runtime/common/SimulationState.h b/runtime/common/SimulationState.h
index 3ec97f2568f..694770fa482 100644
--- a/runtime/common/SimulationState.h
+++ b/runtime/common/SimulationState.h
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <complex>
 #include <memory>
+#include <optional>
 #include <variant>
 #include <vector>
 
@@ -140,6 +141,16 @@ class SimulationState {
     return createFromSizeAndPtr(size, ptr, data.index());
   }
 
+  /// @brief True if the state has amplitudes or density matrix
+  // is available or can be computed.
+  virtual bool hasData() const { return true; }
+
+  /// @brief Helper to retrieve (kernel name, `args` pointers)
+  virtual std::optional<std::pair<std::string, std::vector<void *>>>
+  getKernelInfo() const {
+    return std::nullopt;
+  }
+
   /// @brief Return the tensor at the given index. Throws
   /// for an invalid tensor index.
   virtual Tensor getTensor(std::size_t tensorIdx = 0) const = 0;
diff --git a/runtime/cudaq/CMakeLists.txt b/runtime/cudaq/CMakeLists.txt
index 9c08eef3543..2efb8824e7b 100644
--- a/runtime/cudaq/CMakeLists.txt
+++ b/runtime/cudaq/CMakeLists.txt
@@ -20,6 +20,7 @@ add_library(${LIBRARY_NAME}
                 platform/quantum_platform.cpp
                 qis/execution_manager_c_api.cpp
                 qis/execution_manager.cpp
+                qis/quantum_state.cpp
                 qis/remote_state.cpp
                 qis/state.cpp
                 utils/cudaq_utils.cpp
diff --git a/runtime/cudaq/algorithms/get_state.h b/runtime/cudaq/algorithms/get_state.h
index bbb64ebcbfc..a57fa0194e6 100644
--- a/runtime/cudaq/algorithms/get_state.h
+++ b/runtime/cudaq/algorithms/get_state.h
@@ -14,6 +14,7 @@
 #include "cudaq/host_config.h"
 #include "cudaq/platform.h"
 #include "cudaq/platform/QuantumExecutionQueue.h"
+#include "cudaq/qis/quantum_state.h"
 #include "cudaq/qis/remote_state.h"
 #include "cudaq/qis/state.h"
 #include <complex>
@@ -118,6 +119,17 @@ auto get_state(QuantumKernel &&kernel, Args &&...args) {
     return state(new RemoteSimulationState(std::forward<QuantumKernel>(kernel),
                                            std::forward<Args>(args)...));
   }
+#endif
+#if defined(CUDAQ_QUANTUM_DEVICE)
+  // Store kernel name and arguments for quantum states.
+  if (!cudaq::get_quake_by_name(cudaq::getKernelName(kernel), false).empty()) {
+    return state(new QuantumState(std::forward<QuantumKernel>(kernel),
+                                  std::forward<Args>(args)...));
+  } else {
+    throw std::runtime_error(
+        "cudaq::state* argument synthesis is not supported for quantum hardware"
+        "for c-like functions, use class kernels instead");
+  }
 #endif
   return details::extractState([&]() mutable {
     cudaq::invokeKernel(std::forward<QuantumKernel>(kernel),
diff --git a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
index 21cc45be1e3..0a291a240d2 100644
--- a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
@@ -13,6 +13,8 @@ config:
   platform-qpu: remote_rest
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
+  # Add preprocessor defines to compilation
+  preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
diff --git a/runtime/cudaq/qis/quantum_state.cpp b/runtime/cudaq/qis/quantum_state.cpp
new file mode 100644
index 00000000000..faaae5b510a
--- /dev/null
+++ b/runtime/cudaq/qis/quantum_state.cpp
@@ -0,0 +1,113 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "quantum_state.h"
+#include "common/Logger.h"
+
+namespace cudaq {
+
+QuantumState::~QuantumState() {
+  if (!platformExecutionLog.empty()) {
+    // Flush any info log from the remote execution
+    printf("%s\n", platformExecutionLog.c_str());
+    platformExecutionLog.clear();
+  }
+
+  for (std::size_t counter = 0; auto &ptr : args)
+    deleters[counter++](ptr);
+
+  args.clear();
+  deleters.clear();
+}
+
+std::size_t QuantumState::getNumQubits() const {
+  throw std::runtime_error(
+      "getNumQubits is not implemented for quantum hardware");
+}
+
+cudaq::SimulationState::Tensor
+QuantumState::getTensor(std::size_t tensorIdx) const {
+  throw std::runtime_error("getTensor is not implemented for quantum hardware");
+}
+
+/// @brief Return all tensors that represent this state
+std::vector<cudaq::SimulationState::Tensor> QuantumState::getTensors() const {
+  throw std::runtime_error(
+      "getTensors is not implemented for quantum hardware");
+  return {getTensor()};
+}
+
+/// @brief Return the number of tensors that represent this state.
+std::size_t QuantumState::getNumTensors() const {
+  throw std::runtime_error(
+      "getNumTensors is not implemented for quantum hardware");
+}
+
+std::complex<double>
+QuantumState::operator()(std::size_t tensorIdx,
+                         const std::vector<std::size_t> &indices) {
+  throw std::runtime_error(
+      "operator() is not implemented for quantum hardware");
+}
+
+std::unique_ptr<SimulationState>
+QuantumState::createFromSizeAndPtr(std::size_t size, void *ptr, std::size_t) {
+  throw std::runtime_error(
+      "createFromSizeAndPtr is not implemented for quantum hardware");
+}
+
+void QuantumState::dump(std::ostream &os) const {
+  throw std::runtime_error("dump is not implemented for quantum hardware");
+}
+
+cudaq::SimulationState::precision QuantumState::getPrecision() const {
+  throw std::runtime_error(
+      "getPrecision is not implemented for quantum hardware");
+}
+
+void QuantumState::destroyState() {
+  // There is no state data so nothing to destroy.
+}
+
+bool QuantumState::isDeviceData() const {
+  throw std::runtime_error(
+      "isDeviceData is not implemented for quantum hardware");
+}
+
+void QuantumState::toHost(std::complex<double> *clientAllocatedData,
+                          std::size_t numElements) const {
+  throw std::runtime_error("toHost is not implemented for quantum hardware");
+}
+
+void QuantumState::toHost(std::complex<float> *clientAllocatedData,
+                          std::size_t numElements) const {
+  throw std::runtime_error("toHost is not implemented for quantum hardware");
+}
+
+std::optional<std::pair<std::string, std::vector<void *>>>
+QuantumState::getKernelInfo() const {
+  return std::make_pair(kernelName, args);
+}
+
+std::vector<std::complex<double>>
+QuantumState::getAmplitudes(const std::vector<std::vector<int>> &basisStates) {
+  throw std::runtime_error(
+      "getAmplitudes is not implemented for quantum hardware");
+}
+
+std::complex<double>
+QuantumState::getAmplitude(const std::vector<int> &basisState) {
+  throw std::runtime_error(
+      "getAmplitudes is not implemented for quantum hardware");
+}
+
+std::complex<double>
+QuantumState::overlap(const cudaq::SimulationState &other) {
+  throw std::runtime_error("overlap is not implemented for quantum hardware");
+}
+} // namespace cudaq
diff --git a/runtime/cudaq/qis/quantum_state.h b/runtime/cudaq/qis/quantum_state.h
new file mode 100644
index 00000000000..63117eb4629
--- /dev/null
+++ b/runtime/cudaq/qis/quantum_state.h
@@ -0,0 +1,151 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "common/SimulationState.h"
+#include "cudaq.h"
+#include "cudaq/utils/cudaq_utils.h"
+
+namespace cudaq {
+/// Implementation of `SimulationState` for quantum device backends.
+// The state is represented by a quantum kernel.
+// Quantum state contains all the information we need to replicate a
+// call to kernel that created the state.
+class QuantumState : public cudaq::SimulationState {
+protected:
+  std::string kernelName;
+  // Lazily-evaluated state data (just keeping the kernel name and arguments).
+  // e.g., to be evaluated at amplitude accessor APIs (const APIs, hence needs
+  // to be mutable) or overlap calculation with another remote state (combining
+  // the IR of both states for remote evaluation)
+  mutable std::unique_ptr<cudaq::SimulationState> state;
+  // Cache log messages from the remote execution.
+  // Mutable to support lazy execution during `const` API calls.
+  mutable std::string platformExecutionLog;
+  using ArgDeleter = std::function<void(void *)>;
+  /// @brief  Vector of arguments
+  // Note: we create a copy of all arguments except pointers.
+  std::vector<void *> args;
+  /// @brief Deletion functions for the arguments.
+  std::vector<std::function<void(void *)>> deleters;
+
+public:
+  template <typename T>
+  void addArgument(const T &arg) {
+    if constexpr (std::is_pointer_v<std::decay_t<T>>) {
+      if constexpr (std::is_copy_constructible_v<
+                        std::remove_pointer_t<std::decay_t<T>>>) {
+        auto ptr = new std::remove_pointer_t<std::decay_t<T>>(*arg);
+        args.push_back(ptr);
+        deleters.push_back([](void *ptr) {
+          delete static_cast<std::remove_pointer_t<std::decay_t<T>> *>(ptr);
+        });
+      } else {
+        throw std::invalid_argument(
+            "Unsupported argument type: only pointers to copy-constructible "
+            "types and copy-constructible types are supported.");
+      }
+    } else if constexpr (std::is_copy_constructible_v<std::decay_t<T>>) {
+      auto *ptr = new std::decay_t<T>(arg);
+      args.push_back(ptr);
+      deleters.push_back(
+          [](void *ptr) { delete static_cast<std::decay_t<T> *>(ptr); });
+    } else {
+      throw std::invalid_argument(
+          "Unsupported argument type: only pointers to copy-constructible "
+          "types and copy-constructible types are supported.");
+    }
+  }
+
+  /// @brief Constructor
+  template <typename QuantumKernel, typename... Args>
+  QuantumState(QuantumKernel &&kernel, Args &&...args) {
+    if constexpr (has_name<QuantumKernel>::value) {
+      // kernel_builder kernel: need to JIT code to get it registered.
+      static_cast<cudaq::details::kernel_builder_base &>(kernel).jitCode();
+      kernelName = kernel.name();
+    } else {
+      kernelName = cudaq::getKernelName(kernel);
+    }
+    (addArgument(args), ...);
+  }
+  QuantumState() = default;
+  virtual ~QuantumState();
+
+  /// @brief True if the state has amplitudes or density matrix available.
+  virtual bool hasData() const override { return false; }
+
+  /// @brief Helper to retrieve (kernel name, `args` pointers)
+  virtual std::optional<std::pair<std::string, std::vector<void *>>>
+  getKernelInfo() const override;
+
+  /// @brief Return the number of qubits this state represents.
+  std::size_t getNumQubits() const override;
+
+  /// @brief Compute the overlap of this state representation with
+  /// the provided `other` state, e.g. `<this | other>`.
+  std::complex<double> overlap(const cudaq::SimulationState &other) override;
+
+  /// @brief Return the amplitude of the given computational
+  /// basis state.
+  std::complex<double>
+  getAmplitude(const std::vector<int> &basisState) override;
+
+  /// @brief Return the amplitudes of the given list of computational
+  /// basis states.
+  std::vector<std::complex<double>>
+  getAmplitudes(const std::vector<std::vector<int>> &basisState) override;
+
+  /// @brief Return the tensor at the given index. Throws
+  /// for an invalid tensor index.
+  Tensor getTensor(std::size_t tensorIdx = 0) const override;
+
+  /// @brief Return all tensors that represent this state
+  std::vector<Tensor> getTensors() const override;
+
+  /// @brief Return the number of tensors that represent this state.
+  std::size_t getNumTensors() const override;
+
+  /// @brief Return the element from the tensor at the
+  /// given tensor index and at the given indices.
+  std::complex<double>
+  operator()(std::size_t tensorIdx,
+             const std::vector<std::size_t> &indices) override;
+
+  /// @brief Create a new subclass specific SimulationState
+  /// from the user provided data set.
+  std::unique_ptr<SimulationState>
+  createFromSizeAndPtr(std::size_t size, void *ptr, std::size_t) override;
+
+  /// @brief Dump a representation of the state to the
+  /// given output stream.
+  void dump(std::ostream &os) const override;
+
+  /// @brief Return the floating point precision used by the simulation state.
+  precision getPrecision() const override;
+
+  /// @brief Destroy the state representation, frees all associated memory.
+  void destroyState() override;
+
+  /// @brief Return true if this `SimulationState` wraps data on the GPU.
+  bool isDeviceData() const override;
+
+  /// @brief Transfer data from device to host, return the data
+  /// to the pointer provided by the client. Clients must specify the number of
+  /// elements.
+  void toHost(std::complex<double> *clientAllocatedData,
+              std::size_t numElements) const override;
+
+  /// @brief Transfer data from device to host, return the data
+  /// to the pointer provided by the client. Clients must specify the number of
+  /// elements.
+  void toHost(std::complex<float> *clientAllocatedData,
+              std::size_t numElements) const override;
+};
+} // namespace cudaq
diff --git a/runtime/cudaq/qis/remote_state.cpp b/runtime/cudaq/qis/remote_state.cpp
index 713a462e46d..84c9bf94104 100644
--- a/runtime/cudaq/qis/remote_state.cpp
+++ b/runtime/cudaq/qis/remote_state.cpp
@@ -128,7 +128,7 @@ void RemoteSimulationState::toHost(std::complex<float> *clientAllocatedData,
   }
 }
 
-std::pair<std::string, std::vector<void *>>
+std::optional<std::pair<std::string, std::vector<void *>>>
 RemoteSimulationState::getKernelInfo() const {
   return std::make_pair(kernelName, args);
 }
diff --git a/runtime/cudaq/qis/remote_state.h b/runtime/cudaq/qis/remote_state.h
index 878bb098dd8..ba7929dea44 100644
--- a/runtime/cudaq/qis/remote_state.h
+++ b/runtime/cudaq/qis/remote_state.h
@@ -83,7 +83,8 @@ class RemoteSimulationState : public cudaq::SimulationState {
   virtual void execute() const;
 
   /// @brief Helper to retrieve (kernel name, `args` pointers)
-  virtual std::pair<std::string, std::vector<void *>> getKernelInfo() const;
+  virtual std::optional<std::pair<std::string, std::vector<void *>>>
+  getKernelInfo() const override;
 
   /// @brief Return the number of qubits this state represents.
   std::size_t getNumQubits() const override;
diff --git a/targettests/Remote-Sim/qvector_init_from_state.cpp b/targettests/Remote-Sim/qvector_init_from_state.cpp
index 5899c2f5987..1f94b47f06f 100644
--- a/targettests/Remote-Sim/qvector_init_from_state.cpp
+++ b/targettests/Remote-Sim/qvector_init_from_state.cpp
@@ -148,6 +148,22 @@ int main() {
 // CHECK: 10
   // clang-format on
 
+  {
+    std::cout << "Passing state from another kernel as argument"
+                 " with pauli word arg (kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state);
+    auto counts = cudaq::sample(test_state_param2, &state, cudaq::pauli_word{"XX"});
+    printCounts(counts);
+  }
+  // clang-format off
+// CHECK: Passing state from another kernel as argument with pauli word arg (kernel mode)
+// CHECK: 00
+// CHECK: 01
+// CHECK: 10
+// CHECK: 11
+  // clang-format on
+
   {
     std::cout << "Passing state from another kernel as argument iteratively "
                  "with vector args (kernel mode)"
diff --git a/targettests/execution/qvector_init_from_state.cpp b/targettests/execution/qvector_init_from_state.cpp
new file mode 100644
index 00000000000..afaba5a2c05
--- /dev/null
+++ b/targettests/execution/qvector_init_from_state.cpp
@@ -0,0 +1,147 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: nvq++ %cpp_std --enable-mlir                                     %s -o %t  && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+struct test_init_state {
+  void operator()(int n) __qpu__ {
+    cudaq::qvector q(n);
+    ry(M_PI/2.0, q[0]);
+  }
+};
+
+struct test_state_param {
+  void operator()(cudaq::state *state) __qpu__ {
+    cudaq::qvector q(state);
+    x(q);
+  }
+};
+
+struct test_state_param2 {
+  void operator()(cudaq::state *state, cudaq::pauli_word w) __qpu__ {
+    cudaq::qvector q(state);
+    cudaq::exp_pauli(1.0, q, w);
+  }
+};
+
+void printCounts(cudaq::sample_result &result) {
+  std::vector<std::string> values{};
+  for (auto &&[bits, counts] : result) {
+    values.push_back(bits);
+  }
+
+  std::sort(values.begin(), values.end());
+  for (auto &&bits : values) {
+    std::cout << bits << std::endl;
+  }
+}
+
+int main() {
+  std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0., 0., 0., 0., 0.};
+  std::vector<cudaq::complex> vec1{0., 0., 0.,        0.,
+                                   0., 0., M_SQRT1_2, M_SQRT1_2};
+  auto state = cudaq::state::from_data(vec);
+  auto state1 = cudaq::state::from_data(vec1);
+  {
+    std::cout << "Passing state created from data as argument (kernel mode)"
+              << std::endl;
+    auto counts = cudaq::sample(test_state_param{}, &state);
+    printCounts(counts);
+
+    counts = cudaq::sample(test_state_param{}, &state1);
+    printCounts(counts);
+  }
+
+  // clang-format off
+// CHECK: Passing state created from data as argument (kernel mode)
+// CHECK: 011
+// CHECK: 111
+
+// CHECK: 000
+// CHECK: 100
+  // clang-format on
+
+  {
+    std::cout << "Passing state from another kernel as argument (kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state{}, 2);
+    auto counts = cudaq::sample(test_state_param{}, &state);
+    printCounts(counts);
+  }
+  // clang-format off
+// CHECK: Passing state from another kernel as argument (kernel mode)
+// CHECK: 01
+// CHECK: 11
+  // clang-format on
+
+  {
+    std::cout
+        << "Passing large state from another kernel as argument (kernel mode)"
+        << std::endl;
+    auto largeState = cudaq::get_state(test_init_state{}, 14);
+    auto counts = cudaq::sample(test_state_param{}, &largeState);
+    printCounts(counts);
+  }
+  // clang-format off
+// CHECK: Passing large state from another kernel as argument (kernel mode)
+// CHECK: 01111111111111
+// CHECK: 11111111111111
+  // clang-format on
+
+  {
+    std::cout << "Passing state from another kernel as argument"
+                 " with pauli word arg (kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state{}, 2);
+    auto counts = cudaq::sample(test_state_param2{}, &state, cudaq::pauli_word{"XX"});
+    printCounts(counts);
+  }
+  // clang-format off
+// CHECK: Passing state from another kernel as argument with pauli word arg (kernel mode)
+// CHECK: 00
+// CHECK: 01
+// CHECK: 10
+// CHECK: 11
+  // clang-format on
+
+  {
+    std::cout << "Passing state from another kernel as argument iteratively "
+                 "(kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state{}, 2);
+    for (auto i = 0; i < 4; i++) {
+      auto counts = cudaq::sample(test_state_param{}, &state);
+      std::cout << "Iteration: " << i << std::endl;
+      printCounts(counts);
+      state = cudaq::get_state(test_state_param{}, &state);
+    }
+  }
+  // clang-format off
+// CHECK: Passing state from another kernel as argument iteratively (kernel mode)
+// CHECK: Iteration: 0
+// CHECK: 01
+// CHECK: 11
+// CHECK: Iteration: 1
+// CHECK: 00
+// CHECK: 10
+// CHECK: Iteration: 2
+// CHECK: 01
+// CHECK: 11
+// CHECK: Iteration: 3
+// CHECK: 00
+// CHECK: 10
+  // clang-format on
+}
diff --git a/targettests/execution/state_init.cpp b/targettests/execution/state_init.cpp
index 31e946147dd..e9b8456513d 100644
--- a/targettests/execution/state_init.cpp
+++ b/targettests/execution/state_init.cpp
@@ -40,4 +40,4 @@ int main() {
 }
 
 // CHECK: 00
-// CHECK: 10
+// CHECK: 10
\ No newline at end of file
diff --git a/test/Quake/arg_subst-5.txt b/test/Quake/arg_subst-5.txt
new file mode 100644
index 00000000000..c5e727bb79e
--- /dev/null
+++ b/test/Quake/arg_subst-5.txt
@@ -0,0 +1,15 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+cc.arg_subst[0] {
+  %0 = cc.string_literal "init" : !cc.ptr<!cc.array<i8 x 46>>
+  %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 46>>) -> !cc.ptr<i8>
+  %2 = func.call @__nvqpp_cudaq_state_get(%1) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+  %3 = cc.cast %2 : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
+}
+func.func private @__nvqpp_cudaq_state_get(!cc.ptr<i8>) -> !cc.ptr<!cc.state>
diff --git a/test/Quake/arg_subst-6.txt b/test/Quake/arg_subst-6.txt
new file mode 100644
index 00000000000..4c3a55d883a
--- /dev/null
+++ b/test/Quake/arg_subst-6.txt
@@ -0,0 +1,11 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+cc.arg_subst[0] {
+  %c2_i32 = arith.constant 2 : i32
+}
diff --git a/test/Quake/arg_subst_func.qke b/test/Quake/arg_subst_func.qke
index e96e04b63af..4bf6e101556 100644
--- a/test/Quake/arg_subst_func.qke
+++ b/test/Quake/arg_subst_func.qke
@@ -6,7 +6,7 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt --argument-synthesis=functions=foo:%S/arg_subst.txt,blink:%S/arg_subst.txt,testy1:%S/arg_subst-1.txt,testy2:%S/arg_subst-2.txt,testy3:%S/arg_subst-3.txt,testy4:%S/arg_subst-4.txt --canonicalize %s | FileCheck %s
+// RUN: cudaq-opt --argument-synthesis=functions=foo:%S/arg_subst.txt,blink:%S/arg_subst.txt,testy1:%S/arg_subst-1.txt,testy2:%S/arg_subst-2.txt,testy3:%S/arg_subst-3.txt,testy4:%S/arg_subst-4.txt,testy5:%S/arg_subst-5.txt,init:%S/arg_subst-6.txt --canonicalize %s | FileCheck %s
 
 func.func private @bar(i32)
 func.func private @baz(f32)
@@ -146,3 +146,38 @@ func.func @testy4(%arg0: !cc.stdvec<!cc.struct<{i32, f64, i8, i16}>>) {
 // CHECK:           call @callee4(%[[VAL_32]]) : (!cc.stdvec<!cc.struct<{i32, f64, i8, i16}>>) -> ()
 // CHECK:           return
 // CHECK:         }
+
+func.func @testy5(%arg0: !cc.ptr<!cc.state>) {
+  %3 = call @__nvqpp_cudaq_state_numberOfQubits(%arg0) : (!cc.ptr<!cc.state>) -> i64
+  %4 = quake.alloca !quake.veq<?>[%3 : i64]
+  %5 = quake.init_state %4, %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+  return
+}
+
+func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
+func.func private @__nvqpp_cudaq_state_get(!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+
+func.func private @init(%arg0: i32) -> !quake.veq<?> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+  %cst = arith.constant 1.5707963267948966 : f64
+  %0 = cc.cast signed %arg0 : (i32) -> i64
+  %1 = quake.alloca !quake.veq<?>[%0 : i64]
+  %2 = quake.concat %1 : (!quake.veq<?>) -> !quake.veq<?>
+  return %2 : !quake.veq<?>
+}
+
+// CHECK-LABEL:   func.func @testy5() {
+// CHECK:           %[[VAL_0:.*]] = cc.string_literal "init" : !cc.ptr<!cc.array<i8 x 46>>
+// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.array<i8 x 46>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_2:.*]] = call @__nvqpp_cudaq_state_get(%[[VAL_1]]) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_3:.*]] = call @__nvqpp_cudaq_state_numberOfQubits(%[[VAL_2]]) : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
+// CHECK:           %[[VAL_5:.*]] = quake.init_state %[[VAL_4]], %[[VAL_2]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+// CHECK:           return
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
+// CHECK:         func.func private @__nvqpp_cudaq_state_get(!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+// CHECK:         func.func private @init() -> !quake.veq<?> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_8:.*]] = quake.relax_size %[[VAL_7:.*]] : (!quake.veq<2>) -> !quake.veq<?>
+// CHECK:           return %[[VAL_8]] : !quake.veq<?>
+// CHECK:         }
diff --git a/test/Quake/state_init.qke b/test/Quake/state_init.qke
new file mode 100644
index 00000000000..9f43a965a4f
--- /dev/null
+++ b/test/Quake/state_init.qke
@@ -0,0 +1,37 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt -state-initialization -canonicalize %s | FileCheck %s
+
+module {
+  func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+    %0 = cc.string_literal "callee.modified_0" : !cc.ptr<!cc.array<i8 x 27>>
+    %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 27>>) -> !cc.ptr<i8>
+    %2 = call @__nvqpp_cudaq_state_get(%1) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+    %3 = call @__nvqpp_cudaq_state_numberOfQubits(%2) : (!cc.ptr<!cc.state>) -> i64
+    %4 = quake.alloca !quake.veq<?>[%3 : i64]
+    %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+    return
+  }
+
+  func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
+  func.func private @__nvqpp_cudaq_state_get(!cc.ptr<i8>) -> !cc.ptr<!cc.state>
+
+  func.func private @callee.modified_0() -> !quake.veq<?> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+    %cst = arith.constant 1.5707963267948966 : f64
+    %0 = quake.alloca !quake.veq<2>
+    %1 = quake.extract_ref %0[0] : (!quake.veq<2>) -> !quake.ref
+    quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+    %2 = quake.relax_size %0 : (!quake.veq<2>) -> !quake.veq<?>
+    return %2 : !quake.veq<?>
+  }
+// CHECK-LABEL:   func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = call @callee.modified_0() : () -> !quake.veq<?>
+// CHECK:           return
+// CHECK:         }
+}
diff --git a/test/Quake/state_prep.qke b/test/Quake/state_prep.qke
index 4289571b33c..3072a192187 100644
--- a/test/Quake/state_prep.qke
+++ b/test/Quake/state_prep.qke
@@ -31,7 +31,7 @@ module {
 // CHECK:           return
 // CHECK:         }
 
- func.func @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  func.func @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
     %0 = cc.address_of @__nvqpp__mlirgen__function_test_real_constant_array._Z24test_real_constant_arrayv.rodata_0 : !cc.ptr<!cc.array<f64 x 4>>
     %1 = quake.alloca !quake.veq<2>
     %2 = quake.init_state %1, %0 : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<2>
diff --git a/tpls/Stim b/tpls/Stim
index 47190f4a3af..b01e4239158 160000
--- a/tpls/Stim
+++ b/tpls/Stim
@@ -1 +1 @@
-Subproject commit 47190f4a3afb104c9f0068d0be9fea87d2894a70
+Subproject commit b01e42391583d03db4266b387d907eda1d7ae488

From 3fc56de6f0c911888fc8f3ae6356b8613653f0f9 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 17 Oct 2024 14:25:47 -0700
Subject: [PATCH 03/54] Merge with main

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 python/tests/interop/quantum_lib/CMakeLists.txt | 1 +
 runtime/common/BaseRemoteRESTQPU.h              | 7 +++----
 targettests/execution/state_init.cpp            | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tests/interop/quantum_lib/CMakeLists.txt b/python/tests/interop/quantum_lib/CMakeLists.txt
index 34fb0241880..21bb37a4d7b 100644
--- a/python/tests/interop/quantum_lib/CMakeLists.txt
+++ b/python/tests/interop/quantum_lib/CMakeLists.txt
@@ -11,3 +11,4 @@ set(CMAKE_CXX_COMPILE_OBJECT "<CMAKE_CXX_COMPILER> -fPIC --enable-mlir --disable
 
 # FIXME Error with SHARED, it pulls in all the mlir libraries anyway
 add_library(quantum_lib OBJECT quantum_lib.cpp)
+add_dependencies(quantum_lib nvq++ cudaq-opt cudaq-quake cudaq-translate)
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 41f45b6b759..32a097cfc5d 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -393,14 +393,13 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (!func->hasAttr(cudaq::entryPointAttrName))
       func->setAttr(cudaq::entryPointAttrName, builder.getUnitAttr());
     auto moduleOp = builder.create<mlir::ModuleOp>();
+    moduleOp.push_back(func.clone());
     moduleOp->setAttrs(m_module->getAttrDictionary());
 
     for (auto &op : m_module.getOps()) {
       if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
-        // Add quantum kernels defined in the module.
-        if (funcOp->hasAttr(cudaq::kernelAttrName) ||
-            funcOp.getName().startswith("__nvqpp__mlirgen__") ||
-            funcOp.getBody().empty())
+        // Add function definitions for runtime functions.
+        if (funcOp.getBody().empty())
           moduleOp.push_back(funcOp.clone());
       }
       // Add globals defined in the module.
diff --git a/targettests/execution/state_init.cpp b/targettests/execution/state_init.cpp
index e9b8456513d..31e946147dd 100644
--- a/targettests/execution/state_init.cpp
+++ b/targettests/execution/state_init.cpp
@@ -40,4 +40,4 @@ int main() {
 }
 
 // CHECK: 00
-// CHECK: 10
\ No newline at end of file
+// CHECK: 10

From 7969a755986157cdb04625a8680516432d00e352 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 17 Oct 2024 14:37:56 -0700
Subject: [PATCH 04/54] Merge with main

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 tpls/Stim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tpls/Stim b/tpls/Stim
index b01e4239158..47190f4a3af 160000
--- a/tpls/Stim
+++ b/tpls/Stim
@@ -1 +1 @@
-Subproject commit b01e42391583d03db4266b387d907eda1d7ae488
+Subproject commit 47190f4a3afb104c9f0068d0be9fea87d2894a70

From 755d0d1971bc489093ab2e541db759352f4506eb Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 17 Oct 2024 15:24:55 -0700
Subject: [PATCH 05/54] Fix test failure on anyon platform

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/common/BaseRemoteRESTQPU.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 32a097cfc5d..989649d9fa5 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -18,6 +18,7 @@
 #include "common/RuntimeMLIR.h"
 #include "cudaq.h"
 #include "cudaq/Frontend/nvqpp/AttributeNames.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/OpenQASMEmitter.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
@@ -398,8 +399,13 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
 
     for (auto &op : m_module.getOps()) {
       if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
-        // Add function definitions for runtime functions.
-        if (funcOp.getBody().empty())
+        // Add function definitions for runtime functions that must
+        // be removed after synthesis in cleanup ops.
+        if (funcOp.getBody().empty() &&
+            (funcOp.getName().equals(cudaq::getNumQubitsFromCudaqState) ||
+             funcOp.getName().equals(cudaq::createCudaqStateFromDataFP64) ||
+             funcOp.getName().equals(cudaq::createCudaqStateFromDataFP32) ||
+             funcOp.getName().equals(cudaq::getCudaqState)))
           moduleOp.push_back(funcOp.clone());
       }
       // Add globals defined in the module.

From 382bc99adda74bcae5cab1965096dac12d6e2b37 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 17 Oct 2024 15:40:34 -0700
Subject: [PATCH 06/54] Make StateInitialization a funcOp pass

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Transforms/Passes.td  |  2 +-
 .../Transforms/StateInitialization.cpp        | 25 ++++++-------------
 runtime/common/BaseRemoteRESTQPU.h            |  2 +-
 3 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 66eb4cfcb0d..70ae6c71386 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -779,7 +779,7 @@ def DeleteStates : Pass<"delete-states", "mlir::ModuleOp"> {
   }];
 }
 
-def StateInitialization : Pass<"state-initialization", "mlir::ModuleOp"> {
+def StateInitialization : Pass<"state-initialization", "mlir::func::FuncOp"> {
   let summary =
     "Replace `quake.init_state` instructions with call to the kernel generating the state";
   let description = [{
diff --git a/lib/Optimizer/Transforms/StateInitialization.cpp b/lib/Optimizer/Transforms/StateInitialization.cpp
index 3a122f02a7b..f641eb04f63 100644
--- a/lib/Optimizer/Transforms/StateInitialization.cpp
+++ b/lib/Optimizer/Transforms/StateInitialization.cpp
@@ -121,26 +121,17 @@ class StateInitializationPass
 
   void runOnOperation() override {
     auto *ctx = &getContext();
-    auto module = getOperation();
-    for (Operation &op : *module.getBody()) {
-      auto func = dyn_cast<func::FuncOp>(op);
-      if (!func)
-        continue;
+    auto func = getOperation();
+    RewritePatternSet patterns(ctx);
+    patterns.insert<StateInitPattern>(ctx);
 
-      std::string funcName = func.getName().str();
-      RewritePatternSet patterns(ctx);
-      patterns.insert<StateInitPattern>(ctx);
+    LLVM_DEBUG(llvm::dbgs() << "Before state initialization: " << func << '\n');
 
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Before state initialization: " << func << '\n');
+    if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
+                                            std::move(patterns))))
+      signalPassFailure();
 
-      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                              std::move(patterns))))
-        signalPassFailure();
-
-      LLVM_DEBUG(llvm::dbgs()
-                 << "After state initialization: " << func << '\n');
-    }
+    LLVM_DEBUG(llvm::dbgs() << "After state initialization: " << func << '\n');
   }
 };
 } // namespace
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 989649d9fa5..a37d5bf7067 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -447,7 +447,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
                                                    substs.end()}));
         pm.addPass(mlir::createCanonicalizerPass());
         pm.addPass(opt::createDeleteStates());
-        pm.addPass(opt::createStateInitialization());
+        pm.addNestedPass<mlir::func::FuncOp>(opt::createStateInitialization());
       } else if (updatedArgs) {
         cudaq::info("Run Quake Synth.\n");
         pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));

From d3a05d4432d41acaae68fea86eeac6f3e34d4cc7 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 18 Oct 2024 11:09:12 -0700
Subject: [PATCH 07/54] Fix issues and tests for the rest of quantum
 architectures

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Transforms/Passes.td  |  11 ++
 lib/Optimizer/Transforms/CMakeLists.txt       |   1 +
 .../Transforms/StateInitialization.cpp        |  16 +--
 lib/Optimizer/Transforms/StateValidation.cpp  | 130 ++++++++++++++++++
 runtime/common/BaseRemoteRESTQPU.h            |   2 +
 .../default/rest/helpers/anyon/anyon.yml      |   2 +
 .../default/rest/helpers/ionq/ionq.yml        |   2 +
 .../platform/default/rest/helpers/iqm/iqm.yml |   2 +
 .../platform/default/rest/helpers/oqc/oqc.yml |   2 +
 .../execution/qvector_init_from_state.cpp     |  17 ++-
 10 files changed, 174 insertions(+), 11 deletions(-)
 create mode 100644 lib/Optimizer/Transforms/StateValidation.cpp

diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 70ae6c71386..aa8f038c410 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -866,6 +866,17 @@ def StatePreparation : Pass<"state-prep", "mlir::ModuleOp"> {
   ];
 }
 
+def StateValidation : Pass<"state-validation", "mlir::ModuleOp"> {
+  let summary =
+    "Make sure MLIR is valid after synthesis for quantum devices";
+  let description = [{
+    Argument synthesis should replace all `quake.init` from state instructions
+    and calls to state-related runtime functions.
+    Make sure none of them left, and remove definitions for state-related
+    runtime functions.
+  }];
+}
+
 def PromoteRefToVeqAlloc : Pass<"promote-qubit-allocation"> {
   let summary = "Promote single qubit allocations.";
   let description = [{
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index f107d78bde6..7eae39e35fe 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -52,6 +52,7 @@ add_cudaq_library(OptTransforms
   RegToMem.cpp
   StateInitialization.cpp
   StatePreparation.cpp
+  StateValidation.cpp
   UnitarySynthesis.cpp
   WiresToWiresets.cpp
 
diff --git a/lib/Optimizer/Transforms/StateInitialization.cpp b/lib/Optimizer/Transforms/StateInitialization.cpp
index f641eb04f63..c46273b7476 100644
--- a/lib/Optimizer/Transforms/StateInitialization.cpp
+++ b/lib/Optimizer/Transforms/StateInitialization.cpp
@@ -30,10 +30,10 @@ using namespace mlir;
 
 namespace {
 
-static bool isCall(Operation *callOp, std::vector<const char *> &&names) {
-  if (callOp) {
-    if (auto createStateCall = dyn_cast<func::CallOp>(callOp)) {
-      if (auto calleeAttr = createStateCall.getCalleeAttr()) {
+static bool isCall(Operation *op, std::vector<const char *> &&names) {
+  if (op) {
+    if (auto callOp = dyn_cast<func::CallOp>(op)) {
+      if (auto calleeAttr = callOp.getCalleeAttr()) {
         auto funcName = calleeAttr.getValue().str();
         if (std::find(names.begin(), names.end(), funcName) != names.end())
           return true;
@@ -43,12 +43,12 @@ static bool isCall(Operation *callOp, std::vector<const char *> &&names) {
   return false;
 }
 
-static bool isGetStateCall(Operation *callOp) {
-  return isCall(callOp, {cudaq::getCudaqState});
+static bool isGetStateCall(Operation *op) {
+  return isCall(op, {cudaq::getCudaqState});
 }
 
-static bool isNumberOfQubitsCall(Operation *callOp) {
-  return isCall(callOp, {cudaq::getNumQubitsFromCudaqState});
+static bool isNumberOfQubitsCall(Operation *op) {
+  return isCall(op, {cudaq::getNumQubitsFromCudaqState});
 }
 
 // clang-format off
diff --git a/lib/Optimizer/Transforms/StateValidation.cpp b/lib/Optimizer/Transforms/StateValidation.cpp
new file mode 100644
index 00000000000..be20dd4edef
--- /dev/null
+++ b/lib/Optimizer/Transforms/StateValidation.cpp
@@ -0,0 +1,130 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+
+namespace cudaq::opt {
+#define GEN_PASS_DEF_STATEVALIDATION
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
+#define DEBUG_TYPE "state-validation"
+
+using namespace mlir;
+
+
+/// Validate that quantum code does not contain runtime calls and remove runtime function definitions. 
+namespace {
+
+static bool isRuntimeStateCallName(llvm::StringRef funcName) {
+  static std::vector<const char *> names = {
+    cudaq::getCudaqState,
+    cudaq::createCudaqStateFromDataFP32,
+    cudaq::createCudaqStateFromDataFP64,
+    cudaq::deleteCudaqState,
+    cudaq::getNumQubitsFromCudaqState
+  };
+  if (std::find(names.begin(), names.end(), funcName) != names.end())
+      return true; 
+  return false;
+}
+
+static bool isRuntimeStateCall(Operation *callOp) {
+  if (callOp) {
+    if (auto call = dyn_cast<func::CallOp>(callOp)) {
+      if (auto calleeAttr = call.getCalleeAttr()) {
+        auto funcName = calleeAttr.getValue().str();
+        if (isRuntimeStateCallName(funcName))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+class ValidateStateCallPattern : public OpRewritePattern<func::CallOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(func::CallOp callOp,
+                                PatternRewriter &rewriter) const override {
+    if (isRuntimeStateCall(callOp)) {
+      auto name = callOp.getCalleeAttr().getValue();
+      callOp.emitError("Unsupported call for quantum platform: " + name);
+    }
+    return failure();
+  }
+};
+
+class ValidateStateInitPattern : public OpRewritePattern<quake::InitializeStateOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
+                                PatternRewriter &rewriter) const override {
+    auto stateOp = initState.getOperand(1);
+    if (isa<cudaq::cc::StateType>(stateOp.getType())) 
+      initState.emitError("Synthesis did not remove `quake.init_state <state>` instruction");
+    
+    return failure();
+  }
+};
+
+
+class StateValidationPass
+    : public cudaq::opt::impl::StateValidationBase<StateValidationPass> {
+protected:
+public:
+  using StateValidationBase::StateValidationBase;
+
+  mlir::ModuleOp getModule() { return getOperation(); }
+
+  void runOnOperation() override final {
+    auto *ctx = &getContext();
+    auto module = getModule();
+    SmallVector<Operation *> toErase;
+
+    for (Operation &op : *module.getBody()) {
+      auto func = dyn_cast<func::FuncOp>(op);
+      if (!func)
+        continue;
+
+      RewritePatternSet patterns(ctx);
+      patterns.insert<ValidateStateCallPattern, ValidateStateInitPattern>(ctx);
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Before state validation: " << func << '\n');
+
+      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
+                                              std::move(patterns))))
+        signalPassFailure();
+
+      // Delete runtime function definitions.
+      if (func.getBody().empty() && isRuntimeStateCallName(func.getName()))
+        toErase.push_back(func);
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "After state validation: " << func << '\n');
+    }
+
+    for (auto *op : toErase)
+      op->erase();
+  }
+};
+
+} // namespace
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index a37d5bf7067..0eab2c7fbab 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -405,6 +405,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
             (funcOp.getName().equals(cudaq::getNumQubitsFromCudaqState) ||
              funcOp.getName().equals(cudaq::createCudaqStateFromDataFP64) ||
              funcOp.getName().equals(cudaq::createCudaqStateFromDataFP32) ||
+             funcOp.getName().equals(cudaq::deleteCudaqState) ||
              funcOp.getName().equals(cudaq::getCudaqState)))
           moduleOp.push_back(funcOp.clone());
       }
@@ -448,6 +449,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         pm.addPass(mlir::createCanonicalizerPass());
         pm.addPass(opt::createDeleteStates());
         pm.addNestedPass<mlir::func::FuncOp>(opt::createStateInitialization());
+        pm.addPass(opt::createStateValidation());
       } else if (updatedArgs) {
         cudaq::info("Run Quake Synth.\n");
         pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
diff --git a/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml b/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
index 3ecb49f3021..e0fb208f9c9 100644
--- a/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
@@ -13,6 +13,8 @@ config:
   platform-qpu: remote_rest
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
+  # Add preprocessor defines to compilation
+  preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
diff --git a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
index 238d4c33163..802cdc2e0ad 100644
--- a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
@@ -13,6 +13,8 @@ config:
   platform-qpu: remote_rest
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
+  # Add preprocessor defines to compilation
+  preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
diff --git a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
index 0e90a1e2afa..2c928bda876 100644
--- a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
@@ -13,6 +13,8 @@ config:
   platform-qpu: remote_rest
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
+  # Add preprocessor defines to compilation
+  preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
diff --git a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
index 6a8a46c0667..cde626676cf 100644
--- a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
@@ -13,6 +13,8 @@ config:
   platform-qpu: remote_rest
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
+  # Add preprocessor defines to compilation
+  preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
diff --git a/targettests/execution/qvector_init_from_state.cpp b/targettests/execution/qvector_init_from_state.cpp
index afaba5a2c05..06c97b1e6a3 100644
--- a/targettests/execution/qvector_init_from_state.cpp
+++ b/targettests/execution/qvector_init_from_state.cpp
@@ -7,8 +7,16 @@
  ******************************************************************************/
 
 // clang-format off
-// RUN: nvq++ %cpp_std --enable-mlir                                     %s -o %t  && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target quantinuum --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
+// Simulators
+// RUN: nvq++ %cpp_std --enable-mlir  %s                              -o %t && %t | FileCheck %s
+
+// Quantum emulators
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// 2 different IQM machines for 2 different topologies
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 // clang-format on
 
 #include <cudaq.h>
@@ -91,7 +99,10 @@ int main() {
     std::cout
         << "Passing large state from another kernel as argument (kernel mode)"
         << std::endl;
-    auto largeState = cudaq::get_state(test_init_state{}, 14);
+    // TODO: State larger than 5 qubits fails on iqm machines with Adonis architecture
+    // TODO: State larger than 8 qubits fails on oqc and anyon
+    // Up to 14 bits works with quantinuum an ionq
+    auto largeState = cudaq::get_state(test_init_state{}, 5);
     auto counts = cudaq::sample(test_state_param{}, &largeState);
     printCounts(counts);
   }

From 51ef054c14df334252e389e2244d24974486661e Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 18 Oct 2024 15:48:39 -0700
Subject: [PATCH 08/54] Fix failing quantinuum state prep tests

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 .../Transforms/StateInitialization.cpp        | 68 ++++++++++---------
 lib/Optimizer/Transforms/StateValidation.cpp  |  7 +-
 2 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/lib/Optimizer/Transforms/StateInitialization.cpp b/lib/Optimizer/Transforms/StateInitialization.cpp
index c46273b7476..0ed68676709 100644
--- a/lib/Optimizer/Transforms/StateInitialization.cpp
+++ b/lib/Optimizer/Transforms/StateInitialization.cpp
@@ -73,39 +73,43 @@ class StateInitPattern : public OpRewritePattern<quake::InitializeStateOp> {
                                 PatternRewriter &rewriter) const override {
     auto loc = initState.getLoc();
     auto allocaOp = initState.getOperand(0).getDefiningOp();
-    auto getStateOp = initState.getOperand(1).getDefiningOp();
-    auto numOfQubits = allocaOp->getOperand(0).getDefiningOp();
-
-    if (isGetStateCall(getStateOp)) {
-      auto calleeNameOp = getStateOp->getOperand(0);
-      if (auto cast =
-              dyn_cast<cudaq::cc::CastOp>(calleeNameOp.getDefiningOp())) {
-        calleeNameOp = cast.getOperand();
-
-        if (auto literal = dyn_cast<cudaq::cc::CreateStringLiteralOp>(
-                calleeNameOp.getDefiningOp())) {
-          auto calleeName = literal.getStringLiteral();
-
-          Value result =
-              rewriter
-                  .create<func::CallOp>(loc, initState.getType(), calleeName,
-                                        mlir::ValueRange{})
-                  .getResult(0);
-          rewriter.replaceAllUsesWith(initState, result);
-          initState.erase();
-          allocaOp->dropAllUses();
-          rewriter.eraseOp(allocaOp);
-          if (isNumberOfQubitsCall(numOfQubits)) {
-            numOfQubits->dropAllUses();
-            rewriter.eraseOp(numOfQubits);
+    auto stateOp = initState.getOperand(1);
+
+    if (isa<cudaq::cc::StateType>(stateOp.getType())) {
+      auto getStateOp = stateOp.getDefiningOp();
+      auto numOfQubits = allocaOp->getOperand(0).getDefiningOp();
+
+      if (isGetStateCall(getStateOp)) {
+        auto calleeNameOp = getStateOp->getOperand(0);
+        if (auto cast =
+                dyn_cast<cudaq::cc::CastOp>(calleeNameOp.getDefiningOp())) {
+          calleeNameOp = cast.getOperand();
+
+          if (auto literal = dyn_cast<cudaq::cc::CreateStringLiteralOp>(
+                  calleeNameOp.getDefiningOp())) {
+            auto calleeName = literal.getStringLiteral();
+
+            Value result =
+                rewriter
+                    .create<func::CallOp>(loc, initState.getType(), calleeName,
+                                          mlir::ValueRange{})
+                    .getResult(0);
+            rewriter.replaceAllUsesWith(initState, result);
+            initState.erase();
+            allocaOp->dropAllUses();
+            rewriter.eraseOp(allocaOp);
+            if (isNumberOfQubitsCall(numOfQubits)) {
+              numOfQubits->dropAllUses();
+              rewriter.eraseOp(numOfQubits);
+            }
+            getStateOp->dropAllUses();
+            rewriter.eraseOp(getStateOp);
+            cast->dropAllUses();
+            rewriter.eraseOp(cast);
+            literal->dropAllUses();
+            rewriter.eraseOp(literal);
+            return success();
           }
-          getStateOp->dropAllUses();
-          rewriter.eraseOp(getStateOp);
-          cast->dropAllUses();
-          rewriter.eraseOp(cast);
-          literal->dropAllUses();
-          rewriter.eraseOp(literal);
-          return success();
         }
       }
     }
diff --git a/lib/Optimizer/Transforms/StateValidation.cpp b/lib/Optimizer/Transforms/StateValidation.cpp
index f0b25cdc100..c9d301740c6 100644
--- a/lib/Optimizer/Transforms/StateValidation.cpp
+++ b/lib/Optimizer/Transforms/StateValidation.cpp
@@ -62,7 +62,8 @@ class ValidateStateCallPattern : public OpRewritePattern<func::CallOp> {
                                 PatternRewriter &rewriter) const override {
     if (isRuntimeStateCall(callOp)) {
       auto name = callOp.getCalleeAttr().getValue();
-      callOp.emitError("Unsupported call for quantum platform: " + name);
+      callOp.emitError(
+          "Synthesis did not remove func call for quantum platform: " + name);
     }
     return failure();
   }
@@ -77,8 +78,8 @@ class ValidateStateInitPattern
                                 PatternRewriter &rewriter) const override {
     auto stateOp = initState.getOperand(1);
     if (isa<cudaq::cc::StateType>(stateOp.getType()))
-      initState.emitError(
-          "Synthesis did not remove `quake.init_state <state>` instruction");
+      initState.emitError("Synthesis did not remove `quake.init_state <veq> "
+                          "<state>` instruction");
 
     return failure();
   }

From a7f5387e10c181704ff36c37504fea72ea2e3486 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Mon, 21 Oct 2024 15:11:34 -0700
Subject: [PATCH 09/54] Address CR comments

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Transforms/Passes.td  |  17 +--
 lib/Optimizer/Transforms/CMakeLists.txt       |   3 +-
 ...ization.cpp => ReplaceStateWithKernel.cpp} |  98 +++++++-------
 lib/Optimizer/Transforms/StateValidation.cpp  | 127 ------------------
 runtime/common/BaseRemoteRESTQPU.h            |  10 +-
 ...init.qke => replace_state_with_kernel.qke} |   2 +-
 6 files changed, 64 insertions(+), 193 deletions(-)
 rename lib/Optimizer/Transforms/{StateInitialization.cpp => ReplaceStateWithKernel.cpp} (56%)
 delete mode 100644 lib/Optimizer/Transforms/StateValidation.cpp
 rename test/Quake/{state_init.qke => replace_state_with_kernel.qke} (96%)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index aa8f038c410..ef446a38129 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -779,7 +779,7 @@ def DeleteStates : Pass<"delete-states", "mlir::ModuleOp"> {
   }];
 }
 
-def StateInitialization : Pass<"state-initialization", "mlir::func::FuncOp"> {
+def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::FuncOp"> {
   let summary =
     "Replace `quake.init_state` instructions with call to the kernel generating the state";
   let description = [{
@@ -794,7 +794,7 @@ def StateInitialization : Pass<"state-initialization", "mlir::func::FuncOp"> {
 
     For example:
 
-    Before StateInitialization (state-initialization):
+    Before ReplaceStateWithKernel (replace-state-with-kernel):
     ```
     func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
       %0 = cc.string_literal "__nvqpp__mlirgen__test_init_state.modified_0" : !cc.ptr<!cc.array<i8 x 45>>
@@ -807,7 +807,7 @@ def StateInitialization : Pass<"state-initialization", "mlir::func::FuncOp"> {
     }
     ```
 
-    After StateInitialization (state-initialization):
+    After ReplaceStateWithKernel (replace-state-with-kernel):
     ```
     func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
       %5 = call @__nvqpp__mlirgen__test_init_state.modified_0() : () -> !quake.veq<?>
@@ -866,17 +866,6 @@ def StatePreparation : Pass<"state-prep", "mlir::ModuleOp"> {
   ];
 }
 
-def StateValidation : Pass<"state-validation", "mlir::ModuleOp"> {
-  let summary =
-    "Make sure MLIR is valid after synthesis for quantum devices";
-  let description = [{
-    Argument synthesis should replace all `quake.init` from state instructions
-    and calls to state-related runtime functions.
-    Make sure none of them left, and remove definitions for state-related
-    runtime functions.
-  }];
-}
-
 def PromoteRefToVeqAlloc : Pass<"promote-qubit-allocation"> {
   let summary = "Promote single qubit allocations.";
   let description = [{
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index 7eae39e35fe..153e095e1fc 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -50,9 +50,8 @@ add_cudaq_library(OptTransforms
   QuakeSynthesizer.cpp
   RefToVeqAlloc.cpp
   RegToMem.cpp
-  StateInitialization.cpp
+  ReplaceStateWithKernel.cpp
   StatePreparation.cpp
-  StateValidation.cpp
   UnitarySynthesis.cpp
   WiresToWiresets.cpp
 
diff --git a/lib/Optimizer/Transforms/StateInitialization.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
similarity index 56%
rename from lib/Optimizer/Transforms/StateInitialization.cpp
rename to lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index 0ed68676709..d588f092167 100644
--- a/lib/Optimizer/Transforms/StateInitialization.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -20,11 +20,11 @@
 #include <span>
 
 namespace cudaq::opt {
-#define GEN_PASS_DEF_STATEINITIALIZATION
+#define GEN_PASS_DEF_REPLACESTATEWITHKERNEL
 #include "cudaq/Optimizer/Transforms/Passes.h.inc"
 } // namespace cudaq::opt
 
-#define DEBUG_TYPE "state-initialization"
+#define DEBUG_TYPE "replace-state-with-kernel"
 
 using namespace mlir;
 
@@ -52,7 +52,9 @@ static bool isNumberOfQubitsCall(Operation *op) {
 }
 
 // clang-format off
-/// Replace `quake.init_state` by a call to a (modified) kernel that produced the state.
+/// Replace `quake.init_state` by a call to a (modified) kernel that produced
+/// the state.
+///
 /// ```
 ///  %0 = cc.string_literal "callee.modified_0" : !cc.ptr<!cc.array<i8 x 27>>
 ///  %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 27>>) -> !cc.ptr<i8>
@@ -65,50 +67,54 @@ static bool isNumberOfQubitsCall(Operation *op) {
 ///  %5 = call @callee.modified_0() : () -> !quake.veq<?>
 /// ```
 // clang-format on
-class StateInitPattern : public OpRewritePattern<quake::InitializeStateOp> {
+class ReplaceStateWithKernelPattern : public OpRewritePattern<quake::InitializeStateOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
                                 PatternRewriter &rewriter) const override {
-    auto loc = initState.getLoc();
-    auto allocaOp = initState.getOperand(0).getDefiningOp();
+    //auto loc = initState.getLoc();
+    auto *alloca = initState.getOperand(0).getDefiningOp();
     auto stateOp = initState.getOperand(1);
 
-    if (isa<cudaq::cc::StateType>(stateOp.getType())) {
-      auto getStateOp = stateOp.getDefiningOp();
-      auto numOfQubits = allocaOp->getOperand(0).getDefiningOp();
-
-      if (isGetStateCall(getStateOp)) {
-        auto calleeNameOp = getStateOp->getOperand(0);
-        if (auto cast =
-                dyn_cast<cudaq::cc::CastOp>(calleeNameOp.getDefiningOp())) {
-          calleeNameOp = cast.getOperand();
-
-          if (auto literal = dyn_cast<cudaq::cc::CreateStringLiteralOp>(
-                  calleeNameOp.getDefiningOp())) {
-            auto calleeName = literal.getStringLiteral();
-
-            Value result =
-                rewriter
-                    .create<func::CallOp>(loc, initState.getType(), calleeName,
-                                          mlir::ValueRange{})
-                    .getResult(0);
-            rewriter.replaceAllUsesWith(initState, result);
-            initState.erase();
-            allocaOp->dropAllUses();
-            rewriter.eraseOp(allocaOp);
-            if (isNumberOfQubitsCall(numOfQubits)) {
-              numOfQubits->dropAllUses();
-              rewriter.eraseOp(numOfQubits);
+    if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(stateOp.getType())) {
+      if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
+        auto *getState = stateOp.getDefiningOp();
+        auto *numOfQubits = alloca->getOperand(0).getDefiningOp();
+
+        if (isGetStateCall(getState)) {
+          auto calleeNameOp = getState->getOperand(0);
+          if (auto cast = calleeNameOp.getDefiningOp<cudaq::cc::CastOp>()) {
+            calleeNameOp = cast.getOperand();
+
+            if (auto literal = 
+                    calleeNameOp.getDefiningOp<cudaq::cc::CreateStringLiteralOp>()) {
+              auto calleeName = literal.getStringLiteral();
+              rewriter.replaceOpWithNewOp<func::CallOp>(initState, initState.getType(), calleeName,
+                                            mlir::ValueRange{});
+
+              if (alloca->getUses().empty()) 
+                rewriter.eraseOp(alloca);
+              else  {
+                alloca->emitError("Failed to remove `quake.alloca` in state synthesis");
+                return failure();
+              }
+              if (isNumberOfQubitsCall(numOfQubits)) {
+                if (numOfQubits->getUses().empty())
+                  rewriter.eraseOp(numOfQubits);
+                else  {
+                  numOfQubits->emitError("Failed to remove runtime call to get number of qubits in state synthesis");
+                  return failure();
+                }
+              }
+              if (getState->getUses().empty())
+                rewriter.eraseOp(getState);
+              else  {
+                alloca->emitError("Failed to remove runtime call to get state in state synthesis");
+                return failure();
+              }
+              return success();
             }
-            getStateOp->dropAllUses();
-            rewriter.eraseOp(getStateOp);
-            cast->dropAllUses();
-            rewriter.eraseOp(cast);
-            literal->dropAllUses();
-            rewriter.eraseOp(literal);
-            return success();
           }
         }
       }
@@ -117,25 +123,25 @@ class StateInitPattern : public OpRewritePattern<quake::InitializeStateOp> {
   }
 };
 
-class StateInitializationPass
-    : public cudaq::opt::impl::StateInitializationBase<
-          StateInitializationPass> {
+class ReplaceStateWithKernelPass
+    : public cudaq::opt::impl::ReplaceStateWithKernelBase<
+          ReplaceStateWithKernelPass> {
 public:
-  using StateInitializationBase::StateInitializationBase;
+  using ReplaceStateWithKernelBase::ReplaceStateWithKernelBase;
 
   void runOnOperation() override {
     auto *ctx = &getContext();
     auto func = getOperation();
     RewritePatternSet patterns(ctx);
-    patterns.insert<StateInitPattern>(ctx);
+    patterns.insert<ReplaceStateWithKernelPattern>(ctx);
 
-    LLVM_DEBUG(llvm::dbgs() << "Before state initialization: " << func << '\n');
+    LLVM_DEBUG(llvm::dbgs() << "Before replace state with kernel: " << func << '\n');
 
     if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
                                             std::move(patterns))))
       signalPassFailure();
 
-    LLVM_DEBUG(llvm::dbgs() << "After state initialization: " << func << '\n');
+    LLVM_DEBUG(llvm::dbgs() << "After replace state with kerenl: " << func << '\n');
   }
 };
 } // namespace
diff --git a/lib/Optimizer/Transforms/StateValidation.cpp b/lib/Optimizer/Transforms/StateValidation.cpp
deleted file mode 100644
index c9d301740c6..00000000000
--- a/lib/Optimizer/Transforms/StateValidation.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-#include "PassDetails.h"
-#include "cudaq/Optimizer/Builder/Intrinsics.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
-#include "cudaq/Optimizer/Transforms/Passes.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/Passes.h"
-
-namespace cudaq::opt {
-#define GEN_PASS_DEF_STATEVALIDATION
-#include "cudaq/Optimizer/Transforms/Passes.h.inc"
-} // namespace cudaq::opt
-
-#define DEBUG_TYPE "state-validation"
-
-using namespace mlir;
-
-/// Validate that quantum code does not contain runtime calls and remove runtime
-/// function definitions.
-namespace {
-
-static bool isRuntimeStateCallName(llvm::StringRef funcName) {
-  static std::vector<const char *> names = {
-      cudaq::getCudaqState, cudaq::createCudaqStateFromDataFP32,
-      cudaq::createCudaqStateFromDataFP64, cudaq::deleteCudaqState,
-      cudaq::getNumQubitsFromCudaqState};
-  if (std::find(names.begin(), names.end(), funcName) != names.end())
-    return true;
-  return false;
-}
-
-static bool isRuntimeStateCall(Operation *callOp) {
-  if (callOp) {
-    if (auto call = dyn_cast<func::CallOp>(callOp)) {
-      if (auto calleeAttr = call.getCalleeAttr()) {
-        auto funcName = calleeAttr.getValue().str();
-        if (isRuntimeStateCallName(funcName))
-          return true;
-      }
-    }
-  }
-  return false;
-}
-
-class ValidateStateCallPattern : public OpRewritePattern<func::CallOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(func::CallOp callOp,
-                                PatternRewriter &rewriter) const override {
-    if (isRuntimeStateCall(callOp)) {
-      auto name = callOp.getCalleeAttr().getValue();
-      callOp.emitError(
-          "Synthesis did not remove func call for quantum platform: " + name);
-    }
-    return failure();
-  }
-};
-
-class ValidateStateInitPattern
-    : public OpRewritePattern<quake::InitializeStateOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
-                                PatternRewriter &rewriter) const override {
-    auto stateOp = initState.getOperand(1);
-    if (isa<cudaq::cc::StateType>(stateOp.getType()))
-      initState.emitError("Synthesis did not remove `quake.init_state <veq> "
-                          "<state>` instruction");
-
-    return failure();
-  }
-};
-
-class StateValidationPass
-    : public cudaq::opt::impl::StateValidationBase<StateValidationPass> {
-protected:
-public:
-  using StateValidationBase::StateValidationBase;
-
-  mlir::ModuleOp getModule() { return getOperation(); }
-
-  void runOnOperation() override final {
-    auto *ctx = &getContext();
-    auto module = getModule();
-    SmallVector<Operation *> toErase;
-
-    for (Operation &op : *module.getBody()) {
-      auto func = dyn_cast<func::FuncOp>(op);
-      if (!func)
-        continue;
-
-      RewritePatternSet patterns(ctx);
-      patterns.insert<ValidateStateCallPattern, ValidateStateInitPattern>(ctx);
-
-      LLVM_DEBUG(llvm::dbgs() << "Before state validation: " << func << '\n');
-
-      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                              std::move(patterns))))
-        signalPassFailure();
-
-      // Delete runtime function definitions.
-      if (func.getBody().empty() && isRuntimeStateCallName(func.getName()))
-        toErase.push_back(func);
-
-      LLVM_DEBUG(llvm::dbgs() << "After state validation: " << func << '\n');
-    }
-
-    for (auto *op : toErase)
-      op->erase();
-  }
-};
-
-} // namespace
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index c467811a666..a9053411fac 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -437,6 +437,9 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
       mlir::PassManager pm(&context);
       if (!rawArgs.empty()) {
         cudaq::info("Run Argument Synth.\n");
+        // For quantum hardware, we collect substitutions for the
+        // whole call tree of states, which are treated as calls to
+        // the kernels and their arguments that produced the state.
         opt::ArgumentConverter argCon(kernelName, moduleOp);
         argCon.gen(rawArgs);
         auto [kernels, substs] = argCon.collectAllSubstitutions();
@@ -446,10 +449,11 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
                                                    kernels.end()},
                 mlir::SmallVector<mlir::StringRef>{substs.begin(),
                                                    substs.end()}));
-        pm.addPass(mlir::createCanonicalizerPass());
         pm.addPass(opt::createDeleteStates());
-        pm.addNestedPass<mlir::func::FuncOp>(opt::createStateInitialization());
-        pm.addPass(opt::createStateValidation());
+        pm.addNestedPass<mlir::func::FuncOp>(
+            opt::createReplaceStateWithKernel());
+        pm.addPass(mlir::createCanonicalizerPass());
+        pm.addPass(mlir::createSymbolDCEPass());
       } else if (updatedArgs) {
         cudaq::info("Run Quake Synth.\n");
         pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
diff --git a/test/Quake/state_init.qke b/test/Quake/replace_state_with_kernel.qke
similarity index 96%
rename from test/Quake/state_init.qke
rename to test/Quake/replace_state_with_kernel.qke
index 9f43a965a4f..70b04e31030 100644
--- a/test/Quake/state_init.qke
+++ b/test/Quake/replace_state_with_kernel.qke
@@ -6,7 +6,7 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt -state-initialization -canonicalize %s | FileCheck %s
+// RUN: cudaq-opt -replace-state-with-kernel -canonicalize %s | FileCheck %s
 
 module {
   func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {

From 9f0937fcb022663cf1e94216e7acb9bd7c429572 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Mon, 21 Oct 2024 15:41:40 -0700
Subject: [PATCH 10/54] Format

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 .../Transforms/ReplaceStateWithKernel.cpp     | 37 +++++++++++--------
 runtime/common/BaseRemoteRESTQPU.h            |  2 +-
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index d588f092167..5300f574154 100644
--- a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -67,13 +67,13 @@ static bool isNumberOfQubitsCall(Operation *op) {
 ///  %5 = call @callee.modified_0() : () -> !quake.veq<?>
 /// ```
 // clang-format on
-class ReplaceStateWithKernelPattern : public OpRewritePattern<quake::InitializeStateOp> {
+class ReplaceStateWithKernelPattern
+    : public OpRewritePattern<quake::InitializeStateOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
                                 PatternRewriter &rewriter) const override {
-    //auto loc = initState.getLoc();
     auto *alloca = initState.getOperand(0).getDefiningOp();
     auto stateOp = initState.getOperand(1);
 
@@ -87,30 +87,35 @@ class ReplaceStateWithKernelPattern : public OpRewritePattern<quake::InitializeS
           if (auto cast = calleeNameOp.getDefiningOp<cudaq::cc::CastOp>()) {
             calleeNameOp = cast.getOperand();
 
-            if (auto literal = 
-                    calleeNameOp.getDefiningOp<cudaq::cc::CreateStringLiteralOp>()) {
+            if (auto literal =
+                    calleeNameOp
+                        .getDefiningOp<cudaq::cc::CreateStringLiteralOp>()) {
               auto calleeName = literal.getStringLiteral();
-              rewriter.replaceOpWithNewOp<func::CallOp>(initState, initState.getType(), calleeName,
-                                            mlir::ValueRange{});
+              rewriter.replaceOpWithNewOp<func::CallOp>(
+                  initState, initState.getType(), calleeName,
+                  mlir::ValueRange{});
 
-              if (alloca->getUses().empty()) 
+              if (alloca->getUses().empty())
                 rewriter.eraseOp(alloca);
-              else  {
-                alloca->emitError("Failed to remove `quake.alloca` in state synthesis");
+              else {
+                alloca->emitError(
+                    "Failed to remove `quake.alloca` in state synthesis");
                 return failure();
               }
               if (isNumberOfQubitsCall(numOfQubits)) {
                 if (numOfQubits->getUses().empty())
                   rewriter.eraseOp(numOfQubits);
-                else  {
-                  numOfQubits->emitError("Failed to remove runtime call to get number of qubits in state synthesis");
+                else {
+                  numOfQubits->emitError("Failed to remove runtime call to get "
+                                         "number of qubits in state synthesis");
                   return failure();
                 }
               }
               if (getState->getUses().empty())
                 rewriter.eraseOp(getState);
-              else  {
-                alloca->emitError("Failed to remove runtime call to get state in state synthesis");
+              else {
+                alloca->emitError("Failed to remove runtime call to get state "
+                                  "in state synthesis");
                 return failure();
               }
               return success();
@@ -135,13 +140,15 @@ class ReplaceStateWithKernelPass
     RewritePatternSet patterns(ctx);
     patterns.insert<ReplaceStateWithKernelPattern>(ctx);
 
-    LLVM_DEBUG(llvm::dbgs() << "Before replace state with kernel: " << func << '\n');
+    LLVM_DEBUG(llvm::dbgs()
+               << "Before replace state with kernel: " << func << '\n');
 
     if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
                                             std::move(patterns))))
       signalPassFailure();
 
-    LLVM_DEBUG(llvm::dbgs() << "After replace state with kerenl: " << func << '\n');
+    LLVM_DEBUG(llvm::dbgs()
+               << "After replace state with kerenl: " << func << '\n');
   }
 };
 } // namespace
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index cd57a245d60..2253b4a996a 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -411,7 +411,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
       }
       // Add any global symbols, including global constant arrays.
       // Global constant arrays can be created during compilation,
-      // `lift-array-alloc`, `argument-synthesis`, `quake-synthesizer`, 
+      // `lift-array-alloc`, `argument-synthesis`, `quake-synthesizer`,
       // and `get-concrete-matrix`passes.
       if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op))
         moduleOp.push_back(globalOp.clone());

From 2f3a62327293e5c79b49c2249ecdf241467e6d9b Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 09:54:47 -0700
Subject: [PATCH 11/54] Fix failing test

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 targettests/execution/qvector_init_from_state.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/targettests/execution/qvector_init_from_state.cpp b/targettests/execution/qvector_init_from_state.cpp
index 06c97b1e6a3..681e42eee07 100644
--- a/targettests/execution/qvector_init_from_state.cpp
+++ b/targettests/execution/qvector_init_from_state.cpp
@@ -108,8 +108,8 @@ int main() {
   }
   // clang-format off
 // CHECK: Passing large state from another kernel as argument (kernel mode)
-// CHECK: 01111111111111
-// CHECK: 11111111111111
+// CHECK: 01111
+// CHECK: 11111
   // clang-format on
 
   {

From b3813503b148b98f4d7d074075a6a7496b1082c9 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 09:56:28 -0700
Subject: [PATCH 12/54] Format

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 targettests/execution/qvector_init_from_state.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/targettests/execution/qvector_init_from_state.cpp b/targettests/execution/qvector_init_from_state.cpp
index 681e42eee07..d75a7e30d8d 100644
--- a/targettests/execution/qvector_init_from_state.cpp
+++ b/targettests/execution/qvector_init_from_state.cpp
@@ -109,7 +109,7 @@ int main() {
   // clang-format off
 // CHECK: Passing large state from another kernel as argument (kernel mode)
 // CHECK: 01111
-// CHECK: 11111
+// CHECK: 111111
   // clang-format on
 
   {

From dc87ca4c9b31d7d1037c5f103adc58a353822135 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 09:57:02 -0700
Subject: [PATCH 13/54] Format

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 targettests/execution/qvector_init_from_state.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/targettests/execution/qvector_init_from_state.cpp b/targettests/execution/qvector_init_from_state.cpp
index d75a7e30d8d..681e42eee07 100644
--- a/targettests/execution/qvector_init_from_state.cpp
+++ b/targettests/execution/qvector_init_from_state.cpp
@@ -109,7 +109,7 @@ int main() {
   // clang-format off
 // CHECK: Passing large state from another kernel as argument (kernel mode)
 // CHECK: 01111
-// CHECK: 111111
+// CHECK: 11111
   // clang-format on
 
   {

From 53a34c97759a619a9298523705392412a2fc7974 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 14:46:03 -0700
Subject: [PATCH 14/54] Replaced getState intrinsic by cc.get_state op

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Builder/Intrinsics.h  |  4 -
 include/cudaq/Optimizer/Dialect/CC/CCOps.td   | 20 +++++
 lib/Optimizer/Builder/Intrinsics.cpp          |  4 -
 .../Transforms/ReplaceStateWithKernel.cpp     | 77 +++++++------------
 runtime/common/ArgumentConversion.cpp         | 21 +----
 runtime/common/BaseRemoteRESTQPU.h            |  1 -
 runtime/test/test_argument_conversion.cpp     | 22 ++----
 test/Quake/replace_state_with_kernel.qke      | 15 ++--
 8 files changed, 63 insertions(+), 101 deletions(-)

diff --git a/include/cudaq/Optimizer/Builder/Intrinsics.h b/include/cudaq/Optimizer/Builder/Intrinsics.h
index d545a576025..fa9ce53097f 100644
--- a/include/cudaq/Optimizer/Builder/Intrinsics.h
+++ b/include/cudaq/Optimizer/Builder/Intrinsics.h
@@ -55,10 +55,6 @@ static constexpr const char createCudaqStateFromDataFP32[] =
 // Delete a state created by the runtime functions above.
 static constexpr const char deleteCudaqState[] = "__nvqpp_cudaq_state_delete";
 
-// Get state of a kernel (placeholder function, calls are always replaced in
-// opts)
-static constexpr const char getCudaqState[] = "__nvqpp_cudaq_state_get";
-
 /// Builder for lowering the clang AST to an IR for CUDA-Q. Lowering includes
 /// the transformation of both quantum and classical computation. Different
 /// features of the CUDA-Q programming model are lowered into different dialects
diff --git a/include/cudaq/Optimizer/Dialect/CC/CCOps.td b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
index a58e3d403d6..cda02c7a23a 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCOps.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
@@ -898,6 +898,26 @@ def cc_AddressOfOp : CCOp<"address_of", [Pure,
   }];
 }
 
+def cc_GetStateOp : CCOp<"get_state", [Pure] > {
+  let summary = "Get state from kernel with the provided name.";
+  let description = [{
+    This operation is created by argument synthesis of state pointer arguments
+    for quantum devices. It takes a kernel name as ASCIIZ string literal value
+    and returns the kernel's quantum state. The operation is replaced by a call
+    to the kernel with the provided name in ReplaceStateByKernel pass.
+
+    ```mlir
+      %0 = cc.get_state "callee" : !cc.ptr<!cc.state>
+    ```
+  }];
+
+  let arguments = (ins StrAttr:$calleeName);
+  let results = (outs cc_PointerType:$result);
+  let assemblyFormat = [{
+     $calleeName `:` qualified(type(results)) attr-dict
+  }];
+}
+
 def cc_GlobalOp : CCOp<"global", [IsolatedFromAbove, Symbol]> {
   let summary = "Create a global constant or variable";
   let description = [{
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index e0ed794264f..315743f057d 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -269,10 +269,6 @@ static constexpr IntrinsicCode intrinsicTable[] = {
 
     {cudaq::deleteCudaqState, {}, R"#(
   func.func private @__nvqpp_cudaq_state_delete(%p : !cc.ptr<!cc.state>) -> ()
-  )#"},
-
-    {cudaq::getCudaqState, {}, R"#(
-  func.func private @__nvqpp_cudaq_state_get(%p : !cc.ptr<i8>) -> !cc.ptr<!cc.state>
   )#"},
 
     {cudaq::getNumQubitsFromCudaqState, {}, R"#(
diff --git a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index 5300f574154..80907bfec1d 100644
--- a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -43,10 +43,6 @@ static bool isCall(Operation *op, std::vector<const char *> &&names) {
   return false;
 }
 
-static bool isGetStateCall(Operation *op) {
-  return isCall(op, {cudaq::getCudaqState});
-}
-
 static bool isNumberOfQubitsCall(Operation *op) {
   return isCall(op, {cudaq::getNumQubitsFromCudaqState});
 }
@@ -56,12 +52,10 @@ static bool isNumberOfQubitsCall(Operation *op) {
 /// the state.
 ///
 /// ```
-///  %0 = cc.string_literal "callee.modified_0" : !cc.ptr<!cc.array<i8 x 27>>
-///  %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 27>>) -> !cc.ptr<i8>
-///  %2 = call @__nvqpp_cudaq_state_get(%1) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
-///  %3 = call @__nvqpp_cudaq_state_numberOfQubits(%2) : (!cc.ptr<!cc.state>) -> i64
-///  %4 = quake.alloca !quake.veq<?>[%3 : i64]
-///  %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+///  %0 = cc.get_state "__nvqpp__mlirgen__test_init_state.modified_0" : !cc.ptr<!cc.state>
+///  %1 = call @__nvqpp_cudaq_state_numberOfQubits(%0) : (!cc.ptr<!cc.state>) -> i64
+///  %2 = quake.alloca !quake.veq<?>[%1 : i64]
+///  %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 /// ───────────────────────────────────────────
 /// ...
 ///  %5 = call @callee.modified_0() : () -> !quake.veq<?>
@@ -79,49 +73,34 @@ class ReplaceStateWithKernelPattern
 
     if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(stateOp.getType())) {
       if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-        auto *getState = stateOp.getDefiningOp();
         auto *numOfQubits = alloca->getOperand(0).getDefiningOp();
-
-        if (isGetStateCall(getState)) {
-          auto calleeNameOp = getState->getOperand(0);
-          if (auto cast = calleeNameOp.getDefiningOp<cudaq::cc::CastOp>()) {
-            calleeNameOp = cast.getOperand();
-
-            if (auto literal =
-                    calleeNameOp
-                        .getDefiningOp<cudaq::cc::CreateStringLiteralOp>()) {
-              auto calleeName = literal.getStringLiteral();
-              rewriter.replaceOpWithNewOp<func::CallOp>(
-                  initState, initState.getType(), calleeName,
-                  mlir::ValueRange{});
-
-              if (alloca->getUses().empty())
-                rewriter.eraseOp(alloca);
-              else {
-                alloca->emitError(
-                    "Failed to remove `quake.alloca` in state synthesis");
-                return failure();
-              }
-              if (isNumberOfQubitsCall(numOfQubits)) {
-                if (numOfQubits->getUses().empty())
-                  rewriter.eraseOp(numOfQubits);
-                else {
-                  numOfQubits->emitError("Failed to remove runtime call to get "
-                                         "number of qubits in state synthesis");
-                  return failure();
-                }
-              }
-              if (getState->getUses().empty())
-                rewriter.eraseOp(getState);
-              else {
-                alloca->emitError("Failed to remove runtime call to get state "
-                                  "in state synthesis");
-                return failure();
-              }
-              return success();
+        stateOp.getDefiningOp()->dump();
+
+        if (auto getState = stateOp.getDefiningOp<cudaq::cc::GetStateOp>()) {
+          auto calleeName = getState.getCalleeName();
+          rewriter.replaceOpWithNewOp<func::CallOp>(
+              initState, initState.getType(), calleeName, mlir::ValueRange{});
+
+          if (alloca->getUses().empty())
+            rewriter.eraseOp(alloca);
+          else {
+            alloca->emitError(
+                "Failed to remove `quake.alloca` in state synthesis");
+            return failure();
+          }
+          if (isNumberOfQubitsCall(numOfQubits)) {
+            if (numOfQubits->getUses().empty())
+              rewriter.eraseOp(numOfQubits);
+            else {
+              numOfQubits->emitError("Failed to remove runtime call to get "
+                                     "number of qubits in state synthesis");
+              return failure();
             }
           }
+          return success();
         }
+        numOfQubits->emitError(
+            "Failed to replace `quake.init_state` in state synthesis");
       }
     }
     return failure();
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 42b228dd3bf..c548d235238 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -243,27 +243,10 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     converter.genCallee(modifiedCalleeName, calleeArgs);
 
     // Create a subst for state pointer.
-    auto strLitTy = cudaq::cc::PointerType::get(
-        cudaq::cc::ArrayType::get(builder.getContext(), builder.getI8Type(),
-                                  modifiedCalleeKernelName.size() + 1));
-    auto callee = builder.create<cudaq::cc::CreateStringLiteralOp>(
-        loc, strLitTy, builder.getStringAttr(modifiedCalleeKernelName));
-
-    auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
-    auto calleeCast = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, callee);
-
-    cudaq::IRBuilder irBuilder(ctx);
-    auto result = irBuilder.loadIntrinsic(substMod, cudaq::getCudaqState);
-    assert(succeeded(result) && "loading intrinsic should never fail");
-
     auto statePtrTy =
         cudaq::cc::PointerType::get(cudaq::cc::StateType::get(ctx));
-    auto statePtr =
-        builder
-            .create<func::CallOp>(loc, statePtrTy, cudaq::getCudaqState,
-                                  ValueRange{calleeCast})
-            .getResult(0);
-    return builder.create<cudaq::cc::CastOp>(loc, statePtrTy, statePtr);
+    return builder.create<cudaq::cc::GetStateOp>(
+        loc, statePtrTy, builder.getStringAttr(modifiedCalleeKernelName));
   }
 
   TODO("cudaq::state* argument synthesis for quantum hardware for c functions");
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 2c8654d540c..0421cde8774 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -458,7 +458,6 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         pm.addPass(opt::createDeleteStates());
         pm.addNestedPass<mlir::func::FuncOp>(
             opt::createReplaceStateWithKernel());
-        pm.addPass(mlir::createCanonicalizerPass());
         pm.addPass(mlir::createSymbolDCEPass());
       } else if (updatedArgs) {
         cudaq::info("Run Quake Synth.\n");
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 9fe3d92f8fb..93939125c1b 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -380,13 +380,10 @@ void test_state(mlir::MLIRContext *ctx) {
 
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @[[VAL_GC:.*]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_1:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_2:.*]] = arith.constant 8 : i64
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<complex<f64> x 8>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_3]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<complex<f64> x 8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = func.call @__nvqpp_cudaq_state_createFromData_fp64(%[[VAL_4]], %[[VAL_2]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.array<complex<f64> x 8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_3:.*]] = func.call @__nvqpp_cudaq_state_createFromData_fp64(%[[VAL_2]], %[[VAL_1]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
 // CHECK:        }
 // CHECK-DAG:    cc.global constant @[[VAL_GC]] (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f64>>) : !cc.array<complex<f64> x 8>
 // CHECK-DAG:    func.func private @__nvqpp_cudaq_state_createFromData_fp64(!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
@@ -490,13 +487,10 @@ void test_combinations(mlir::MLIRContext *ctx) {
 // CHECK:         }
 // CHECK-LABEL:   cc.arg_subst[1] {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @[[VAL_GC:.*]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_1:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_2:.*]] = arith.constant 8 : i64
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<complex<f64> x 8>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_3]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<complex<f64> x 8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = func.call @__nvqpp_cudaq_state_createFromData_fp64(%[[VAL_4]], %[[VAL_2]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.array<complex<f64> x 8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_3:.*]] = func.call @__nvqpp_cudaq_state_createFromData_fp64(%[[VAL_2]], %[[VAL_1]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK-DAG:     cc.global constant @[[VAL_GC]] (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f64>>) : !cc.array<complex<f64> x 8>
 // CHECK-DAG:     func.func private @__nvqpp_cudaq_state_createFromData_fp64(!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
diff --git a/test/Quake/replace_state_with_kernel.qke b/test/Quake/replace_state_with_kernel.qke
index 70b04e31030..751e29775a9 100644
--- a/test/Quake/replace_state_with_kernel.qke
+++ b/test/Quake/replace_state_with_kernel.qke
@@ -10,18 +10,13 @@
 
 module {
   func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-    %0 = cc.string_literal "callee.modified_0" : !cc.ptr<!cc.array<i8 x 27>>
-    %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 27>>) -> !cc.ptr<i8>
-    %2 = call @__nvqpp_cudaq_state_get(%1) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
-    %3 = call @__nvqpp_cudaq_state_numberOfQubits(%2) : (!cc.ptr<!cc.state>) -> i64
-    %4 = quake.alloca !quake.veq<?>[%3 : i64]
-    %5 = quake.init_state %4, %2 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
-    return
+    %0 = cc.get_state "__nvqpp__mlirgen__test_init_state.modified_0" : !cc.ptr<!cc.state>
+    %1 = call @__nvqpp_cudaq_state_numberOfQubits(%0) : (!cc.ptr<!cc.state>) -> i64
+    %2 = quake.alloca !quake.veq<?>[%1 : i64]
+    %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+return
   }
-
   func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
-  func.func private @__nvqpp_cudaq_state_get(!cc.ptr<i8>) -> !cc.ptr<!cc.state>
-
   func.func private @callee.modified_0() -> !quake.veq<?> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
     %cst = arith.constant 1.5707963267948966 : f64
     %0 = quake.alloca !quake.veq<2>

From fe6d409ec21b0f72016690213dd5a3781d9c53cc Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 14:47:59 -0700
Subject: [PATCH 15/54] Remove print

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index 80907bfec1d..bdc18982840 100644
--- a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -74,7 +74,6 @@ class ReplaceStateWithKernelPattern
     if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(stateOp.getType())) {
       if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
         auto *numOfQubits = alloca->getOperand(0).getDefiningOp();
-        stateOp.getDefiningOp()->dump();
 
         if (auto getState = stateOp.getDefiningOp<cudaq::cc::GetStateOp>()) {
           auto calleeName = getState.getCalleeName();

From 48704e3bcb648043ba9c1ccd7ecd056d620e88e6 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 14:50:08 -0700
Subject: [PATCH 16/54] Remove getCudaqState references

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp | 3 +--
 runtime/common/BaseRemoteRESTQPU.h         | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
index 04eac5b06f7..4de20fd7bef 100644
--- a/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
+++ b/lib/Optimizer/CodeGen/VerifyNVQIRCalls.cpp
@@ -49,8 +49,7 @@ struct VerifyNVQIRCallOpsPass
           cudaq::getNumQubitsFromCudaqState,
           cudaq::createCudaqStateFromDataFP32,
           cudaq::createCudaqStateFromDataFP64,
-          cudaq::deleteCudaqState,
-          cudaq::getCudaqState};
+          cudaq::deleteCudaqState};
       // It must be either NVQIR extension functions or in the allowed list.
       return std::find(NVQIR_FUNCS.begin(), NVQIR_FUNCS.end(), functionName) !=
                  NVQIR_FUNCS.end() ||
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 0421cde8774..0d9a5ddbc96 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -408,8 +408,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
             (funcOp.getName().equals(cudaq::getNumQubitsFromCudaqState) ||
              funcOp.getName().equals(cudaq::createCudaqStateFromDataFP64) ||
              funcOp.getName().equals(cudaq::createCudaqStateFromDataFP32) ||
-             funcOp.getName().equals(cudaq::deleteCudaqState) ||
-             funcOp.getName().equals(cudaq::getCudaqState)))
+             funcOp.getName().equals(cudaq::deleteCudaqState)))
           moduleOp.push_back(funcOp.clone());
       }
       // Add any global symbols, including global constant arrays.

From 137f621febc0c607dbea69d25eba70e7bcb696ca Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 15:01:25 -0700
Subject: [PATCH 17/54] Minor updates

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/common/BaseRemoteRESTQPU.h | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 0d9a5ddbc96..5cf89c0332f 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -403,18 +403,21 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     for (auto &op : m_module.getOps()) {
       if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
         // Add function definitions for runtime functions that must
-        // be removed after synthesis in cleanup ops.
+        // be removed after synthesis in cleanup passes.
+        static const std::vector<llvm::StringRef> stateFuncs = {
+            cudaq::getNumQubitsFromCudaqState,
+            cudaq::createCudaqStateFromDataFP32,
+            cudaq::createCudaqStateFromDataFP64};
+
         if (funcOp.getBody().empty() &&
-            (funcOp.getName().equals(cudaq::getNumQubitsFromCudaqState) ||
-             funcOp.getName().equals(cudaq::createCudaqStateFromDataFP64) ||
-             funcOp.getName().equals(cudaq::createCudaqStateFromDataFP32) ||
-             funcOp.getName().equals(cudaq::deleteCudaqState)))
+            std::find(stateFuncs.begin(), stateFuncs.end(), funcOp.getName()) !=
+                stateFuncs.end())
           moduleOp.push_back(funcOp.clone());
       }
       // Add any global symbols, including global constant arrays.
       // Global constant arrays can be created during compilation,
       // `lift-array-alloc`, `argument-synthesis`, `quake-synthesizer`,
-      // and `get-concrete-matrix`passes.
+      // and `get-concrete-matrix` passes.
       if (auto globalOp = dyn_cast<cudaq::cc::GlobalOp>(op))
         moduleOp.push_back(globalOp.clone());
     }

From ad7c6bcd26a521f4401e4b46e97e09795a4f6333 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 22 Oct 2024 18:05:49 -0700
Subject: [PATCH 18/54] Fix failing quake test

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 test/Quake/replace_state_with_kernel.qke | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Quake/replace_state_with_kernel.qke b/test/Quake/replace_state_with_kernel.qke
index 751e29775a9..09570c62907 100644
--- a/test/Quake/replace_state_with_kernel.qke
+++ b/test/Quake/replace_state_with_kernel.qke
@@ -10,7 +10,7 @@
 
 module {
   func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-    %0 = cc.get_state "__nvqpp__mlirgen__test_init_state.modified_0" : !cc.ptr<!cc.state>
+    %0 = cc.get_state "callee.modified_0" : !cc.ptr<!cc.state>
     %1 = call @__nvqpp_cudaq_state_numberOfQubits(%0) : (!cc.ptr<!cc.state>) -> i64
     %2 = quake.alloca !quake.veq<?>[%1 : i64]
     %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>

From 78c0a4423cfa5070082a405925420ed0d8f52484 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Mon, 4 Nov 2024 16:02:06 -0800
Subject: [PATCH 19/54] Add a few state-related cc ops

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Dialect/CC/CCOps.td  |  62 ++++++++++
 include/cudaq/Optimizer/Transforms/Passes.td |   5 +-
 lib/Frontend/nvqpp/ConvertExpr.cpp           |  13 +--
 lib/Optimizer/CodeGen/QuakeToCodegen.cpp     |  68 ++++++++++-
 lib/Optimizer/Transforms/DeleteStates.cpp    | 112 +++++++------------
 python/cudaq/kernel/ast_bridge.py            |   6 +-
 python/cudaq/kernel/kernel_builder.py        |  10 +-
 runtime/common/ArgumentConversion.cpp        |  23 +---
 runtime/cudaq/builder/kernel_builder.cpp     |  11 +-
 runtime/test/test_argument_conversion.cpp    |  20 +---
 test/AST-Quake/qalloc_state.cpp              |   9 +-
 test/Quake/delete_states.qke                 |  61 ++++------
 12 files changed, 218 insertions(+), 182 deletions(-)

diff --git a/include/cudaq/Optimizer/Dialect/CC/CCOps.td b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
index a58e3d403d6..d58fc6bc335 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCOps.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
@@ -898,6 +898,68 @@ def cc_AddressOfOp : CCOp<"address_of", [Pure,
   }];
 }
 
+def cc_CreateStateOp : CCOp<"create_state", [Pure] > {
+  let summary = "Create state from data";
+  let description = [{
+    This operation takes a pointer to state data and creates a quantum state.
+    The operation can be optimized away in DeleteStates pass, or replaced
+    by an intrinsic runtime call on simulators.
+
+    ```mlir
+      %0 = cc.create_state %data: !cc.ptr<!cc.state>
+    ```
+  }];
+
+  let arguments = (ins
+    AnyPointerType:$data,
+    AnySignlessInteger:$length
+  );
+  let results = (outs AnyPointerType:$result);
+  let assemblyFormat = [{
+      $data `,` $length `:` functional-type(operands, results) attr-dict
+  }];
+}
+
+def cc_GetNumberOfQubitsOp : CCOp<"get_number_of_qubits", [Pure] > {
+  let summary = "Get number of qubits from a quantum state";
+  let description = [{
+    This operation takes a state pointer argument and returns a number of
+    qubits in the state. The operation can be optimized away in some passes
+    line ReplaceStateByKernel or DeleteStates, or replaced by an intrinsic
+    runtime call on simulators.
+
+    ```mlir
+      %0 = cc.get_number_of_qubits %state : i64
+    ```
+  }];
+
+  let arguments = (ins cc_PointerType:$state);
+  let results = (outs AnySignlessInteger:$result);
+  let assemblyFormat = [{
+      $state `:` functional-type(operands, results) attr-dict
+  }];
+}
+
+def cc_GetStateOp : CCOp<"get_state", [Pure] > {
+  let summary = "Get state from kernel with the provided name.";
+  let description = [{
+    This operation is created by argument synthesis of state pointer arguments
+    for quantum devices. It takes a kernel name as ASCIIZ string literal value
+    and returns the kernel's quantum state. The operation is replaced by a call
+    to the kernel with the provided name in ReplaceStateByKernel pass.
+
+    ```mlir
+      %0 = cc.get_state "callee" : !cc.ptr<!cc.state>
+    ```
+  }];
+
+  let arguments = (ins StrAttr:$calleeName);
+  let results = (outs cc_PointerType:$result);
+  let assemblyFormat = [{
+     $calleeName `:` qualified(type(results)) attr-dict
+  }];
+}
+
 def cc_GlobalOp : CCOp<"global", [IsolatedFromAbove, Symbol]> {
   let summary = "Create a global constant or variable";
   let description = [{
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index da6f3163b3e..04964037c18 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -752,9 +752,8 @@ def DeleteStates : Pass<"delete-states", "mlir::ModuleOp"> {
     func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
       %c8_i64 = arith.constant 8 : i64
       %0 = cc.address_of @foo.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-      %3 = cc.cast %0 : (!cc.ptr<!cc.array<complex<f32> x 8>>) -> !cc.ptr<i8>
-      %4 = call @__nvqpp_cudaq_state_createFromData_fp32(%3, %c8_i64) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-      %5 = call @__nvqpp_cudaq_state_numberOfQubits(%4) : (!cc.ptr<!cc.state>) -> i64
+      %4 = cc.create_state %3, %c8_i64  : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
+      %5 = cc.get_number_of_qubits %4 : (!cc.ptr<!cc.state>) -> i64
       %6 = quake.alloca !quake.veq<?>[%5 : i64]
       %7 = quake.init_state %6, %4 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
diff --git a/lib/Frontend/nvqpp/ConvertExpr.cpp b/lib/Frontend/nvqpp/ConvertExpr.cpp
index e6350d1c5c1..fa0fd326f10 100644
--- a/lib/Frontend/nvqpp/ConvertExpr.cpp
+++ b/lib/Frontend/nvqpp/ConvertExpr.cpp
@@ -2694,19 +2694,12 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
             initials = load.getPtrvalue();
         }
         if (isStateType(initials.getType())) {
-          IRBuilder irBuilder(builder.getContext());
-          auto mod =
-              builder.getBlock()->getParentOp()->getParentOfType<ModuleOp>();
-          auto result =
-              irBuilder.loadIntrinsic(mod, getNumQubitsFromCudaqState);
-          assert(succeeded(result) && "loading intrinsic should never fail");
           Value state = initials;
           auto i64Ty = builder.getI64Type();
-          auto numQubits = builder.create<func::CallOp>(
-              loc, i64Ty, getNumQubitsFromCudaqState, ValueRange{state});
+          auto numQubits =
+              builder.create<cudaq::cc::GetNumberOfQubitsOp>(loc, i64Ty, state);
           auto veqTy = quake::VeqType::getUnsized(ctx);
-          Value alloc = builder.create<quake::AllocaOp>(loc, veqTy,
-                                                        numQubits.getResult(0));
+          Value alloc = builder.create<quake::AllocaOp>(loc, veqTy, numQubits);
           return pushValue(builder.create<quake::InitializeStateOp>(
               loc, veqTy, alloc, state));
         }
diff --git a/lib/Optimizer/CodeGen/QuakeToCodegen.cpp b/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
index e9e56f8f5fe..6e913a2bec2 100644
--- a/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
+++ b/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
@@ -8,6 +8,9 @@
 
 #include "QuakeToCodegen.h"
 #include "CodeGenOps.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
+#include "cudaq/Optimizer/CodeGen/Passes.h"
+#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
@@ -62,10 +65,73 @@ class ExpandComplexCast : public OpRewritePattern<cudaq::cc::CastOp> {
     return success();
   }
 };
+
+class CreateStateOpPattern : public OpRewritePattern<cudaq::cc::CreateStateOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(cudaq::cc::CreateStateOp createStateOp,
+                                PatternRewriter &rewriter) const override {
+    auto module = createStateOp->getParentOfType<ModuleOp>();
+    auto loc = createStateOp.getLoc();
+    auto ctx = createStateOp.getContext();
+    auto buffer = createStateOp.getOperand(0);
+    auto size = createStateOp.getOperand(1);
+
+    auto bufferTy = buffer.getType();
+    auto ptrTy = cast<cudaq::cc::PointerType>(bufferTy);
+    auto arrTy = cast<cudaq::cc::ArrayType>(ptrTy.getElementType());
+    auto eleTy = arrTy.getElementType();
+    auto is64Bit = isa<Float64Type>(eleTy);
+
+    if (auto cTy = dyn_cast<ComplexType>(eleTy))
+      is64Bit = isa<Float64Type>(eleTy);
+
+    auto createStateFunc = is64Bit ? cudaq::createCudaqStateFromDataFP64
+                                   : cudaq::createCudaqStateFromDataFP32;
+    cudaq::IRBuilder irBuilder(ctx);
+    auto result = irBuilder.loadIntrinsic(module, createStateFunc);
+    assert(succeeded(result) && "loading intrinsic should never fail");
+
+    auto stateTy = cudaq::cc::StateType::get(ctx);
+    auto statePtrTy = cudaq::cc::PointerType::get(stateTy);
+    auto i8PtrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
+    auto cast = rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, buffer);
+
+    rewriter.replaceOpWithNewOp<func::CallOp>(
+        createStateOp, statePtrTy, createStateFunc, ValueRange{cast, size});
+    return success();
+  }
+};
+
+class GetNumberOfQubitsOpPattern
+    : public OpRewritePattern<cudaq::cc::GetNumberOfQubitsOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(cudaq::cc::GetNumberOfQubitsOp getNumQubitsOp,
+                                PatternRewriter &rewriter) const override {
+    auto module = getNumQubitsOp->getParentOfType<ModuleOp>();
+    auto ctx = getNumQubitsOp.getContext();
+    auto state = getNumQubitsOp.getOperand();
+
+    cudaq::IRBuilder irBuilder(ctx);
+    auto result =
+        irBuilder.loadIntrinsic(module, cudaq::getNumQubitsFromCudaqState);
+    assert(succeeded(result) && "loading intrinsic should never fail");
+
+    rewriter.replaceOpWithNewOp<func::CallOp>(
+        getNumQubitsOp, rewriter.getI64Type(),
+        cudaq::getNumQubitsFromCudaqState, state);
+    return success();
+  }
+};
+
 } // namespace
 
 void cudaq::codegen::populateQuakeToCodegenPatterns(
     mlir::RewritePatternSet &patterns) {
   auto *ctx = patterns.getContext();
-  patterns.insert<CodeGenRAIIPattern, ExpandComplexCast>(ctx);
+  patterns.insert<CodeGenRAIIPattern, ExpandComplexCast, CreateStateOpPattern,
+                  GetNumberOfQubitsOpPattern>(ctx);
 }
diff --git a/lib/Optimizer/Transforms/DeleteStates.cpp b/lib/Optimizer/Transforms/DeleteStates.cpp
index 7cc7bca0444..74b3a432c23 100644
--- a/lib/Optimizer/Transforms/DeleteStates.cpp
+++ b/lib/Optimizer/Transforms/DeleteStates.cpp
@@ -29,104 +29,79 @@ namespace cudaq::opt {
 using namespace mlir;
 
 namespace {
-
-static bool isCall(Operation *callOp, std::vector<const char *> &&names) {
-  if (callOp) {
-    if (auto createStateCall = dyn_cast<func::CallOp>(callOp)) {
-      if (auto calleeAttr = createStateCall.getCalleeAttr()) {
-        auto funcName = calleeAttr.getValue().str();
-        if (std::find(names.begin(), names.end(), funcName) != names.end())
-          return true;
-      }
-    }
-  }
-  return false;
-}
-
-static bool isCreateStateCall(Operation *callOp) {
-  return isCall(callOp, {cudaq::createCudaqStateFromDataFP64,
-                         cudaq::createCudaqStateFromDataFP32});
-}
-
-static bool isNumberOfQubitsCall(Operation *callOp) {
-  return isCall(callOp, {cudaq::getNumQubitsFromCudaqState});
-}
-
-/// For a call to `__nvqpp_cudaq_state_createFromData_fpXX`, get the number of
-/// qubits allocated.
-static std::size_t getStateSize(Operation *callOp) {
-  if (isCreateStateCall(callOp)) {
-    if (auto createStateCall = dyn_cast<func::CallOp>(callOp)) {
-      auto sizeOperand = createStateCall.getOperand(1);
-      auto defOp = sizeOperand.getDefiningOp();
-      while (defOp && !dyn_cast<arith::ConstantIntOp>(defOp))
-        defOp = defOp->getOperand(0).getDefiningOp();
-      if (auto constOp = dyn_cast<arith::ConstantIntOp>(defOp))
-        return constOp.getValue().cast<IntegerAttr>().getInt();
-    }
+/// For a `cc:CreateStateOp`, get the number of qubits allocated.
+static std::size_t getStateSize(Operation *op) {
+  if (auto createStateOp = dyn_cast<cudaq::cc::CreateStateOp>(op)) {
+    auto sizeOperand = createStateOp.getOperand(1);
+    auto defOp = sizeOperand.getDefiningOp();
+    while (defOp && !dyn_cast<arith::ConstantIntOp>(defOp))
+      defOp = defOp->getOperand(0).getDefiningOp();
+    if (auto constOp = dyn_cast<arith::ConstantIntOp>(defOp))
+      return constOp.getValue().cast<IntegerAttr>().getInt();
   }
-  callOp->emitError("Cannot compute number of qubits");
+  op->emitError("Cannot compute number of qubits from createStateOp");
   return 0;
 }
 
 // clang-format off
-/// Remove `__nvqpp_cudaq_state_numberOfQubits` calls.
+/// Replace `cc.get_number_of_qubits` by a constant.
 /// ```
-/// %1 = arith.constant 8 : i64
-/// %2 = call @__nvqpp_cudaq_state_createFromData_fp32(%0, %1) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-/// %3 = call @__nvqpp_cudaq_state_numberOfQubits(%2) : (!cc.ptr<!cc.state>) -> i64
+/// %c8_i64 = arith.constant 8 : i64
+/// %2 = cc.create_state %3, %c8_i64 : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+/// %3 = cc.get_number_of_qubits %2 : i64
 /// ...
 /// ───────────────────────────────────────────
-/// %1 = arith.constant 8 : i64
-/// %2 = call @__nvqpp_cudaq_state_createFromData_fp32(%0, %1) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-/// %5 = arith.constant 3 : i64
+/// %c8_i64 = arith.constant 8 : i64
+/// %2 = cc.create_state %3, %c8_i64 : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+/// %3 = arith.constant 3 : i64
 /// ```
 // clang-format on
-class NumberOfQubitsPattern : public OpRewritePattern<func::CallOp> {
+class NumberOfQubitsPattern
+    : public OpRewritePattern<cudaq::cc::GetNumberOfQubitsOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(func::CallOp callOp,
+  LogicalResult matchAndRewrite(cudaq::cc::GetNumberOfQubitsOp op,
                                 PatternRewriter &rewriter) const override {
-    if (isNumberOfQubitsCall(callOp)) {
-      auto createStateOp = callOp.getOperand(0).getDefiningOp();
-      if (isCreateStateCall(createStateOp)) {
-        auto size = getStateSize(createStateOp);
-        rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(
-            callOp, std::countr_zero(size), rewriter.getI64Type());
-        return success();
-      }
+    auto stateOp = op.getOperand();
+    if (auto createStateOp =
+            stateOp.getDefiningOp<cudaq::cc::CreateStateOp>()) {
+      auto size = getStateSize(createStateOp);
+      rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(
+          op, std::countr_zero(size), rewriter.getI64Type());
+      return success();
     }
     return failure();
   }
 };
 
 // clang-format off
-/// Replace calls to `__nvqpp_cudaq_state_numberOfQubits` by a constant.
+/// Remove `cc.create_state` instructions and pass their data directly to
+/// the `quake.state_init` instruction instead.
 /// ```
 /// %2 = cc.cast %1 : (!cc.ptr<!cc.array<complex<f32> x 8>>) -> !cc.ptr<i8>
-/// %3 = call @__nvqpp_cudaq_state_createFromData_fp32(%2, %c8_i64) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+/// %3 = cc.create_state %3, %c8_i64 : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
 /// %4 = quake.alloca !quake.veq<?>[%0 : i64]
 /// %5 = quake.init_state %4, %3 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 /// ───────────────────────────────────────────
 /// ...
-/// %3 = call @__nvqpp_cudaq_state_createFromData_fp32(%2, %c8_i64) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
 /// %4 = quake.alloca !quake.veq<?>[%0 : i64]
 /// %5 = quake.init_state %4, %1 : (!quake.veq<?>, !cc.ptr<!cc.array<complex<f32> x 8>>) -> !quake.veq<?>
 /// ```
 // clang-format on
+
 class StateToDataPattern : public OpRewritePattern<quake::InitializeStateOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
                                 PatternRewriter &rewriter) const override {
-    auto stateOp = initState.getOperand(1).getDefiningOp();
+    auto state = initState.getOperand(1);
     auto targets = initState.getTargets();
 
-    if (isCreateStateCall(stateOp)) {
-      auto dataOp = stateOp->getOperand(0);
-      if (auto cast = dyn_cast<cudaq::cc::CastOp>(dataOp.getDefiningOp()))
+    if (auto createStateOp = state.getDefiningOp<cudaq::cc::CreateStateOp>()) {
+      auto dataOp = createStateOp->getOperand(0);
+      if (auto cast = dataOp.getDefiningOp<cudaq::cc::CastOp>())
         dataOp = cast.getOperand();
       rewriter.replaceOpWithNewOp<quake::InitializeStateOp>(
           initState, targets.getType(), targets, dataOp);
@@ -163,10 +138,8 @@ class DeleteStatesPass
       llvm::SmallVector<Operation *> usedStates;
 
       func.walk([&](Operation *op) {
-        if (isCreateStateCall(op)) {
-          if (op->getUses().empty())
-            op->erase();
-          else
+        if (isa<cudaq::cc::CreateStateOp>(op)) {
+          if (!op->getUses().empty())
             usedStates.push_back(op);
         }
       });
@@ -178,15 +151,16 @@ class DeleteStatesPass
         func.walk([&](Operation *op) {
           if (isa<func::ReturnOp>(op)) {
             auto loc = op->getLoc();
-            auto deleteState = cudaq::deleteCudaqState;
-            auto result = irBuilder.loadIntrinsic(module, deleteState);
+            auto result =
+                irBuilder.loadIntrinsic(module, cudaq::deleteCudaqState);
             assert(succeeded(result) && "loading intrinsic should never fail");
 
             builder.setInsertionPoint(op);
             for (auto createStateOp : usedStates) {
-              auto results = cast<func::CallOp>(createStateOp).getResults();
-              builder.create<func::CallOp>(loc, std::nullopt, deleteState,
-                                           results);
+              auto result = cast<cudaq::cc::CreateStateOp>(createStateOp);
+              builder.create<func::CallOp>(loc, std::nullopt,
+                                           cudaq::deleteCudaqState,
+                                           mlir::ValueRange{result});
             }
           }
         });
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index cae278143f5..13d694d7bca 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -2246,11 +2246,9 @@ def bodyBuilder(iterVal):
                         # handle `cudaq.qvector(state)`
                         statePtr = self.ifNotPointerThenStore(valueOrPtr)
 
-                        symName = '__nvqpp_cudaq_state_numberOfQubits'
-                        load_intrinsic(self.module, symName)
                         i64Ty = self.getIntegerType()
-                        numQubits = func.CallOp([i64Ty], symName,
-                                                [statePtr]).result
+                        numQubits = cc.GetNumberOfQubitsOp(i64Ty,
+                                                           statePtr).result
 
                         veqTy = quake.VeqType.get(self.ctx)
                         qubits = quake.AllocaOp(veqTy, size=numQubits).result
diff --git a/python/cudaq/kernel/kernel_builder.py b/python/cudaq/kernel/kernel_builder.py
index e8d6345ffbc..9f528acfeea 100644
--- a/python/cudaq/kernel/kernel_builder.py
+++ b/python/cudaq/kernel/kernel_builder.py
@@ -777,10 +777,8 @@ def qalloc(self, initializer=None):
             if isinstance(initializer, cudaq_runtime.State):
                 statePtr = self.capturedDataStorage.storeCudaqState(initializer)
 
-                symName = '__nvqpp_cudaq_state_numberOfQubits'
-                load_intrinsic(self.module, symName)
                 i64Ty = self.getIntegerType()
-                numQubits = func.CallOp([i64Ty], symName, [statePtr]).result
+                numQubits = cc.GetNumberOfQubitsOp(i64Ty, statePtr).result
 
                 veqTy = quake.VeqType.get(self.ctx)
                 qubits = quake.AllocaOp(veqTy, size=numQubits).result
@@ -816,11 +814,9 @@ def qalloc(self, initializer=None):
                     if cc.StateType.isinstance(valueTy):
                         statePtr = initializer.mlirValue
 
-                        symName = '__nvqpp_cudaq_state_numberOfQubits'
-                        load_intrinsic(self.module, symName)
                         i64Ty = self.getIntegerType()
-                        numQubits = func.CallOp([i64Ty], symName,
-                                                [statePtr]).result
+                        numQubits = cc.GetNumberOfQubitsOp(i64Ty,
+                                                           statePtr).result
 
                         veqTy = quake.VeqType.get(self.ctx)
                         qubits = quake.AllocaOp(veqTy, size=numQubits).result
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 0de2589752f..09ddb9c74bc 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -130,33 +130,18 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
       std::string name =
           kernelName.str() + ".rodata_synth_" + std::to_string(counter++);
       irBuilder.genVectorOfConstants(loc, substMod, name, vec);
-      auto conGlobal = builder.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
-      return builder.create<cudaq::cc::LoadOp>(loc, arrTy, conGlobal);
+      return builder.create<cudaq::cc::AddressOfOp>(loc, ptrTy, name);
     };
 
-    auto conArr = is64Bit ? genConArray.template operator()<double>()
+    auto buffer = is64Bit ? genConArray.template operator()<double>()
                           : genConArray.template operator()<float>();
 
-    auto createState = is64Bit ? cudaq::createCudaqStateFromDataFP64
-                               : cudaq::createCudaqStateFromDataFP32;
-    auto result = irBuilder.loadIntrinsic(substMod, createState);
-    assert(succeeded(result) && "loading intrinsic should never fail");
-
     auto arrSize = builder.create<arith::ConstantIntOp>(loc, size, 64);
     auto stateTy = cudaq::cc::StateType::get(ctx);
     auto statePtrTy = cudaq::cc::PointerType::get(stateTy);
-    auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
-    auto buffer = builder.create<cudaq::cc::AllocaOp>(loc, arrTy);
-    builder.create<cudaq::cc::StoreOp>(loc, conArr, buffer);
-
-    auto cast = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, buffer);
-    auto statePtr = builder
-                        .create<func::CallOp>(loc, statePtrTy, createState,
-                                              ValueRange{cast, arrSize})
-                        .getResult(0);
 
-    // TODO: Delete the new state before function exit.
-    return builder.create<cudaq::cc::CastOp>(loc, statePtrTy, statePtr);
+    return builder.create<cudaq::cc::CreateStateOp>(loc, statePtrTy, buffer,
+                                                    arrSize);
   }
   // The program is executed on quantum hardware, state data is not
   // available and needs to be regenerated.
diff --git a/runtime/cudaq/builder/kernel_builder.cpp b/runtime/cudaq/builder/kernel_builder.cpp
index 6961cc547f7..ebf10a6978f 100644
--- a/runtime/cudaq/builder/kernel_builder.cpp
+++ b/runtime/cudaq/builder/kernel_builder.cpp
@@ -514,16 +514,11 @@ QuakeValue qalloc(ImplicitLocOpBuilder &builder, QuakeValue &sizeOrVec) {
     auto eleTy = statePtrTy.getElementType();
     if (auto stateTy = dyn_cast<cc::StateType>(eleTy)) {
       // get the number of qubits
-      IRBuilder irBuilder(context);
-      auto mod = builder.getBlock()->getParentOp()->getParentOfType<ModuleOp>();
-      auto result = irBuilder.loadIntrinsic(mod, getNumQubitsFromCudaqState);
-      assert(succeeded(result) && "loading intrinsic should never fail");
-      auto numQubits = builder.create<func::CallOp>(
-          builder.getI64Type(), getNumQubitsFromCudaqState, ValueRange{value});
+      auto numQubits = builder.create<cudaq::cc::GetNumberOfQubitsOp>(
+          builder.getI64Type(), value);
       // allocate the number of qubits we need
       auto veqTy = quake::VeqType::getUnsized(context);
-      Value qubits =
-          builder.create<quake::AllocaOp>(veqTy, numQubits.getResult(0));
+      Value qubits = builder.create<quake::AllocaOp>(veqTy, numQubits);
       // Add the initialize state op
       qubits = builder.create<quake::InitializeStateOp>(qubits.getType(),
                                                         qubits, value);
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 9fe3d92f8fb..1326ac4d395 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -380,16 +380,10 @@ void test_state(mlir::MLIRContext *ctx) {
 
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @[[VAL_GC:.*]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_1:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_2:.*]] = arith.constant 8 : i64
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<complex<f64> x 8>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_3]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<complex<f64> x 8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = func.call @__nvqpp_cudaq_state_createFromData_fp64(%[[VAL_4]], %[[VAL_2]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
+// CHECK:           %[[VAL_2:.*]] = cc.create_state %[[VAL_0]], %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f64> x 8>, i64) -> !cc.ptr<!cc.state>
 // CHECK:        }
 // CHECK-DAG:    cc.global constant @[[VAL_GC]] (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f64>>) : !cc.array<complex<f64> x 8>
-// CHECK-DAG:    func.func private @__nvqpp_cudaq_state_createFromData_fp64(!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
   // clang-format on
 }
 
@@ -490,16 +484,10 @@ void test_combinations(mlir::MLIRContext *ctx) {
 // CHECK:         }
 // CHECK-LABEL:   cc.arg_subst[1] {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @[[VAL_GC:.*]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_1:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_2:.*]] = arith.constant 8 : i64
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<complex<f64> x 8>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_3]] : !cc.ptr<!cc.array<complex<f64> x 8>>
-// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<complex<f64> x 8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = func.call @__nvqpp_cudaq_state_createFromData_fp64(%[[VAL_4]], %[[VAL_2]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
+// CHECK:           %[[VAL_5:.*]] = cc.create_state %[[VAL_0]], %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f64> x 8>>, i64) -> !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK-DAG:     cc.global constant @[[VAL_GC]] (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f64>>) : !cc.array<complex<f64> x 8>
-// CHECK-DAG:     func.func private @__nvqpp_cudaq_state_createFromData_fp64(!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
 // CHECK-LABEL:   cc.arg_subst[2] {
 // CHECK:           %[[VAL_0:.*]] = cc.alloca !cc.array<!cc.charspan x 2>
 // CHECK:           %[[VAL_1:.*]] = cc.address_of @cstr.585800 : !cc.ptr<!llvm.array<3 x i8>>
diff --git a/test/AST-Quake/qalloc_state.cpp b/test/AST-Quake/qalloc_state.cpp
index 191c9c3a305..822f1e1f567 100644
--- a/test/AST-Quake/qalloc_state.cpp
+++ b/test/AST-Quake/qalloc_state.cpp
@@ -20,7 +20,7 @@ struct Eins {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Eins(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.state>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_3:.*]] = call @__nvqpp_cudaq_state_numberOfQubits(%[[VAL_0]]) : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = cc.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_0]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
@@ -34,7 +34,7 @@ struct Zwei {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Zwei(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.state>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_3:.*]] = call @__nvqpp_cudaq_state_numberOfQubits(%[[VAL_0]]) : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = cc.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_0]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
@@ -48,7 +48,7 @@ struct Drei {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Drei(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.state>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_3:.*]] = call @__nvqpp_cudaq_state_numberOfQubits(%[[VAL_0]]) : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = cc.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_0]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
@@ -62,8 +62,7 @@ struct Vier {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Vier(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.state>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_3:.*]] = call @__nvqpp_cudaq_state_numberOfQubits(%[[VAL_0]]) : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = cc.get_number_of_qubits %[[VAL_0]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_0]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 
-// CHECK: func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
diff --git a/test/Quake/delete_states.qke b/test/Quake/delete_states.qke
index caa7cca6218..bc9c3e1d474 100644
--- a/test/Quake/delete_states.qke
+++ b/test/Quake/delete_states.qke
@@ -12,33 +12,24 @@ module {
   func.func @__nvqpp__mlirgen__function_test_state_param._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
     %c8_i64 = arith.constant 8 : i64
     %0 = cc.address_of @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %1 = cc.load %0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %2 = cc.alloca !cc.array<complex<f32> x 8>
-    cc.store %1, %2 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %3 = cc.cast %2 : (!cc.ptr<!cc.array<complex<f32> x 8>>) -> !cc.ptr<i8>
-    %4 = call @__nvqpp_cudaq_state_createFromData_fp32(%3, %c8_i64) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-    %5 = call @__nvqpp_cudaq_state_numberOfQubits(%4) : (!cc.ptr<!cc.state>) -> i64
-    %6 = quake.alloca !quake.veq<?>[%5 : i64]
-    %7 = quake.init_state %6, %4 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+    %1 = cc.create_state %0, %c8_i64 : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
+    %2 = cc.get_number_of_qubits %1 : (!cc.ptr<!cc.state>) -> i64
+    %3 = quake.alloca !quake.veq<?>[%2 : i64]
+    %4 = quake.init_state %3, %1 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
     return
   }
-  func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
-  cc.global constant @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00
-,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
-  func.func private @__nvqpp_cudaq_state_createFromData_fp32(!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
+  cc.global constant @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_state_param._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-// CHECK:           %[[VAL_1:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.array<complex<f32> x 8>>
-// CHECK:           %[[VAL_2:.*]] = cc.alloca !cc.array<complex<f32> x 8>
-// CHECK:           cc.store %[[VAL_1]], %[[VAL_2]] : !cc.ptr<!cc.array<complex<f32> x 8>>
-// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<3>
-// CHECK:           %[[VAL_4:.*]] = quake.init_state %[[VAL_3]], %[[VAL_2]] : (!quake.veq<3>, !cc.ptr<!cc.array<complex<f32> x 8>>) -> !quake.veq<3>
+// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<3>
+// CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_0]] : (!quake.veq<3>, !cc.ptr<!cc.array<complex<f32> x 8>>) -> !quake.veq<3>
+// CHECK:           return
 // CHECK:         }
-// CHECK-DAG:     cc.global constant @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
+// CHECK-DAG:    cc.global constant @function_test_state_param._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
 
   func.func @__nvqpp__mlirgen__sub_kernel(%arg : !cc.ptr<!cc.state>) attributes {"cudaq-kernel", no_this} {
-    %0 = call @__nvqpp_cudaq_state_numberOfQubits(%arg) : (!cc.ptr<!cc.state>) -> i64
+    %0 = cc.get_number_of_qubits %arg : (!cc.ptr<!cc.state>) -> i64
     %1 = quake.alloca !quake.veq<?>[%0 : i64]
     %2 = quake.init_state %1, %arg : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
     return
@@ -47,38 +38,28 @@ module {
   func.func @__nvqpp__mlirgen__function_test_state_param1._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
     %c8_i64 = arith.constant 8 : i64
     %0 = cc.address_of @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %1 = cc.load %0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %2 = cc.alloca !cc.array<complex<f32> x 8>
-    cc.store %1, %2 : !cc.ptr<!cc.array<complex<f32> x 8>>
-    %3 = cc.cast %2 : (!cc.ptr<!cc.array<complex<f32> x 8>>) -> !cc.ptr<i8>
-    %4 = call @__nvqpp_cudaq_state_createFromData_fp32(%3, %c8_i64) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-    call @__nvqpp__mlirgen__sub_kernel(%4) : (!cc.ptr<!cc.state>) -> ()
+    %1 = cc.create_state %0, %c8_i64 : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
+    call @__nvqpp__mlirgen__sub_kernel(%1) : (!cc.ptr<!cc.state>) -> ()
     return
   }
 
   cc.global constant @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00
 ,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
 
-// CHECK:         func.func @__nvqpp__mlirgen__sub_kernel(%[[VAL_ARG:.*]]: !cc.ptr<!cc.state>) attributes {"cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = call @__nvqpp_cudaq_state_numberOfQubits(%[[VAL_ARG]]) : (!cc.ptr<!cc.state>) -> i64
+// CHECK:         func.func @__nvqpp__mlirgen__sub_kernel(%arg0: !cc.ptr<!cc.state>) attributes {"cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = cc.get_number_of_qubits %arg0 : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<?>[%[[VAL_0]] : i64]
-// CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %[[VAL_ARG]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+// CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 // CHECK:           return
-// CHECK:          }
-
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_state_param1._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:         }
+// CHECK:         func.func @__nvqpp__mlirgen__function_test_state_param1._Z16test_state_paramPN5cudaq5stateE() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 8 : i64
 // CHECK:           %[[VAL_1:.*]] = cc.address_of @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 : !cc.ptr<!cc.array<complex<f32> x 8>>
-// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<!cc.array<complex<f32> x 8>>
-// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<complex<f32> x 8>
-// CHECK:           cc.store %[[VAL_2]], %[[VAL_3]] : !cc.ptr<!cc.array<complex<f32> x 8>>
-// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<complex<f32> x 8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = call @__nvqpp_cudaq_state_createFromData_fp32(%[[VAL_4]], %[[VAL_0]]) : (!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
-// CHECK:           call @__nvqpp__mlirgen__sub_kernel(%[[VAL_5]]) : (!cc.ptr<!cc.state>) -> ()
-// CHECK:           call @__nvqpp_cudaq_state_delete(%[[VAL_5]]) : (!cc.ptr<!cc.state>) -> ()
+// CHECK:           %[[VAL_2:.*]] = cc.create_state %[[VAL_1]], %[[VAL_0]] : (!cc.ptr<!cc.array<complex<f32> x 8>>, i64) -> !cc.ptr<!cc.state>
+// CHECK:           call @__nvqpp__mlirgen__sub_kernel(%[[VAL_2]]) : (!cc.ptr<!cc.state>) -> ()
+// CHECK:           call @__nvqpp_cudaq_state_delete(%[[VAL_2]]) : (!cc.ptr<!cc.state>) -> ()
 // CHECK:           return
 // CHECK:         }
-// CHECK-DAG:     constant @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
+// CHECK-DAG:     cc.global constant @function_test_state_param1._Z16test_state_paramPN5cudaq5stateE.rodata_synth_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f32>>) : !cc.array<complex<f32> x 8>
 // CHECK-DAG:     func.func private @__nvqpp_cudaq_state_delete(!cc.ptr<!cc.state>)
 }
-

From 102f8196fef4393441f42c13a40961c05ba34ea7 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 5 Nov 2024 09:51:04 -0800
Subject: [PATCH 20/54] Fix test_argument_conversion

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/test/test_argument_conversion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 1326ac4d395..7c8e9f42053 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -381,7 +381,7 @@ void test_state(mlir::MLIRContext *ctx) {
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = cc.address_of @[[VAL_GC:.*]] : !cc.ptr<!cc.array<complex<f64> x 8>>
 // CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
-// CHECK:           %[[VAL_2:.*]] = cc.create_state %[[VAL_0]], %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f64> x 8>, i64) -> !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_2:.*]] = cc.create_state %[[VAL_0]], %[[VAL_1]] : (!cc.ptr<!cc.array<complex<f64> x 8>>, i64) -> !cc.ptr<!cc.state>
 // CHECK:        }
 // CHECK-DAG:    cc.global constant @[[VAL_GC]] (dense<[(0.70710678118654757,0.000000e+00), (0.70710678118654757,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<8xcomplex<f64>>) : !cc.array<complex<f64> x 8>
   // clang-format on

From 5ea1d973daf78890ee7f4ad2b780f9adca868d42 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 5 Nov 2024 10:00:57 -0800
Subject: [PATCH 21/54] Add printing in failing tests

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 python/tests/kernel/test_kernel_qvector_state_init.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/tests/kernel/test_kernel_qvector_state_init.py b/python/tests/kernel/test_kernel_qvector_state_init.py
index 18fa3914b3e..c832cd64836 100644
--- a/python/tests/kernel/test_kernel_qvector_state_init.py
+++ b/python/tests/kernel/test_kernel_qvector_state_init.py
@@ -32,11 +32,18 @@ def test_kernel_synthesis_complex():
     def kernel(vec: cudaq.State):
         q = cudaq.qvector(vec)
 
+    counts = cudaq.sample(kernel, state)
+    print(f"Non-synthesized: ${counts}")
+    assert '00' in counts
+    assert '10' in counts
+    assert len(counts) == 2
+
     synthesized = cudaq.synthesize(kernel, state)
     counts = cudaq.sample(synthesized)
-    print(counts)
-    assert '10' in counts
+    print(f"Synthesized: ${counts}")
     assert '00' in counts
+    assert '10' in counts
+    assert len(counts) == 2
 
 
 # float

From 074c60f778f9dc49995199903d99fe3f83eff41b Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 5 Nov 2024 10:02:38 -0800
Subject: [PATCH 22/54] Add printing in failing tests

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 python/tests/kernel/test_kernel_qvector_state_init.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/tests/kernel/test_kernel_qvector_state_init.py b/python/tests/kernel/test_kernel_qvector_state_init.py
index c832cd64836..64c1ef55d4e 100644
--- a/python/tests/kernel/test_kernel_qvector_state_init.py
+++ b/python/tests/kernel/test_kernel_qvector_state_init.py
@@ -34,6 +34,7 @@ def kernel(vec: cudaq.State):
 
     counts = cudaq.sample(kernel, state)
     print(f"Non-synthesized: ${counts}")
+    print(kernel)
     assert '00' in counts
     assert '10' in counts
     assert len(counts) == 2
@@ -41,6 +42,7 @@ def kernel(vec: cudaq.State):
     synthesized = cudaq.synthesize(kernel, state)
     counts = cudaq.sample(synthesized)
     print(f"Synthesized: ${counts}")
+    print(synthesized)
     assert '00' in counts
     assert '10' in counts
     assert len(counts) == 2

From 310f6ca48e0f458b23accbb84125ecca0591b902 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 12 Nov 2024 10:06:42 -0800
Subject: [PATCH 23/54] Fix failing tests

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 lib/Optimizer/CodeGen/QuakeToCodegen.cpp      |  2 +-
 python/cudaq/kernel/ast_bridge.py             |  3 +++
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  1 +
 .../kernel/test_kernel_qvector_state_init.py  | 24 ++++++++-----------
 runtime/common/BaseRemoteRESTQPU.h            |  1 +
 5 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/lib/Optimizer/CodeGen/QuakeToCodegen.cpp b/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
index 6e913a2bec2..6774847bf80 100644
--- a/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
+++ b/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
@@ -85,7 +85,7 @@ class CreateStateOpPattern : public OpRewritePattern<cudaq::cc::CreateStateOp> {
     auto is64Bit = isa<Float64Type>(eleTy);
 
     if (auto cTy = dyn_cast<ComplexType>(eleTy))
-      is64Bit = isa<Float64Type>(eleTy);
+      is64Bit = isa<Float64Type>(cTy.getElementType());
 
     auto createStateFunc = is64Bit ? cudaq::createCudaqStateFromDataFP64
                                    : cudaq::createCudaqStateFromDataFP32;
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index 13d694d7bca..ffd930bf72a 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -3829,6 +3829,9 @@ def visit_Name(self, node):
                 if cc.StdvecType.isinstance(eleTy):
                     self.pushValue(value)
                     return
+                if cc.StateType.isinstance(eleTy):
+                    self.pushValue(value)
+                    return
                 loaded = cc.LoadOp(value).result
                 self.pushValue(loaded)
             elif cc.CallableType.isinstance(
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 90ba42b6171..b995f71f1ac 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -535,6 +535,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   pm.addNestedPass<func::FuncOp>(
       cudaq::opt::createArgumentSynthesisPass(kernels, substs));
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  pm.addPass(opt::createDeleteStates());
 
   // Run state preparation for quantum devices (or their emulation) only.
   // Simulators have direct implementation of state initialization
diff --git a/python/tests/kernel/test_kernel_qvector_state_init.py b/python/tests/kernel/test_kernel_qvector_state_init.py
index 64c1ef55d4e..84a3a603f12 100644
--- a/python/tests/kernel/test_kernel_qvector_state_init.py
+++ b/python/tests/kernel/test_kernel_qvector_state_init.py
@@ -33,16 +33,12 @@ def kernel(vec: cudaq.State):
         q = cudaq.qvector(vec)
 
     counts = cudaq.sample(kernel, state)
-    print(f"Non-synthesized: ${counts}")
-    print(kernel)
     assert '00' in counts
     assert '10' in counts
     assert len(counts) == 2
 
     synthesized = cudaq.synthesize(kernel, state)
     counts = cudaq.sample(synthesized)
-    print(f"Synthesized: ${counts}")
-    print(synthesized)
     assert '00' in counts
     assert '10' in counts
     assert len(counts) == 2
@@ -55,7 +51,7 @@ def kernel(vec: cudaq.State):
 def test_kernel_float_params_f64():
 
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     f = np.array([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)], dtype=float)
 
@@ -85,7 +81,7 @@ def test_kernel_float_params_f32():
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_complex_params_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=complex)
@@ -104,7 +100,7 @@ def kernel(vec: cudaq.State):
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_complex128_params_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=np.complex128)
@@ -123,7 +119,7 @@ def kernel(vec: cudaq.State):
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_complex64_params_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=np.complex64)
@@ -181,7 +177,7 @@ def test_kernel_complex_params_f32():
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_complex_capture_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=complex)
@@ -200,7 +196,7 @@ def kernel():
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_complex128_capture_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=np.complex128)
@@ -219,7 +215,7 @@ def kernel():
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_complex128_capture_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=np.complex64)
@@ -280,7 +276,7 @@ def test_kernel_complex_capture_f32():
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_simulation_dtype_complex_params_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=cudaq.complex())
@@ -318,7 +314,7 @@ def kernel(vec: cudaq.State):
 @skipIfNvidiaFP64NotInstalled
 def test_kernel_simulation_dtype_capture_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     c = np.array([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)],
                  dtype=cudaq.complex())
@@ -359,7 +355,7 @@ def kernel():
 @skipIfNvidiaFP64NotInstalled
 def test_init_from_other_kernel_state_f64():
     cudaq.reset_target()
-    cudaq.set_target('nvidia-fp64')
+    cudaq.set_target('nvidia', option='fp64')
 
     @cudaq.kernel
     def bell():
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 0834bc7e3e9..84eb527ebb5 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -444,6 +444,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         mlir::SmallVector<mlir::StringRef> substs = {substBuff};
         pm.addNestedPass<mlir::func::FuncOp>(
             opt::createArgumentSynthesisPass(kernels, substs));
+        pm.addPass(opt::createDeleteStates());
       } else if (updatedArgs) {
         cudaq::info("Run Quake Synth.\n");
         pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));

From 6fdccbadae996044512dc9453b8b94a3323d2f7c Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 12 Nov 2024 14:32:16 -0800
Subject: [PATCH 24/54] Add description for new algorithm for state syntesis

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 .../Transforms/ReplaceStateWithKernel.cpp     | 23 +----
 runtime/common/ArgumentConversion.cpp         | 83 ++++++++++++++++++-
 test/Quake/replace_state_with_kernel.qke      |  5 +-
 3 files changed, 86 insertions(+), 25 deletions(-)

diff --git a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index ec7d5c25f71..c9b46205a02 100644
--- a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -29,31 +29,13 @@ namespace cudaq::opt {
 using namespace mlir;
 
 namespace {
-
-static bool isCall(Operation *op, std::vector<const char *> &&names) {
-  if (op) {
-    if (auto callOp = dyn_cast<func::CallOp>(op)) {
-      if (auto calleeAttr = callOp.getCalleeAttr()) {
-        auto funcName = calleeAttr.getValue().str();
-        if (std::find(names.begin(), names.end(), funcName) != names.end())
-          return true;
-      }
-    }
-  }
-  return false;
-}
-
-static bool isNumberOfQubitsCall(Operation *op) {
-  return isCall(op, {cudaq::getNumQubitsFromCudaqState});
-}
-
 // clang-format off
 /// Replace `quake.init_state` by a call to a (modified) kernel that produced
 /// the state.
 ///
 /// ```
 ///  %0 = cc.get_state "__nvqpp__mlirgen__test_init_state.modified_0" : !cc.ptr<!cc.state>
-///  %1 = call @__nvqpp_cudaq_state_numberOfQubits(%0) : (!cc.ptr<!cc.state>) -> i64
+///  %1 = cc.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
 ///  %2 = quake.alloca !quake.veq<?>[%1 : i64]
 ///  %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 /// ───────────────────────────────────────────
@@ -87,7 +69,8 @@ class ReplaceStateWithKernelPattern
                 "Failed to remove `quake.alloca` in state synthesis");
             return failure();
           }
-          if (isNumberOfQubitsCall(numOfQubits)) {
+
+          if (isa<cudaq::cc::GetNumberOfQubitsOp>(numOfQubits)) {
             if (numOfQubits->getUses().empty())
               rewriter.eraseOp(numOfQubits);
             else {
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index bf903e02786..72ba288e586 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -160,8 +160,87 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
                                                     arrSize);
   }
 
-  // For quantum hardware, replace states with calls to kernels that generated
-  // them.
+  // For quantum hardware, we aim at replacing states with calls to kernels
+  // that generated them. This is done in 2 stages:
+  //
+  // 1. Replace state by cc.get_state instruction during argument conversion:
+  //
+  // Create two functions:
+  // - callee.num_qubits_N
+  //    Calculates the number of qubits needed for the veq allocation
+  // - callee.init_state_N
+  //    Initializes the veq passed as a parameter
+  //
+  // Then replace the state with
+  //   `cc.get_state "callee.num_qubits_0" "callee.init_state_0"`:
+  //
+  // ```
+  // func.func @caller(%arg0: !cc.ptr<!cc.state>) attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  //   %1 = cc.get_number_of_qubits %arg0: (!cc.ptr<!cc.state>) -> i64
+  //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+  //   %3 = quake.init_state %2, %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+  //   return
+  // }
+  //
+  // func.func private @callee(%arg0: i64) attributes {"cudaq-kernel"} {
+  //   %cst = arith.constant 1.5707963267948966 : f64
+  //   %0 = quake.alloca !quake.veq<?>[%arg0 : i64]
+  //   %1 = quake.extract_ref %0[0] : (!quake.veq<2>) -> !quake.ref
+  //   quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+  //   return
+  // }
+  //
+  // Call from the user host code:
+  // state = cudaq.get_state(callee, 2)
+  // counts = cudaq.sample(caller, state)
+  // ```
+  //
+  // => after argument synthesis:
+  //
+  // ```
+  // func.func @caller() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  //   %0 = cc.get_state "callee.num_qubits_0" "callee.init_state_0" : !cc.ptr<!cc.state>
+  //   %1 = cc.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
+  //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+  //   %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+  //   return
+  // }
+  //
+  // func.func private @callee.num_qubits_0(%arg0: !quake.veq<?>) -> i64 attributes {"cudaq-kernel"} {
+  //   %cst = arith.constant 2 : i64
+  //   return %cst : i64
+  // }
+  //
+  // func.func private @callee.init_state_0(%arg0: !quake.veq<?>) attributes {"cudaq-kernel"} {
+  //   %cst = arith.constant 1.5707963267948966 : f64
+  //   %1 = quake.extract_ref %arg0[0] : (!quake.veq<2>) -> !quake.ref
+  //   quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+  //   return
+  // }
+  // ```
+  //
+  // 2. Replace the `cc.get_state` ops with calls to the generated functions
+  //    synthesized with the arguments used to create the state:
+  //
+  // After ReplaceStateWithKernel pass:
+  //
+  // func.func @caller() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  //   %1 = call "callee.num_qubits_0" : () -> i64
+  //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+  //   call "callee.init_0" %2: (!quake.veq<?>) -> ()
+  // }
+  //
+  // func.func private @callee.get_number_of_qubits_0(%arg0: !quake.veq<?>) -> i64 attributes {"cudaq-kernel"} {
+  //   %cst = arith.constant 2 : i64
+  //   return %cst : i64
+  // }
+  //
+  // func.func private @callee.init_0(%arg0: !quake.veq<?>) attributes {"cudaq-kernel"} {
+  //   %cst = arith.constant 1.5707963267948966 : f64
+  //   %1 = quake.extract_ref %arg0[0] : (!quake.veq<2>) -> !quake.ref
+  //   quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+  //   return
+  // }
   if (simState->getKernelInfo().has_value()) {
     auto [calleeName, calleeArgs] = simState->getKernelInfo().value();
 
diff --git a/test/Quake/replace_state_with_kernel.qke b/test/Quake/replace_state_with_kernel.qke
index 09570c62907..3fa8b62d7dd 100644
--- a/test/Quake/replace_state_with_kernel.qke
+++ b/test/Quake/replace_state_with_kernel.qke
@@ -11,12 +11,11 @@
 module {
   func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
     %0 = cc.get_state "callee.modified_0" : !cc.ptr<!cc.state>
-    %1 = call @__nvqpp_cudaq_state_numberOfQubits(%0) : (!cc.ptr<!cc.state>) -> i64
+    %1 = cc.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
     %2 = quake.alloca !quake.veq<?>[%1 : i64]
     %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
-return
+    return
   }
-  func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
   func.func private @callee.modified_0() -> !quake.veq<?> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
     %cst = arith.constant 1.5707963267948966 : f64
     %0 = quake.alloca !quake.veq<2>

From 1dfa8058fb0ec3e8be3ca99e3f470653dbb8fbe9 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 9 Jan 2025 11:46:58 -0800
Subject: [PATCH 25/54] Fix tests

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Transforms/Passes.h   |   4 +-
 .../Transforms/ArgumentSynthesis.cpp          |  13 +-
 runtime/common/ArgumentConversion.cpp         |  24 +-
 runtime/common/BaseRemoteRESTQPU.h            |  21 +-
 .../Remote-Sim/qvector_init_from_state.cpp    | 194 +++++++-------
 .../qvector_init_from_state_lazy.cpp          | 245 ++++++++++--------
 .../execution/qvector_init_from_state.cpp     |  91 ++++---
 test/Quake/arg_subst-5.txt                    |   6 +-
 test/Quake/arg_subst_func.qke                 |  21 +-
 9 files changed, 326 insertions(+), 293 deletions(-)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 6b66c473d53..9dd54c21f6f 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -62,8 +62,8 @@ createArgumentSynthesisPass(mlir::ArrayRef<mlir::StringRef> funcNames,
 /// functions and the substitutions text can be built as an unzipped pair of
 /// lists.
 std::unique_ptr<mlir::Pass>
-createArgumentSynthesisPass(const std::vector<std::string>& funcNames,
-                            const std::vector<std::string>& substitutions);
+createArgumentSynthesisPass(const std::vector<std::string> &funcNames,
+                            const std::vector<std::string> &substitutions);
 
 // declarative passes
 #define GEN_PASS_DECL
diff --git a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
index 377164865d3..932c091cb73 100644
--- a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
+++ b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
@@ -164,12 +164,11 @@ cudaq::opt::createArgumentSynthesisPass(ArrayRef<StringRef> funcNames,
       ArgumentSynthesisOptions{pairs});
 }
 
-std::unique_ptr<mlir::Pass>
-cudaq::opt::createArgumentSynthesisPass(const std::vector<std::string>& funcNames,
-                                        const std::vector<std::string>& substitutions) {
+std::unique_ptr<mlir::Pass> cudaq::opt::createArgumentSynthesisPass(
+    const std::vector<std::string> &funcNames,
+    const std::vector<std::string> &substitutions) {
   return cudaq::opt::createArgumentSynthesisPass(
-                mlir::SmallVector<mlir::StringRef>{funcNames.begin(),
-                                                   funcNames.end()},
-                mlir::SmallVector<mlir::StringRef>{substitutions.begin(),
-                                                   substitutions.end()});
+      mlir::SmallVector<mlir::StringRef>{funcNames.begin(), funcNames.end()},
+      mlir::SmallVector<mlir::StringRef>{substitutions.begin(),
+                                         substitutions.end()});
 }
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 08639270754..407fc718a04 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -175,15 +175,16 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   // Then replace the state with
   //   `quake.get_state "callee.num_qubits_0" "callee.init_state_0"`:
   //
+  // clang-format off
   // ```
-  // func.func @caller(%arg0: !cc.ptr<!cc.state>) attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  // func.func @caller(%arg0: !cc.ptr<!cc.state>) {
   //   %1 = quake.get_number_of_qubits %arg0: (!cc.ptr<!cc.state>) -> i64
   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
   //   %3 = quake.init_state %2, %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
   //   return
   // }
   //
-  // func.func private @callee(%arg0: i64) attributes {"cudaq-kernel"} {
+  // func.func private @callee(%arg0: i64) {
   //   %cst = arith.constant 1.5707963267948966 : f64
   //   %0 = quake.alloca !quake.veq<?>[%arg0 : i64]
   //   %1 = quake.extract_ref %0[0] : (!quake.veq<2>) -> !quake.ref
@@ -195,11 +196,13 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   // state = cudaq.get_state(callee, 2)
   // counts = cudaq.sample(caller, state)
   // ```
+  // clang-format on
   //
   // => after argument synthesis:
   //
+  // clang-format off
   // ```
-  // func.func @caller() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  // func.func @caller() {
   //   %0 = quake.get_state "callee.num_qubits_0" "callee.init_state_0" : !cc.ptr<!cc.state>
   //   %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
@@ -207,41 +210,46 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   //   return
   // }
   //
-  // func.func private @callee.num_qubits_0(%arg0: !quake.veq<?>) -> i64 attributes {"cudaq-kernel"} {
+  // func.func private @callee.num_qubits_0(%arg0: !quake.veq<?>) -> i64 {
   //   %cst = arith.constant 2 : i64
   //   return %cst : i64
   // }
   //
-  // func.func private @callee.init_state_0(%arg0: !quake.veq<?>) attributes {"cudaq-kernel"} {
+  // func.func private @callee.init_state_0(%arg0: !quake.veq<?>) {
   //   %cst = arith.constant 1.5707963267948966 : f64
   //   %1 = quake.extract_ref %arg0[0] : (!quake.veq<2>) -> !quake.ref
   //   quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
   //   return
   // }
   // ```
+  // clang-format on
   //
   // 2. Replace the `quake.get_state` ops with calls to the generated functions
   //    synthesized with the arguments used to create the state:
   //
   // After ReplaceStateWithKernel pass:
   //
-  // func.func @caller() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+  // clang-format off
+  // ```
+  // func.func @caller() {
   //   %1 = call "callee.num_qubits_0" : () -> i64
   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
   //   call "callee.init_0" %2: (!quake.veq<?>) -> ()
   // }
   //
-  // func.func private @callee.get_number_of_qubits_0(%arg0: !quake.veq<?>) -> i64 attributes {"cudaq-kernel"} {
+  // func.func private @callee.num_qubits_0(%arg0: !quake.veq<?>) -> i64 {
   //   %cst = arith.constant 2 : i64
   //   return %cst : i64
   // }
   //
-  // func.func private @callee.init_0(%arg0: !quake.veq<?>) attributes {"cudaq-kernel"} {
+  // func.func private @callee.init_0(%arg0: !quake.veq<?>) {
   //   %cst = arith.constant 1.5707963267948966 : f64
   //   %1 = quake.extract_ref %arg0[0] : (!quake.veq<2>) -> !quake.ref
   //   quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
   //   return
   // }
+  // ```
+  // clang-format on
   if (simState->getKernelInfo().has_value()) {
     auto [calleeName, calleeArgs] = simState->getKernelInfo().value();
 
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 6ef988752df..cab04096f97 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -417,19 +417,6 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     moduleOp->setAttrs(m_module->getAttrDictionary());
 
     for (auto &op : m_module.getOps()) {
-      // if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
-      //   // Add function definitions for runtime functions that must
-      //   // be removed after synthesis in cleanup passes.
-      //   static const std::vector<llvm::StringRef> stateFuncs = {
-      //       cudaq::getNumQubitsFromCudaqState,
-      //       cudaq::createCudaqStateFromDataFP32,
-      //       cudaq::createCudaqStateFromDataFP64};
-
-      //   if (funcOp.getBody().empty() &&
-      //       std::find(stateFuncs.begin(), stateFuncs.end(), funcOp.getName()) !=
-      //           stateFuncs.end())
-      //     moduleOp.push_back(funcOp.clone());
-      // }
       // Add any global symbols, including global constant arrays.
       // Global constant arrays can be created during compilation,
       // `lift-array-alloc`, `argument-synthesis`, `quake-synthesizer`,
@@ -468,17 +455,11 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         auto [kernels, substs] = argCon.collectAllSubstitutions();
         pm.addNestedPass<mlir::func::FuncOp>(
             cudaq::opt::createArgumentSynthesisPass(kernels, substs));
-        // pm.addNestedPass<mlir::func::FuncOp>(
-        //     cudaq::opt::createArgumentSynthesisPass(
-        //         mlir::SmallVector<mlir::StringRef>{kernels.begin(),
-        //                                            kernels.end()},
-        //         mlir::SmallVector<mlir::StringRef>{substs.begin(),
-        //                                            substs.end()}));
         pm.addPass(opt::createDeleteStates());
         pm.addNestedPass<mlir::func::FuncOp>(
             opt::createReplaceStateWithKernel());
         pm.addPass(mlir::createSymbolDCEPass());
-      } else if (updatedArgs) {;
+      } else if (updatedArgs) {
         cudaq::info("Run Quake Synth.\n");
         pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
       }
diff --git a/targettests/Remote-Sim/qvector_init_from_state.cpp b/targettests/Remote-Sim/qvector_init_from_state.cpp
index 90246ac2cfd..098a52194e6 100644
--- a/targettests/Remote-Sim/qvector_init_from_state.cpp
+++ b/targettests/Remote-Sim/qvector_init_from_state.cpp
@@ -19,13 +19,8 @@
 #include <string>
 #include <vector>
 
-__qpu__ void test_init_state() {
-  cudaq::qvector q(2);
-  ry(M_PI / 2.0, q[0]);
-}
-
-__qpu__ void test_init_large_state() {
-  cudaq::qvector q(14);
+__qpu__ void test_init_state(int n) {
+  cudaq::qvector q(n);
   ry(M_PI / 2.0, q[0]);
 }
 
@@ -39,18 +34,18 @@ __qpu__ void test_state_param2(cudaq::state *state, cudaq::pauli_word w) {
   cudaq::exp_pauli(1.0, q, w);
 }
 
-__qpu__ void test_state_param3(cudaq::state *initial_state,
+__qpu__ void test_state_param3(cudaq::state *state,
                                std::vector<cudaq::pauli_word> &words) {
-  cudaq::qvector q(initial_state);
+  cudaq::qvector q(state);
   for (std::size_t i = 0; i < words.size(); ++i) {
     cudaq::exp_pauli(1.0, q, words[i]);
   }
 }
 
-__qpu__ void test_state_param4(cudaq::state *initial_state,
+__qpu__ void test_state_param4(cudaq::state *state,
                                std::vector<double> &coefficients,
                                std::vector<cudaq::pauli_word> &words) {
-  cudaq::qvector q(initial_state);
+  cudaq::qvector q(state);
   for (std::size_t i = 0; i < words.size(); ++i) {
     cudaq::exp_pauli(coefficients[i], q, words[i]);
   }
@@ -83,48 +78,64 @@ int main() {
     counts = cudaq::sample(test_state_param, &state1);
     printCounts(counts);
   }
-
   // clang-format off
-// CHECK: Passing state created from data as argument (kernel mode)
-// CHECK: 011
-// CHECK: 111
+  // CHECK: Passing state created from data as argument (kernel mode)
+  // CHECK: 011
+  // CHECK: 111
 
-// CHECK: 000
-// CHECK: 100
+  // CHECK: 000
+  // CHECK: 100
   // clang-format on
 
   {
     std::cout << "Passing state from another kernel as argument (kernel mode)"
               << std::endl;
-    auto state = cudaq::get_state(test_init_state);
+    auto state = cudaq::get_state(test_init_state, 2);
     auto counts = cudaq::sample(test_state_param, &state);
     printCounts(counts);
   }
   // clang-format off
-// CHECK: Passing state from another kernel as argument (kernel mode)
-// CHECK: 01
-// CHECK: 11
+  // CHECK: Passing state from another kernel as argument (kernel mode)
+  // CHECK: 01
+  // CHECK: 11
   // clang-format on
 
   {
     std::cout
         << "Passing large state from another kernel as argument (kernel mode)"
         << std::endl;
-    auto largeState = cudaq::get_state(test_init_large_state);
+    auto largeState = cudaq::get_state(test_init_state, 14);
     auto counts = cudaq::sample(test_state_param, &largeState);
     printCounts(counts);
   }
   // clang-format off
-// CHECK: Passing large state from another kernel as argument (kernel mode)
-// CHECK: 01111111111111
-// CHECK: 11111111111111
+  // CHECK: Passing large state from another kernel as argument (kernel mode)
+  // CHECK: 01111111111111
+  // CHECK: 11111111111111
+  // clang-format on
+
+  {
+    std::cout << "Passing state from another kernel as argument"
+                 " with pauli word arg (kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state, 2);
+    auto counts =
+        cudaq::sample(test_state_param2, &state, cudaq::pauli_word{"XX"});
+    printCounts(counts);
+  }
+  // clang-format off
+  // CHECK: Passing state from another kernel as argument with pauli word arg (kernel mode)
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
   // clang-format on
 
   {
     std::cout << "Passing state from another kernel as argument iteratively "
                  "(kernel mode)"
               << std::endl;
-    auto state = cudaq::get_state(test_init_state);
+    auto state = cudaq::get_state(test_init_state, 2);
     for (auto i = 0; i < 4; i++) {
       auto counts = cudaq::sample(test_state_param, &state);
       std::cout << "Iteration: " << i << std::endl;
@@ -133,42 +144,26 @@ int main() {
     }
   }
   // clang-format off
-// CHECK: Passing state from another kernel as argument iteratively (kernel mode)
-// CHECK: Iteration: 0
-// CHECK: 01
-// CHECK: 11
-// CHECK: Iteration: 1
-// CHECK: 00
-// CHECK: 10
-// CHECK: Iteration: 2
-// CHECK: 01
-// CHECK: 11
-// CHECK: Iteration: 3
-// CHECK: 00
-// CHECK: 10
-  // clang-format on
-
-  {
-    std::cout << "Passing state from another kernel as argument"
-                 " with pauli word arg (kernel mode)"
-              << std::endl;
-    auto state = cudaq::get_state(test_init_state);
-    auto counts = cudaq::sample(test_state_param2, &state, cudaq::pauli_word{"XX"});
-    printCounts(counts);
-  }
-  // clang-format off
-// CHECK: Passing state from another kernel as argument with pauli word arg (kernel mode)
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
+  // CHECK: Passing state from another kernel as argument iteratively (kernel mode)
+  // CHECK: Iteration: 0
+  // CHECK: 01
+  // CHECK: 11
+  // CHECK: Iteration: 1
+  // CHECK: 00
+  // CHECK: 10
+  // CHECK: Iteration: 2
+  // CHECK: 01
+  // CHECK: 11
+  // CHECK: Iteration: 3
+  // CHECK: 00
+  // CHECK: 10
   // clang-format on
 
   {
     std::cout << "Passing state from another kernel as argument iteratively "
                  "with vector args (kernel mode)"
               << std::endl;
-    auto state = cudaq::get_state(test_init_state);
+    auto state = cudaq::get_state(test_init_state, 2);
     auto words = std::vector<cudaq::pauli_word>{cudaq::pauli_word{"XX"}};
     for (auto i = 0; i < 4; i++) {
       auto counts = cudaq::sample(test_state_param3, &state, words);
@@ -178,36 +173,35 @@ int main() {
       words = std::vector<cudaq::pauli_word>{cudaq::pauli_word{"XY"}};
     }
   }
-  // Passing state from another kernel as argument iteratively with vector args
-  // (kernel mode)
   // clang-format off
-// CHECK: Iteration: 0
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
-// CHECK: Iteration: 1
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
-// CHECK: Iteration: 2
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
-// CHECK: Iteration: 3
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
+  // CHECK: Passing state from another kernel as argument iteratively with vector args (kernel mode)
+  // CHECK: Iteration: 0
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // CHECK: Iteration: 1
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // CHECK: Iteration: 2
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // CHECK: Iteration: 3
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
   // clang-format on
 
   {
     std::cout << "Passing state from another kernel as argument iteratively "
                  "with vector args with 2 elements (kernel mode)"
               << std::endl;
-    auto state = cudaq::get_state(test_init_state);
+    auto state = cudaq::get_state(test_init_state, 2);
     auto words = std::vector<cudaq::pauli_word>{cudaq::pauli_word{"XX"},
                                                 cudaq::pauli_word{"II"}};
     auto coeffs = std::vector<double>{1.0, 2.0};
@@ -222,26 +216,26 @@ int main() {
     }
   }
   // clang-format off
-// CHECK: Passing state from another kernel as argument iteratively with vector args with 2 elements (kernel mode)
-// CHECK: Iteration: 0
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
-// CHECK: Iteration: 1
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
-// CHECK: Iteration: 2
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
-// CHECK: Iteration: 3
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
+  // CHECK: Passing state from another kernel as argument iteratively with vector args with 2 elements (kernel mode)
+  // CHECK: Iteration: 0
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // CHECK: Iteration: 1
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // CHECK: Iteration: 2
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // CHECK: Iteration: 3
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
   // clang-format on
 }
diff --git a/targettests/Remote-Sim/qvector_init_from_state_lazy.cpp b/targettests/Remote-Sim/qvector_init_from_state_lazy.cpp
index abd1c6e3474..9fedb6a995b 100644
--- a/targettests/Remote-Sim/qvector_init_from_state_lazy.cpp
+++ b/targettests/Remote-Sim/qvector_init_from_state_lazy.cpp
@@ -16,38 +16,34 @@
 
 #include <cudaq.h>
 #include <iostream>
+#include <string>
+#include <vector>
 
 struct test_init_state {
-  void operator()() __qpu__ {
-    cudaq::qvector q(2);
-    ry(M_PI/2.0, q[0]);
-  }
-};
-
-struct test_init_large_state {
-  void operator()() __qpu__ {
-    cudaq::qvector q(14);
-    ry(M_PI/2.0, q[0]);
+  void operator()(int n) __qpu__ {
+    cudaq::qvector q(n);
+    ry(M_PI / 2.0, q[0]);
   }
 };
 
 struct test_state_param {
-  void operator()(cudaq::state *initial_state) __qpu__ {
-    cudaq::qvector q(initial_state);
+  void operator()(cudaq::state *state) __qpu__ {
+    cudaq::qvector q(state);
     x(q);
   }
 };
 
 struct test_state_param2 {
-  void operator()(cudaq::state *initial_state, cudaq::pauli_word w) __qpu__ {
-    cudaq::qvector q(initial_state);
+  void operator()(cudaq::state *state, cudaq::pauli_word w) __qpu__ {
+    cudaq::qvector q(state);
     cudaq::exp_pauli(1.0, q, w);
   }
 };
 
 struct test_state_param3 {
-  void operator()(cudaq::state *initial_state, std::vector<cudaq::pauli_word>& words) __qpu__ {
-    cudaq::qvector q(initial_state);
+  void operator()(cudaq::state *state,
+                  std::vector<cudaq::pauli_word> &words) __qpu__ {
+    cudaq::qvector q(state);
     for (std::size_t i = 0; i < words.size(); ++i) {
       cudaq::exp_pauli(1.0, q, words[i]);
     }
@@ -55,15 +51,16 @@ struct test_state_param3 {
 };
 
 struct test_state_param4 {
-  void operator()(cudaq::state *initial_state, std::vector<double> &coefficients, std::vector<cudaq::pauli_word>& words) __qpu__ {
-    cudaq::qvector q(initial_state);
+  void operator()(cudaq::state *state, std::vector<double> &coefficients,
+                  std::vector<cudaq::pauli_word> &words) __qpu__ {
+    cudaq::qvector q(state);
     for (std::size_t i = 0; i < words.size(); ++i) {
       cudaq::exp_pauli(coefficients[i], q, words[i]);
     }
   }
 };
 
-void printCounts(cudaq::sample_result& result) {
+void printCounts(cudaq::sample_result &result) {
   std::vector<std::string> values{};
   for (auto &&[bits, counts] : result) {
     values.push_back(bits);
@@ -77,47 +74,77 @@ void printCounts(cudaq::sample_result& result) {
 
 int main() {
   std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0., 0., 0., 0., 0.};
-  std::vector<cudaq::complex> vec1{0., 0.,  0., 0., 0., 0., M_SQRT1_2, M_SQRT1_2};
+  std::vector<cudaq::complex> vec1{0., 0., 0.,        0.,
+                                   0., 0., M_SQRT1_2, M_SQRT1_2};
   auto state = cudaq::state::from_data(vec);
   auto state1 = cudaq::state::from_data(vec1);
   {
-      std::cout << "Passing state created from data as argument (kernel mode)" << std::endl;
-      auto counts = cudaq::sample(test_state_param{}, &state);
-      printCounts(counts);
+    std::cout << "Passing state created from data as argument (kernel mode)"
+              << std::endl;
+    auto counts = cudaq::sample(test_state_param{}, &state);
+    printCounts(counts);
 
-      counts = cudaq::sample(test_state_param{}, &state1);
-      printCounts(counts);
+    counts = cudaq::sample(test_state_param{}, &state1);
+    printCounts(counts);
   }
-// CHECK: Passing state created from data as argument (kernel mode)
-// CHECK: 011
-// CHECK: 111
+  // clang-format off
+  // CHECK: Passing state created from data as argument (kernel mode)
+  // CHECK: 011
+  // CHECK: 111
 
-// CHECK: 000
-// CHECK: 100
+  // CHECK: 000
+  // CHECK: 100
+  // clang-format on
 
   {
-    std::cout << "Passing state from another kernel as argument (kernel mode)" << std::endl;
-    auto state = cudaq::get_state(test_init_state{});
+    std::cout << "Passing state from another kernel as argument (kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state{}, 2);
     auto counts = cudaq::sample(test_state_param{}, &state);
     printCounts(counts);
   }
-// CHECK: Passing state from another kernel as argument (kernel mode)
-// CHECK: 01
-// CHECK: 11
+  // clang-format off
+  // CHECK: Passing state from another kernel as argument (kernel mode)
+  // CHECK: 01
+  // CHECK: 11
+  // clang-format on
 
   {
-    std::cout << "Passing large state from another kernel as argument (kernel mode)" << std::endl;
-    auto largeState = cudaq::get_state(test_init_large_state{});
+    std::cout
+        << "Passing large state from another kernel as argument (kernel mode)"
+        << std::endl;
+    auto largeState = cudaq::get_state(test_init_state{}, 14);
     auto counts = cudaq::sample(test_state_param{}, &largeState);
     printCounts(counts);
   }
-// CHECK: Passing large state from another kernel as argument (kernel mode)
-// CHECK: 01111111111111
-// CHECK: 11111111111111
+  // clang-format off
+  // CHECK: Passing large state from another kernel as argument (kernel mode)
+  // CHECK: 01111111111111
+  // CHECK: 11111111111111
+  // clang-format on
+
+  {
+    std::cout << "Passing state from another kernel as argument"
+                 " with pauli word arg (kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state{}, 2);
+    auto counts =
+        cudaq::sample(test_state_param2{}, &state, cudaq::pauli_word{"XX"});
+    printCounts(counts);
+  }
+  // clang-format off
+  // CHECK: Passing state from another kernel as argument with pauli word arg (kernel mode)
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // clang-format on
 
   {
-    std::cout << "Passing state from another kernel as argument iteratively (kernel mode)" << std::endl;
-    auto state = cudaq::get_state(test_init_state{});
+    std::cout << "Passing state from another kernel as argument iteratively "
+                 "(kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state{}, 2);
     for (auto i = 0; i < 4; i++) {
       auto counts = cudaq::sample(test_state_param{}, &state);
       std::cout << "Iteration: " << i << std::endl;
@@ -125,23 +152,27 @@ int main() {
       state = cudaq::get_state(test_state_param{}, &state);
     }
   }
-// CHECK: Passing state from another kernel as argument iteratively (kernel mode)
-// CHECK: Iteration: 0
-// CHECK: 01
-// CHECK: 11
-// CHECK: Iteration: 1
-// CHECK: 00
-// CHECK: 10
-// CHECK: Iteration: 2
-// CHECK: 01
-// CHECK: 11
-// CHECK: Iteration: 3
-// CHECK: 00
-// CHECK: 10
+  // clang-format off
+  // CHECK: Passing state from another kernel as argument iteratively (kernel mode)
+  // CHECK: Iteration: 0
+  // CHECK: 01
+  // CHECK: 11
+  // CHECK: Iteration: 1
+  // CHECK: 00
+  // CHECK: 10
+  // CHECK: Iteration: 2
+  // CHECK: 01
+  // CHECK: 11
+  // CHECK: Iteration: 3
+  // CHECK: 00
+  // CHECK: 10
+  // clang-format on
 
   {
-    std::cout << "Passing state from another kernel as argument iteratively with vector args (kernel mode)" << std::endl;
-    auto state = cudaq::get_state(test_init_state{});
+    std::cout << "Passing state from another kernel as argument iteratively "
+                 "with vector args (kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state{}, 2);
     auto words = std::vector<cudaq::pauli_word>{cudaq::pauli_word{"XX"}};
     for (auto i = 0; i < 4; i++) {
       auto counts = cudaq::sample(test_state_param3{}, &state, words);
@@ -151,61 +182,69 @@ int main() {
       words = std::vector<cudaq::pauli_word>{cudaq::pauli_word{"XY"}};
     }
   }
-// CHECK: Passing state from another kernel as argument iteratively with vector args (kernel mode)
-// CHECK: Iteration: 0
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
-// CHECK: Iteration: 1
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
-// CHECK: Iteration: 2
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
-// CHECK: Iteration: 3
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
+  // clang-format off
+  // CHECK: Passing state from another kernel as argument iteratively with vector args (kernel mode)
+  // CHECK: Iteration: 0
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // CHECK: Iteration: 1
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // CHECK: Iteration: 2
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // CHECK: Iteration: 3
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // clang-format on
 
   {
-    std::cout << "Passing state from another kernel as argument iteratively with vector args with 2 elements (kernel mode)" << std::endl;
-    auto state = cudaq::get_state(test_init_state{});
-    auto words = std::vector<cudaq::pauli_word>{cudaq::pauli_word{"XX"}, cudaq::pauli_word{"II"}};
+    std::cout << "Passing state from another kernel as argument iteratively "
+                 "with vector args with 2 elements (kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state{}, 2);
+    auto words = std::vector<cudaq::pauli_word>{cudaq::pauli_word{"XX"},
+                                                cudaq::pauli_word{"II"}};
     auto coeffs = std::vector<double>{1.0, 2.0};
     for (auto i = 0; i < 4; i++) {
       auto counts = cudaq::sample(test_state_param4{}, &state, coeffs, words);
       std::cout << "Iteration: " << i << std::endl;
       printCounts(counts);
       state = cudaq::get_state(test_state_param4{}, &state, coeffs, words);
-      words = std::vector<cudaq::pauli_word>{cudaq::pauli_word{"II"}, cudaq::pauli_word{"XY"}};
+      words = std::vector<cudaq::pauli_word>{cudaq::pauli_word{"II"},
+                                             cudaq::pauli_word{"XY"}};
       coeffs = std::vector<double>{1.0, 2.0};
     }
   }
-// CHECK: Passing state from another kernel as argument iteratively with vector args with 2 elements (kernel mode)
-// CHECK: Iteration: 0
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
-// CHECK: Iteration: 1
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
-// CHECK: Iteration: 2
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
-// CHECK: Iteration: 3
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
+  // clang-format off
+  // CHECK: Passing state from another kernel as argument iteratively with vector args with 2 elements (kernel mode)
+  // CHECK: Iteration: 0
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // CHECK: Iteration: 1
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // CHECK: Iteration: 2
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // CHECK: Iteration: 3
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // clang-format on
 }
diff --git a/targettests/execution/qvector_init_from_state.cpp b/targettests/execution/qvector_init_from_state.cpp
index 681e42eee07..62d162e1781 100644
--- a/targettests/execution/qvector_init_from_state.cpp
+++ b/targettests/execution/qvector_init_from_state.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
@@ -27,7 +27,7 @@
 struct test_init_state {
   void operator()(int n) __qpu__ {
     cudaq::qvector q(n);
-    ry(M_PI/2.0, q[0]);
+    ry(M_PI / 2.0, q[0]);
   }
 };
 
@@ -45,6 +45,26 @@ struct test_state_param2 {
   }
 };
 
+struct test_state_param3 {
+  void operator()(cudaq::state *state,
+                  std::vector<cudaq::pauli_word> &words) __qpu__ {
+    cudaq::qvector q(state);
+    for (std::size_t i = 0; i < words.size(); ++i) {
+      cudaq::exp_pauli(1.0, q, words[i]);
+    }
+  }
+};
+
+struct test_state_param4 {
+  void operator()(cudaq::state *state, std::vector<double> &coefficients,
+                  std::vector<cudaq::pauli_word> &words) __qpu__ {
+    cudaq::qvector q(state);
+    for (std::size_t i = 0; i < words.size(); ++i) {
+      cudaq::exp_pauli(coefficients[i], q, words[i]);
+    }
+  }
+};
+
 void printCounts(cudaq::sample_result &result) {
   std::vector<std::string> values{};
   for (auto &&[bits, counts] : result) {
@@ -72,14 +92,13 @@ int main() {
     counts = cudaq::sample(test_state_param{}, &state1);
     printCounts(counts);
   }
-
   // clang-format off
-// CHECK: Passing state created from data as argument (kernel mode)
-// CHECK: 011
-// CHECK: 111
+  // CHECK: Passing state created from data as argument (kernel mode)
+  // CHECK: 011
+  // CHECK: 111
 
-// CHECK: 000
-// CHECK: 100
+  // CHECK: 000
+  // CHECK: 100
   // clang-format on
 
   {
@@ -90,16 +109,17 @@ int main() {
     printCounts(counts);
   }
   // clang-format off
-// CHECK: Passing state from another kernel as argument (kernel mode)
-// CHECK: 01
-// CHECK: 11
+  // CHECK: Passing state from another kernel as argument (kernel mode)
+  // CHECK: 01
+  // CHECK: 11
   // clang-format on
 
   {
     std::cout
         << "Passing large state from another kernel as argument (kernel mode)"
         << std::endl;
-    // TODO: State larger than 5 qubits fails on iqm machines with Adonis architecture
+    // TODO: State larger than 5 qubits fails on iqm machines with Adonis
+    // architecture
     // TODO: State larger than 8 qubits fails on oqc and anyon
     // Up to 14 bits works with quantinuum an ionq
     auto largeState = cudaq::get_state(test_init_state{}, 5);
@@ -107,9 +127,9 @@ int main() {
     printCounts(counts);
   }
   // clang-format off
-// CHECK: Passing large state from another kernel as argument (kernel mode)
-// CHECK: 01111
-// CHECK: 11111
+  // CHECK: Passing large state from another kernel as argument (kernel mode)
+  // CHECK: 01111
+  // CHECK: 11111
   // clang-format on
 
   {
@@ -117,15 +137,16 @@ int main() {
                  " with pauli word arg (kernel mode)"
               << std::endl;
     auto state = cudaq::get_state(test_init_state{}, 2);
-    auto counts = cudaq::sample(test_state_param2{}, &state, cudaq::pauli_word{"XX"});
+    auto counts =
+        cudaq::sample(test_state_param2{}, &state, cudaq::pauli_word{"XX"});
     printCounts(counts);
   }
   // clang-format off
-// CHECK: Passing state from another kernel as argument with pauli word arg (kernel mode)
-// CHECK: 00
-// CHECK: 01
-// CHECK: 10
-// CHECK: 11
+  // CHECK: Passing state from another kernel as argument with pauli word arg (kernel mode)
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
   // clang-format on
 
   {
@@ -141,18 +162,20 @@ int main() {
     }
   }
   // clang-format off
-// CHECK: Passing state from another kernel as argument iteratively (kernel mode)
-// CHECK: Iteration: 0
-// CHECK: 01
-// CHECK: 11
-// CHECK: Iteration: 1
-// CHECK: 00
-// CHECK: 10
-// CHECK: Iteration: 2
-// CHECK: 01
-// CHECK: 11
-// CHECK: Iteration: 3
-// CHECK: 00
-// CHECK: 10
+  // CHECK: Passing state from another kernel as argument iteratively (kernel mode)
+  // CHECK: Iteration: 0
+  // CHECK: 01
+  // CHECK: 11
+  // CHECK: Iteration: 1
+  // CHECK: 00
+  // CHECK: 10
+  // CHECK: Iteration: 2
+  // CHECK: 01
+  // CHECK: 11
+  // CHECK: Iteration: 3
+  // CHECK: 00
+  // CHECK: 10
   // clang-format on
+
+  // TODO: add tests for vectors of pauli words after we can lifts the arrays of pauli words.
 }
diff --git a/test/Quake/arg_subst-5.txt b/test/Quake/arg_subst-5.txt
index c5e727bb79e..2038ad31ccd 100644
--- a/test/Quake/arg_subst-5.txt
+++ b/test/Quake/arg_subst-5.txt
@@ -7,9 +7,5 @@
 // ========================================================================== //
 
 cc.arg_subst[0] {
-  %0 = cc.string_literal "init" : !cc.ptr<!cc.array<i8 x 46>>
-  %1 = cc.cast %0 : (!cc.ptr<!cc.array<i8 x 46>>) -> !cc.ptr<i8>
-  %2 = func.call @__nvqpp_cudaq_state_get(%1) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
-  %3 = cc.cast %2 : (!cc.ptr<!cc.state>) -> !cc.ptr<!cc.state>
+  %0 = quake.get_state "init" : !cc.ptr<!cc.state>
 }
-func.func private @__nvqpp_cudaq_state_get(!cc.ptr<i8>) -> !cc.ptr<!cc.state>
diff --git a/test/Quake/arg_subst_func.qke b/test/Quake/arg_subst_func.qke
index 97bb3ff3a4d..768216567d7 100644
--- a/test/Quake/arg_subst_func.qke
+++ b/test/Quake/arg_subst_func.qke
@@ -148,16 +148,13 @@ func.func @testy4(%arg0: !cc.stdvec<!cc.struct<{i32, f64, i8, i16}>>) {
 // CHECK:         }
 
 func.func @testy5(%arg0: !cc.ptr<!cc.state>) {
-  %3 = call @__nvqpp_cudaq_state_numberOfQubits(%arg0) : (!cc.ptr<!cc.state>) -> i64
-  %4 = quake.alloca !quake.veq<?>[%3 : i64]
-  %5 = quake.init_state %4, %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+  %0 = quake.get_number_of_qubits %arg0 : (!cc.ptr<!cc.state>) -> i64
+  %1 = quake.alloca !quake.veq<?>[%0 : i64]
+  %5 = quake.init_state %1, %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
   return
 }
 
-func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
-func.func private @__nvqpp_cudaq_state_get(!cc.ptr<i8>) -> !cc.ptr<!cc.state>
-
-func.func private @init(%arg0: i32) -> !quake.veq<?> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+func.func private @init(%arg0: i32) -> !quake.veq<?> {
   %cst = arith.constant 1.5707963267948966 : f64
   %0 = cc.cast signed %arg0 : (i32) -> i64
   %1 = quake.alloca !quake.veq<?>[%0 : i64]
@@ -166,17 +163,13 @@ func.func private @init(%arg0: i32) -> !quake.veq<?> attributes {"cudaq-entrypoi
 }
 
 // CHECK-LABEL:   func.func @testy5() {
-// CHECK:           %[[VAL_0:.*]] = cc.string_literal "init" : !cc.ptr<!cc.array<i8 x 46>>
-// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.array<i8 x 46>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_2:.*]] = call @__nvqpp_cudaq_state_get(%[[VAL_1]]) : (!cc.ptr<i8>) -> !cc.ptr<!cc.state>
-// CHECK:           %[[VAL_3:.*]] = call @__nvqpp_cudaq_state_numberOfQubits(%[[VAL_2]]) : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_2:.*]] = quake.get_state "init" : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_3:.*]] = quake.get_number_of_qubits %[[VAL_2]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_5:.*]] = quake.init_state %[[VAL_4]], %[[VAL_2]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 // CHECK:           return
 // CHECK:         }
-// CHECK:         func.func private @__nvqpp_cudaq_state_numberOfQubits(!cc.ptr<!cc.state>) -> i64
-// CHECK:         func.func private @__nvqpp_cudaq_state_get(!cc.ptr<i8>) -> !cc.ptr<!cc.state>
-// CHECK:         func.func private @init() -> !quake.veq<?> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+// CHECK:         func.func private @init() -> !quake.veq<?> {
 // CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<2>
 // CHECK:           %[[VAL_8:.*]] = quake.relax_size %[[VAL_7:.*]] : (!quake.veq<2>) -> !quake.veq<?>
 // CHECK:           return %[[VAL_8]] : !quake.veq<?>

From 95633714a23ad2823369a86b7455537239da5b02 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 21 Jan 2025 14:52:19 -0800
Subject: [PATCH 26/54] Make intermediate IR legal by separating allocs

---
 .../cudaq/Optimizer/Dialect/Quake/QuakeOps.td |  22 +-
 include/cudaq/Optimizer/Transforms/Passes.td  |  10 +-
 .../Transforms/ReplaceStateWithKernel.cpp     |  79 +++--
 runtime/common/ArgumentConversion.cpp         | 312 ++++++++++++++----
 runtime/cudaq/qis/quantum_state.h             |   4 +-
 runtime/test/CMakeLists.txt                   |   1 +
 runtime/test/FakeQuantumState.h               | 159 +++++++++
 runtime/test/FakeSimulationState.h            |  20 +-
 runtime/test/test_argument_conversion.cpp     | 238 ++++++++++++-
 .../execution/qvector_init_from_state.cpp     |   1 +
 test/Quake/arg_subst-5.txt                    |   2 +-
 test/Quake/arg_subst-6.txt                    |   2 +-
 test/Quake/arg_subst_func.qke                 |  26 +-
 test/Quake/replace_state_with_kernel.qke      |  56 +++-
 14 files changed, 774 insertions(+), 158 deletions(-)
 create mode 100644 runtime/test/FakeQuantumState.h

diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
index 9a2af0ee622..e5bb1222088 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
@@ -1467,19 +1467,29 @@ def QuakeOp_GetStateOp : QuakeOp<"get_state", [Pure] > {
   let summary = "Get state from kernel with the provided name.";
   let description = [{
     This operation is created by argument synthesis of state pointer arguments
-    for quantum devices. It takes a kernel name as ASCIIZ string literal value
-    and returns the kernel's quantum state. The operation is replaced by a call
-    to the kernel with the provided name in ReplaceStateByKernel pass.
+    for quantum devices.
+
+    It takes two kernel names as ASCIIZ string literals:
+      - "num_qubits" for determining the size of the allocation to initialize
+      - "init" for initializing the state the same way as the original kernel
+        passed to `cudaq::get_state`) as ASCIIZ string literal
+
+    And returns the quantum state of the original kernel passed to
+    `cudaq::get_state`. The operation is replaced by calls to the kernels with
+    the provided names in `ReplaceStateByKernel` pass.
 
     ```mlir
-      %0 = quake.get_state "callee" : !cc.ptr<!cc.state>
+      %0 = quake.get_state "num_qubits" "init" : !cc.ptr<!cc.state>
     ```
   }];
 
-  let arguments = (ins StrAttr:$calleeName);
+  let arguments = (ins
+    StrAttr:$numQubitsFuncName,
+    StrAttr:$initFuncName
+  );
   let results = (outs cc_PointerType:$result);
   let assemblyFormat = [{
-     $calleeName `:` qualified(type(results)) attr-dict
+     $numQubitsFuncName $initFuncName `:` qualified(type(results)) attr-dict
   }];
 }
 
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 00aecc8a718..71bbd8dd4d5 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -841,8 +841,8 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::Func
 
     Before ReplaceStateWithKernel (replace-state-with-kernel):
     ```
-    func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-      %0 = quake.get_state "callee.modified_0" : !cc.ptr<!cc.state>
+    func.func @foo() {
+      %0 = quake.get_state "callee.num_qubits_0" "callee.init_0": !cc.ptr<!cc.state>
       %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
       %2 = quake.alloca !quake.veq<?>[%1 : i64]
       %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
@@ -852,8 +852,10 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::Func
 
     After ReplaceStateWithKernel (replace-state-with-kernel):
     ```
-    func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-      %3 = call @__nvqpp__mlirgen__test_init_state.modified_0() : () -> !quake.veq<?>
+    func.func @foo()  {
+      %1 = call @callee.num_qubits_0() : () -> i64
+      %2 = quake.alloca !quake.veq<?>[%1 : i64]
+      %3 = call @callee.init_0(%2) : (!quake.veq<?>) -> !quake.veq<?>
       return
     }
     ```
diff --git a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index e232ae0983a..d102d156da2 100644
--- a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -29,60 +29,77 @@ namespace cudaq::opt {
 using namespace mlir;
 
 namespace {
+// clang-format off
+/// Replace `quake.get_number_of_qubits` by a call to a a function
+/// that computes the number of qubits for a state.
+///
+/// ```
+///  %0 = quake.get_state "callee.num_qubits_0" "callee.init_0" : !cc.ptr<!cc.state>
+///  %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
+/// ───────────────────────────────────────────
+/// ...
+///  %1 = call @callee.num_qubits_0() : () -> i64
+/// ```
+// clang-format on
+class ReplaceGetNumQubitsPattern
+    : public OpRewritePattern<quake::GetNumberOfQubitsOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::GetNumberOfQubitsOp numQubits,
+                                PatternRewriter &rewriter) const override {
+
+    auto stateOp = numQubits.getOperand();
+    if (auto getState = stateOp.getDefiningOp<quake::GetStateOp>()) {
+      auto numQubitsName = getState.getNumQubitsFuncName();
+
+      rewriter.setInsertionPoint(numQubits);
+      rewriter.replaceOpWithNewOp<func::CallOp>(
+          numQubits, numQubits.getType(), numQubitsName, mlir::ValueRange{});
+      return success();
+    }
+    return numQubits->emitError(
+        "ReplaceStateWithKernel: failed to replace `quake.get_num_qubits`");
+  }
+};
+
 // clang-format off
 /// Replace `quake.init_state` by a call to a (modified) kernel that produced
 /// the state.
 ///
 /// ```
-///  %0 = quake.get_state "callee.modified_0" : !cc.ptr<!cc.state>
-///  %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
-///  %2 = quake.alloca !quake.veq<?>[%1 : i64]
+///  %0 = quake.get_state "callee.num_qubits_0" "callee.init_0" : !cc.ptr<!cc.state>
 ///  %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 /// ───────────────────────────────────────────
 /// ...
-///  %5 = call @callee.modified_0() : () -> !quake.veq<?>
+/// %3 = call @callee.init_0(%2): (!quake.veq<?>) -> !quake.veq<?>
 /// ```
 // clang-format on
-class ReplaceStateWithKernelPattern
+class ReplaceInitStatePattern
     : public OpRewritePattern<quake::InitializeStateOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
                                 PatternRewriter &rewriter) const override {
-    auto *alloca = initState.getOperand(0).getDefiningOp();
+    auto allocaOp = initState.getOperand(0);
     auto stateOp = initState.getOperand(1);
 
     if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(stateOp.getType())) {
       if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-        auto *numOfQubits = alloca->getOperand(0).getDefiningOp();
-
         if (auto getState = stateOp.getDefiningOp<quake::GetStateOp>()) {
-          auto calleeName = getState.getCalleeName();
+          auto initName = getState.getInitFuncName();
+
+          rewriter.setInsertionPoint(initState);
           rewriter.replaceOpWithNewOp<func::CallOp>(
-              initState, initState.getType(), calleeName, mlir::ValueRange{});
-
-          if (alloca->getUses().empty())
-            rewriter.eraseOp(alloca);
-          else {
-            alloca->emitError(
-                "Failed to remove `quake.alloca` in state synthesis");
-            return failure();
-          }
-
-          if (isa<quake::GetNumberOfQubitsOp>(numOfQubits)) {
-            if (numOfQubits->getUses().empty())
-              rewriter.eraseOp(numOfQubits);
-            else {
-              numOfQubits->emitError("Failed to remove runtime call to get "
-                                     "number of qubits in state synthesis");
-              return failure();
-            }
-          }
+              initState, initState.getType(), initName,
+              mlir::ValueRange{allocaOp});
+
           return success();
         }
-        numOfQubits->emitError(
-            "Failed to replace `quake.init_state` in state synthesis");
+
+        return initState->emitError(
+            "ReplaceStateWithKernel: failed to replace `quake.init_state`");
       }
     }
     return failure();
@@ -99,7 +116,7 @@ class ReplaceStateWithKernelPass
     auto *ctx = &getContext();
     auto func = getOperation();
     RewritePatternSet patterns(ctx);
-    patterns.insert<ReplaceStateWithKernelPattern>(ctx);
+    patterns.insert<ReplaceGetNumQubitsPattern, ReplaceInitStatePattern>(ctx);
 
     LLVM_DEBUG(llvm::dbgs()
                << "Before replace state with kernel: " << func << '\n');
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 407fc718a04..ebc8c52ae18 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -99,6 +99,225 @@ static Value genConstant(OpBuilder &, cudaq::cc::StructType, void *,
 static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *,
                          ModuleOp substMod, llvm::DataLayout &);
 
+/// Create callee.init_N that initializes the state
+/// Callee:
+/// func.func @__nvqpp__mlirgen__callee(%arg0: i64) {
+///   %0 = cc.alloca i64
+///   cc.store %arg0, %0 : !cc.ptr<i64>
+///   %1 = cc.load %0 : !cc.ptr<i64>
+///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
+///   quake.x %3 : (!quake.ref) -> ()
+///   return
+/// }
+/// callee.init_N:
+/// func.func private @callee.init_0(%arg0: !quake.veq<?>, %arg0: i64) ->
+/// !!quake.veq<?> {
+///   %1 = quake.extract_ref %arg0[1] : (!quake.veq<2>) -> !quake.ref
+///   quake.x %1 : (f64, !quake.ref) -> ()
+///   return %arg0: !quake.veq<?>
+/// }
+static void createInitFunc(OpBuilder &builder, ModuleOp sourceMod,
+                           func::FuncOp calleeFunc,
+                           std::string &initKernelName) {
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToEnd(sourceMod.getBody());
+
+  auto ctx = builder.getContext();
+  auto loc = builder.getUnknownLoc();
+
+  auto initFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
+
+  auto argTypes = calleeFunc.getArgumentTypes();
+  auto retTy = quake::VeqType::getUnsized(ctx);
+  auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retTy});
+
+  initFunc.setName(initKernelName);
+  initFunc.setType(funcTy);
+  initFunc.setPrivate();
+
+  OpBuilder newBuilder(ctx);
+
+  auto *entryBlock = &initFunc.getRegion().front();
+  newBuilder.setInsertionPointToStart(entryBlock);
+  auto intType = newBuilder.getI64Type();
+  Value zero = newBuilder.create<arith::ConstantIntOp>(loc, 0, intType);
+  Value one = newBuilder.create<arith::ConstantIntOp>(loc, 1, intType);
+  Value begin = zero;
+
+  auto argPos = initFunc.getArguments().size();
+
+  // Detect errors in kernel passed to get_state.
+  std::function<void(Block &)> processInner = [&](Block &block) {
+    for (auto &op : block) {
+      for (auto &region : op.getRegions()) {
+        for (auto &b : region)
+          processInner(b);
+      }
+      // Don't allow returns in inner scopes
+      if (auto retOp = dyn_cast<func::ReturnOp>(&op))
+        calleeFunc.emitError("Encountered return in inner scope in a kernel "
+                             "passed to get_state");
+    }
+  };
+
+  for (auto &op : calleeFunc.getRegion().front())
+    for (auto &region : op.getRegions())
+      for (auto &b : region)
+        processInner(b);
+
+  // Process outer block to initialize the allocation passed as an argument.
+  std::function<void(Block &)> process = [&](Block &block) {
+    SmallVector<Operation *> cleanUps;
+    Operation *replacedReturn = nullptr;
+
+    Value arg;
+    Value subArg;
+    Value blockBegin = begin;
+    Value blockAllocSize = zero;
+    for (auto &op : block) {
+      if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
+        newBuilder.setInsertionPointAfter(alloc);
+
+        if (!arg) {
+          initFunc.insertArgument(argPos, retTy, {}, loc);
+          arg = initFunc.getArgument(argPos);
+        }
+
+        auto allocSize = alloc.getSize();
+        auto offset = newBuilder.create<arith::SubIOp>(loc, allocSize, one);
+        subArg =
+            newBuilder.create<quake::SubVeqOp>(loc, retTy, arg, begin, offset);
+        alloc.replaceAllUsesWith(subArg);
+        cleanUps.push_back(alloc);
+        begin = newBuilder.create<arith::AddIOp>(loc, begin, allocSize);
+        blockAllocSize =
+            newBuilder.create<arith::AddIOp>(loc, blockAllocSize, allocSize);
+      }
+
+      if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
+        if (retOp != replacedReturn) {
+          newBuilder.setInsertionPointAfter(retOp);
+
+          auto offset =
+              newBuilder.create<arith::SubIOp>(loc, blockAllocSize, one);
+          Value ret = newBuilder.create<quake::SubVeqOp>(loc, retTy, arg,
+                                                         blockBegin, offset);
+
+          assert(arg && "No veq allocations found");
+          replacedReturn = newBuilder.create<func::ReturnOp>(loc, ret);
+          cleanUps.push_back(retOp);
+        }
+      }
+    }
+
+    for (auto &op : cleanUps) {
+      op->dropAllReferences();
+      op->dropAllUses();
+      op->erase();
+    }
+  };
+
+  // Process the function body
+  process(initFunc.getRegion().front());
+}
+
+/// Create callee.num_qubits_N that calculates the number of qubits to
+/// initialize Callee: func.func @callee(%arg0: i64) {
+///   %0 = cc.alloca i64
+///   cc.store %arg0, %0 : !cc.ptr<i64>
+///   %1 = cc.load %0 : !cc.ptr<i64>
+///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
+///   quake.x %3 : (!quake.ref) -> ()
+///   return
+/// }
+///
+/// callee.num_qubits_0:
+/// func.func private @callee.num_qubits_0(%arg0: i64) -> i64 {
+///   %0 = cc.alloca i64
+///   cc.store %arg0, %0 : !cc.ptr<i64>
+///   %1 = cc.load %0 : !cc.ptr<i64>
+///   return %1 : i64
+/// }
+static void createNumQubitsFunc(OpBuilder &builder, ModuleOp sourceMod,
+                                func::FuncOp calleeFunc,
+                                std::string &numQubitsKernelName) {
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToEnd(sourceMod.getBody());
+
+  auto ctx = builder.getContext();
+  auto loc = builder.getUnknownLoc();
+
+  auto numQubitsFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
+
+  auto argTypes = calleeFunc.getArgumentTypes();
+  auto retType = builder.getI64Type();
+  auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retType});
+
+  numQubitsFunc.setName(numQubitsKernelName);
+  numQubitsFunc.setType(funcTy);
+  numQubitsFunc.setPrivate();
+
+  OpBuilder newBuilder(ctx);
+
+  auto *entryBlock = &numQubitsFunc.getRegion().front();
+  newBuilder.setInsertionPointToStart(entryBlock);
+  Value size = newBuilder.create<arith::ConstantIntOp>(loc, 0, retType);
+
+  // Process block recursively to calculate and return allocation size
+  // and remove everything else.
+  std::function<void(Block &)> process = [&](Block &block) {
+    SmallVector<Operation *> used;
+    Operation *replacedReturn = nullptr;
+
+    for (auto &op : block) {
+      // Calculate allocation size (existing allocation size plus new one)
+      if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
+        auto allocSize = alloc.getSize();
+        newBuilder.setInsertionPointAfter(alloc);
+        size = newBuilder.create<arith::AddIOp>(loc, size, allocSize);
+      }
+
+      // Return allocation size
+      if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
+        if (retOp != replacedReturn) {
+
+          newBuilder.setInsertionPointAfter(retOp);
+          auto newRet = newBuilder.create<func::ReturnOp>(loc, size);
+          replacedReturn = newRet;
+          used.push_back(newRet);
+        }
+      }
+    }
+
+    // Collect all ops needed for size calculation
+    SmallVector<Operation *> keep;
+    while (!used.empty()) {
+      auto *op = used.pop_back_val();
+      keep.push_back(op);
+      for (auto opnd : op->getOperands())
+        if (auto defOp = opnd.getDefiningOp())
+          used.push_back(defOp);
+    }
+
+    // Remove the rest of the ops
+    SmallVector<Operation *> toErase;
+    for (auto &op : block)
+      if (std::find(keep.begin(), keep.end(), &op) == keep.end())
+        toErase.push_back(&op);
+
+    for (auto &op : toErase) {
+      op->dropAllReferences();
+      op->dropAllUses();
+      op->erase();
+    }
+  };
+
+  // Process the function body
+  process(numQubitsFunc.getRegion().front());
+}
+
 static Value genConstant(OpBuilder &builder, const cudaq::state *v,
                          llvm::DataLayout &layout,
                          cudaq::opt::ArgumentConverter &converter) {
@@ -185,10 +404,9 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   // }
   //
   // func.func private @callee(%arg0: i64) {
-  //   %cst = arith.constant 1.5707963267948966 : f64
   //   %0 = quake.alloca !quake.veq<?>[%arg0 : i64]
   //   %1 = quake.extract_ref %0[0] : (!quake.veq<2>) -> !quake.ref
-  //   quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+  //   quake.x %1 : (!quake.ref) -> ()
   //   return
   // }
   //
@@ -210,15 +428,13 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   //   return
   // }
   //
-  // func.func private @callee.num_qubits_0(%arg0: !quake.veq<?>) -> i64 {
-  //   %cst = arith.constant 2 : i64
-  //   return %cst : i64
+  // func.func private @callee.num_qubits_0(%arg0: i64) -> i64 {
+  //   return %arg0 : i64
   // }
   //
-  // func.func private @callee.init_state_0(%arg0: !quake.veq<?>) {
-  //   %cst = arith.constant 1.5707963267948966 : f64
+  // func.func private @callee.init_state_0(%arg0: i64, %arg1: !quake.veq<?>) {
   //   %1 = quake.extract_ref %arg0[0] : (!quake.veq<2>) -> !quake.ref
-  //   quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+  //   quake.x %1 : (f64, !quake.ref) -> ()
   //   return
   // }
   // ```
@@ -232,21 +448,21 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   // clang-format off
   // ```
   // func.func @caller() {
-  //   %1 = call "callee.num_qubits_0" : () -> i64
+  //   %1 = call 2callee.num_qubits_0() : () -> i64
   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
-  //   call "callee.init_0" %2: (!quake.veq<?>) -> ()
+  //   %3 = call @callee.init_0(%2): (!quake.veq<?>) -> !quake.veq<?>
   // }
   //
-  // func.func private @callee.num_qubits_0(%arg0: !quake.veq<?>) -> i64 {
+  // func.func private @callee.num_qubits_0() -> i64 {
   //   %cst = arith.constant 2 : i64
   //   return %cst : i64
   // }
   //
-  // func.func private @callee.init_0(%arg0: !quake.veq<?>) {
+  // func.func private @callee.init_0(%arg0: !quake.veq<?>): !quake.veq<?> {
   //   %cst = arith.constant 1.5707963267948966 : f64
   //   %1 = quake.extract_ref %arg0[0] : (!quake.veq<2>) -> !quake.ref
   //   quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
-  //   return
+  //   return %arg0
   // }
   // ```
   // clang-format on
@@ -264,66 +480,32 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     auto fromModule = parseSourceString<ModuleOp>(code, ctx);
 
     static unsigned counter = 0;
-    std::string modifiedCalleeName =
-        calleeName + ".modified_" + std::to_string(counter++);
-    std::string modifiedCalleeKernelName =
-        cudaq::runtime::cudaqGenPrefixName + modifiedCalleeName;
+    std::string initName = calleeName + ".init_" + std::to_string(counter);
+    std::string initKernelName = cudaq::runtime::cudaqGenPrefixName + initName;
+
+    std::string numQubitsName =
+        calleeName + ".num_qubits_" + std::to_string(counter++);
+    std::string numQubitsKernelName =
+        cudaq::runtime::cudaqGenPrefixName + numQubitsName;
 
-    // Create callee.modified that returns concat of veq allocations.
     auto calleeFunc = fromModule->lookupSymbol<func::FuncOp>(calleeKernelName);
     assert(calleeFunc && "callee is missing");
-    auto argTypes = calleeFunc.getArgumentTypes();
-    auto retType = quake::VeqType::getUnsized(ctx);
-    auto funcTy = FunctionType::get(ctx, argTypes, {retType});
-
-    {
-      OpBuilder::InsertionGuard guard(builder);
-      builder.setInsertionPointToEnd(sourceMod.getBody());
-
-      auto modifiedCalleeFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
-      modifiedCalleeFunc.setName(modifiedCalleeKernelName);
-      modifiedCalleeFunc.setType(funcTy);
-      modifiedCalleeFunc.setPrivate();
-
-      OpBuilder modifiedBuilder(ctx);
-      SmallVector<Value> allocations;
-      SmallVector<Operation *> cleanUps;
-      for (auto &op : modifiedCalleeFunc.getOps()) {
-        if (auto alloc = dyn_cast<quake::AllocaOp>(op)) {
-          allocations.push_back(alloc.getResult());
-          // Replace by the result of quake.init_state if used by it
-          for (auto *user : op.getUsers()) {
-            if (auto init = dyn_cast<quake::InitializeStateOp>(*user)) {
-              allocations.pop_back();
-              allocations.push_back(init.getResult());
-            }
-          }
-        }
-        if (auto retOp = dyn_cast<func::ReturnOp>(op)) {
-          if (retOp.getOperands().size() == 0) {
-            modifiedBuilder.setInsertionPointAfter(retOp);
-            assert(allocations.size() > 0 && "No veq allocations found");
-            Value ret = modifiedBuilder.create<quake::ConcatOp>(
-                loc, quake::VeqType::getUnsized(ctx), allocations);
-            modifiedBuilder.create<func::ReturnOp>(loc, ret);
-            cleanUps.push_back(retOp);
-          }
-        }
-      }
-      for (auto *op : cleanUps) {
-        op->dropAllUses();
-        op->erase();
-      }
-    }
 
-    // Create substitutions for the `callee.modified.N`.
-    converter.genCallee(modifiedCalleeName, calleeArgs);
+    // Create `callee.init_N` and `callee.num_qubits_N` used for
+    // `quake.get_state` replacement later in ReplaceStateWithKernel pass
+    createInitFunc(builder, sourceMod, calleeFunc, initKernelName);
+    createNumQubitsFunc(builder, sourceMod, calleeFunc, numQubitsKernelName);
+
+    // Create substitutions for the `callee.init_N` and `callee.num_qubits_N`.
+    converter.genCallee(initName, calleeArgs);
+    converter.genCallee(numQubitsName, calleeArgs);
 
-    // Create a subst for state pointer.
+    // Create a substitution for the state pointer.
     auto statePtrTy =
         cudaq::cc::PointerType::get(cudaq::cc::StateType::get(ctx));
     return builder.create<quake::GetStateOp>(
-        loc, statePtrTy, builder.getStringAttr(modifiedCalleeKernelName));
+        loc, statePtrTy, builder.getStringAttr(numQubitsKernelName),
+        builder.getStringAttr(initKernelName));
   }
 
   TODO("cudaq::state* argument synthesis for quantum hardware for c functions");
diff --git a/runtime/cudaq/qis/quantum_state.h b/runtime/cudaq/qis/quantum_state.h
index 63117eb4629..c9b1b30029b 100644
--- a/runtime/cudaq/qis/quantum_state.h
+++ b/runtime/cudaq/qis/quantum_state.h
@@ -1,5 +1,5 @@
 /****************************************************************-*- C++ -*-****
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
@@ -76,6 +76,8 @@ class QuantumState : public cudaq::SimulationState {
     (addArgument(args), ...);
   }
   QuantumState() = default;
+  QuantumState(const QuantumState &other)
+      : kernelName(other.kernelName), args(other.args), deleters() {}
   virtual ~QuantumState();
 
   /// @brief True if the state has amplitudes or density matrix available.
diff --git a/runtime/test/CMakeLists.txt b/runtime/test/CMakeLists.txt
index 753cecb9616..0c93df80f5e 100644
--- a/runtime/test/CMakeLists.txt
+++ b/runtime/test/CMakeLists.txt
@@ -24,6 +24,7 @@ link_directories(${CMAKE_BINARY_DIR}/lib)
 target_link_libraries(${TEST_NAME}
   PUBLIC
     cudaq-mlir-runtime
+    cudaq
 )
 
 set_property(TARGET ${TEST_NAME} PROPERTY FOLDER test)
diff --git a/runtime/test/FakeQuantumState.h b/runtime/test/FakeQuantumState.h
new file mode 100644
index 00000000000..14acec132a6
--- /dev/null
+++ b/runtime/test/FakeQuantumState.h
@@ -0,0 +1,159 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/qis/state.h"
+#include <cassert>
+#include <memory>
+
+#include <iostream>
+
+/// @cond DO_NOT_DOCUMENT
+/// @brief Fake simulation state to use in tests.
+class FakeQuantumState : public cudaq::SimulationState {
+private:
+  std::string kernelName;
+  std::vector<void *> args;
+  //std::vector<std::function<void(void *)>> deleters;
+
+public:
+  virtual std::unique_ptr<SimulationState>
+  createFromSizeAndPtr(std::size_t size, void *data,
+                       std::size_t dataType) override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  FakeQuantumState() = default;
+  // FakeQuantumState(const std::string& kernelName, int arg) : kernelName(kernelName) {
+  //   std::cout << "ARG: " << arg << std::endl; 
+  //   addArgument<int>(arg);
+  // }
+
+  FakeQuantumState(const std::string& kernelName, const std::vector<void*> args) : kernelName(kernelName), args(args) {
+    //std::cout << "ARG: " << arg << std::endl; 
+    //addArgument<int>(arg);
+  }
+
+  FakeQuantumState(const FakeQuantumState& other): kernelName(other.kernelName), args(other.args) {}
+
+  // template <typename T>
+  // void addArgument(const T &arg) {
+  //   if constexpr (std::is_pointer_v<std::decay_t<T>>) {
+  //     if constexpr (std::is_copy_constructible_v<
+  //                       std::remove_pointer_t<std::decay_t<T>>>) {
+  //       auto ptr = new std::remove_pointer_t<std::decay_t<T>>(*arg);
+  //       args.push_back(ptr);
+  //       deleters.push_back([](void *ptr) {
+  //         delete static_cast<std::remove_pointer_t<std::decay_t<T>> *>(ptr);
+  //       });
+  //     } else {
+  //       throw std::invalid_argument(
+  //           "Unsupported argument type: only pointers to copy-constructible "
+  //           "types and copy-constructible types are supported.");
+  //     }
+  //   } else if constexpr (std::is_copy_constructible_v<std::decay_t<T>>) {
+  //     auto *ptr = new std::decay_t<T>(arg);
+  //     args.push_back(ptr);
+  //     deleters.push_back(
+  //         [](void *ptr) { delete static_cast<std::decay_t<T> *>(ptr); });
+  //   } else {
+  //     throw std::invalid_argument(
+  //         "Unsupported argument type: only pointers to copy-constructible "
+  //         "types and copy-constructible types are supported.");
+  //   }
+  // }
+
+  virtual std::unique_ptr<cudaq::SimulationState>
+  createFromData(const cudaq::state_data &data) override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual bool hasData() const override { return false; }
+
+  virtual std::optional<std::pair<std::string, std::vector<void *>>>
+  getKernelInfo() const override {
+    return std::make_pair(kernelName, args);
+  }
+
+  virtual Tensor getTensor(std::size_t tensorIdx = 0) const override {
+    throw std::runtime_error("Not implemented");
+    //return Tensor();
+  }
+
+  virtual std::vector<Tensor> getTensors() const override {
+    throw std::runtime_error("Not implemented");
+    //return std::vector<Tensor>();
+  }
+
+  virtual std::size_t getNumTensors() const override { return 1; }
+
+  virtual std::size_t getNumQubits() const override {
+    throw std::runtime_error("Not implemented");
+    //return 0;
+  }
+
+  virtual std::complex<double> overlap(const SimulationState &other) override {
+    throw std::runtime_error("Not implemented");
+    //return 0;
+  }
+
+  virtual std::complex<double>
+  getAmplitude(const std::vector<int> &basisState) override {
+    throw std::runtime_error("Not implemented");
+    //return 0;
+  }
+
+  virtual std::vector<std::complex<double>>
+  getAmplitudes(const std::vector<std::vector<int>> &basisStates) override {
+    throw std::runtime_error("Not implemented");
+    //return {0};
+  }
+
+  virtual void dump(std::ostream &os) const override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual precision getPrecision() const override {
+    return cudaq::SimulationState::precision::fp64;
+  }
+
+  virtual void destroyState() override {
+  }
+
+  virtual std::complex<double>
+  operator()(std::size_t tensorIdx,
+             const std::vector<std::size_t> &indices) override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual std::size_t getNumElements() const override { 
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual bool isDeviceData() const override { return false; }
+
+  virtual bool isArrayLike() const override { return true; }
+
+  virtual void toHost(std::complex<double> *clientAllocatedData,
+                      std::size_t numElements) const override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual void toHost(std::complex<float> *clientAllocatedData,
+                      std::size_t numElements) const override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual ~FakeQuantumState() override {
+    // for (std::size_t counter = 0; auto &ptr : args)
+    //   deleters[counter++](ptr);
+
+    // args.clear();
+    // deleters.clear();
+  }
+};
+/// @endcond
diff --git a/runtime/test/FakeSimulationState.h b/runtime/test/FakeSimulationState.h
index 4dcec050bec..74a0c0c66ed 100644
--- a/runtime/test/FakeSimulationState.h
+++ b/runtime/test/FakeSimulationState.h
@@ -21,7 +21,7 @@ class FakeSimulationState : public cudaq::SimulationState {
   virtual std::unique_ptr<SimulationState>
   createFromSizeAndPtr(std::size_t size, void *data,
                        std::size_t dataType) override {
-    std::runtime_error("Not implemented");
+    throw std::runtime_error("Not implemented");
     return std::make_unique<FakeSimulationState>(size, data);
   }
 
@@ -30,17 +30,17 @@ class FakeSimulationState : public cudaq::SimulationState {
 
   virtual std::unique_ptr<cudaq::SimulationState>
   createFromData(const cudaq::state_data &data) override {
-    std::runtime_error("Not implemented");
+    throw std::runtime_error("Not implemented");
     return std::make_unique<FakeSimulationState>(0, nullptr);
   }
 
   virtual Tensor getTensor(std::size_t tensorIdx = 0) const override {
-    std::runtime_error("Not implemented");
+    throw std::runtime_error("Not implemented");
     return Tensor();
   }
 
   virtual std::vector<Tensor> getTensors() const override {
-    std::runtime_error("Not implemented");
+    throw std::runtime_error("Not implemented");
     return std::vector<Tensor>();
   }
 
@@ -51,33 +51,31 @@ class FakeSimulationState : public cudaq::SimulationState {
   }
 
   virtual std::complex<double> overlap(const SimulationState &other) override {
-    std::runtime_error("Not implemented");
+    throw std::runtime_error("Not implemented");
     return 0;
   }
 
   virtual std::complex<double>
   getAmplitude(const std::vector<int> &basisState) override {
-    std::runtime_error("Not implemented");
+    throw std::runtime_error("Not implemented");
     return 0;
   }
 
   virtual std::vector<std::complex<double>>
   getAmplitudes(const std::vector<std::vector<int>> &basisStates) override {
-    std::runtime_error("Not implemented");
+    throw std::runtime_error("Not implemented");
     return {0};
   }
 
   virtual void dump(std::ostream &os) const override {
-    std::runtime_error("Not implemented");
+    throw std::runtime_error("Not implemented");
   }
 
   virtual precision getPrecision() const override {
     return cudaq::SimulationState::precision::fp64;
   }
 
-  virtual void destroyState() override {
-    std::runtime_error("Not implemented");
-  }
+  virtual void destroyState() override {}
 
   virtual std::complex<double>
   operator()(std::size_t tensorIdx,
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 9f936cdcec8..90ac7b763d3 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -11,6 +11,7 @@
 
 // RUN: test_argument_conversion | FileCheck %s
 
+#include "FakeQuantumState.h"
 #include "FakeSimulationState.h"
 #include "common/ArgumentConversion.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
@@ -20,9 +21,28 @@
 #include "mlir/Parser/Parser.h"
 #include <numeric>
 
+extern "C" void __cudaq_deviceCodeHolderAdd(const char *, const char *);
+
+void dumpSubstitutionModules(cudaq::opt::ArgumentConverter &ab) {
+  std::function<void(cudaq::opt::ArgumentConverter &)> dump =
+      [&dump](cudaq::opt::ArgumentConverter &con) {
+        // Dump the conversions
+        llvm::outs() << "========================================\n"
+                        "Substitution module:\n"
+                     << con.getKernelName() << "\n"
+                     << con.getSubstitutionModule() << '\n';
+
+        for (auto &calleeCon : con.getCalleeConverters())
+          dump(calleeCon);
+      };
+
+  dump(ab);
+}
+
 void doSimpleTest(mlir::MLIRContext *ctx, const std::string &typeName,
-                  std::vector<void *> args) {
-  std::string code = R"#(
+                  std::vector<void *> args,
+                  const std::string &additionalCode = "") {
+  std::string code = additionalCode + R"#(
 func.func private @callee(%0: )#" +
                      typeName + R"#()
 func.func @__nvqpp__mlirgen__testy(%0: )#" +
@@ -37,10 +57,10 @@ func.func @__nvqpp__mlirgen__testy(%0: )#" +
   cudaq::opt::ArgumentConverter ab{"testy", *mod};
   // Create the argument conversions
   ab.gen(args);
-  // Dump the conversions
-  llvm::outs() << "========================================\n"
-                  "Substitution module:\n"
-               << ab.getSubstitutionModule() << '\n';
+  // Dump the modified source module
+  llvm::outs() << "Source module (after):\n" << *mod << '\n';
+  // Dump all conversions
+  dumpSubstitutionModules(ab);
 }
 
 void doTest(mlir::MLIRContext *ctx, std::vector<std::string> &typeNames,
@@ -80,14 +100,12 @@ void doTest(mlir::MLIRContext *ctx, std::vector<std::string> &typeNames,
   auto mod = mlir::parseSourceString<mlir::ModuleOp>(code, ctx);
   llvm::outs() << "Source module:\n" << *mod << '\n';
   cudaq::opt::ArgumentConverter ab{"testy", *mod};
-
   // Create the argument conversions
   ab.gen_drop_front(args, startingArgIdx);
-
-  // Dump the conversions
-  llvm::outs() << "========================================\n"
-                  "Substitution module:\n"
-               << ab.getSubstitutionModule() << '\n';
+  // Dump the modified source module
+  llvm::outs() << "Source module (after):\n" << *mod << '\n';
+  // Dump all conversions
+  dumpSubstitutionModules(ab);
 }
 
 void test_scalars(mlir::MLIRContext *ctx) {
@@ -361,7 +379,7 @@ void test_recursive(mlir::MLIRContext *ctx) {
   // clang-format on
 }
 
-void test_state(mlir::MLIRContext *ctx) {
+void test_simulation_state(mlir::MLIRContext *ctx) {
   {
     std::vector<std::complex<double>> data{M_SQRT1_2, M_SQRT1_2, 0., 0.,
                                            0.,        0.,        0., 0.};
@@ -384,6 +402,197 @@ void test_state(mlir::MLIRContext *ctx) {
   // clang-format on
 }
 
+void test_quantum_state(mlir::MLIRContext *ctx) {
+  {
+    auto kernel = "init";
+    auto kernelCode =
+        ""
+        "func.func private @__nvqpp__mlirgen__init(%arg0: i64) {\n"
+        "  %0 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
+        "  %1 = quake.extract_ref %0[0] : (!quake.veq<?>) -> !quake.ref\n"
+        "  quake.x %1 : (!quake.ref) -> ()\n"
+        "  return\n"
+        "}\n";
+    __cudaq_deviceCodeHolderAdd(kernel, kernelCode);
+
+    std::int64_t n = 2;
+    std::vector<void *> a = {static_cast<void *>(&n)};
+    auto x = cudaq::state(new FakeQuantumState(kernel, a));
+    std::vector<void *> v = {static_cast<void *>(&x)};
+    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, kernelCode);
+  }
+
+  // clang-format off
+// CHECK:       Source module:
+// CHECK:         func.func private @__nvqpp__mlirgen__init(%arg0: i64) {
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
+// CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<?>) -> !quake.ref
+// CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
+// CHECK:           return
+// CHECK:         }
+// CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
+
+// CHECK:       Source module (after):
+// CHECK:         func.func private @__nvqpp__mlirgen__init(%arg0: i64) {
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
+// CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<?>) -> !quake.ref
+// CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
+// CHECK:           return
+// CHECK:         }
+// CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
+// CHECK:         func.func private @__nvqpp__mlirgen__init.init_0(%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.subi %arg0, %[[VAL_1]] : i64
+// CHECK:           %[[VAL_3:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_2]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           %[[VAL_4:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
+// CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
+// CHECK:           %[[VAL_6:.*]] = quake.extract_ref %[[VAL_3]][0] : (!quake.veq<?>) -> !quake.ref
+// CHECK:           quake.x %[[VAL_6]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_7:.*]] = arith.subi %[[VAL_5]], %[[VAL_1]] : i64
+// CHECK:           %[[VAL_8:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_7]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           return %[[VAL_8]] : !quake.veq<?>
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__init.num_qubits_0(%arg0: i64) -> i64 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
+// CHECK:           return %[[VAL_1]] : i64
+// CHECK:         }
+
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         testy
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %[[VAL_0:.*]] = quake.get_state "__nvqpp__mlirgen__init.num_qubits_0" "__nvqpp__mlirgen__init.init_0" : !cc.ptr<!cc.state>
+// CHECK:         }
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         init.init_0
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
+// CHECK:         }
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         init.num_qubits_0
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
+// CHECK:         }
+  // clang-format on
+
+  {
+    auto kernel = "init";
+    auto kernelCode =
+        ""
+        " func.func private @__nvqpp__mlirgen__init(%arg0: i64) {\n"
+        "   %2 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
+        "   %3 = quake.extract_ref %2[0] : (!quake.veq<?>) -> !quake.ref\n"
+        "   quake.x %3 : (!quake.ref) -> ()\n"
+        "   %measOut = quake.mz %3 name \"\" : (!quake.ref) -> !quake.measure\n"
+        "   %4 = quake.discriminate %measOut : (!quake.measure) -> i1\n"
+        "   cc.if(%4) {\n"
+        "    %6 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
+        "    %7 = quake.extract_ref %6[0] : (!quake.veq<?>) -> !quake.ref\n"
+        "    quake.x %7 : (!quake.ref) -> ()\n"
+        "    %8 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref\n"
+        "    quake.y %8 : (!quake.ref) -> ()\n"
+        "   }\n"
+        "   return\n"
+        "}\n";
+
+    __cudaq_deviceCodeHolderAdd(kernel, kernelCode);
+
+    std::int64_t n = 2;
+    std::vector<void *> a = {static_cast<void *>(&n)};
+    auto x = cudaq::state(new FakeQuantumState(kernel, a));
+    std::vector<void *> v = {static_cast<void *>(&x)};
+    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, kernelCode);
+  }
+
+  // clang-format off
+// CHECK:       Source module:
+// CHECK:         func.func private @__nvqpp__mlirgen__init(%arg0: i64) {
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
+// CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<?>) -> !quake.ref
+// CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!quake.measure) -> i1
+// CHECK:           cc.if(%[[VAL_3]]) {
+// CHECK:             %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
+// CHECK:             %[[VAL_5:.*]] = quake.extract_ref %[[VAL_4]][0] : (!quake.veq<?>) -> !quake.ref
+// CHECK:             quake.x %[[VAL_5]] : (!quake.ref) -> ()
+// CHECK:             %[[VAL_6:.*]] = quake.extract_ref %[[VAL_0]][1] : (!quake.veq<?>) -> !quake.ref
+// CHECK:             quake.y %[[VAL_6]] : (!quake.ref) -> ()
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+// CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
+
+// CHECK:       Source module (after):
+// CHECK:         func.func private @__nvqpp__mlirgen__init(%arg0: i64) {
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
+// CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<?>) -> !quake.ref
+// CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!quake.measure) -> i1
+// CHECK:           cc.if(%[[VAL_3]]) {
+// CHECK:             %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
+// CHECK:             %[[VAL_5:.*]] = quake.extract_ref %[[VAL_4]][0] : (!quake.veq<?>) -> !quake.ref
+// CHECK:             quake.x %[[VAL_5]] : (!quake.ref) -> ()
+// CHECK:             %[[VAL_6:.*]] = quake.extract_ref %[[VAL_0]][1] : (!quake.veq<?>) -> !quake.ref
+// CHECK:             quake.y %[[VAL_6]] : (!quake.ref) -> ()
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+// CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
+// CHECK:         func.func private @__nvqpp__mlirgen__init.init_1(%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.subi %arg0, %[[VAL_1]] : i64
+// CHECK:           %[[VAL_3:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_2]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           %[[VAL_4:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
+// CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
+// CHECK:           %[[VAL_6:.*]] = quake.extract_ref %[[VAL_3]][0] : (!quake.veq<?>) -> !quake.ref
+// CHECK:           quake.x %[[VAL_6]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_6]] name "" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_8:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
+// CHECK:           cc.if(%[[VAL_8]]) {
+// CHECK:             %[[VAL_11:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
+// CHECK:             %[[VAL_12:.*]] = quake.extract_ref %[[VAL_11]][0] : (!quake.veq<?>) -> !quake.ref
+// CHECK:             quake.x %[[VAL_12]] : (!quake.ref) -> ()
+// CHECK:             %[[VAL_13:.*]] = quake.extract_ref %[[VAL_3]][1] : (!quake.veq<?>) -> !quake.ref
+// CHECK:             quake.y %[[VAL_13]] : (!quake.ref) -> ()
+// CHECK:           }
+// CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_5]], %[[VAL_1]] : i64
+// CHECK:           %[[VAL_10:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_9]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           return %[[VAL_10]] : !quake.veq<?>
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__init.num_qubits_1(%arg0: i64) -> i64 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
+// CHECK:           return %[[VAL_1]] : i64
+// CHECK:         }
+
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         testy
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %[[VAL_0:.*]] = quake.get_state "__nvqpp__mlirgen__init.num_qubits_1" "__nvqpp__mlirgen__init.init_1" : !cc.ptr<!cc.state>
+// CHECK:         }
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         init.init_1
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
+// CHECK:         }
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         init.num_qubits_1
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
+// CHECK:         }
+  // clang-format on
+}
+
 void test_combinations(mlir::MLIRContext *ctx) {
   {
     bool x = true;
@@ -514,7 +723,8 @@ int main() {
   test_vectors(&context);
   test_aggregates(&context);
   test_recursive(&context);
-  test_state(&context);
+  test_simulation_state(&context);
+  test_quantum_state(&context);
   test_combinations(&context);
   return 0;
 }
diff --git a/targettests/execution/qvector_init_from_state.cpp b/targettests/execution/qvector_init_from_state.cpp
index 62d162e1781..482440b4b8f 100644
--- a/targettests/execution/qvector_init_from_state.cpp
+++ b/targettests/execution/qvector_init_from_state.cpp
@@ -13,6 +13,7 @@
 // Quantum emulators
 // RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
 // 2 different IQM machines for 2 different topologies
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
diff --git a/test/Quake/arg_subst-5.txt b/test/Quake/arg_subst-5.txt
index 2038ad31ccd..5b43881daf5 100644
--- a/test/Quake/arg_subst-5.txt
+++ b/test/Quake/arg_subst-5.txt
@@ -7,5 +7,5 @@
 // ========================================================================== //
 
 cc.arg_subst[0] {
-  %0 = quake.get_state "init" : !cc.ptr<!cc.state>
+  %0 = quake.get_state "num_qubits" "init" : !cc.ptr<!cc.state>
 }
diff --git a/test/Quake/arg_subst-6.txt b/test/Quake/arg_subst-6.txt
index 4c3a55d883a..7a53d0369de 100644
--- a/test/Quake/arg_subst-6.txt
+++ b/test/Quake/arg_subst-6.txt
@@ -7,5 +7,5 @@
 // ========================================================================== //
 
 cc.arg_subst[0] {
-  %c2_i32 = arith.constant 2 : i32
+  %c2_i64 = arith.constant 2 : i64
 }
diff --git a/test/Quake/arg_subst_func.qke b/test/Quake/arg_subst_func.qke
index 768216567d7..b54188850b6 100644
--- a/test/Quake/arg_subst_func.qke
+++ b/test/Quake/arg_subst_func.qke
@@ -6,7 +6,7 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt --argument-synthesis=functions=foo:%S/arg_subst.txt,blink:%S/arg_subst.txt,testy1:%S/arg_subst-1.txt,testy2:%S/arg_subst-2.txt,testy3:%S/arg_subst-3.txt,testy4:%S/arg_subst-4.txt,testy5:%S/arg_subst-5.txt,init:%S/arg_subst-6.txt --canonicalize %s | FileCheck %s
+// RUN: cudaq-opt --argument-synthesis=functions=foo:%S/arg_subst.txt,blink:%S/arg_subst.txt,testy1:%S/arg_subst-1.txt,testy2:%S/arg_subst-2.txt,testy3:%S/arg_subst-3.txt,testy4:%S/arg_subst-4.txt,testy5:%S/arg_subst-5.txt,num_qubits:%S/arg_subst-6.txt,init:%S/arg_subst-6.txt --canonicalize %s | FileCheck %s
 
 func.func private @bar(i32)
 func.func private @baz(f32)
@@ -154,23 +154,25 @@ func.func @testy5(%arg0: !cc.ptr<!cc.state>) {
   return
 }
 
-func.func private @init(%arg0: i32) -> !quake.veq<?> {
-  %cst = arith.constant 1.5707963267948966 : f64
-  %0 = cc.cast signed %arg0 : (i32) -> i64
-  %1 = quake.alloca !quake.veq<?>[%0 : i64]
-  %2 = quake.concat %1 : (!quake.veq<?>) -> !quake.veq<?>
-  return %2 : !quake.veq<?>
+func.func @init(%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
+  return %arg1 : !quake.veq<?>
+}
+
+func.func @num_qubits(%arg0: i64) -> i64 {
+  return %arg0 : i64
 }
 
 // CHECK-LABEL:   func.func @testy5() {
-// CHECK:           %[[VAL_2:.*]] = quake.get_state "init" : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_2:.*]] = quake.get_state "num_qubits" "init" : !cc.ptr<!cc.state>
 // CHECK:           %[[VAL_3:.*]] = quake.get_number_of_qubits %[[VAL_2]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_5:.*]] = quake.init_state %[[VAL_4]], %[[VAL_2]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 // CHECK:           return
 // CHECK:         }
-// CHECK:         func.func private @init() -> !quake.veq<?> {
-// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_8:.*]] = quake.relax_size %[[VAL_7:.*]] : (!quake.veq<2>) -> !quake.veq<?>
-// CHECK:           return %[[VAL_8]] : !quake.veq<?>
+// CHECK:         func.func @init(%arg0: !quake.veq<?>) -> !quake.veq<?> {
+// CHECK:           return %arg0 : !quake.veq<?>
+// CHECK:         }
+// CHECK:         func.func @num_qubits() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
+// CHECK:           return %[[VAL_0]] : i64
 // CHECK:         }
diff --git a/test/Quake/replace_state_with_kernel.qke b/test/Quake/replace_state_with_kernel.qke
index d234db9a617..15bc9cab104 100644
--- a/test/Quake/replace_state_with_kernel.qke
+++ b/test/Quake/replace_state_with_kernel.qke
@@ -9,23 +9,55 @@
 // RUN: cudaq-opt -replace-state-with-kernel -canonicalize %s | FileCheck %s
 
 module {
-  func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-    %0 = quake.get_state "callee.modified_0" : !cc.ptr<!cc.state>
+
+  func.func private @callee.num_qubits_0() -> i64 {
+    %cst = arith.constant 2 : i64
+    return %cst : i64
+  }
+
+  func.func private @callee.init_0(%arg0: !quake.veq<?>) -> !quake.veq<?> {
+    %cst = arith.constant 1.5707963267948966 : f64
+    %1 = quake.extract_ref %arg0[0] : (!quake.veq<?>) -> !quake.ref
+    quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+    return %arg0: !quake.veq<?>
+  }
+
+  func.func @caller0() {
+    %0 = quake.get_state "callee.num_qubits_0" "callee.init_0" : !cc.ptr<!cc.state>
     %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
     %2 = quake.alloca !quake.veq<?>[%1 : i64]
     %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
     return
   }
-  func.func private @callee.modified_0() -> !quake.veq<?> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-    %cst = arith.constant 1.5707963267948966 : f64
-    %0 = quake.alloca !quake.veq<2>
-    %1 = quake.extract_ref %0[0] : (!quake.veq<2>) -> !quake.ref
-    quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
-    %2 = quake.relax_size %0 : (!quake.veq<2>) -> !quake.veq<?>
-    return %2 : !quake.veq<?>
+
+// CHECK-LABEL:   func.func @caller0() {
+// CHECK:           %[[VAL_0:.*]] = call @callee.num_qubits_0() : () -> i64
+// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<?>[%[[VAL_0]] : i64]
+// CHECK:           %[[VAL_2:.*]] = call @callee.init_0(%[[VAL_1]]) : (!quake.veq<?>) -> !quake.veq<?>
+// CHECK:           return
+// CHECK:         }
+
+  func.func @caller1(%arg0: i64) {
+    %0 = quake.get_state "callee.num_qubits_0" "callee.init_0" : !cc.ptr<!cc.state>
+    %2 = quake.alloca !quake.veq<?>[%arg0 : i64]
+    %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+    return
   }
-// CHECK-LABEL:   func.func @foo() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = call @callee.modified_0() : () -> !quake.veq<?>
+
+// CHECK-LABEL:   func.func @caller1(%arg0: i64) {
+// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
+// CHECK:           %[[VAL_2:.*]] = call @callee.init_0(%[[VAL_1]]) : (!quake.veq<?>) -> !quake.veq<?>
 // CHECK:           return
 // CHECK:         }
-}
+
+  func.func @caller2() -> i64 {
+    %0 = quake.get_state "callee.num_qubits_0" "callee.init_0" : !cc.ptr<!cc.state>
+    %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
+    return %1: i64
+  }
+
+// CHECK-LABEL:   func.func @caller2() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = call @callee.num_qubits_0() : () -> i64
+// CHECK:           return %[[VAL_0]] : i64
+// CHECK:         }
+}
\ No newline at end of file

From 008e8c17f23f68cd178d6fd0d71453c1ca3c2630 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 21 Jan 2025 15:01:17 -0800
Subject: [PATCH 27/54] DCO Remediation Commit for Anna Gringauze
 <agringauze@nvidia.com>

I, Anna Gringauze <agringauze@nvidia.com>, hereby add my Signed-off-by to this commit: 95633714a23ad2823369a86b7455537239da5b02

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/test/FakeQuantumState.h    | 63 ++++--------------------------
 runtime/test/FakeSimulationState.h |  6 ---
 2 files changed, 8 insertions(+), 61 deletions(-)

diff --git a/runtime/test/FakeQuantumState.h b/runtime/test/FakeQuantumState.h
index 14acec132a6..2f20babd955 100644
--- a/runtime/test/FakeQuantumState.h
+++ b/runtime/test/FakeQuantumState.h
@@ -18,7 +18,6 @@ class FakeQuantumState : public cudaq::SimulationState {
 private:
   std::string kernelName;
   std::vector<void *> args;
-  //std::vector<std::function<void(void *)>> deleters;
 
 public:
   virtual std::unique_ptr<SimulationState>
@@ -28,44 +27,11 @@ class FakeQuantumState : public cudaq::SimulationState {
   }
 
   FakeQuantumState() = default;
-  // FakeQuantumState(const std::string& kernelName, int arg) : kernelName(kernelName) {
-  //   std::cout << "ARG: " << arg << std::endl; 
-  //   addArgument<int>(arg);
-  // }
-
-  FakeQuantumState(const std::string& kernelName, const std::vector<void*> args) : kernelName(kernelName), args(args) {
-    //std::cout << "ARG: " << arg << std::endl; 
-    //addArgument<int>(arg);
-  }
-
-  FakeQuantumState(const FakeQuantumState& other): kernelName(other.kernelName), args(other.args) {}
-
-  // template <typename T>
-  // void addArgument(const T &arg) {
-  //   if constexpr (std::is_pointer_v<std::decay_t<T>>) {
-  //     if constexpr (std::is_copy_constructible_v<
-  //                       std::remove_pointer_t<std::decay_t<T>>>) {
-  //       auto ptr = new std::remove_pointer_t<std::decay_t<T>>(*arg);
-  //       args.push_back(ptr);
-  //       deleters.push_back([](void *ptr) {
-  //         delete static_cast<std::remove_pointer_t<std::decay_t<T>> *>(ptr);
-  //       });
-  //     } else {
-  //       throw std::invalid_argument(
-  //           "Unsupported argument type: only pointers to copy-constructible "
-  //           "types and copy-constructible types are supported.");
-  //     }
-  //   } else if constexpr (std::is_copy_constructible_v<std::decay_t<T>>) {
-  //     auto *ptr = new std::decay_t<T>(arg);
-  //     args.push_back(ptr);
-  //     deleters.push_back(
-  //         [](void *ptr) { delete static_cast<std::decay_t<T> *>(ptr); });
-  //   } else {
-  //     throw std::invalid_argument(
-  //         "Unsupported argument type: only pointers to copy-constructible "
-  //         "types and copy-constructible types are supported.");
-  //   }
-  // }
+  FakeQuantumState(const std::string &kernelName,
+                   const std::vector<void *> args)
+      : kernelName(kernelName), args(args) {}
+  FakeQuantumState(const FakeQuantumState &other)
+      : kernelName(other.kernelName), args(other.args) {}
 
   virtual std::unique_ptr<cudaq::SimulationState>
   createFromData(const cudaq::state_data &data) override {
@@ -81,36 +47,30 @@ class FakeQuantumState : public cudaq::SimulationState {
 
   virtual Tensor getTensor(std::size_t tensorIdx = 0) const override {
     throw std::runtime_error("Not implemented");
-    //return Tensor();
   }
 
   virtual std::vector<Tensor> getTensors() const override {
     throw std::runtime_error("Not implemented");
-    //return std::vector<Tensor>();
   }
 
   virtual std::size_t getNumTensors() const override { return 1; }
 
   virtual std::size_t getNumQubits() const override {
     throw std::runtime_error("Not implemented");
-    //return 0;
   }
 
   virtual std::complex<double> overlap(const SimulationState &other) override {
     throw std::runtime_error("Not implemented");
-    //return 0;
   }
 
   virtual std::complex<double>
   getAmplitude(const std::vector<int> &basisState) override {
     throw std::runtime_error("Not implemented");
-    //return 0;
   }
 
   virtual std::vector<std::complex<double>>
   getAmplitudes(const std::vector<std::vector<int>> &basisStates) override {
     throw std::runtime_error("Not implemented");
-    //return {0};
   }
 
   virtual void dump(std::ostream &os) const override {
@@ -121,8 +81,7 @@ class FakeQuantumState : public cudaq::SimulationState {
     return cudaq::SimulationState::precision::fp64;
   }
 
-  virtual void destroyState() override {
-  }
+  virtual void destroyState() override {}
 
   virtual std::complex<double>
   operator()(std::size_t tensorIdx,
@@ -130,7 +89,7 @@ class FakeQuantumState : public cudaq::SimulationState {
     throw std::runtime_error("Not implemented");
   }
 
-  virtual std::size_t getNumElements() const override { 
+  virtual std::size_t getNumElements() const override {
     throw std::runtime_error("Not implemented");
   }
 
@@ -148,12 +107,6 @@ class FakeQuantumState : public cudaq::SimulationState {
     throw std::runtime_error("Not implemented");
   }
 
-  virtual ~FakeQuantumState() override {
-    // for (std::size_t counter = 0; auto &ptr : args)
-    //   deleters[counter++](ptr);
-
-    // args.clear();
-    // deleters.clear();
-  }
+  virtual ~FakeQuantumState() override {}
 };
 /// @endcond
diff --git a/runtime/test/FakeSimulationState.h b/runtime/test/FakeSimulationState.h
index 74a0c0c66ed..49667e481c7 100644
--- a/runtime/test/FakeSimulationState.h
+++ b/runtime/test/FakeSimulationState.h
@@ -31,17 +31,14 @@ class FakeSimulationState : public cudaq::SimulationState {
   virtual std::unique_ptr<cudaq::SimulationState>
   createFromData(const cudaq::state_data &data) override {
     throw std::runtime_error("Not implemented");
-    return std::make_unique<FakeSimulationState>(0, nullptr);
   }
 
   virtual Tensor getTensor(std::size_t tensorIdx = 0) const override {
     throw std::runtime_error("Not implemented");
-    return Tensor();
   }
 
   virtual std::vector<Tensor> getTensors() const override {
     throw std::runtime_error("Not implemented");
-    return std::vector<Tensor>();
   }
 
   virtual std::size_t getNumTensors() const override { return 1; }
@@ -52,19 +49,16 @@ class FakeSimulationState : public cudaq::SimulationState {
 
   virtual std::complex<double> overlap(const SimulationState &other) override {
     throw std::runtime_error("Not implemented");
-    return 0;
   }
 
   virtual std::complex<double>
   getAmplitude(const std::vector<int> &basisState) override {
     throw std::runtime_error("Not implemented");
-    return 0;
   }
 
   virtual std::vector<std::complex<double>>
   getAmplitudes(const std::vector<std::vector<int>> &basisStates) override {
     throw std::runtime_error("Not implemented");
-    return {0};
   }
 
   virtual void dump(std::ostream &os) const override {

From f8e35eb5a330ad0852a9cb6025570f5b49e416e6 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 11 Feb 2025 20:48:19 -0800
Subject: [PATCH 28/54] Address some PR comments

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Transforms/Passes.h   | 12 ++---
 .../Transforms/ArgumentSynthesis.cpp          | 16 +++---
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  2 +-
 runtime/common/ArgumentConversion.cpp         | 49 ++++++++++---------
 runtime/common/ArgumentConversion.h           | 15 +++---
 runtime/common/BaseRemoteRESTQPU.h            | 32 ++++++++++--
 runtime/cudaq/cudaq.cpp                       |  7 +++
 runtime/cudaq/utils/registry.h                |  1 +
 runtime/test/test_argument_conversion.cpp     |  8 +--
 9 files changed, 88 insertions(+), 54 deletions(-)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index bab623bb88e..89d3268fb11 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -58,12 +58,12 @@ std::unique_ptr<mlir::Pass>
 createArgumentSynthesisPass(mlir::ArrayRef<mlir::StringRef> funcNames,
                             mlir::ArrayRef<mlir::StringRef> substitutions);
 
-/// Helper function to build an argument synthesis pass. The names of the
-/// functions and the substitutions text can be built as an unzipped pair of
-/// lists.
-std::unique_ptr<mlir::Pass>
-createArgumentSynthesisPass(const std::vector<std::string> &funcNames,
-                            const std::vector<std::string> &substitutions);
+// /// Helper function to build an argument synthesis pass. The names of the
+// /// functions and the substitutions text can be built as an unzipped pair of
+// /// lists.
+// std::unique_ptr<mlir::Pass>
+// createArgumentSynthesisPass(const mlir::SmallVector<std::string> &funcNames,
+//                             const mlir::SmallVector<std::string> &substitutions);
 
 // declarative passes
 #define GEN_PASS_DECL
diff --git a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
index 932c091cb73..359f8839ee5 100644
--- a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
+++ b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
@@ -164,11 +164,11 @@ cudaq::opt::createArgumentSynthesisPass(ArrayRef<StringRef> funcNames,
       ArgumentSynthesisOptions{pairs});
 }
 
-std::unique_ptr<mlir::Pass> cudaq::opt::createArgumentSynthesisPass(
-    const std::vector<std::string> &funcNames,
-    const std::vector<std::string> &substitutions) {
-  return cudaq::opt::createArgumentSynthesisPass(
-      mlir::SmallVector<mlir::StringRef>{funcNames.begin(), funcNames.end()},
-      mlir::SmallVector<mlir::StringRef>{substitutions.begin(),
-                                         substitutions.end()});
-}
+// std::unique_ptr<mlir::Pass> cudaq::opt::createArgumentSynthesisPass(
+//     const mlir::SmallVector<std::string> &funcNames,
+//     const mlir::SmallVector<std::string> &substitutions) {
+//   return cudaq::opt::createArgumentSynthesisPass(
+//       mlir::SmallVector<mlir::StringRef>{funcNames.begin(), funcNames.end()},
+//       mlir::SmallVector<mlir::StringRef>{substitutions.begin(),
+//                                          substitutions.end()});
+// }
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 6b20a7bdfbb..b67c092660e 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -535,7 +535,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   auto isLocalSimulator = platform.is_simulator() && !platform.is_emulated();
   auto isSimulator = isLocalSimulator || isRemoteSimulator;
 
-  cudaq::opt::ArgumentConverter argCon(name, unwrap(module));
+  opt::ArgumentConverter argCon(name, unwrap(module));
   argCon.gen(runtimeArgs.getArgs());
   std::string kernName = cudaq::runtime::cudaqGenPrefixName + name;
   SmallVector<StringRef> kernels = {kernName};
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index ebc8c52ae18..bc1a558737e 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -20,6 +20,8 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Parser/Parser.h"
 
+#include <iostream>
+
 using namespace mlir;
 
 template <typename A>
@@ -119,7 +121,7 @@ static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *,
 /// }
 static void createInitFunc(OpBuilder &builder, ModuleOp sourceMod,
                            func::FuncOp calleeFunc,
-                           std::string &initKernelName) {
+                           StringRef initKernelName) {
   OpBuilder::InsertionGuard guard(builder);
   builder.setInsertionPointToEnd(sourceMod.getBody());
 
@@ -242,7 +244,7 @@ static void createInitFunc(OpBuilder &builder, ModuleOp sourceMod,
 /// }
 static void createNumQubitsFunc(OpBuilder &builder, ModuleOp sourceMod,
                                 func::FuncOp calleeFunc,
-                                std::string &numQubitsKernelName) {
+                                StringRef numQubitsKernelName) {
   OpBuilder::InsertionGuard guard(builder);
   builder.setInsertionPointToEnd(sourceMod.getBody());
 
@@ -478,27 +480,29 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     auto code = cudaq::get_quake_by_name(calleeName, /*throwException=*/false);
     assert(!code.empty() && "Quake code not found for callee");
     auto fromModule = parseSourceString<ModuleOp>(code, ctx);
+    
+    auto calleeFunc = fromModule->lookupSymbol<func::FuncOp>(calleeKernelName);
+    assert(calleeFunc && "callee func is missing");
 
     static unsigned counter = 0;
-    std::string initName = calleeName + ".init_" + std::to_string(counter);
-    std::string initKernelName = cudaq::runtime::cudaqGenPrefixName + initName;
-
-    std::string numQubitsName =
-        calleeName + ".num_qubits_" + std::to_string(counter++);
-    std::string numQubitsKernelName =
-        cudaq::runtime::cudaqGenPrefixName + numQubitsName;
-
-    auto calleeFunc = fromModule->lookupSymbol<func::FuncOp>(calleeKernelName);
-    assert(calleeFunc && "callee is missing");
+    auto initName = calleeName + ".init_" + std::to_string(counter);
+    auto numQubitsName = calleeName + ".num_qubits_" + std::to_string(counter++);
+    auto initKernelName = cudaq::runtime::cudaqGenPrefixName + initName;
+    auto numQubitsKernelName = cudaq::runtime::cudaqGenPrefixName + numQubitsName;
 
     // Create `callee.init_N` and `callee.num_qubits_N` used for
     // `quake.get_state` replacement later in ReplaceStateWithKernel pass
     createInitFunc(builder, sourceMod, calleeFunc, initKernelName);
     createNumQubitsFunc(builder, sourceMod, calleeFunc, numQubitsKernelName);
 
-    // Create substitutions for the `callee.init_N` and `callee.num_qubits_N`.
-    converter.genCallee(initName, calleeArgs);
-    converter.genCallee(numQubitsName, calleeArgs);
+    // Create and register names for new `init` and `num_qubits` kernels so
+    // ArgumentConverters can keep a string reference to a valid memory.
+    auto registeredInitName = cudaq::registry::cudaqRegisterAuxKernelName(initName.c_str());
+    auto registeredNumQubitsName = cudaq::registry::cudaqRegisterAuxKernelName(numQubitsName.c_str());
+    
+    // Create substitutions for `callee.init_N` and `callee.num_qubits_N`.
+    converter.genCallee(registeredInitName, calleeArgs);
+    converter.genCallee(registeredNumQubitsName, calleeArgs);
 
     // Create a substitution for the state pointer.
     auto statePtrTy =
@@ -682,8 +686,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::IndirectCallableType indCallTy,
 //===----------------------------------------------------------------------===//
 
 cudaq::opt::ArgumentConverter::ArgumentConverter(StringRef kernelName,
-                                                 ModuleOp sourceModule,
-                                                 bool isSimulator)
+                                                 ModuleOp sourceModule)
     : sourceModule(sourceModule), builder(sourceModule.getContext()),
       kernelName(kernelName) {
   substModule = builder.create<ModuleOp>(builder.getUnknownLoc());
@@ -694,7 +697,7 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
   // We should look up the input type signature here.
 
   auto fun = sourceModule.lookupSymbol<func::FuncOp>(
-      cudaq::runtime::cudaqGenPrefixName + kernelName);
+      cudaq::runtime::cudaqGenPrefixName + kernelName.str());
   FunctionType fromFuncTy = fun.getFunctionType();
   for (auto iter :
        llvm::enumerate(llvm::zip(fromFuncTy.getInputs(), arguments))) {
@@ -816,22 +819,22 @@ void cudaq::opt::ArgumentConverter::gen_drop_front(
   gen(partialArgs);
 }
 
-std::pair<std::vector<std::string>, std::vector<std::string>>
+std::pair<SmallVector<std::string>, SmallVector<std::string>>
 cudaq::opt::ArgumentConverter::collectAllSubstitutions() {
-  std::vector<std::string> kernels;
-  std::vector<std::string> substs;
+  SmallVector<std::string> kernels;
+  SmallVector<std::string> substs;
 
   std::function<void(ArgumentConverter &)> collect =
       [&kernels, &substs, &collect](ArgumentConverter &con) {
         auto name = con.getKernelName();
         std::string kernName = cudaq::runtime::cudaqGenPrefixName + name.str();
-        kernels.push_back(kernName);
+        kernels.emplace_back(kernName);
 
         {
           std::string substBuff;
           llvm::raw_string_ostream ss(substBuff);
           ss << con.getSubstitutionModule();
-          substs.push_back(substBuff);
+          substs.emplace_back(substBuff);
         }
 
         for (auto &calleeCon : con.getCalleeConverters())
diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index 1cb2b86ac14..25f033efb78 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -15,6 +15,7 @@
 #include "mlir/IR/Types.h"
 #include <unordered_set>
 #include <vector>
+#include <list>
 
 namespace cudaq::opt {
 
@@ -22,8 +23,7 @@ class ArgumentConverter {
 public:
   /// Build an instance to create argument substitutions for a specified \p
   /// kernelName in \p sourceModule.
-  ArgumentConverter(mlir::StringRef kernelName, mlir::ModuleOp sourceModule,
-                    bool isSimulator = true);
+  ArgumentConverter(mlir::StringRef kernelName, mlir::ModuleOp sourceModule);
 
   /// Generate a substitution ModuleOp for the vector of arguments presented.
   /// The arguments are those presented to the kernel, kernelName.
@@ -52,24 +52,25 @@ class ArgumentConverter {
 
   mlir::StringRef getKernelName() { return kernelName; }
 
-  void genCallee(std::string &calleeName, std::vector<void *> &args) {
-    auto converter = ArgumentConverter(calleeName, sourceModule);
+  void genCallee(mlir::StringRef calleeName, std::vector<void *> &args) {
+    // auto converter = ArgumentConverter(calleeName, sourceModule);
+    // converter.gen(args);
+    auto converter = calleeConverters.emplace_back(ArgumentConverter(calleeName, sourceModule));
     converter.gen(args);
-    calleeConverters.push_back(converter);
   }
 
   std::vector<ArgumentConverter> &getCalleeConverters() {
     return calleeConverters;
   }
 
-  std::pair<std::vector<std::string>, std::vector<std::string>>
+  std::pair<mlir::SmallVector<std::string>, mlir::SmallVector<std::string>>
   collectAllSubstitutions();
 
 private:
   mlir::ModuleOp sourceModule;
   mlir::ModuleOp substModule;
   mlir::OpBuilder builder;
-  std::string kernelName;
+  mlir::StringRef kernelName;
   mlir::SmallVector<cc::ArgumentSubstitutionOp> substitutions;
   std::vector<ArgumentConverter> calleeConverters;
 };
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index de0e768e77d..4b60e4a537d 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -447,14 +447,36 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (!rawArgs.empty() || updatedArgs) {
       mlir::PassManager pm(&context);
       if (!rawArgs.empty()) {
-        // For quantum hardware, we collect substitutions for the
-        // whole call tree of states, which are treated as calls to
-        // the kernels and their arguments that produced the state.
         opt::ArgumentConverter argCon(kernelName, moduleOp);
         argCon.gen(rawArgs);
-        auto [kernels, substs] = argCon.collectAllSubstitutions();
+
+        // For quantum hardware, we traverse the tree of ArgumentConverters
+        // for the call tree of states and collect substitutions for all calls.
+        mlir::SmallVector<std::string> kernels;
+        mlir::SmallVector<std::string> substs;
+
+        std::function<void(opt::ArgumentConverter &)> collect =
+          [&kernels, &substs, &collect](opt::ArgumentConverter &con) {
+            auto name = con.getKernelName();
+            std::string kernName = cudaq::runtime::cudaqGenPrefixName + name.str();
+            kernels.emplace_back(kernName);
+          
+            std::string substBuff;
+            llvm::raw_string_ostream ss(substBuff);
+            ss << con.getSubstitutionModule();
+            substs.emplace_back(substBuff);
+
+            for (auto &calleeCon : con.getCalleeConverters())
+              collect(calleeCon);
+          };
+
+        collect(argCon);
+
+        mlir::SmallVector<mlir::StringRef> funcNames{kernels.begin(), kernels.end()};
+        mlir::SmallVector<mlir::StringRef> substitutions{substs.begin(), substs.end()};
+
         pm.addNestedPass<mlir::func::FuncOp>(
-            cudaq::opt::createArgumentSynthesisPass(kernels, substs));
+            cudaq::opt::createArgumentSynthesisPass(funcNames, substitutions));
         pm.addPass(opt::createDeleteStates());
         pm.addNestedPass<mlir::func::FuncOp>(
             opt::createReplaceStateWithKernel());
diff --git a/runtime/cudaq/cudaq.cpp b/runtime/cudaq/cudaq.cpp
index 071f658f43f..07da9f60048 100644
--- a/runtime/cudaq/cudaq.cpp
+++ b/runtime/cudaq/cudaq.cpp
@@ -19,6 +19,7 @@
 #include "distributed/mpi_plugin.h"
 #include <dlfcn.h>
 #include <filesystem>
+#include <list>
 #include <map>
 #include <regex>
 #include <shared_mutex>
@@ -242,6 +243,7 @@ void cudaq::registry::__cudaq_deviceCodeHolderAdd(const char *key,
 //===----------------------------------------------------------------------===//
 
 static std::vector<std::string> kernelRegistry;
+static std::list<std::string> auxKernelRegistry;
 
 static std::map<std::string, cudaq::KernelArgsCreator> argsCreators;
 static std::map<std::string, std::string> lambdaNames;
@@ -252,6 +254,11 @@ void cudaq::registry::cudaqRegisterKernelName(const char *kernelName) {
   kernelRegistry.emplace_back(kernelName);
 }
 
+const char * cudaq::registry::cudaqRegisterAuxKernelName(const char *kernelName) {
+  std::unique_lock<std::shared_mutex> lock(globalRegistryMutex);
+  return auxKernelRegistry.emplace_back(kernelName).c_str();
+}
+
 void cudaq::registry::__cudaq_registerLinkableKernel(void *hostSideFunc,
                                                      const char *kernelName,
                                                      void *deviceSideFunc) {
diff --git a/runtime/cudaq/utils/registry.h b/runtime/cudaq/utils/registry.h
index 2748afe17e7..cb6a7c0c146 100644
--- a/runtime/cudaq/utils/registry.h
+++ b/runtime/cudaq/utils/registry.h
@@ -13,6 +13,7 @@ namespace cudaq::registry {
 extern "C" {
 void __cudaq_deviceCodeHolderAdd(const char *, const char *);
 void cudaqRegisterKernelName(const char *);
+const char* cudaqRegisterAuxKernelName(const char *);
 void cudaqRegisterArgsCreator(const char *, char *);
 void cudaqRegisterLambdaName(const char *, const char *);
 
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index a5e5fa3474f..1ecf85c0404 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -487,7 +487,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
         "   %2 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
         "   %3 = quake.extract_ref %2[0] : (!quake.veq<?>) -> !quake.ref\n"
         "   quake.x %3 : (!quake.ref) -> ()\n"
-        "   %measOut = quake.mz %3 name \"\" : (!quake.ref) -> !quake.measure\n"
+        "   %measOut = quake.mz %3 name \"q0\" : (!quake.ref) -> !quake.measure\n"
         "   %4 = quake.discriminate %measOut : (!quake.measure) -> i1\n"
         "   cc.if(%4) {\n"
         "    %6 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
@@ -514,7 +514,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
 // CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<?>) -> !quake.ref
 // CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "q0" : (!quake.ref) -> !quake.measure
 // CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!quake.measure) -> i1
 // CHECK:           cc.if(%[[VAL_3]]) {
 // CHECK:             %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
@@ -532,7 +532,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
 // CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<?>) -> !quake.ref
 // CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "q0" : (!quake.ref) -> !quake.measure
 // CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!quake.measure) -> i1
 // CHECK:           cc.if(%[[VAL_3]]) {
 // CHECK:             %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
@@ -553,7 +553,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
 // CHECK:           %[[VAL_6:.*]] = quake.extract_ref %[[VAL_3]][0] : (!quake.veq<?>) -> !quake.ref
 // CHECK:           quake.x %[[VAL_6]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_6]] name "" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_6]] name "q0" : (!quake.ref) -> !quake.measure
 // CHECK:           %[[VAL_8:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
 // CHECK:           cc.if(%[[VAL_8]]) {
 // CHECK:             %[[VAL_11:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]

From e79ad6abe648e89ae3f620678c3fc62c2a02d71e Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 12 Feb 2025 11:52:52 -0800
Subject: [PATCH 29/54] Address more CR comments

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Dialect/CC/CCTypes.td |  6 +++
 .../cudaq/Optimizer/Dialect/Quake/QuakeOps.td | 16 +++---
 include/cudaq/Optimizer/Transforms/Passes.h   |  7 ---
 include/cudaq/Optimizer/Transforms/Passes.td  |  2 +-
 .../Transforms/ArgumentSynthesis.cpp          |  9 ----
 .../Transforms/ReplaceStateWithKernel.cpp     | 10 ++--
 runtime/common/ArgumentConversion.cpp         | 49 +++++--------------
 runtime/common/ArgumentConversion.h           | 15 +++---
 runtime/common/BaseRemoteRESTQPU.h            | 46 ++++++++++-------
 runtime/cudaq/cudaq.cpp                       |  6 ---
 runtime/cudaq/utils/registry.h                |  1 -
 runtime/test/test_argument_conversion.cpp     |  7 +--
 test/Quake/arg_subst-5.txt                    |  2 +-
 test/Quake/arg_subst_func.qke                 |  2 +-
 test/Quake/replace_state_with_kernel.qke      |  6 +--
 15 files changed, 79 insertions(+), 105 deletions(-)

diff --git a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
index 18bce4e156a..03b8d9541d9 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
@@ -313,4 +313,10 @@ def AnyStateInitLike : TypeConstraint<cc_PointerType.predicate,
                          "state initializer types">;
 def AnyStateInitType : Type<AnyStateInitLike.predicate, "initial state type">;
 
+def AnyStatePointerType : Type<
+  And<[
+    cc_PointerType.predicate,
+    CPred<"$_self.cast<cudaq::cc::PointerType>().getElementType().isa<cudaq::cc::StateType>()">
+    ]>,
+    "state pointer type">;
 #endif // CUDAQ_DIALECT_CC_TYPES_TD
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
index e5bb1222088..65730b84f29 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
@@ -1418,7 +1418,7 @@ def quake_CreateStateOp : QuakeOp<"create_state", [Pure]> {
     cc_PointerType:$data,
     AnySignlessInteger:$length
   );
-  let results = (outs cc_PointerType:$result);
+  let results = (outs AnyStatePointerType:$result);
   let assemblyFormat = [{
       $data `,` $length `:` functional-type(operands, results) attr-dict
   }];
@@ -1436,7 +1436,7 @@ def QuakeOp_DeleteStateOp : QuakeOp<"delete_state", [] > {
     ```
   }];
 
-  let arguments = (ins cc_PointerType:$state);
+  let arguments = (ins AnyStatePointerType:$state);
   let results = (outs);
   let assemblyFormat = [{
       $state `:` type(operands) attr-dict
@@ -1456,7 +1456,7 @@ def quake_GetNumberOfQubitsOp : QuakeOp<"get_number_of_qubits", [Pure] > {
     ```
   }];
 
-  let arguments = (ins cc_PointerType:$state);
+  let arguments = (ins AnyStatePointerType:$state);
   let results = (outs AnySignlessInteger:$result);
   let assemblyFormat = [{
       $state `:` functional-type(operands, results) attr-dict
@@ -1479,17 +1479,17 @@ def QuakeOp_GetStateOp : QuakeOp<"get_state", [Pure] > {
     the provided names in `ReplaceStateByKernel` pass.
 
     ```mlir
-      %0 = quake.get_state "num_qubits" "init" : !cc.ptr<!cc.state>
+      %0 = quake.get_state @num_qubits @init : !cc.ptr<!cc.state>
     ```
   }];
 
   let arguments = (ins
-    StrAttr:$numQubitsFuncName,
-    StrAttr:$initFuncName
+    FlatSymbolRefAttr:$numQubitsFunc,
+    FlatSymbolRefAttr:$initFunc
   );
-  let results = (outs cc_PointerType:$result);
+  let results = (outs AnyStatePointerType:$result);
   let assemblyFormat = [{
-     $numQubitsFuncName $initFuncName `:` qualified(type(results)) attr-dict
+     $numQubitsFunc $initFunc `:` qualified(type(results)) attr-dict
   }];
 }
 
diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 89d3268fb11..4bfddf6101d 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -58,13 +58,6 @@ std::unique_ptr<mlir::Pass>
 createArgumentSynthesisPass(mlir::ArrayRef<mlir::StringRef> funcNames,
                             mlir::ArrayRef<mlir::StringRef> substitutions);
 
-// /// Helper function to build an argument synthesis pass. The names of the
-// /// functions and the substitutions text can be built as an unzipped pair of
-// /// lists.
-// std::unique_ptr<mlir::Pass>
-// createArgumentSynthesisPass(const mlir::SmallVector<std::string> &funcNames,
-//                             const mlir::SmallVector<std::string> &substitutions);
-
 // declarative passes
 #define GEN_PASS_DECL
 #define GEN_PASS_REGISTRATION
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 7c77b7d5fbd..9afc56a4083 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -842,7 +842,7 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::Func
     Before ReplaceStateWithKernel (replace-state-with-kernel):
     ```
     func.func @foo() {
-      %0 = quake.get_state "callee.num_qubits_0" "callee.init_0": !cc.ptr<!cc.state>
+      %0 = quake.get_state @callee.num_qubits_0 @callee.init_0: !cc.ptr<!cc.state>
       %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
       %2 = quake.alloca !quake.veq<?>[%1 : i64]
       %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
diff --git a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
index 359f8839ee5..76a3ac36ca8 100644
--- a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
+++ b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
@@ -163,12 +163,3 @@ cudaq::opt::createArgumentSynthesisPass(ArrayRef<StringRef> funcNames,
   return std::make_unique<ArgumentSynthesisPass>(
       ArgumentSynthesisOptions{pairs});
 }
-
-// std::unique_ptr<mlir::Pass> cudaq::opt::createArgumentSynthesisPass(
-//     const mlir::SmallVector<std::string> &funcNames,
-//     const mlir::SmallVector<std::string> &substitutions) {
-//   return cudaq::opt::createArgumentSynthesisPass(
-//       mlir::SmallVector<mlir::StringRef>{funcNames.begin(), funcNames.end()},
-//       mlir::SmallVector<mlir::StringRef>{substitutions.begin(),
-//                                          substitutions.end()});
-// }
diff --git a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index d102d156da2..872e12c3f32 100644
--- a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -34,7 +34,7 @@ namespace {
 /// that computes the number of qubits for a state.
 ///
 /// ```
-///  %0 = quake.get_state "callee.num_qubits_0" "callee.init_0" : !cc.ptr<!cc.state>
+///  %0 = quake.get_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
 ///  %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
 /// ───────────────────────────────────────────
 /// ...
@@ -51,11 +51,11 @@ class ReplaceGetNumQubitsPattern
 
     auto stateOp = numQubits.getOperand();
     if (auto getState = stateOp.getDefiningOp<quake::GetStateOp>()) {
-      auto numQubitsName = getState.getNumQubitsFuncName();
+      auto numQubitsFunc = getState.getNumQubitsFunc();
 
       rewriter.setInsertionPoint(numQubits);
       rewriter.replaceOpWithNewOp<func::CallOp>(
-          numQubits, numQubits.getType(), numQubitsName, mlir::ValueRange{});
+          numQubits, numQubits.getType(), numQubitsFunc, mlir::ValueRange{});
       return success();
     }
     return numQubits->emitError(
@@ -68,7 +68,7 @@ class ReplaceGetNumQubitsPattern
 /// the state.
 ///
 /// ```
-///  %0 = quake.get_state "callee.num_qubits_0" "callee.init_0" : !cc.ptr<!cc.state>
+///  %0 = quake.get_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
 ///  %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 /// ───────────────────────────────────────────
 /// ...
@@ -88,7 +88,7 @@ class ReplaceInitStatePattern
     if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(stateOp.getType())) {
       if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
         if (auto getState = stateOp.getDefiningOp<quake::GetStateOp>()) {
-          auto initName = getState.getInitFuncName();
+          auto initName = getState.getInitFunc();
 
           rewriter.setInsertionPoint(initState);
           rewriter.replaceOpWithNewOp<func::CallOp>(
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index bc1a558737e..ceccc2dc24e 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -120,8 +120,7 @@ static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *,
 ///   return %arg0: !quake.veq<?>
 /// }
 static void createInitFunc(OpBuilder &builder, ModuleOp sourceMod,
-                           func::FuncOp calleeFunc,
-                           StringRef initKernelName) {
+                           func::FuncOp calleeFunc, StringRef initKernelName) {
   OpBuilder::InsertionGuard guard(builder);
   builder.setInsertionPointToEnd(sourceMod.getBody());
 
@@ -394,7 +393,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   //    Initializes the veq passed as a parameter
   //
   // Then replace the state with
-  //   `quake.get_state "callee.num_qubits_0" "callee.init_state_0"`:
+  //   `quake.get_state @callee.num_qubits_0 @callee.init_state_0`:
   //
   // clang-format off
   // ```
@@ -423,7 +422,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   // clang-format off
   // ```
   // func.func @caller() {
-  //   %0 = quake.get_state "callee.num_qubits_0" "callee.init_state_0" : !cc.ptr<!cc.state>
+  //   %0 = quake.get_state @callee.num_qubits_0 @callee.init_state_0 : !cc.ptr<!cc.state>
   //   %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
   //   %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
@@ -480,15 +479,17 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     auto code = cudaq::get_quake_by_name(calleeName, /*throwException=*/false);
     assert(!code.empty() && "Quake code not found for callee");
     auto fromModule = parseSourceString<ModuleOp>(code, ctx);
-    
+
     auto calleeFunc = fromModule->lookupSymbol<func::FuncOp>(calleeKernelName);
     assert(calleeFunc && "callee func is missing");
 
     static unsigned counter = 0;
     auto initName = calleeName + ".init_" + std::to_string(counter);
-    auto numQubitsName = calleeName + ".num_qubits_" + std::to_string(counter++);
+    auto numQubitsName =
+        calleeName + ".num_qubits_" + std::to_string(counter++);
     auto initKernelName = cudaq::runtime::cudaqGenPrefixName + initName;
-    auto numQubitsKernelName = cudaq::runtime::cudaqGenPrefixName + numQubitsName;
+    auto numQubitsKernelName =
+        cudaq::runtime::cudaqGenPrefixName + numQubitsName;
 
     // Create `callee.init_N` and `callee.num_qubits_N` used for
     // `quake.get_state` replacement later in ReplaceStateWithKernel pass
@@ -497,9 +498,11 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
 
     // Create and register names for new `init` and `num_qubits` kernels so
     // ArgumentConverters can keep a string reference to a valid memory.
-    auto registeredInitName = cudaq::registry::cudaqRegisterAuxKernelName(initName.c_str());
-    auto registeredNumQubitsName = cudaq::registry::cudaqRegisterAuxKernelName(numQubitsName.c_str());
-    
+    auto &registeredInitName =
+        cudaq::opt::ArgumentConverter::registerKernelName(initName);
+    auto &registeredNumQubitsName =
+        cudaq::opt::ArgumentConverter::registerKernelName(numQubitsName);
+
     // Create substitutions for `callee.init_N` and `callee.num_qubits_N`.
     converter.genCallee(registeredInitName, calleeArgs);
     converter.genCallee(registeredNumQubitsName, calleeArgs);
@@ -818,29 +821,3 @@ void cudaq::opt::ArgumentConverter::gen_drop_front(
   }
   gen(partialArgs);
 }
-
-std::pair<SmallVector<std::string>, SmallVector<std::string>>
-cudaq::opt::ArgumentConverter::collectAllSubstitutions() {
-  SmallVector<std::string> kernels;
-  SmallVector<std::string> substs;
-
-  std::function<void(ArgumentConverter &)> collect =
-      [&kernels, &substs, &collect](ArgumentConverter &con) {
-        auto name = con.getKernelName();
-        std::string kernName = cudaq::runtime::cudaqGenPrefixName + name.str();
-        kernels.emplace_back(kernName);
-
-        {
-          std::string substBuff;
-          llvm::raw_string_ostream ss(substBuff);
-          ss << con.getSubstitutionModule();
-          substs.emplace_back(substBuff);
-        }
-
-        for (auto &calleeCon : con.getCalleeConverters())
-          collect(calleeCon);
-      };
-
-  collect(*this);
-  return {kernels, substs};
-}
diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index 25f033efb78..d07a5e5e989 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -13,9 +13,9 @@
 #include "cudaq/qis/state.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Types.h"
+#include <list>
 #include <unordered_set>
 #include <vector>
-#include <list>
 
 namespace cudaq::opt {
 
@@ -53,9 +53,7 @@ class ArgumentConverter {
   mlir::StringRef getKernelName() { return kernelName; }
 
   void genCallee(mlir::StringRef calleeName, std::vector<void *> &args) {
-    // auto converter = ArgumentConverter(calleeName, sourceModule);
-    // converter.gen(args);
-    auto converter = calleeConverters.emplace_back(ArgumentConverter(calleeName, sourceModule));
+    auto &converter = calleeConverters.emplace_back(calleeName, sourceModule);
     converter.gen(args);
   }
 
@@ -63,10 +61,15 @@ class ArgumentConverter {
     return calleeConverters;
   }
 
-  std::pair<mlir::SmallVector<std::string>, mlir::SmallVector<std::string>>
-  collectAllSubstitutions();
+  static const std::string &registerKernelName(const std::string &kernelName) {
+    return kernelNameRegistry.emplace_back(kernelName);
+  }
 
 private:
+  // Note: use std::list to make sure we always return valid references
+  // when registering new kernel names.
+  static std::list<std::string> kernelNameRegistry;
+
   mlir::ModuleOp sourceModule;
   mlir::ModuleOp substModule;
   mlir::OpBuilder builder;
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 4b60e4a537d..5cdd07d1646 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -450,31 +450,41 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         opt::ArgumentConverter argCon(kernelName, moduleOp);
         argCon.gen(rawArgs);
 
-        // For quantum hardware, we traverse the tree of ArgumentConverters
-        // for the call tree of states and collect substitutions for all calls.
+        // For quantum devices, we've created a tree of ArgumentConverters
+        // with nodes corresponding to `init` and `num_qubits` functions
+        // created from a kernel that generated the state argument.
+        // Traverse the tree and collect substitutions for all those
+        // functions.
+
+        // Store kernel and substitution strings on the stack.
+        // We pass string references to the `createArgumentSynthesisPass`.
         mlir::SmallVector<std::string> kernels;
         mlir::SmallVector<std::string> substs;
 
         std::function<void(opt::ArgumentConverter &)> collect =
-          [&kernels, &substs, &collect](opt::ArgumentConverter &con) {
-            auto name = con.getKernelName();
-            std::string kernName = cudaq::runtime::cudaqGenPrefixName + name.str();
-            kernels.emplace_back(kernName);
-          
-            std::string substBuff;
-            llvm::raw_string_ostream ss(substBuff);
-            ss << con.getSubstitutionModule();
-            substs.emplace_back(substBuff);
-
-            for (auto &calleeCon : con.getCalleeConverters())
-              collect(calleeCon);
-          };
+            [&kernels, &substs, &collect](opt::ArgumentConverter &con) {
+              {
+                auto name = con.getKernelName();
+                std::string kernName =
+                    cudaq::runtime::cudaqGenPrefixName + name.str();
+                kernels.emplace_back(kernName);
+              }
+              {
+                std::string substBuff;
+                llvm::raw_string_ostream ss(substBuff);
+                ss << con.getSubstitutionModule();
+                substs.emplace_back(substBuff);
+              }
 
+              for (auto &calleeCon : con.getCalleeConverters())
+                collect(calleeCon);
+            };
         collect(argCon);
 
-        mlir::SmallVector<mlir::StringRef> funcNames{kernels.begin(), kernels.end()};
-        mlir::SmallVector<mlir::StringRef> substitutions{substs.begin(), substs.end()};
-
+        mlir::SmallVector<mlir::StringRef> funcNames{kernels.begin(),
+                                                     kernels.end()};
+        mlir::SmallVector<mlir::StringRef> substitutions{substs.begin(),
+                                                         substs.end()};
         pm.addNestedPass<mlir::func::FuncOp>(
             cudaq::opt::createArgumentSynthesisPass(funcNames, substitutions));
         pm.addPass(opt::createDeleteStates());
diff --git a/runtime/cudaq/cudaq.cpp b/runtime/cudaq/cudaq.cpp
index 07da9f60048..5dbdf4ee8cf 100644
--- a/runtime/cudaq/cudaq.cpp
+++ b/runtime/cudaq/cudaq.cpp
@@ -243,7 +243,6 @@ void cudaq::registry::__cudaq_deviceCodeHolderAdd(const char *key,
 //===----------------------------------------------------------------------===//
 
 static std::vector<std::string> kernelRegistry;
-static std::list<std::string> auxKernelRegistry;
 
 static std::map<std::string, cudaq::KernelArgsCreator> argsCreators;
 static std::map<std::string, std::string> lambdaNames;
@@ -254,11 +253,6 @@ void cudaq::registry::cudaqRegisterKernelName(const char *kernelName) {
   kernelRegistry.emplace_back(kernelName);
 }
 
-const char * cudaq::registry::cudaqRegisterAuxKernelName(const char *kernelName) {
-  std::unique_lock<std::shared_mutex> lock(globalRegistryMutex);
-  return auxKernelRegistry.emplace_back(kernelName).c_str();
-}
-
 void cudaq::registry::__cudaq_registerLinkableKernel(void *hostSideFunc,
                                                      const char *kernelName,
                                                      void *deviceSideFunc) {
diff --git a/runtime/cudaq/utils/registry.h b/runtime/cudaq/utils/registry.h
index cb6a7c0c146..2748afe17e7 100644
--- a/runtime/cudaq/utils/registry.h
+++ b/runtime/cudaq/utils/registry.h
@@ -13,7 +13,6 @@ namespace cudaq::registry {
 extern "C" {
 void __cudaq_deviceCodeHolderAdd(const char *, const char *);
 void cudaqRegisterKernelName(const char *);
-const char* cudaqRegisterAuxKernelName(const char *);
 void cudaqRegisterArgsCreator(const char *, char *);
 void cudaqRegisterLambdaName(const char *, const char *);
 
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 1ecf85c0404..84c8d425001 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -463,7 +463,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         testy
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = quake.get_state "__nvqpp__mlirgen__init.num_qubits_0" "__nvqpp__mlirgen__init.init_0" : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_0:.*]] = quake.get_state @__nvqpp__mlirgen__init.num_qubits_0 @__nvqpp__mlirgen__init.init_0 : !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK:         ========================================
 // CHECK:         Substitution module:
@@ -487,7 +487,8 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
         "   %2 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
         "   %3 = quake.extract_ref %2[0] : (!quake.veq<?>) -> !quake.ref\n"
         "   quake.x %3 : (!quake.ref) -> ()\n"
-        "   %measOut = quake.mz %3 name \"q0\" : (!quake.ref) -> !quake.measure\n"
+        "   %measOut = quake.mz %3 name \"q0\" : (!quake.ref) -> "
+        "!quake.measure\n"
         "   %4 = quake.discriminate %measOut : (!quake.measure) -> i1\n"
         "   cc.if(%4) {\n"
         "    %6 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
@@ -576,7 +577,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         testy
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = quake.get_state "__nvqpp__mlirgen__init.num_qubits_1" "__nvqpp__mlirgen__init.init_1" : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_0:.*]] = quake.get_state @__nvqpp__mlirgen__init.num_qubits_1 @__nvqpp__mlirgen__init.init_1 : !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK:         ========================================
 // CHECK:         Substitution module:
diff --git a/test/Quake/arg_subst-5.txt b/test/Quake/arg_subst-5.txt
index 5b43881daf5..5020e7fe096 100644
--- a/test/Quake/arg_subst-5.txt
+++ b/test/Quake/arg_subst-5.txt
@@ -7,5 +7,5 @@
 // ========================================================================== //
 
 cc.arg_subst[0] {
-  %0 = quake.get_state "num_qubits" "init" : !cc.ptr<!cc.state>
+  %0 = quake.get_state @num_qubits @init : !cc.ptr<!cc.state>
 }
diff --git a/test/Quake/arg_subst_func.qke b/test/Quake/arg_subst_func.qke
index 0a3fa2d653f..8df6c5e1433 100644
--- a/test/Quake/arg_subst_func.qke
+++ b/test/Quake/arg_subst_func.qke
@@ -163,7 +163,7 @@ func.func @num_qubits(%arg0: i64) -> i64 {
 }
 
 // CHECK-LABEL:   func.func @testy5() {
-// CHECK:           %[[VAL_2:.*]] = quake.get_state "num_qubits" "init" : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_2:.*]] = quake.get_state @num_qubits @init : !cc.ptr<!cc.state>
 // CHECK:           %[[VAL_3:.*]] = quake.get_number_of_qubits %[[VAL_2]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_5:.*]] = quake.init_state %[[VAL_4]], %[[VAL_2]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
diff --git a/test/Quake/replace_state_with_kernel.qke b/test/Quake/replace_state_with_kernel.qke
index 15bc9cab104..24bdd787216 100644
--- a/test/Quake/replace_state_with_kernel.qke
+++ b/test/Quake/replace_state_with_kernel.qke
@@ -23,7 +23,7 @@ module {
   }
 
   func.func @caller0() {
-    %0 = quake.get_state "callee.num_qubits_0" "callee.init_0" : !cc.ptr<!cc.state>
+    %0 = quake.get_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
     %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
     %2 = quake.alloca !quake.veq<?>[%1 : i64]
     %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
@@ -38,7 +38,7 @@ module {
 // CHECK:         }
 
   func.func @caller1(%arg0: i64) {
-    %0 = quake.get_state "callee.num_qubits_0" "callee.init_0" : !cc.ptr<!cc.state>
+    %0 = quake.get_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
     %2 = quake.alloca !quake.veq<?>[%arg0 : i64]
     %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
     return
@@ -51,7 +51,7 @@ module {
 // CHECK:         }
 
   func.func @caller2() -> i64 {
-    %0 = quake.get_state "callee.num_qubits_0" "callee.init_0" : !cc.ptr<!cc.state>
+    %0 = quake.get_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
     %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
     return %1: i64
   }

From c0d9ae9e51a3ff342f99fab2dc25935be05a2b26 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 12 Feb 2025 22:08:09 -0800
Subject: [PATCH 30/54] Cleanup

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 python/runtime/cudaq/platform/py_alt_launch_kernel.cpp | 2 +-
 runtime/common/ArgumentConversion.cpp                  | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index b67c092660e..6b20a7bdfbb 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -535,7 +535,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   auto isLocalSimulator = platform.is_simulator() && !platform.is_emulated();
   auto isSimulator = isLocalSimulator || isRemoteSimulator;
 
-  opt::ArgumentConverter argCon(name, unwrap(module));
+  cudaq::opt::ArgumentConverter argCon(name, unwrap(module));
   argCon.gen(runtimeArgs.getArgs());
   std::string kernName = cudaq::runtime::cudaqGenPrefixName + name;
   SmallVector<StringRef> kernels = {kernName};
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index ceccc2dc24e..421faa48c3b 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -20,8 +20,6 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Parser/Parser.h"
 
-#include <iostream>
-
 using namespace mlir;
 
 template <typename A>

From 1ecd8cc0168e2f454efb200da013a37e20ac347c Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 13 Feb 2025 14:09:49 -0800
Subject: [PATCH 31/54] Address CR comments

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 .../cudaq/Optimizer/Dialect/Quake/QuakeOps.td |  19 +--
 include/cudaq/Optimizer/Transforms/Passes.td  |  42 +++++-
 runtime/common/ArgumentConversion.cpp         |  19 ++-
 runtime/cudaq/algorithms/get_state.h          |  13 +-
 runtime/test/FakeQuantumState.h               |  37 +++--
 runtime/test/FakeSimulationState.h            |   1 -
 runtime/test/test_argument_conversion.cpp     | 131 +++++++++++++++++-
 test/Quake/replace_state_with_kernel.qke      |   2 +-
 8 files changed, 216 insertions(+), 48 deletions(-)

diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
index 65730b84f29..6bb5e985092 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
@@ -1469,14 +1469,17 @@ def QuakeOp_GetStateOp : QuakeOp<"get_state", [Pure] > {
     This operation is created by argument synthesis of state pointer arguments
     for quantum devices.
 
-    It takes two kernel names as ASCIIZ string literals:
-      - "num_qubits" for determining the size of the allocation to initialize
-      - "init" for initializing the state the same way as the original kernel
-        passed to `cudaq::get_state`) as ASCIIZ string literal
-
-    And returns the quantum state of the original kernel passed to
-    `cudaq::get_state`. The operation is replaced by calls to the kernels with
-    the provided names in `ReplaceStateByKernel` pass.
+    It takes two kernel names as symbol references:
+      - @num_qubits for determining the size of the allocation to initialize
+      - @init for initializing the state the same way as the original kernel
+        passed to `cudaq::get_state`.
+
+    This operation will return of the original kernel passed to
+    `cudaq::get_state`. `cudaq::get_state`. 
+    
+    The operation may be replaced by calls to the @num_qubits and @init calls,
+    which will reproduce the specified state in the `ReplaceStateByKernel`
+    pass.
 
     ```mlir
       %0 = quake.get_state @num_qubits @init : !cc.ptr<!cc.state>
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 9afc56a4083..687eee5120d 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -828,14 +828,44 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::Func
   let summary =
     "Replace `quake.init_state` instructions with call to the kernel generating the state";
   let description = [{
-    Argument synthesis for state pointers for quantum devices substitutes state
-    argument by a new state created from `__nvqpp_cudaq_state_get` intrinsic, which
-    in turn accepts the name for the synthesized kernel that generated the state.
+    This optimization replaces `quake.init_state`, `quake.get_number_of_qubits`, 
+    and `quake.get_state` operations.
+
+    Before this optimization, argument synthesis for state pointers for quantum
+    devices substituted a new state created from the `quake.get_state` operation
+    for the state argument. 
+    
+    The `quake.get_state` operation accepts symbols for the synthesized kernels
+    `num_qubits` and `init` that argument synthesis generated from the original
+    kernel call that generated the state, e.g., the `cudaq::get_state` call that
+    refers to the result of a specific quantum kernel being invoked with a set
+    of parameters
+
+    For example, for the user code:
+      ```
+      state = cudaq::get_state(callee, args)
+      caller(state)
+      ```
+
+    The argument synthesis generated the following new kernels from the `callee`
+    and synthesized them to substitute their arguments with `args`:
+      ```
+      func.func @callee_init(qubits: !quake.veq<?>, arguments) -> !quake.veq<?>
+      func.func @callee_num_qubits(arguments) -> i64
+      ```
+
+    The argument synthesis also substituted the state argument in the `caller`
+    with:
+      ```
+      quake.get_state @callee_num_qubits @callee_init: !cc.ptr<!cc.state>
+      ```
 
-    This optimization completes the replacement of `quake.init_state` instruction by:
+    This optimization performs the replacements for the the following operations 
+    that use a state produced by  `quake.get_state @num_qubits @init` operation:
 
-    - Replace `quake.init_state` by a call that `get_state` call refers to.
-    - Remove all unneeded instructions.
+    - Replace `quake.get_number_of_qubits` operation by the @num_qubits call
+    - Replace `quake.init_state` operation by the @init call
+    - Clean up unused `quake.get_state` operation
 
     For example:
 
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 421faa48c3b..664d10549f9 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -100,7 +100,8 @@ static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *,
                          ModuleOp substMod, llvm::DataLayout &);
 
 /// Create callee.init_N that initializes the state
-/// Callee:
+/// Callee (the kernel captured by state):
+// clang-format off
 /// func.func @__nvqpp__mlirgen__callee(%arg0: i64) {
 ///   %0 = cc.alloca i64
 ///   cc.store %arg0, %0 : !cc.ptr<i64>
@@ -117,6 +118,7 @@ static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *,
 ///   quake.x %1 : (f64, !quake.ref) -> ()
 ///   return %arg0: !quake.veq<?>
 /// }
+// clang-format on
 static void createInitFunc(OpBuilder &builder, ModuleOp sourceMod,
                            func::FuncOp calleeFunc, StringRef initKernelName) {
   OpBuilder::InsertionGuard guard(builder);
@@ -139,9 +141,8 @@ static void createInitFunc(OpBuilder &builder, ModuleOp sourceMod,
 
   auto *entryBlock = &initFunc.getRegion().front();
   newBuilder.setInsertionPointToStart(entryBlock);
-  auto intType = newBuilder.getI64Type();
-  Value zero = newBuilder.create<arith::ConstantIntOp>(loc, 0, intType);
-  Value one = newBuilder.create<arith::ConstantIntOp>(loc, 1, intType);
+  Value zero = newBuilder.create<arith::ConstantIntOp>(loc, 0, 64);
+  Value one = newBuilder.create<arith::ConstantIntOp>(loc, 1, 64);
   Value begin = zero;
 
   auto argPos = initFunc.getArguments().size();
@@ -149,10 +150,10 @@ static void createInitFunc(OpBuilder &builder, ModuleOp sourceMod,
   // Detect errors in kernel passed to get_state.
   std::function<void(Block &)> processInner = [&](Block &block) {
     for (auto &op : block) {
-      for (auto &region : op.getRegions()) {
+      for (auto &region : op.getRegions())
         for (auto &b : region)
           processInner(b);
-      }
+
       // Don't allow returns in inner scopes
       if (auto retOp = dyn_cast<func::ReturnOp>(&op))
         calleeFunc.emitError("Encountered return in inner scope in a kernel "
@@ -222,7 +223,10 @@ static void createInitFunc(OpBuilder &builder, ModuleOp sourceMod,
 }
 
 /// Create callee.num_qubits_N that calculates the number of qubits to
-/// initialize Callee: func.func @callee(%arg0: i64) {
+/// initialize the state
+/// Callee: (the kernel captured by state):
+// clang-format off
+/// func.func @callee(%arg0: i64) {
 ///   %0 = cc.alloca i64
 ///   cc.store %arg0, %0 : !cc.ptr<i64>
 ///   %1 = cc.load %0 : !cc.ptr<i64>
@@ -239,6 +243,7 @@ static void createInitFunc(OpBuilder &builder, ModuleOp sourceMod,
 ///   %1 = cc.load %0 : !cc.ptr<i64>
 ///   return %1 : i64
 /// }
+// clang-format on
 static void createNumQubitsFunc(OpBuilder &builder, ModuleOp sourceMod,
                                 func::FuncOp calleeFunc,
                                 StringRef numQubitsKernelName) {
diff --git a/runtime/cudaq/algorithms/get_state.h b/runtime/cudaq/algorithms/get_state.h
index 204de442934..dacb2ef2793 100644
--- a/runtime/cudaq/algorithms/get_state.h
+++ b/runtime/cudaq/algorithms/get_state.h
@@ -119,17 +119,16 @@ auto get_state(QuantumKernel &&kernel, Args &&...args) {
     return state(new RemoteSimulationState(std::forward<QuantumKernel>(kernel),
                                            std::forward<Args>(args)...));
   }
-#endif
+#else
 #if defined(CUDAQ_QUANTUM_DEVICE)
   // Store kernel name and arguments for quantum states.
-  if (!cudaq::get_quake_by_name(cudaq::getKernelName(kernel), false).empty()) {
+  if (!cudaq::get_quake_by_name(cudaq::getKernelName(kernel), false).empty())
     return state(new QuantumState(std::forward<QuantumKernel>(kernel),
                                   std::forward<Args>(args)...));
-  } else {
-    throw std::runtime_error(
-        "cudaq::state* argument synthesis is not supported for quantum hardware"
-        "for c-like functions, use class kernels instead");
-  }
+  throw std::runtime_error(
+      "cudaq::state* argument synthesis is not supported for quantum hardware"
+      "for c-like functions, use class kernels instead");
+#endif
 #endif
   return details::extractState([&]() mutable {
     cudaq::invokeKernel(std::forward<QuantumKernel>(kernel),
diff --git a/runtime/test/FakeQuantumState.h b/runtime/test/FakeQuantumState.h
index 2f20babd955..87a177a6c21 100644
--- a/runtime/test/FakeQuantumState.h
+++ b/runtime/test/FakeQuantumState.h
@@ -10,14 +10,14 @@
 #include <cassert>
 #include <memory>
 
-#include <iostream>
-
 /// @cond DO_NOT_DOCUMENT
 /// @brief Fake simulation state to use in tests.
-class FakeQuantumState : public cudaq::SimulationState {
+class FakeDeviceState : public cudaq::SimulationState {
 private:
   std::string kernelName;
   std::vector<void *> args;
+  std::size_t size = 0;
+  void *data = 0;
 
 public:
   virtual std::unique_ptr<SimulationState>
@@ -26,11 +26,11 @@ class FakeQuantumState : public cudaq::SimulationState {
     throw std::runtime_error("Not implemented");
   }
 
-  FakeQuantumState() = default;
-  FakeQuantumState(const std::string &kernelName,
-                   const std::vector<void *> args)
+  FakeDeviceState() = default;
+  FakeDeviceState(std::size_t size, void *data) : size(size), data(data) {}
+  FakeDeviceState(const std::string &kernelName, const std::vector<void *> args)
       : kernelName(kernelName), args(args) {}
-  FakeQuantumState(const FakeQuantumState &other)
+  FakeDeviceState(const FakeDeviceState &other)
       : kernelName(other.kernelName), args(other.args) {}
 
   virtual std::unique_ptr<cudaq::SimulationState>
@@ -38,7 +38,7 @@ class FakeQuantumState : public cudaq::SimulationState {
     throw std::runtime_error("Not implemented");
   }
 
-  virtual bool hasData() const override { return false; }
+  virtual bool hasData() const override { return data != nullptr; }
 
   virtual std::optional<std::pair<std::string, std::vector<void *>>>
   getKernelInfo() const override {
@@ -53,9 +53,15 @@ class FakeQuantumState : public cudaq::SimulationState {
     throw std::runtime_error("Not implemented");
   }
 
-  virtual std::size_t getNumTensors() const override { return 1; }
+  virtual std::size_t getNumTensors() const override {
+    if (hasData())
+      return 1;
+    throw std::runtime_error("Not implemented");
+  }
 
   virtual std::size_t getNumQubits() const override {
+    if (hasData())
+      return std::countr_zero(size);
     throw std::runtime_error("Not implemented");
   }
 
@@ -78,7 +84,9 @@ class FakeQuantumState : public cudaq::SimulationState {
   }
 
   virtual precision getPrecision() const override {
-    return cudaq::SimulationState::precision::fp64;
+    if (hasData())
+      return cudaq::SimulationState::precision::fp64;
+    throw std::runtime_error("Not implemented");
   }
 
   virtual void destroyState() override {}
@@ -86,10 +94,17 @@ class FakeQuantumState : public cudaq::SimulationState {
   virtual std::complex<double>
   operator()(std::size_t tensorIdx,
              const std::vector<std::size_t> &indices) override {
+    if (hasData()) {
+      assert(tensorIdx == 0);
+      assert(indices.size() == 1);
+      return *(static_cast<std::complex<double> *>(data) + indices[0]);
+    }
     throw std::runtime_error("Not implemented");
   }
 
   virtual std::size_t getNumElements() const override {
+    if (hasData())
+      return size;
     throw std::runtime_error("Not implemented");
   }
 
@@ -107,6 +122,6 @@ class FakeQuantumState : public cudaq::SimulationState {
     throw std::runtime_error("Not implemented");
   }
 
-  virtual ~FakeQuantumState() override {}
+  virtual ~FakeDeviceState() override {}
 };
 /// @endcond
diff --git a/runtime/test/FakeSimulationState.h b/runtime/test/FakeSimulationState.h
index 49667e481c7..53e2b0bf936 100644
--- a/runtime/test/FakeSimulationState.h
+++ b/runtime/test/FakeSimulationState.h
@@ -22,7 +22,6 @@ class FakeSimulationState : public cudaq::SimulationState {
   createFromSizeAndPtr(std::size_t size, void *data,
                        std::size_t dataType) override {
     throw std::runtime_error("Not implemented");
-    return std::make_unique<FakeSimulationState>(size, data);
   }
 
   FakeSimulationState() = default;
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 84c8d425001..795f3947dc7 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -11,16 +11,135 @@
 
 // RUN: test_argument_conversion | FileCheck %s
 
-#include "FakeQuantumState.h"
-#include "FakeSimulationState.h"
+// #include "FakeQuantumState.h"
+// #include "FakeSimulationState.h"
 #include "common/ArgumentConversion.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "cudaq/Optimizer/InitAllDialects.h"
 #include "cudaq/qis/pauli_word.h"
+#include "cudaq/qis/state.h"
 #include "mlir/Parser/Parser.h"
+#include <cassert>
+#include <memory>
 #include <numeric>
 
+/// @cond DO_NOT_DOCUMENT
+/// @brief Fake simulation state to use in tests.
+class FakeDeviceState : public cudaq::SimulationState {
+private:
+  std::string kernelName;
+  std::vector<void *> args;
+  std::size_t size = 0;
+  void *data = 0;
+
+public:
+  virtual std::unique_ptr<SimulationState>
+  createFromSizeAndPtr(std::size_t size, void *data,
+                       std::size_t dataType) override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  FakeDeviceState() = default;
+  FakeDeviceState(std::size_t size, void *data) : size(size), data(data) {}
+  FakeDeviceState(const std::string &kernelName, const std::vector<void *> args)
+      : kernelName(kernelName), args(args) {}
+  FakeDeviceState(const FakeDeviceState &other)
+      : kernelName(other.kernelName), args(other.args) {}
+
+  virtual std::unique_ptr<cudaq::SimulationState>
+  createFromData(const cudaq::state_data &data) override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual bool hasData() const override { return data != nullptr; }
+
+  virtual std::optional<std::pair<std::string, std::vector<void *>>>
+  getKernelInfo() const override {
+    return std::make_pair(kernelName, args);
+  }
+
+  virtual Tensor getTensor(std::size_t tensorIdx = 0) const override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual std::vector<Tensor> getTensors() const override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual std::size_t getNumTensors() const override {
+    if (hasData())
+      return 1;
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual std::size_t getNumQubits() const override {
+    if (hasData())
+      return std::countr_zero(size);
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual std::complex<double> overlap(const SimulationState &other) override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual std::complex<double>
+  getAmplitude(const std::vector<int> &basisState) override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual std::vector<std::complex<double>>
+  getAmplitudes(const std::vector<std::vector<int>> &basisStates) override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual void dump(std::ostream &os) const override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual precision getPrecision() const override {
+    if (hasData())
+      return cudaq::SimulationState::precision::fp64;
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual void destroyState() override {}
+
+  virtual std::complex<double>
+  operator()(std::size_t tensorIdx,
+             const std::vector<std::size_t> &indices) override {
+    if (hasData()) {
+      assert(tensorIdx == 0);
+      assert(indices.size() == 1);
+      return *(static_cast<std::complex<double> *>(data) + indices[0]);
+    }
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual std::size_t getNumElements() const override {
+    if (hasData())
+      return size;
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual bool isDeviceData() const override { return false; }
+
+  virtual bool isArrayLike() const override { return true; }
+
+  virtual void toHost(std::complex<double> *clientAllocatedData,
+                      std::size_t numElements) const override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual void toHost(std::complex<float> *clientAllocatedData,
+                      std::size_t numElements) const override {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual ~FakeDeviceState() override {}
+};
+/// @endcond
+
 extern "C" void __cudaq_deviceCodeHolderAdd(const char *, const char *);
 
 void dumpSubstitutionModules(cudaq::opt::ArgumentConverter &ab) {
@@ -383,7 +502,7 @@ void test_simulation_state(mlir::MLIRContext *ctx) {
   {
     std::vector<std::complex<double>> data{M_SQRT1_2, M_SQRT1_2, 0., 0.,
                                            0.,        0.,        0., 0.};
-    auto x = cudaq::state(new FakeSimulationState(data.size(), data.data()));
+    auto x = cudaq::state(new FakeDeviceState(data.size(), data.data()));
     std::vector<void *> v = {static_cast<void *>(&x)};
     doSimpleTest(ctx, "!cc.ptr<!cc.state>", v);
   }
@@ -406,7 +525,6 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
   {
     auto kernel = "init";
     auto kernelCode =
-        ""
         "func.func private @__nvqpp__mlirgen__init(%arg0: i64) {\n"
         "  %0 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
         "  %1 = quake.extract_ref %0[0] : (!quake.veq<?>) -> !quake.ref\n"
@@ -417,7 +535,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 
     std::int64_t n = 2;
     std::vector<void *> a = {static_cast<void *>(&n)};
-    auto x = cudaq::state(new FakeQuantumState(kernel, a));
+    auto x = cudaq::state(new FakeDeviceState(kernel, a));
     std::vector<void *> v = {static_cast<void *>(&x)};
     doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, kernelCode);
   }
@@ -482,7 +600,6 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
   {
     auto kernel = "init";
     auto kernelCode =
-        ""
         " func.func private @__nvqpp__mlirgen__init(%arg0: i64) {\n"
         "   %2 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
         "   %3 = quake.extract_ref %2[0] : (!quake.veq<?>) -> !quake.ref\n"
@@ -504,7 +621,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 
     std::int64_t n = 2;
     std::vector<void *> a = {static_cast<void *>(&n)};
-    auto x = cudaq::state(new FakeQuantumState(kernel, a));
+    auto x = cudaq::state(new FakeDeviceState(kernel, a));
     std::vector<void *> v = {static_cast<void *>(&x)};
     doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, kernelCode);
   }
diff --git a/test/Quake/replace_state_with_kernel.qke b/test/Quake/replace_state_with_kernel.qke
index 24bdd787216..58b474a65b0 100644
--- a/test/Quake/replace_state_with_kernel.qke
+++ b/test/Quake/replace_state_with_kernel.qke
@@ -60,4 +60,4 @@ module {
 // CHECK:           %[[VAL_0:.*]] = call @callee.num_qubits_0() : () -> i64
 // CHECK:           return %[[VAL_0]] : i64
 // CHECK:         }
-}
\ No newline at end of file
+}

From de387fce8aefb617f460a92c843175368bc8940b Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 14 Feb 2025 13:11:48 -0800
Subject: [PATCH 32/54] Address more CR comments

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 .../cudaq/Optimizer/Dialect/Quake/QuakeOps.td |   4 +-
 include/cudaq/Optimizer/Transforms/Passes.td  |  19 +--
 .../Transforms/ReplaceStateWithKernel.cpp     |   2 +-
 runtime/common/ArgumentConversion.cpp         |  26 ++--
 runtime/common/ArgumentConversion.h           |   7 +-
 runtime/common/BaseRemoteRESTQPU.h            |   1 +
 runtime/cudaq/algorithms/get_state.h          |  15 ++-
 .../default/rest/helpers/braket/braket.yml    |   2 +
 .../rest/helpers/infleqtion/infleqtion.yml    |   2 +
 .../cudaq/platform/fermioniq/fermioniq.yml    |   2 +
 runtime/test/FakeQuantumState.h               | 127 ------------------
 runtime/test/FakeSimulationState.h            | 101 --------------
 runtime/test/test_argument_conversion.cpp     |   8 +-
 .../execution/qvector_init_from_state.cpp     |  47 +------
 .../qvector_init_from_state_pauli.cpp         |  78 +++++++++++
 15 files changed, 139 insertions(+), 302 deletions(-)
 delete mode 100644 runtime/test/FakeQuantumState.h
 delete mode 100644 runtime/test/FakeSimulationState.h
 create mode 100644 targettests/execution/qvector_init_from_state_pauli.cpp

diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
index 6bb5e985092..cfb16bd100c 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
@@ -1475,8 +1475,8 @@ def QuakeOp_GetStateOp : QuakeOp<"get_state", [Pure] > {
         passed to `cudaq::get_state`.
 
     This operation will return of the original kernel passed to
-    `cudaq::get_state`. `cudaq::get_state`. 
-    
+    `cudaq::get_state`. `cudaq::get_state`.
+
     The operation may be replaced by calls to the @num_qubits and @init calls,
     which will reproduce the specified state in the `ReplaceStateByKernel`
     pass.
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index f42559cb37d..1351b3bdf17 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -855,15 +855,16 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::Func
   let summary =
     "Replace `quake.init_state` instructions with call to the kernel generating the state";
   let description = [{
-    This optimization replaces `quake.init_state`, `quake.get_number_of_qubits`, 
-    and `quake.get_state` operations.
+    This optimization replaces `quake.init_state`, `quake.get_number_of_qubits`,
+    and `quake.get_state` operations invoked on state pointers during argument
+    synthesis for quantum devices.
 
     Before this optimization, argument synthesis for state pointers for quantum
-    devices substituted a new state created from the `quake.get_state` operation
-    for the state argument. 
-    
+    devices substituted a state created from the `quake.get_state` operation
+    for the state argument.
+
     The `quake.get_state` operation accepts symbols for the synthesized kernels
-    `num_qubits` and `init` that argument synthesis generated from the original
+    `@num_qubits` and `@init` that argument synthesis generated from the original
     kernel call that generated the state, e.g., the `cudaq::get_state` call that
     refers to the result of a specific quantum kernel being invoked with a set
     of parameters
@@ -877,8 +878,8 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::Func
     The argument synthesis generated the following new kernels from the `callee`
     and synthesized them to substitute their arguments with `args`:
       ```
-      func.func @callee_init(qubits: !quake.veq<?>, arguments) -> !quake.veq<?>
       func.func @callee_num_qubits(arguments) -> i64
+      func.func @callee_init(qubits: !quake.veq<?>, arguments) -> !quake.veq<?>
       ```
 
     The argument synthesis also substituted the state argument in the `caller`
@@ -890,8 +891,8 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::Func
     This optimization performs the replacements for the the following operations 
     that use a state produced by  `quake.get_state @num_qubits @init` operation:
 
-    - Replace `quake.get_number_of_qubits` operation by the @num_qubits call
-    - Replace `quake.init_state` operation by the @init call
+    - Replace `quake.get_number_of_qubits` operation by the call to `@num_qubits`
+    - Replace `quake.init_state` operation by the call to `@init`
     - Clean up unused `quake.get_state` operation
 
     For example:
diff --git a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index 872e12c3f32..d385f061738 100644
--- a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -30,7 +30,7 @@ using namespace mlir;
 
 namespace {
 // clang-format off
-/// Replace `quake.get_number_of_qubits` by a call to a a function
+/// Replace `quake.get_number_of_qubits` by a call to a function
 /// that computes the number of qubits for a state.
 ///
 /// ```
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 664d10549f9..6078fb45125 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -334,7 +334,10 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
 
   // If the state has amplitude data, we materialize the data as a state
   // vector and create a new state from it.
-  // TODO: how to handle density matrices? Should we just inline calls?
+  // TODO: add an option to use the kernel info if available, i.e. for
+  // remote simulators
+  // TODO: add an option of storing the kernel info on simulators if
+  // preferred i.e. to support synthesis of density matrices.
   if (simState->hasData()) {
     // The call below might cause lazy execution of the state kernel.
     // TODO: For lazy execution scenario on remote simulators, we have the
@@ -384,7 +387,8 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
                                                 arrSize);
   }
 
-  // For quantum hardware, we aim at replacing states with calls to kernels
+  // Otherwise (ie quantum hardware, where getting the amplitude data is not
+  // efficient) we aim at replacing states with calls to kernels (`callees`)
   // that generated them. This is done in 2 stages:
   //
   // 1. Replace state by quake.get_state instruction during argument conversion:
@@ -392,11 +396,11 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   // Create two functions:
   // - callee.num_qubits_N
   //    Calculates the number of qubits needed for the veq allocation
-  // - callee.init_state_N
+  // - callee.init_N
   //    Initializes the veq passed as a parameter
   //
   // Then replace the state with
-  //   `quake.get_state @callee.num_qubits_0 @callee.init_state_0`:
+  //   `quake.get_state @callee.num_qubits_0 @callee.init_0`:
   //
   // clang-format off
   // ```
@@ -436,7 +440,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   //   return %arg0 : i64
   // }
   //
-  // func.func private @callee.init_state_0(%arg0: i64, %arg1: !quake.veq<?>) {
+  // func.func private @callee.init_0(%arg0: i64, %arg1: !quake.veq<?>) {
   //   %1 = quake.extract_ref %arg0[0] : (!quake.veq<2>) -> !quake.ref
   //   quake.x %1 : (f64, !quake.ref) -> ()
   //   return
@@ -444,15 +448,16 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   // ```
   // clang-format on
   //
-  // 2. Replace the `quake.get_state` ops with calls to the generated functions
-  //    synthesized with the arguments used to create the state:
+  // 2. Replace the `quake.get_state` and ops that use its state with calls to
+  // the generated functions, synthesized with the arguments used to create the
+  // original state:
   //
   // After ReplaceStateWithKernel pass:
   //
   // clang-format off
   // ```
   // func.func @caller() {
-  //   %1 = call 2callee.num_qubits_0() : () -> i64
+  //   %1 = call callee.num_qubits_0() : () -> i64
   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
   //   %3 = call @callee.init_0(%2): (!quake.veq<?>) -> !quake.veq<?>
   // }
@@ -506,7 +511,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     auto &registeredNumQubitsName =
         cudaq::opt::ArgumentConverter::registerKernelName(numQubitsName);
 
-    // Create substitutions for `callee.init_N` and `callee.num_qubits_N`.
+    // Convert arguments  for `callee.init_N` and `callee.num_qubits_N`.
     converter.genCallee(registeredInitName, calleeArgs);
     converter.genCallee(registeredNumQubitsName, calleeArgs);
 
@@ -691,6 +696,9 @@ Value genConstant(OpBuilder &builder, cudaq::cc::IndirectCallableType indCallTy,
 
 //===----------------------------------------------------------------------===//
 
+std::list<std::string> cudaq::opt::ArgumentConverter::kernelNameRegistry =
+    std::list<std::string>();
+
 cudaq::opt::ArgumentConverter::ArgumentConverter(StringRef kernelName,
                                                  ModuleOp sourceModule)
     : sourceModule(sourceModule), builder(sourceModule.getContext()),
diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index d07a5e5e989..6d2b2135c37 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -66,8 +66,11 @@ class ArgumentConverter {
   }
 
 private:
-  // Note: use std::list to make sure we always return valid references
-  // when registering new kernel names.
+  /// Keeps kernel names created during argument conversion in memory.
+  /// References to those names are used by the argument converters for
+  /// those kernels.
+  /// Note: use std::list to make sure we always return valid references
+  /// when registering new kernel names.
   static std::list<std::string> kernelNameRegistry;
 
   mlir::ModuleOp sourceModule;
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 3a45628a498..0763e11304c 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -450,6 +450,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     if (!rawArgs.empty() || updatedArgs) {
       mlir::PassManager pm(&context);
       if (!rawArgs.empty()) {
+        cudaq::info("Run Argument Synth.\n");
         opt::ArgumentConverter argCon(kernelName, moduleOp);
         argCon.gen(rawArgs);
 
diff --git a/runtime/cudaq/algorithms/get_state.h b/runtime/cudaq/algorithms/get_state.h
index dacb2ef2793..caec195715d 100644
--- a/runtime/cudaq/algorithms/get_state.h
+++ b/runtime/cudaq/algorithms/get_state.h
@@ -120,14 +120,25 @@ auto get_state(QuantumKernel &&kernel, Args &&...args) {
                                            std::forward<Args>(args)...));
   }
 #else
-#if defined(CUDAQ_QUANTUM_DEVICE)
+#if defined(CUDAQ_QUANTUM_DEVICE) && !defined(CUDAQ_LIBRARY_MODE)
   // Store kernel name and arguments for quantum states.
   if (!cudaq::get_quake_by_name(cudaq::getKernelName(kernel), false).empty())
     return state(new QuantumState(std::forward<QuantumKernel>(kernel),
                                   std::forward<Args>(args)...));
   throw std::runtime_error(
       "cudaq::state* argument synthesis is not supported for quantum hardware"
-      "for c-like functions, use class kernels instead");
+      " for c-like functions, use class kernels instead");
+#else
+#if defined(CUDAQ_QUANTUM_DEVICE)
+  // Kernel builder is MLIR-based kernel.
+  if constexpr (has_name<QuantumKernel>::value)
+    return state(new QuantumState(std::forward<QuantumKernel>(kernel),
+                                  std::forward<Args>(args)...));
+
+  throw std::runtime_error(
+      "cudaq::state* argument synthesis is not supported for quantum hardware"
+      " for c-like functions in library mode");
+#endif
 #endif
 #endif
   return details::extractState([&]() mutable {
diff --git a/runtime/cudaq/platform/default/rest/helpers/braket/braket.yml b/runtime/cudaq/platform/default/rest/helpers/braket/braket.yml
index 7e2b573f65c..f409e4fa298 100644
--- a/runtime/cudaq/platform/default/rest/helpers/braket/braket.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/braket/braket.yml
@@ -16,6 +16,8 @@ config:
   link-libs: ["-lcudaq-rest-qpu"]
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
+  # Add preprocessor defines to compilation
+  preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Define the lowering pipeline
   platform-lowering-config: "classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,classical-optimization-pipeline,func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),decomposition{enable-patterns=SToR1,TToR1,R1ToU3,U3ToRotations,CHToCX,CCZToCX,CRzToCX,CRyToCX,CRxToCX,CR1ToCX,RxAdjToRx,RyAdjToRy,RzAdjToRz},quake-to-cc-prep,func.func(expand-control-veqs,combine-quantum-alloc,canonicalize,combine-measurements),symbol-dce"
   # Tell the rest-qpu that we are generating OpenQASM 2.0.
diff --git a/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml b/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml
index a9fa9484526..5f9c82b022e 100644
--- a/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml
@@ -16,6 +16,8 @@ config:
   link-libs: ["-lcudaq-rest-qpu"]
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
+  # Add preprocessor defines to compilation
+  preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Define the lowering pipeline
   platform-lowering-config: "classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,classical-optimization-pipeline,func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),decomposition{enable-patterns=SToR1,TToR1,CCZToCX,CRyToCX,CRxToCX,R1AdjToR1,RxAdjToRx,RyAdjToRy,RzAdjToRz},quake-to-cc-prep,func.func(memtoreg{quantum=0}),symbol-dce"
   # Tell the rest-qpu that we are generating OpenQASM 2.0.
diff --git a/runtime/cudaq/platform/fermioniq/fermioniq.yml b/runtime/cudaq/platform/fermioniq/fermioniq.yml
index ec87efd03f4..eed0959caaf 100644
--- a/runtime/cudaq/platform/fermioniq/fermioniq.yml
+++ b/runtime/cudaq/platform/fermioniq/fermioniq.yml
@@ -13,6 +13,8 @@ config:
   platform-qpu: fermioniq
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
+  # Add preprocessor defines to compilation
+  preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Add the fermioniq-qpu library to the link list
   link-libs: ["-lcudaq-fermioniq-qpu"]
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/test/FakeQuantumState.h b/runtime/test/FakeQuantumState.h
deleted file mode 100644
index 87a177a6c21..00000000000
--- a/runtime/test/FakeQuantumState.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/****************************************************************-*- C++ -*-****
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-#include "cudaq/qis/state.h"
-#include <cassert>
-#include <memory>
-
-/// @cond DO_NOT_DOCUMENT
-/// @brief Fake simulation state to use in tests.
-class FakeDeviceState : public cudaq::SimulationState {
-private:
-  std::string kernelName;
-  std::vector<void *> args;
-  std::size_t size = 0;
-  void *data = 0;
-
-public:
-  virtual std::unique_ptr<SimulationState>
-  createFromSizeAndPtr(std::size_t size, void *data,
-                       std::size_t dataType) override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  FakeDeviceState() = default;
-  FakeDeviceState(std::size_t size, void *data) : size(size), data(data) {}
-  FakeDeviceState(const std::string &kernelName, const std::vector<void *> args)
-      : kernelName(kernelName), args(args) {}
-  FakeDeviceState(const FakeDeviceState &other)
-      : kernelName(other.kernelName), args(other.args) {}
-
-  virtual std::unique_ptr<cudaq::SimulationState>
-  createFromData(const cudaq::state_data &data) override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual bool hasData() const override { return data != nullptr; }
-
-  virtual std::optional<std::pair<std::string, std::vector<void *>>>
-  getKernelInfo() const override {
-    return std::make_pair(kernelName, args);
-  }
-
-  virtual Tensor getTensor(std::size_t tensorIdx = 0) const override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual std::vector<Tensor> getTensors() const override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual std::size_t getNumTensors() const override {
-    if (hasData())
-      return 1;
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual std::size_t getNumQubits() const override {
-    if (hasData())
-      return std::countr_zero(size);
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual std::complex<double> overlap(const SimulationState &other) override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual std::complex<double>
-  getAmplitude(const std::vector<int> &basisState) override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual std::vector<std::complex<double>>
-  getAmplitudes(const std::vector<std::vector<int>> &basisStates) override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual void dump(std::ostream &os) const override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual precision getPrecision() const override {
-    if (hasData())
-      return cudaq::SimulationState::precision::fp64;
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual void destroyState() override {}
-
-  virtual std::complex<double>
-  operator()(std::size_t tensorIdx,
-             const std::vector<std::size_t> &indices) override {
-    if (hasData()) {
-      assert(tensorIdx == 0);
-      assert(indices.size() == 1);
-      return *(static_cast<std::complex<double> *>(data) + indices[0]);
-    }
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual std::size_t getNumElements() const override {
-    if (hasData())
-      return size;
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual bool isDeviceData() const override { return false; }
-
-  virtual bool isArrayLike() const override { return true; }
-
-  virtual void toHost(std::complex<double> *clientAllocatedData,
-                      std::size_t numElements) const override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual void toHost(std::complex<float> *clientAllocatedData,
-                      std::size_t numElements) const override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual ~FakeDeviceState() override {}
-};
-/// @endcond
diff --git a/runtime/test/FakeSimulationState.h b/runtime/test/FakeSimulationState.h
deleted file mode 100644
index 53e2b0bf936..00000000000
--- a/runtime/test/FakeSimulationState.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/****************************************************************-*- C++ -*-****
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-#include "cudaq/qis/state.h"
-#include <cassert>
-#include <memory>
-
-/// @cond DO_NOT_DOCUMENT
-/// @brief Fake simulation state to use in tests.
-class FakeSimulationState : public cudaq::SimulationState {
-private:
-  std::size_t size = 0;
-  void *data = 0;
-
-public:
-  virtual std::unique_ptr<SimulationState>
-  createFromSizeAndPtr(std::size_t size, void *data,
-                       std::size_t dataType) override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  FakeSimulationState() = default;
-  FakeSimulationState(std::size_t size, void *data) : size(size), data(data) {}
-
-  virtual std::unique_ptr<cudaq::SimulationState>
-  createFromData(const cudaq::state_data &data) override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual Tensor getTensor(std::size_t tensorIdx = 0) const override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual std::vector<Tensor> getTensors() const override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual std::size_t getNumTensors() const override { return 1; }
-
-  virtual std::size_t getNumQubits() const override {
-    return std::countr_zero(size);
-  }
-
-  virtual std::complex<double> overlap(const SimulationState &other) override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual std::complex<double>
-  getAmplitude(const std::vector<int> &basisState) override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual std::vector<std::complex<double>>
-  getAmplitudes(const std::vector<std::vector<int>> &basisStates) override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual void dump(std::ostream &os) const override {
-    throw std::runtime_error("Not implemented");
-  }
-
-  virtual precision getPrecision() const override {
-    return cudaq::SimulationState::precision::fp64;
-  }
-
-  virtual void destroyState() override {}
-
-  virtual std::complex<double>
-  operator()(std::size_t tensorIdx,
-             const std::vector<std::size_t> &indices) override {
-    assert(tensorIdx == 0);
-    assert(indices.size() == 1);
-    return *(static_cast<std::complex<double> *>(data) + indices[0]);
-  }
-
-  virtual std::size_t getNumElements() const override { return size; }
-
-  virtual bool isDeviceData() const override { return false; }
-
-  virtual bool isArrayLike() const override { return true; }
-
-  virtual void toHost(std::complex<double> *clientAllocatedData,
-                      std::size_t numElements) const override {
-    throw std::runtime_error(
-        "SimulationState::toHost complex128 not implemented.");
-  }
-
-  virtual void toHost(std::complex<float> *clientAllocatedData,
-                      std::size_t numElements) const override {
-    throw std::runtime_error(
-        "SimulationState::toHost complex64 not implemented.");
-  }
-
-  virtual ~FakeSimulationState() {}
-};
-/// @endcond
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 795f3947dc7..edc9793a5c8 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -25,7 +25,7 @@
 #include <numeric>
 
 /// @cond DO_NOT_DOCUMENT
-/// @brief Fake simulation state to use in tests.
+/// @brief Fake simulation or quantum device state to use in tests.
 class FakeDeviceState : public cudaq::SimulationState {
 private:
   std::string kernelName;
@@ -56,7 +56,9 @@ class FakeDeviceState : public cudaq::SimulationState {
 
   virtual std::optional<std::pair<std::string, std::vector<void *>>>
   getKernelInfo() const override {
-    return std::make_pair(kernelName, args);
+    if (!hasData())
+      return std::make_pair(kernelName, args);
+    throw std::runtime_error("Not implemented");
   }
 
   virtual Tensor getTensor(std::size_t tensorIdx = 0) const override {
@@ -771,7 +773,7 @@ void test_combinations(mlir::MLIRContext *ctx) {
                                            0.,        0.,        0., 0.};
 
     std::vector<double> x = {0.5, 0.6};
-    cudaq::state y{new FakeSimulationState(data.size(), data.data())};
+    cudaq::state y{new FakeDeviceState(data.size(), data.data())};
     std::vector<cudaq::pauli_word> z = {
         cudaq::pauli_word{"XX"},
         cudaq::pauli_word{"XY"},
diff --git a/targettests/execution/qvector_init_from_state.cpp b/targettests/execution/qvector_init_from_state.cpp
index 482440b4b8f..bd1eee026f7 100644
--- a/targettests/execution/qvector_init_from_state.cpp
+++ b/targettests/execution/qvector_init_from_state.cpp
@@ -18,6 +18,7 @@
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // clang-format on
 
 #include <cudaq.h>
@@ -39,33 +40,6 @@ struct test_state_param {
   }
 };
 
-struct test_state_param2 {
-  void operator()(cudaq::state *state, cudaq::pauli_word w) __qpu__ {
-    cudaq::qvector q(state);
-    cudaq::exp_pauli(1.0, q, w);
-  }
-};
-
-struct test_state_param3 {
-  void operator()(cudaq::state *state,
-                  std::vector<cudaq::pauli_word> &words) __qpu__ {
-    cudaq::qvector q(state);
-    for (std::size_t i = 0; i < words.size(); ++i) {
-      cudaq::exp_pauli(1.0, q, words[i]);
-    }
-  }
-};
-
-struct test_state_param4 {
-  void operator()(cudaq::state *state, std::vector<double> &coefficients,
-                  std::vector<cudaq::pauli_word> &words) __qpu__ {
-    cudaq::qvector q(state);
-    for (std::size_t i = 0; i < words.size(); ++i) {
-      cudaq::exp_pauli(coefficients[i], q, words[i]);
-    }
-  }
-};
-
 void printCounts(cudaq::sample_result &result) {
   std::vector<std::string> values{};
   for (auto &&[bits, counts] : result) {
@@ -133,23 +107,6 @@ int main() {
   // CHECK: 11111
   // clang-format on
 
-  {
-    std::cout << "Passing state from another kernel as argument"
-                 " with pauli word arg (kernel mode)"
-              << std::endl;
-    auto state = cudaq::get_state(test_init_state{}, 2);
-    auto counts =
-        cudaq::sample(test_state_param2{}, &state, cudaq::pauli_word{"XX"});
-    printCounts(counts);
-  }
-  // clang-format off
-  // CHECK: Passing state from another kernel as argument with pauli word arg (kernel mode)
-  // CHECK: 00
-  // CHECK: 01
-  // CHECK: 10
-  // CHECK: 11
-  // clang-format on
-
   {
     std::cout << "Passing state from another kernel as argument iteratively "
                  "(kernel mode)"
@@ -177,6 +134,4 @@ int main() {
   // CHECK: 00
   // CHECK: 10
   // clang-format on
-
-  // TODO: add tests for vectors of pauli words after we can lifts the arrays of pauli words.
 }
diff --git a/targettests/execution/qvector_init_from_state_pauli.cpp b/targettests/execution/qvector_init_from_state_pauli.cpp
new file mode 100644
index 00000000000..6d83170c17b
--- /dev/null
+++ b/targettests/execution/qvector_init_from_state_pauli.cpp
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// Simulators
+// RUN: nvq++ %cpp_std --enable-mlir  %s                              -o %t && %t | FileCheck %s
+
+// Quantum emulators
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
+// 2 different IQM machines for 2 different topologies
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+struct test_init_state {
+  void operator()(int n) __qpu__ {
+    cudaq::qvector q(n);
+    ry(M_PI / 2.0, q[0]);
+  }
+};
+
+struct test_state_param {
+  void operator()(cudaq::state *state, cudaq::pauli_word w) __qpu__ {
+    cudaq::qvector q(state);
+    cudaq::exp_pauli(1.0, q, w);
+  }
+};
+
+void printCounts(cudaq::sample_result &result) {
+  std::vector<std::string> values{};
+  for (auto &&[bits, counts] : result) {
+    values.push_back(bits);
+  }
+
+  std::sort(values.begin(), values.end());
+  for (auto &&bits : values) {
+    std::cout << bits << std::endl;
+  }
+}
+
+int main() {
+  std::vector<cudaq::complex> vec{M_SQRT1_2, M_SQRT1_2, 0., 0., 0., 0., 0., 0.};
+  std::vector<cudaq::complex> vec1{0., 0., 0.,        0.,
+                                   0., 0., M_SQRT1_2, M_SQRT1_2};
+  auto state = cudaq::state::from_data(vec);
+  auto state1 = cudaq::state::from_data(vec1);
+  {
+    std::cout << "Passing state from another kernel as argument"
+                 " with pauli word arg (kernel mode)"
+              << std::endl;
+    auto state = cudaq::get_state(test_init_state{}, 2);
+    auto counts =
+        cudaq::sample(test_state_param{}, &state, cudaq::pauli_word{"XX"});
+    printCounts(counts);
+  }
+  // clang-format off
+  // CHECK: Passing state from another kernel as argument with pauli word arg (kernel mode)
+  // CHECK: 00
+  // CHECK: 01
+  // CHECK: 10
+  // CHECK: 11
+  // clang-format on
+
+  // TODO: add tests for vectors of pauli words after we can lifts the arrays of pauli words.
+}

From 7cf306ad05dd16ee99db88f1e6de79077f071275 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 18 Feb 2025 10:06:55 -0800
Subject: [PATCH 33/54] Address more CR comments

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/cudaq/CMakeLists.txt                  |  2 +-
 runtime/cudaq/algorithms/get_state.h          | 10 ++---
 .../qpu_state.cpp}                            | 45 +++++++++----------
 .../quantum_state.h => platform/qpu_state.h}  | 10 ++---
 4 files changed, 33 insertions(+), 34 deletions(-)
 rename runtime/cudaq/{qis/quantum_state.cpp => platform/qpu_state.cpp} (67%)
 rename runtime/cudaq/{qis/quantum_state.h => platform/qpu_state.h} (96%)

diff --git a/runtime/cudaq/CMakeLists.txt b/runtime/cudaq/CMakeLists.txt
index 7561ca904f6..89f76c68354 100644
--- a/runtime/cudaq/CMakeLists.txt
+++ b/runtime/cudaq/CMakeLists.txt
@@ -17,10 +17,10 @@ add_library(${LIBRARY_NAME}
          SHARED cudaq.cpp 
                 target_control.cpp
                 algorithms/draw.cpp
+                platform/qpu_state.cpp
                 platform/quantum_platform.cpp
                 qis/execution_manager_c_api.cpp
                 qis/execution_manager.cpp
-                qis/quantum_state.cpp
                 qis/remote_state.cpp
                 qis/state.cpp
                 utils/cudaq_utils.cpp
diff --git a/runtime/cudaq/algorithms/get_state.h b/runtime/cudaq/algorithms/get_state.h
index caec195715d..79202f98b48 100644
--- a/runtime/cudaq/algorithms/get_state.h
+++ b/runtime/cudaq/algorithms/get_state.h
@@ -14,7 +14,7 @@
 #include "cudaq/host_config.h"
 #include "cudaq/platform.h"
 #include "cudaq/platform/QuantumExecutionQueue.h"
-#include "cudaq/qis/quantum_state.h"
+#include "cudaq/platform/qpu_state.h"
 #include "cudaq/qis/remote_state.h"
 #include "cudaq/qis/state.h"
 #include <complex>
@@ -123,8 +123,8 @@ auto get_state(QuantumKernel &&kernel, Args &&...args) {
 #if defined(CUDAQ_QUANTUM_DEVICE) && !defined(CUDAQ_LIBRARY_MODE)
   // Store kernel name and arguments for quantum states.
   if (!cudaq::get_quake_by_name(cudaq::getKernelName(kernel), false).empty())
-    return state(new QuantumState(std::forward<QuantumKernel>(kernel),
-                                  std::forward<Args>(args)...));
+    return state(new QPUState(std::forward<QuantumKernel>(kernel),
+                              std::forward<Args>(args)...));
   throw std::runtime_error(
       "cudaq::state* argument synthesis is not supported for quantum hardware"
       " for c-like functions, use class kernels instead");
@@ -132,8 +132,8 @@ auto get_state(QuantumKernel &&kernel, Args &&...args) {
 #if defined(CUDAQ_QUANTUM_DEVICE)
   // Kernel builder is MLIR-based kernel.
   if constexpr (has_name<QuantumKernel>::value)
-    return state(new QuantumState(std::forward<QuantumKernel>(kernel),
-                                  std::forward<Args>(args)...));
+    return state(new QPUState(std::forward<QuantumKernel>(kernel),
+                              std::forward<Args>(args)...));
 
   throw std::runtime_error(
       "cudaq::state* argument synthesis is not supported for quantum hardware"
diff --git a/runtime/cudaq/qis/quantum_state.cpp b/runtime/cudaq/platform/qpu_state.cpp
similarity index 67%
rename from runtime/cudaq/qis/quantum_state.cpp
rename to runtime/cudaq/platform/qpu_state.cpp
index faaae5b510a..0561ca29ddb 100644
--- a/runtime/cudaq/qis/quantum_state.cpp
+++ b/runtime/cudaq/platform/qpu_state.cpp
@@ -1,17 +1,17 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include "quantum_state.h"
+#include "qpu_state.h"
 #include "common/Logger.h"
 
 namespace cudaq {
 
-QuantumState::~QuantumState() {
+QPUState::~QPUState() {
   if (!platformExecutionLog.empty()) {
     // Flush any info log from the remote execution
     printf("%s\n", platformExecutionLog.c_str());
@@ -25,89 +25,88 @@ QuantumState::~QuantumState() {
   deleters.clear();
 }
 
-std::size_t QuantumState::getNumQubits() const {
+std::size_t QPUState::getNumQubits() const {
   throw std::runtime_error(
       "getNumQubits is not implemented for quantum hardware");
 }
 
 cudaq::SimulationState::Tensor
-QuantumState::getTensor(std::size_t tensorIdx) const {
+QPUState::getTensor(std::size_t tensorIdx) const {
   throw std::runtime_error("getTensor is not implemented for quantum hardware");
 }
 
 /// @brief Return all tensors that represent this state
-std::vector<cudaq::SimulationState::Tensor> QuantumState::getTensors() const {
+std::vector<cudaq::SimulationState::Tensor> QPUState::getTensors() const {
   throw std::runtime_error(
       "getTensors is not implemented for quantum hardware");
   return {getTensor()};
 }
 
 /// @brief Return the number of tensors that represent this state.
-std::size_t QuantumState::getNumTensors() const {
+std::size_t QPUState::getNumTensors() const {
   throw std::runtime_error(
       "getNumTensors is not implemented for quantum hardware");
 }
 
 std::complex<double>
-QuantumState::operator()(std::size_t tensorIdx,
-                         const std::vector<std::size_t> &indices) {
+QPUState::operator()(std::size_t tensorIdx,
+                     const std::vector<std::size_t> &indices) {
   throw std::runtime_error(
       "operator() is not implemented for quantum hardware");
 }
 
 std::unique_ptr<SimulationState>
-QuantumState::createFromSizeAndPtr(std::size_t size, void *ptr, std::size_t) {
+QPUState::createFromSizeAndPtr(std::size_t size, void *ptr, std::size_t) {
   throw std::runtime_error(
       "createFromSizeAndPtr is not implemented for quantum hardware");
 }
 
-void QuantumState::dump(std::ostream &os) const {
+void QPUState::dump(std::ostream &os) const {
   throw std::runtime_error("dump is not implemented for quantum hardware");
 }
 
-cudaq::SimulationState::precision QuantumState::getPrecision() const {
+cudaq::SimulationState::precision QPUState::getPrecision() const {
   throw std::runtime_error(
       "getPrecision is not implemented for quantum hardware");
 }
 
-void QuantumState::destroyState() {
+void QPUState::destroyState() {
   // There is no state data so nothing to destroy.
 }
 
-bool QuantumState::isDeviceData() const {
+bool QPUState::isDeviceData() const {
   throw std::runtime_error(
       "isDeviceData is not implemented for quantum hardware");
 }
 
-void QuantumState::toHost(std::complex<double> *clientAllocatedData,
-                          std::size_t numElements) const {
+void QPUState::toHost(std::complex<double> *clientAllocatedData,
+                      std::size_t numElements) const {
   throw std::runtime_error("toHost is not implemented for quantum hardware");
 }
 
-void QuantumState::toHost(std::complex<float> *clientAllocatedData,
-                          std::size_t numElements) const {
+void QPUState::toHost(std::complex<float> *clientAllocatedData,
+                      std::size_t numElements) const {
   throw std::runtime_error("toHost is not implemented for quantum hardware");
 }
 
 std::optional<std::pair<std::string, std::vector<void *>>>
-QuantumState::getKernelInfo() const {
+QPUState::getKernelInfo() const {
   return std::make_pair(kernelName, args);
 }
 
 std::vector<std::complex<double>>
-QuantumState::getAmplitudes(const std::vector<std::vector<int>> &basisStates) {
+QPUState::getAmplitudes(const std::vector<std::vector<int>> &basisStates) {
   throw std::runtime_error(
       "getAmplitudes is not implemented for quantum hardware");
 }
 
 std::complex<double>
-QuantumState::getAmplitude(const std::vector<int> &basisState) {
+QPUState::getAmplitude(const std::vector<int> &basisState) {
   throw std::runtime_error(
       "getAmplitudes is not implemented for quantum hardware");
 }
 
-std::complex<double>
-QuantumState::overlap(const cudaq::SimulationState &other) {
+std::complex<double> QPUState::overlap(const cudaq::SimulationState &other) {
   throw std::runtime_error("overlap is not implemented for quantum hardware");
 }
 } // namespace cudaq
diff --git a/runtime/cudaq/qis/quantum_state.h b/runtime/cudaq/platform/qpu_state.h
similarity index 96%
rename from runtime/cudaq/qis/quantum_state.h
rename to runtime/cudaq/platform/qpu_state.h
index c9b1b30029b..a13ac6f7b40 100644
--- a/runtime/cudaq/qis/quantum_state.h
+++ b/runtime/cudaq/platform/qpu_state.h
@@ -17,7 +17,7 @@ namespace cudaq {
 // The state is represented by a quantum kernel.
 // Quantum state contains all the information we need to replicate a
 // call to kernel that created the state.
-class QuantumState : public cudaq::SimulationState {
+class QPUState : public cudaq::SimulationState {
 protected:
   std::string kernelName;
   // Lazily-evaluated state data (just keeping the kernel name and arguments).
@@ -65,7 +65,7 @@ class QuantumState : public cudaq::SimulationState {
 
   /// @brief Constructor
   template <typename QuantumKernel, typename... Args>
-  QuantumState(QuantumKernel &&kernel, Args &&...args) {
+  QPUState(QuantumKernel &&kernel, Args &&...args) {
     if constexpr (has_name<QuantumKernel>::value) {
       // kernel_builder kernel: need to JIT code to get it registered.
       static_cast<cudaq::details::kernel_builder_base &>(kernel).jitCode();
@@ -75,10 +75,10 @@ class QuantumState : public cudaq::SimulationState {
     }
     (addArgument(args), ...);
   }
-  QuantumState() = default;
-  QuantumState(const QuantumState &other)
+  QPUState() = default;
+  QPUState(const QPUState &other)
       : kernelName(other.kernelName), args(other.args), deleters() {}
-  virtual ~QuantumState();
+  virtual ~QPUState();
 
   /// @brief True if the state has amplitudes or density matrix available.
   virtual bool hasData() const override { return false; }

From 140247143053d9140794f8a9b7eb006b155b8d00 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 19 Feb 2025 17:01:52 -0800
Subject: [PATCH 34/54] Store new functions in subst module and update
 synthesis

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Transforms/Passes.td  |   2 +-
 .../Transforms/ArgumentSynthesis.cpp          | 132 ++++++----
 runtime/common/ArgumentConversion.cpp         |  46 ++--
 runtime/common/ArgumentConversion.h           |   7 +-
 runtime/test/test_argument_conversion.cpp     | 246 ++++++++++++++----
 test/Quake/arg_subst-5.txt                    |  14 +-
 test/Quake/arg_subst_func.qke                 |   8 -
 7 files changed, 318 insertions(+), 137 deletions(-)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 1351b3bdf17..3d22756d404 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -65,7 +65,7 @@ def ApplySpecialization : Pass<"apply-op-specialization", "mlir::ModuleOp"> {
   ];
 }
 
-def ArgumentSynthesis : Pass<"argument-synthesis", "mlir::func::FuncOp"> {
+def ArgumentSynthesis : Pass<"argument-synthesis", "mlir::ModuleOp"> {
   let summary = "Specialize a function by replacing arguments with constants";
   let description = [{
     This pass takes a list of functions and argument substitutions. For each
diff --git a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
index 76a3ac36ca8..9159fdee9de 100644
--- a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
+++ b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Parser/Parser.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
+#include <list>
 
 namespace cudaq::opt {
 #define GEN_PASS_DEF_ARGUMENTSYNTHESIS
@@ -25,69 +26,97 @@ namespace cudaq::opt {
 using namespace mlir;
 
 namespace {
+
+class Analysis {
+public:
+  Analysis(MLIRContext *ctx, mlir::Pass::ListOption<std::string> &funcList)
+      : ctx(ctx), funcList(funcList) {
+    parseSubstModules();
+  }
+
+  void parseSubstModules() {
+    for (auto &item : funcList) {
+      auto pos = item.find(':');
+      if (pos == std::string::npos)
+        continue;
+
+      std::string funcName = item.substr(0, pos);
+      std::string text = item.substr(pos + 1);
+
+      // If there are no substitutions, continue to the next subst
+      if (text.empty()) {
+        LLVM_DEBUG(llvm::dbgs() << funcName << " has no substitutions.");
+        continue;
+      }
+
+      // If we're here, we have a FuncOp and we have substitutions that can be
+      // applied.
+      //
+      // 1. Create a Module with the substitutions that we'll be making.
+      LLVM_DEBUG(llvm::dbgs()
+                 << funcName << " : substitution pattern: '" << text << "'\n");
+      auto substModule = [&]() -> OwningOpRef<ModuleOp> {
+        if (text.front() == '*') {
+          // Substitutions are a raw string after the '*' character.
+          return parseSourceString<ModuleOp>(text.substr(1), ctx);
+        }
+        // Substitutions are in a text file (command-line usage).
+        return parseSourceFile<ModuleOp>(text, ctx);
+      }();
+      assert(*substModule && "module must have been created");
+      auto &name = funcNames.emplace_back(funcName);
+      substModules.try_emplace(name, std::move(substModule));
+      // substModules[funcName]->dump();
+    }
+  }
+
+  MLIRContext *ctx;
+  mlir::Pass::ListOption<std::string> &funcList;
+  std::list<std::string> funcNames;
+  DenseMap<StringRef, OwningOpRef<ModuleOp>> substModules;
+};
+
 class ArgumentSynthesisPass
     : public cudaq::opt::impl::ArgumentSynthesisBase<ArgumentSynthesisPass> {
 public:
   using ArgumentSynthesisBase::ArgumentSynthesisBase;
 
-  void runOnOperation() override {
-    func::FuncOp func = getOperation();
-    StringRef funcName = func.getName();
-    std::string text;
-    if (std::find_if(funcList.begin(), funcList.end(),
-                     [&](const std::string &item) {
-                       auto pos = item.find(':');
-                       if (pos == std::string::npos)
-                         return false;
-                       std::string itemName = item.substr(0, pos);
-                       bool result = itemName == funcName;
-                       if (result)
-                         text = item.substr(pos + 1);
-                       return result;
-                     }) == funcList.end()) {
-      // If the function isn't on the list, do nothing.
-      LLVM_DEBUG(llvm::dbgs() << funcName << " not in list.\n");
-      return;
+  void mergeSymbols(ModuleOp mod, Analysis &analysis) {
+    for (auto &[funcName, substMod] : analysis.substModules) {
+      // 2. Go through the Module and merge in all its symbols.
+      for (auto &op : *substMod) {
+        if (auto symInterface = dyn_cast<SymbolOpInterface>(op)) {
+          auto name = symInterface.getName();
+          auto obj = mod.lookupSymbol(name);
+          if (!obj)
+            mod.getBody()->push_back(op.clone());
+        }
+      }
     }
+  }
 
-    // If there are no substitutions, we're done.
-    if (text.empty()) {
-      LLVM_DEBUG(llvm::dbgs() << funcName << " has no substitutions.");
+  void processFunction(func::FuncOp func, Analysis &analysis) {
+    MLIRContext *ctx = func.getContext();
+    auto funcName = func.getName();
+    LLVM_DEBUG(llvm::dbgs() << "processing : '" << funcName << "'\n");
+
+    auto it = analysis.substModules.find(funcName);
+    if (it == analysis.substModules.end()) {
+      // If the function isn't on the list, do nothing.
+      LLVM_DEBUG(llvm::dbgs() << funcName << " has no substitutions.\n");
       return;
     }
-
-    // If we're here, we have a FuncOp and we have substitutions that can be
-    // applied.
-    //
-    // 1. Create a Module with the substitutions that we'll be making.
-    auto *ctx = func.getContext();
-    LLVM_DEBUG(llvm::dbgs() << "substitution pattern: '" << text << "'\n");
-    auto substMod = [&]() -> OwningOpRef<ModuleOp> {
-      if (text.front() == '*') {
-        // Substitutions are a raw string after the '*' character.
-        return parseSourceString<ModuleOp>(text.substr(1), ctx);
-      }
-      // Substitutions are in a text file (command-line usage).
-      return parseSourceFile<ModuleOp>(text, ctx);
-    }();
-    assert(*substMod && "module must have been created");
+    auto substMod = *(it->second);
 
     // 2. Go through the Module and process each substitution.
     SmallVector<bool> processedArgs(func.getFunctionType().getNumInputs());
     SmallVector<std::tuple<unsigned, Value, Value>> replacements;
     BitVector replacedArgs(processedArgs.size());
-    for (auto &op : *substMod) {
+    for (auto &op : substMod) {
       auto subst = dyn_cast<cudaq::cc::ArgumentSubstitutionOp>(op);
-      if (!subst) {
-        if (auto symInterface = dyn_cast<SymbolOpInterface>(op)) {
-          auto name = symInterface.getName();
-          auto srcMod = func->getParentOfType<ModuleOp>();
-          auto obj = srcMod.lookupSymbol(name);
-          if (!obj)
-            srcMod.getBody()->push_back(op.clone());
-        }
+      if (!subst)
         continue;
-      }
+
       auto pos = subst.getPosition();
       if (pos >= processedArgs.size()) {
         func.emitError("Argument " + std::to_string(pos) + " is invalid.");
@@ -147,6 +176,15 @@ class ArgumentSynthesisPass
     // substituted.
     func.eraseArguments(replacedArgs);
   }
+
+  void runOnOperation() override {
+    ModuleOp mod = getOperation();
+    Analysis analysis(mod.getContext(), funcList);
+
+    mergeSymbols(mod, analysis);
+
+    mod->walk([&](func::FuncOp func) { processFunction(func, analysis); });
+  }
 };
 } // namespace
 
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 6078fb45125..daf84544ee3 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -329,7 +329,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
       cudaq::state_helper::getSimulationState(const_cast<cudaq::state *>(v));
 
   auto kernelName = converter.getKernelName();
-  auto sourceMod = converter.getSourceModule();
+  // auto sourceMod = converter.getSourceModule();
   auto substMod = converter.getSubstitutionModule();
 
   // If the state has amplitude data, we materialize the data as a state
@@ -491,29 +491,35 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     auto calleeFunc = fromModule->lookupSymbol<func::FuncOp>(calleeKernelName);
     assert(calleeFunc && "callee func is missing");
 
-    static unsigned counter = 0;
-    auto initName = calleeName + ".init_" + std::to_string(counter);
-    auto numQubitsName =
-        calleeName + ".num_qubits_" + std::to_string(counter++);
+    // Use the state pointer as a hash to create the new kernel names.
+    // We can reuse the functions previously created from the same state.
+    auto hash = std::to_string(reinterpret_cast<std::size_t>(v));
+    auto initName = calleeName + ".init_" + hash;
+    auto numQubitsName = calleeName + ".num_qubits_" + hash;
+
+    // Function names in the IR
     auto initKernelName = cudaq::runtime::cudaqGenPrefixName + initName;
     auto numQubitsKernelName =
         cudaq::runtime::cudaqGenPrefixName + numQubitsName;
 
-    // Create `callee.init_N` and `callee.num_qubits_N` used for
-    // `quake.get_state` replacement later in ReplaceStateWithKernel pass
-    createInitFunc(builder, sourceMod, calleeFunc, initKernelName);
-    createNumQubitsFunc(builder, sourceMod, calleeFunc, numQubitsKernelName);
-
-    // Create and register names for new `init` and `num_qubits` kernels so
-    // ArgumentConverters can keep a string reference to a valid memory.
-    auto &registeredInitName =
-        cudaq::opt::ArgumentConverter::registerKernelName(initName);
-    auto &registeredNumQubitsName =
-        cudaq::opt::ArgumentConverter::registerKernelName(numQubitsName);
-
-    // Convert arguments  for `callee.init_N` and `callee.num_qubits_N`.
-    converter.genCallee(registeredInitName, calleeArgs);
-    converter.genCallee(registeredNumQubitsName, calleeArgs);
+    if (!cudaq::opt::ArgumentConverter::isRegisteredKernelName(initName) ||
+        !cudaq::opt::ArgumentConverter::isRegisteredKernelName(numQubitsName)) {
+      // Create `callee.init_N` and `callee.num_qubits_N` used for
+      // `quake.get_state` replacement later in ReplaceStateWithKernel pass
+      createInitFunc(builder, substMod, calleeFunc, initKernelName);
+      createNumQubitsFunc(builder, substMod, calleeFunc, numQubitsKernelName);
+
+      // Create and register names for new `init` and `num_qubits` kernels so
+      // ArgumentConverters can keep a string reference to a valid memory.
+      auto &registeredInitName =
+          cudaq::opt::ArgumentConverter::registerKernelName(initName);
+      auto &registeredNumQubitsName =
+          cudaq::opt::ArgumentConverter::registerKernelName(numQubitsName);
+
+      // Convert arguments  for `callee.init_N` and `callee.num_qubits_N`.
+      converter.genCallee(registeredInitName, calleeArgs);
+      converter.genCallee(registeredNumQubitsName, calleeArgs);
+    }
 
     // Create a substitution for the state pointer.
     auto statePtrTy =
diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index 6d2b2135c37..677bc53b066 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -53,7 +53,7 @@ class ArgumentConverter {
   mlir::StringRef getKernelName() { return kernelName; }
 
   void genCallee(mlir::StringRef calleeName, std::vector<void *> &args) {
-    auto &converter = calleeConverters.emplace_back(calleeName, sourceModule);
+    auto &converter = calleeConverters.emplace_back(calleeName, substModule);
     converter.gen(args);
   }
 
@@ -61,6 +61,11 @@ class ArgumentConverter {
     return calleeConverters;
   }
 
+  static bool isRegisteredKernelName(const std::string &kernelName) {
+    return std::find(kernelNameRegistry.begin(), kernelNameRegistry.end(),
+                     kernelName) != kernelNameRegistry.end();
+  }
+
   static const std::string &registerKernelName(const std::string &kernelName) {
     return kernelNameRegistry.emplace_back(kernelName);
   }
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index edc9793a5c8..bd3a7b2107b 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -11,8 +11,6 @@
 
 // RUN: test_argument_conversion | FileCheck %s
 
-// #include "FakeQuantumState.h"
-// #include "FakeSimulationState.h"
 #include "common/ArgumentConversion.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
@@ -172,14 +170,13 @@ func.func @__nvqpp__mlirgen__testy(%0: )#" +
                      typeName + R"#() -> ()
   return
 })#";
+
   // Create the Module
   auto mod = mlir::parseSourceString<mlir::ModuleOp>(code, ctx);
   llvm::outs() << "Source module:\n" << *mod << '\n';
   cudaq::opt::ArgumentConverter ab{"testy", *mod};
   // Create the argument conversions
   ab.gen(args);
-  // Dump the modified source module
-  llvm::outs() << "Source module (after):\n" << *mod << '\n';
   // Dump all conversions
   dumpSubstitutionModules(ab);
 }
@@ -223,8 +220,6 @@ void doTest(mlir::MLIRContext *ctx, std::vector<std::string> &typeNames,
   cudaq::opt::ArgumentConverter ab{"testy", *mod};
   // Create the argument conversions
   ab.gen_drop_front(args, startingArgIdx);
-  // Dump the modified source module
-  llvm::outs() << "Source module (after):\n" << *mod << '\n';
   // Dump all conversions
   dumpSubstitutionModules(ab);
 }
@@ -525,21 +520,31 @@ void test_simulation_state(mlir::MLIRContext *ctx) {
 
 void test_quantum_state(mlir::MLIRContext *ctx) {
   {
-    auto kernel = "init";
-    auto kernelCode =
+    // @cudaq.kernel
+    // def init(n: int):
+    //    q = cudaq.qvector(n)
+    //    x(q[0])
+    //
+    // def kernel(s: cudaq.State):
+    //   ...
+    //
+    // s = cudaq.get_state(init, 2)
+    // cudaq.sample(kernel, s)
+    auto init = "init";
+    auto initCode =
         "func.func private @__nvqpp__mlirgen__init(%arg0: i64) {\n"
         "  %0 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
         "  %1 = quake.extract_ref %0[0] : (!quake.veq<?>) -> !quake.ref\n"
         "  quake.x %1 : (!quake.ref) -> ()\n"
         "  return\n"
         "}\n";
-    __cudaq_deviceCodeHolderAdd(kernel, kernelCode);
+    __cudaq_deviceCodeHolderAdd(init, initCode);
 
     std::int64_t n = 2;
     std::vector<void *> a = {static_cast<void *>(&n)};
-    auto x = cudaq::state(new FakeDeviceState(kernel, a));
-    std::vector<void *> v = {static_cast<void *>(&x)};
-    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, kernelCode);
+    auto s = cudaq::state(new FakeDeviceState(init, a));
+    std::vector<void *> v = {static_cast<void *>(&s)};
+    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, initCode);
   }
 
   // clang-format off
@@ -552,15 +557,145 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         }
 // CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
 
-// CHECK:       Source module (after):
-// CHECK:         func.func private @__nvqpp__mlirgen__init(%arg0: i64) {
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         testy
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %[[VAL_0:.*]] = quake.get_state @__nvqpp__mlirgen__init.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__init.init_[[HASH_0]] : !cc.ptr<!cc.state>
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__init.init_[[HASH_0]](%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.subi %arg0, %[[VAL_1]] : i64
+// CHECK:           %[[VAL_3:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_2]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           %[[VAL_4:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
+// CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
+// CHECK:           %[[VAL_6:.*]] = quake.extract_ref %[[VAL_3]][0] : (!quake.veq<?>) -> !quake.ref
+// CHECK:           quake.x %[[VAL_6]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_7:.*]] = arith.subi %[[VAL_5]], %[[VAL_1]] : i64
+// CHECK:           %[[VAL_8:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_7]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           return %[[VAL_8]] : !quake.veq<?>
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__init.num_qubits_[[HASH_0]](%arg0: i64) -> i64 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
+// CHECK:           return %[[VAL_1]] : i64
+// CHECK:         }
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         init.init_[[HASH_0]]
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
+// CHECK:         }
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         init.num_qubits_[[HASH_0]]
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
+// CHECK:         }
+  // clang-format on
+
+  {
+    // @cudaq.kernel
+    // def init(n: int):
+    //    q = cudaq.qvector(n)
+    //    x(q[0])
+    //
+    // def state_param(s: cudaq.State)
+    //    q = cudaq.qvector(s)
+    //    x(q[0])
+    //
+    // def kernel(s: cudaq.State):
+    //   ...
+    //
+    // s0 = cudaq.get_state(init, 2)
+    // s1 = cudaq.get_state(state_param, s0)
+    // cudaq.sample(kernel, s1)
+    auto init = "init1";
+    auto initCode =
+        "func.func private @__nvqpp__mlirgen__init1(%arg0: i64) {\n"
+        "  %0 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
+        "  %1 = quake.extract_ref %0[0] : (!quake.veq<?>) -> !quake.ref\n"
+        "  quake.x %1 : (!quake.ref) -> ()\n"
+        "  return\n"
+        "}\n";
+    __cudaq_deviceCodeHolderAdd(init, initCode);
+
+    auto stateParam = "state_param";
+    auto stateParamCode =
+        "func.func private @__nvqpp__mlirgen__state_param(%arg0: "
+        "!cc.ptr<!cc.state>) {\n"
+        "  %0 = quake.get_number_of_qubits %arg0 : (!cc.ptr<!cc.state>) -> "
+        "i64\n"
+        "  %1 = quake.alloca !quake.veq<?>[%0 : i64]\n"
+        "  %2 = quake.init_state %1, %arg0 : (!quake.veq<?>, "
+        "!cc.ptr<!cc.state>) -> !quake.veq<?>\n"
+        "  %3 = quake.extract_ref %2[0] : (!quake.veq<?>) -> !quake.ref\n"
+        "  quake.x %3 : (!quake.ref) -> ()\n"
+        "  return\n"
+        "}\n";
+
+    __cudaq_deviceCodeHolderAdd(stateParam, stateParamCode);
+
+    std::int64_t n = 2;
+    std::vector<void *> a = {static_cast<void *>(&n)};
+    auto s0 = cudaq::state(new FakeDeviceState(init, a));
+    std::vector<void *> v0 = {static_cast<void *>(&s0)};
+    auto s1 = cudaq::state(new FakeDeviceState(stateParam, v0));
+    std::vector<void *> v1 = {static_cast<void *>(&s1)};
+
+    auto code = std::string{initCode} + std::string{stateParamCode};
+    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v1, code);
+  }
+
+  // clang-format off
+// CHECK:       Source module:
+// CHECK:         func.func private @__nvqpp__mlirgen__init1(%arg0: i64) {
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
 // CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<?>) -> !quake.ref
 // CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
 // CHECK:           return
 // CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__state_param(%arg0: !cc.ptr<!cc.state>) {
+// CHECK:           %[[VAL_0:.*]] = quake.get_number_of_qubits %arg0 : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<?>[%[[VAL_0]] : i64]
+// CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+// CHECK:           return
+// CHECK:         }
 // CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
-// CHECK:         func.func private @__nvqpp__mlirgen__init.init_0(%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
+
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         testy
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %0 = quake.get_state @__nvqpp__mlirgen__state_param.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__state_param.init_[[HASH_0]] : !cc.ptr<!cc.state>
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__state_param.init_[[HASH_0]](%arg0: !cc.ptr<!cc.state>, %arg1: !quake.veq<?>) -> !quake.veq<?> {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = quake.get_number_of_qubits %arg0 : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = arith.subi %[[VAL_2]], %[[VAL_1]] : i64
+// CHECK:           %[[VAL_4:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_3]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_0]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_6:.*]] = arith.addi %[[VAL_0]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_7:.*]] = quake.init_state %[[VAL_4]], %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+// CHECK:           %[[VAL_8:.*]] = arith.subi %[[VAL_6]], %[[VAL_1]] : i64
+// CHECK:           %[[VAL_9:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_8]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           return %[[VAL_9]] : !quake.veq<?>
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__state_param.num_qubits_[[HASH_0]](%arg0: !cc.ptr<!cc.state>) -> i64 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = quake.get_number_of_qubits %arg0 : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i64
+// CHECK:           return %[[VAL_2]] : i64
+// CHECK:         }
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         state_param.init_[[HASH_0]]
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %0 = quake.get_state @__nvqpp__mlirgen__init1.num_qubits_[[HASH_1:.*]] @__nvqpp__mlirgen__init1.init_[[HASH_1]] : !cc.ptr<!cc.state>
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__init1.init_[[HASH_1]](%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
 // CHECK:           %[[VAL_2:.*]] = arith.subi %arg0, %[[VAL_1]] : i64
@@ -573,36 +708,51 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:           %[[VAL_8:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_7]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
 // CHECK:           return %[[VAL_8]] : !quake.veq<?>
 // CHECK:         }
-// CHECK:         func.func private @__nvqpp__mlirgen__init.num_qubits_0(%arg0: i64) -> i64 {
+// CHECK:         func.func private @__nvqpp__mlirgen__init1.num_qubits_[[HASH_1]](%arg0: i64) -> i64 {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_1:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
 // CHECK:           return %[[VAL_1]] : i64
 // CHECK:         }
-
 // CHECK:         ========================================
 // CHECK:         Substitution module:
-// CHECK:         testy
+// CHECK:         init1.init_[[HASH_1]]
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = quake.get_state @__nvqpp__mlirgen__init.num_qubits_0 @__nvqpp__mlirgen__init.init_0 : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
 // CHECK:         }
 // CHECK:         ========================================
 // CHECK:         Substitution module:
-// CHECK:         init.init_0
+// CHECK:         init1.num_qubits_[[HASH_1]]
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
 // CHECK:         }
 // CHECK:         ========================================
 // CHECK:         Substitution module:
-// CHECK:         init.num_qubits_0
+// CHECK:         state_param.num_qubits_[[HASH_0]]
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
+// CHECK:           %[[VAL_0:.*]] = quake.get_state @__nvqpp__mlirgen__init1.num_qubits_[[HASH_1]] @__nvqpp__mlirgen__init1.init_[[HASH_1]] : !cc.ptr<!cc.state>
 // CHECK:         }
+
   // clang-format on
 
   {
-    auto kernel = "init";
-    auto kernelCode =
-        " func.func private @__nvqpp__mlirgen__init(%arg0: i64) {\n"
+    // @cudaq.kernel
+    // def init(n: int):
+    //    q0 = cudaq.qvector(n)
+    //    x(q0[0])
+    //    r = mz(q0[0])
+    //    if (r):
+    //       q1 = cudaq.qvector(n)
+    //       x(q1[0])
+    //       y(q0[0])
+    //
+    // def kernel(s: cudaq.State):
+    //   ...
+    //
+    // s = cudaq.get_state(init, 2)
+    // cudaq.sample(kernel, s)
+    auto init = "init2";
+    auto initCode =
+        " func.func private @__nvqpp__mlirgen__init2(%arg0: i64) {\n"
         "   %2 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
         "   %3 = quake.extract_ref %2[0] : (!quake.veq<?>) -> !quake.ref\n"
         "   quake.x %3 : (!quake.ref) -> ()\n"
@@ -619,18 +769,18 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
         "   return\n"
         "}\n";
 
-    __cudaq_deviceCodeHolderAdd(kernel, kernelCode);
+    __cudaq_deviceCodeHolderAdd(init, initCode);
 
     std::int64_t n = 2;
     std::vector<void *> a = {static_cast<void *>(&n)};
-    auto x = cudaq::state(new FakeDeviceState(kernel, a));
-    std::vector<void *> v = {static_cast<void *>(&x)};
-    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, kernelCode);
+    auto s = cudaq::state(new FakeDeviceState(init, a));
+    std::vector<void *> v = {static_cast<void *>(&s)};
+    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, initCode);
   }
 
   // clang-format off
 // CHECK:       Source module:
-// CHECK:         func.func private @__nvqpp__mlirgen__init(%arg0: i64) {
+// CHECK:         func.func private @__nvqpp__mlirgen__init2(%arg0: i64) {
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
 // CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<?>) -> !quake.ref
 // CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
@@ -647,24 +797,13 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         }
 // CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
 
-// CHECK:       Source module (after):
-// CHECK:         func.func private @__nvqpp__mlirgen__init(%arg0: i64) {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
-// CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<?>) -> !quake.ref
-// CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "q0" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!quake.measure) -> i1
-// CHECK:           cc.if(%[[VAL_3]]) {
-// CHECK:             %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
-// CHECK:             %[[VAL_5:.*]] = quake.extract_ref %[[VAL_4]][0] : (!quake.veq<?>) -> !quake.ref
-// CHECK:             quake.x %[[VAL_5]] : (!quake.ref) -> ()
-// CHECK:             %[[VAL_6:.*]] = quake.extract_ref %[[VAL_0]][1] : (!quake.veq<?>) -> !quake.ref
-// CHECK:             quake.y %[[VAL_6]] : (!quake.ref) -> ()
-// CHECK:           }
-// CHECK:           return
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         testy
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %[[VAL_0:.*]] = quake.get_state @__nvqpp__mlirgen__init2.num_qubits_[[HASH_1:.*]] @__nvqpp__mlirgen__init2.init_[[HASH_1]] : !cc.ptr<!cc.state>
 // CHECK:         }
-// CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
-// CHECK:         func.func private @__nvqpp__mlirgen__init.init_1(%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
+// CHECK:         func.func private @__nvqpp__mlirgen__init2.init_[[HASH_1]](%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
 // CHECK:           %[[VAL_2:.*]] = arith.subi %arg0, %[[VAL_1]] : i64
@@ -686,27 +825,20 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:           %[[VAL_10:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_9]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
 // CHECK:           return %[[VAL_10]] : !quake.veq<?>
 // CHECK:         }
-// CHECK:         func.func private @__nvqpp__mlirgen__init.num_qubits_1(%arg0: i64) -> i64 {
+// CHECK:         func.func private @__nvqpp__mlirgen__init2.num_qubits_[[HASH_1]](%arg0: i64) -> i64 {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_1:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
 // CHECK:           return %[[VAL_1]] : i64
 // CHECK:         }
-
-// CHECK:         ========================================
-// CHECK:         Substitution module:
-// CHECK:         testy
-// CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = quake.get_state @__nvqpp__mlirgen__init.num_qubits_1 @__nvqpp__mlirgen__init.init_1 : !cc.ptr<!cc.state>
-// CHECK:         }
 // CHECK:         ========================================
 // CHECK:         Substitution module:
-// CHECK:         init.init_1
+// CHECK:         init2.init_[[HASH_1]]
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
 // CHECK:         }
 // CHECK:         ========================================
 // CHECK:         Substitution module:
-// CHECK:         init.num_qubits_1
+// CHECK:         init2.num_qubits_[[HASH_1]]
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
 // CHECK:         }
diff --git a/test/Quake/arg_subst-5.txt b/test/Quake/arg_subst-5.txt
index 5020e7fe096..959ec6ba364 100644
--- a/test/Quake/arg_subst-5.txt
+++ b/test/Quake/arg_subst-5.txt
@@ -6,6 +6,14 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-cc.arg_subst[0] {
-  %0 = quake.get_state @num_qubits @init : !cc.ptr<!cc.state>
-}
+module {
+  cc.arg_subst[0] {
+    %0 = quake.get_state @num_qubits @init : !cc.ptr<!cc.state>
+  }
+  func.func @init(%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
+    return %arg1 : !quake.veq<?>
+  }
+  func.func @num_qubits(%arg0: i64) -> i64 {
+    return %arg0 : i64
+  }
+}
\ No newline at end of file
diff --git a/test/Quake/arg_subst_func.qke b/test/Quake/arg_subst_func.qke
index 8df6c5e1433..dc9a28d9073 100644
--- a/test/Quake/arg_subst_func.qke
+++ b/test/Quake/arg_subst_func.qke
@@ -154,14 +154,6 @@ func.func @testy5(%arg0: !cc.ptr<!cc.state>) {
   return
 }
 
-func.func @init(%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
-  return %arg1 : !quake.veq<?>
-}
-
-func.func @num_qubits(%arg0: i64) -> i64 {
-  return %arg0 : i64
-}
-
 // CHECK-LABEL:   func.func @testy5() {
 // CHECK:           %[[VAL_2:.*]] = quake.get_state @num_qubits @init : !cc.ptr<!cc.state>
 // CHECK:           %[[VAL_3:.*]] = quake.get_number_of_qubits %[[VAL_2]] : (!cc.ptr<!cc.state>) -> i64

From 9a528dd2f8762cbe7738e6f78309a3049bc0f05d Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 20 Feb 2025 12:02:10 -0800
Subject: [PATCH 35/54] Make argument synthesis transitive

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 .../Transforms/ArgumentSynthesis.cpp          | 128 ++++++++----------
 test/Quake/arg_subst-5.txt                    |  15 +-
 test/Quake/arg_subst-6.txt                    |   4 +-
 test/Quake/arg_subst-7.txt                    |  19 +++
 test/Quake/arg_subst-8.txt                    |  11 ++
 test/Quake/arg_subst_func.qke                 |  23 +++-
 6 files changed, 111 insertions(+), 89 deletions(-)
 create mode 100644 test/Quake/arg_subst-7.txt
 create mode 100644 test/Quake/arg_subst-8.txt

diff --git a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
index 9159fdee9de..187012a4799 100644
--- a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
+++ b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
@@ -26,82 +26,21 @@ namespace cudaq::opt {
 using namespace mlir;
 
 namespace {
-
-class Analysis {
-public:
-  Analysis(MLIRContext *ctx, mlir::Pass::ListOption<std::string> &funcList)
-      : ctx(ctx), funcList(funcList) {
-    parseSubstModules();
-  }
-
-  void parseSubstModules() {
-    for (auto &item : funcList) {
-      auto pos = item.find(':');
-      if (pos == std::string::npos)
-        continue;
-
-      std::string funcName = item.substr(0, pos);
-      std::string text = item.substr(pos + 1);
-
-      // If there are no substitutions, continue to the next subst
-      if (text.empty()) {
-        LLVM_DEBUG(llvm::dbgs() << funcName << " has no substitutions.");
-        continue;
-      }
-
-      // If we're here, we have a FuncOp and we have substitutions that can be
-      // applied.
-      //
-      // 1. Create a Module with the substitutions that we'll be making.
-      LLVM_DEBUG(llvm::dbgs()
-                 << funcName << " : substitution pattern: '" << text << "'\n");
-      auto substModule = [&]() -> OwningOpRef<ModuleOp> {
-        if (text.front() == '*') {
-          // Substitutions are a raw string after the '*' character.
-          return parseSourceString<ModuleOp>(text.substr(1), ctx);
-        }
-        // Substitutions are in a text file (command-line usage).
-        return parseSourceFile<ModuleOp>(text, ctx);
-      }();
-      assert(*substModule && "module must have been created");
-      auto &name = funcNames.emplace_back(funcName);
-      substModules.try_emplace(name, std::move(substModule));
-      // substModules[funcName]->dump();
-    }
-  }
-
-  MLIRContext *ctx;
-  mlir::Pass::ListOption<std::string> &funcList;
-  std::list<std::string> funcNames;
-  DenseMap<StringRef, OwningOpRef<ModuleOp>> substModules;
-};
-
 class ArgumentSynthesisPass
     : public cudaq::opt::impl::ArgumentSynthesisBase<ArgumentSynthesisPass> {
 public:
   using ArgumentSynthesisBase::ArgumentSynthesisBase;
 
-  void mergeSymbols(ModuleOp mod, Analysis &analysis) {
-    for (auto &[funcName, substMod] : analysis.substModules) {
-      // 2. Go through the Module and merge in all its symbols.
-      for (auto &op : *substMod) {
-        if (auto symInterface = dyn_cast<SymbolOpInterface>(op)) {
-          auto name = symInterface.getName();
-          auto obj = mod.lookupSymbol(name);
-          if (!obj)
-            mod.getBody()->push_back(op.clone());
-        }
-      }
-    }
-  }
-
-  void processFunction(func::FuncOp func, Analysis &analysis) {
+  void
+  applySubstitutions(func::FuncOp func,
+                     DenseMap<StringRef, OwningOpRef<ModuleOp>> &substModules) {
     MLIRContext *ctx = func.getContext();
     auto funcName = func.getName();
     LLVM_DEBUG(llvm::dbgs() << "processing : '" << funcName << "'\n");
 
-    auto it = analysis.substModules.find(funcName);
-    if (it == analysis.substModules.end()) {
+    // 1. Find substitution module with argument replacements for the function.
+    auto it = substModules.find(funcName);
+    if (it == substModules.end()) {
       // If the function isn't on the list, do nothing.
       LLVM_DEBUG(llvm::dbgs() << funcName << " has no substitutions.\n");
       return;
@@ -116,7 +55,6 @@ class ArgumentSynthesisPass
       auto subst = dyn_cast<cudaq::cc::ArgumentSubstitutionOp>(op);
       if (!subst)
         continue;
-
       auto pos = subst.getPosition();
       if (pos >= processedArgs.size()) {
         func.emitError("Argument " + std::to_string(pos) + " is invalid.");
@@ -179,11 +117,57 @@ class ArgumentSynthesisPass
 
   void runOnOperation() override {
     ModuleOp mod = getOperation();
-    Analysis analysis(mod.getContext(), funcList);
+    MLIRContext *ctx = mod.getContext();
+
+    // 1. Collect all substitution modules.
+    std::list<std::string> funcNames;
+    DenseMap<StringRef, OwningOpRef<ModuleOp>> substModules;
+
+    for (auto &item : funcList) {
+      auto pos = item.find(':');
+      if (pos == std::string::npos)
+        continue;
+
+      std::string funcName = item.substr(0, pos);
+      std::string text = item.substr(pos + 1);
 
-    mergeSymbols(mod, analysis);
+      if (text.empty()) {
+        LLVM_DEBUG(llvm::dbgs() << funcName << " has no substitutions.");
+        continue;
+      }
+
+      // Create a Module with the substitutions that we'll be making.
+      LLVM_DEBUG(llvm::dbgs()
+                 << funcName << " : substitution pattern: '" << text << "'\n");
+      auto substModule = [&]() -> OwningOpRef<ModuleOp> {
+        if (text.front() == '*') {
+          // Substitutions are a raw string after the '*' character.
+          return parseSourceString<ModuleOp>(text.substr(1), ctx);
+        }
+        // Substitutions are in a text file (command-line usage).
+        return parseSourceFile<ModuleOp>(text, ctx);
+      }();
+      assert(*substModule && "module must have been created");
+
+      auto &name = funcNames.emplace_back(funcName);
+      substModules.try_emplace(name, std::move(substModule));
+    }
+
+    // 2. Merge symbols from substitution modules into the source module.
+    for (auto &[funcName, substMod] : substModules) {
+      for (auto &op : *substMod) {
+        if (auto symInterface = dyn_cast<SymbolOpInterface>(op)) {
+          auto name = symInterface.getName();
+          auto obj = mod.lookupSymbol(name);
+          if (!obj)
+            mod.getBody()->push_back(op.clone());
+        }
+      }
+    }
 
-    mod->walk([&](func::FuncOp func) { processFunction(func, analysis); });
+    // 3. Apply all substitutions.
+    mod->walk(
+        [&](func::FuncOp func) { applySubstitutions(func, substModules); });
   }
 };
 } // namespace
@@ -200,4 +184,4 @@ cudaq::opt::createArgumentSynthesisPass(ArrayRef<StringRef> funcNames,
       pairs.emplace_back(name.str() + ":*" + text.str());
   return std::make_unique<ArgumentSynthesisPass>(
       ArgumentSynthesisOptions{pairs});
-}
+}
\ No newline at end of file
diff --git a/test/Quake/arg_subst-5.txt b/test/Quake/arg_subst-5.txt
index 959ec6ba364..21fb6ef0c2e 100644
--- a/test/Quake/arg_subst-5.txt
+++ b/test/Quake/arg_subst-5.txt
@@ -6,14 +6,9 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-module {
-  cc.arg_subst[0] {
-    %0 = quake.get_state @num_qubits @init : !cc.ptr<!cc.state>
-  }
-  func.func @init(%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
-    return %arg1 : !quake.veq<?>
-  }
-  func.func @num_qubits(%arg0: i64) -> i64 {
-    return %arg0 : i64
-  }
+cc.arg_subst[0] {
+  %0 = arith.constant 2 : i32
+}
+func.func private @callee5(%arg0: i32) -> (i32) {
+  return %arg0: i32
 }
\ No newline at end of file
diff --git a/test/Quake/arg_subst-6.txt b/test/Quake/arg_subst-6.txt
index 7a53d0369de..ed5126e1d9b 100644
--- a/test/Quake/arg_subst-6.txt
+++ b/test/Quake/arg_subst-6.txt
@@ -7,5 +7,5 @@
 // ========================================================================== //
 
 cc.arg_subst[0] {
-  %c2_i64 = arith.constant 2 : i64
-}
+  %c4_i64 = arith.constant 4 : i32
+}
\ No newline at end of file
diff --git a/test/Quake/arg_subst-7.txt b/test/Quake/arg_subst-7.txt
new file mode 100644
index 00000000000..959ec6ba364
--- /dev/null
+++ b/test/Quake/arg_subst-7.txt
@@ -0,0 +1,19 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+module {
+  cc.arg_subst[0] {
+    %0 = quake.get_state @num_qubits @init : !cc.ptr<!cc.state>
+  }
+  func.func @init(%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
+    return %arg1 : !quake.veq<?>
+  }
+  func.func @num_qubits(%arg0: i64) -> i64 {
+    return %arg0 : i64
+  }
+}
\ No newline at end of file
diff --git a/test/Quake/arg_subst-8.txt b/test/Quake/arg_subst-8.txt
new file mode 100644
index 00000000000..7a53d0369de
--- /dev/null
+++ b/test/Quake/arg_subst-8.txt
@@ -0,0 +1,11 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+cc.arg_subst[0] {
+  %c2_i64 = arith.constant 2 : i64
+}
diff --git a/test/Quake/arg_subst_func.qke b/test/Quake/arg_subst_func.qke
index dc9a28d9073..b9a7f955981 100644
--- a/test/Quake/arg_subst_func.qke
+++ b/test/Quake/arg_subst_func.qke
@@ -6,7 +6,7 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt --argument-synthesis=functions=foo:%S/arg_subst.txt,blink:%S/arg_subst.txt,testy1:%S/arg_subst-1.txt,testy2:%S/arg_subst-2.txt,testy3:%S/arg_subst-3.txt,testy4:%S/arg_subst-4.txt,testy5:%S/arg_subst-5.txt,num_qubits:%S/arg_subst-6.txt,init:%S/arg_subst-6.txt --canonicalize %s | FileCheck %s
+// RUN: cudaq-opt --argument-synthesis=functions=foo:%S/arg_subst.txt,blink:%S/arg_subst.txt,testy1:%S/arg_subst-1.txt,testy2:%S/arg_subst-2.txt,testy3:%S/arg_subst-3.txt,testy4:%S/arg_subst-4.txt,testy5:%S/arg_subst-5.txt,callee5:%S/arg_subst-6.txt,testy6:%S/arg_subst-7.txt,num_qubits:%S/arg_subst-8.txt,init:%S/arg_subst-8.txt --canonicalize %s | FileCheck %s
 
 func.func private @bar(i32)
 func.func private @baz(f32)
@@ -147,24 +147,37 @@ func.func @testy4(%arg0: !cc.stdvec<!cc.struct<{i32, f64, i8, i16}>>) {
 // CHECK:           return
 // CHECK:         }
 
-func.func @testy5(%arg0: !cc.ptr<!cc.state>) {
+
+func.func @testy5(%arg0: i32) -> i32 {
+  return %arg0: i32
+}
+
+func.func @testy6(%arg0: !cc.ptr<!cc.state>) {
   %0 = quake.get_number_of_qubits %arg0 : (!cc.ptr<!cc.state>) -> i64
   %1 = quake.alloca !quake.veq<?>[%0 : i64]
   %5 = quake.init_state %1, %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
   return
 }
 
-// CHECK-LABEL:   func.func @testy5() {
+// CHECK-LABEL:   func.func @testy5() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i32
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+// CHECK-LABEL:   func.func @testy6() {
 // CHECK:           %[[VAL_2:.*]] = quake.get_state @num_qubits @init : !cc.ptr<!cc.state>
 // CHECK:           %[[VAL_3:.*]] = quake.get_number_of_qubits %[[VAL_2]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_5:.*]] = quake.init_state %[[VAL_4]], %[[VAL_2]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 // CHECK:           return
 // CHECK:         }
-// CHECK:         func.func @init(%arg0: !quake.veq<?>) -> !quake.veq<?> {
+// CHECK-LABEL:   func.func private @callee5() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 4 : i32
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+// CHECK-LABEL:   func.func @init(%arg0: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           return %arg0 : !quake.veq<?>
 // CHECK:         }
-// CHECK:         func.func @num_qubits() -> i64 {
+// CHECK-LABEL:   func.func @num_qubits() -> i64 {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
 // CHECK:           return %[[VAL_0]] : i64
 // CHECK:         }

From af2fd79e5180b9ea6feba52a76318dfce7060aab Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 20 Feb 2025 12:05:51 -0800
Subject: [PATCH 36/54] Update callers of synthesis

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 python/runtime/cudaq/platform/py_alt_launch_kernel.cpp | 3 +--
 runtime/common/BaseRemoteRESTQPU.h                     | 3 +--
 runtime/common/BaseRestRemoteClient.h                  | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 74e7d676a49..083b31e4dde 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -552,8 +552,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   ss << argCon.getSubstitutionModule();
   SmallVector<StringRef> substs = {substBuff};
   PassManager pm(context);
-  pm.addNestedPass<func::FuncOp>(
-      cudaq::opt::createArgumentSynthesisPass(kernels, substs));
+  pm.addPass(opt::createArgumentSynthesisPass(kernels, substs));
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   pm.addPass(opt::createDeleteStates());
 
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index b69caad2766..409153c6181 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -492,8 +492,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
                                                      kernels.end()};
         mlir::SmallVector<mlir::StringRef> substitutions{substs.begin(),
                                                          substs.end()};
-        pm.addNestedPass<mlir::func::FuncOp>(
-            cudaq::opt::createArgumentSynthesisPass(funcNames, substitutions));
+        pm.addPass(opt::createArgumentSynthesisPass(funcNames, substitutions));
         pm.addPass(opt::createDeleteStates());
         pm.addNestedPass<mlir::func::FuncOp>(
             opt::createReplaceStateWithKernel());
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index 5bcb89066af..ab8d3ba79d5 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -191,8 +191,7 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
           llvm::raw_string_ostream ss(substBuff);
           ss << argCon.getSubstitutionModule();
           mlir::SmallVector<mlir::StringRef> substs = {substBuff};
-          pm.addNestedPass<mlir::func::FuncOp>(
-              opt::createArgumentSynthesisPass(kernels, substs));
+          pm.addPass(opt::createArgumentSynthesisPass(kernels, substs));
           pm.addPass(mlir::createCanonicalizerPass());
           pm.addPass(opt::createDeleteStates());
         } else if (args) {

From 4d6f7ee3eb634fc9aced6d894bb302e74e292ca7 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 20 Feb 2025 12:21:30 -0800
Subject: [PATCH 37/54] Use PointerOf in quake defs

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Dialect/CC/CCTypes.td     | 6 ------
 include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td | 8 ++++----
 lib/Optimizer/Transforms/ArgumentSynthesis.cpp    | 2 +-
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
index 03b8d9541d9..18bce4e156a 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
@@ -313,10 +313,4 @@ def AnyStateInitLike : TypeConstraint<cc_PointerType.predicate,
                          "state initializer types">;
 def AnyStateInitType : Type<AnyStateInitLike.predicate, "initial state type">;
 
-def AnyStatePointerType : Type<
-  And<[
-    cc_PointerType.predicate,
-    CPred<"$_self.cast<cudaq::cc::PointerType>().getElementType().isa<cudaq::cc::StateType>()">
-    ]>,
-    "state pointer type">;
 #endif // CUDAQ_DIALECT_CC_TYPES_TD
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
index cfb16bd100c..b70539acee8 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
@@ -1418,7 +1418,7 @@ def quake_CreateStateOp : QuakeOp<"create_state", [Pure]> {
     cc_PointerType:$data,
     AnySignlessInteger:$length
   );
-  let results = (outs AnyStatePointerType:$result);
+  let results = (outs PointerOf<[cc_StateType]>:$result);
   let assemblyFormat = [{
       $data `,` $length `:` functional-type(operands, results) attr-dict
   }];
@@ -1436,7 +1436,7 @@ def QuakeOp_DeleteStateOp : QuakeOp<"delete_state", [] > {
     ```
   }];
 
-  let arguments = (ins AnyStatePointerType:$state);
+  let arguments = (ins PointerOf<[cc_StateType]>:$state);
   let results = (outs);
   let assemblyFormat = [{
       $state `:` type(operands) attr-dict
@@ -1456,7 +1456,7 @@ def quake_GetNumberOfQubitsOp : QuakeOp<"get_number_of_qubits", [Pure] > {
     ```
   }];
 
-  let arguments = (ins AnyStatePointerType:$state);
+  let arguments = (ins PointerOf<[cc_StateType]>:$state);
   let results = (outs AnySignlessInteger:$result);
   let assemblyFormat = [{
       $state `:` functional-type(operands, results) attr-dict
@@ -1490,7 +1490,7 @@ def QuakeOp_GetStateOp : QuakeOp<"get_state", [Pure] > {
     FlatSymbolRefAttr:$numQubitsFunc,
     FlatSymbolRefAttr:$initFunc
   );
-  let results = (outs AnyStatePointerType:$result);
+  let results = (outs PointerOf<[cc_StateType]>:$result);
   let assemblyFormat = [{
      $numQubitsFunc $initFunc `:` qualified(type(results)) attr-dict
   }];
diff --git a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
index 187012a4799..b80bdfeea23 100644
--- a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
+++ b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
@@ -184,4 +184,4 @@ cudaq::opt::createArgumentSynthesisPass(ArrayRef<StringRef> funcNames,
       pairs.emplace_back(name.str() + ":*" + text.str());
   return std::make_unique<ArgumentSynthesisPass>(
       ArgumentSynthesisOptions{pairs});
-}
\ No newline at end of file
+}

From e7d95d54e92c931fc7e8bce4472dfbb241f6ce58 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 20 Feb 2025 14:04:56 -0800
Subject: [PATCH 38/54] Addressed more CR comments

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 .../cudaq/Optimizer/Dialect/Quake/QuakeOps.td |  4 +-
 include/cudaq/Optimizer/Transforms/Passes.td  | 31 +++++-----
 .../Transforms/ReplaceStateWithKernel.cpp     | 57 +++++++++----------
 runtime/common/ArgumentConversion.cpp         | 23 ++++----
 runtime/test/test_argument_conversion.cpp     | 10 ++--
 test/Quake/arg_subst-7.txt                    |  2 +-
 test/Quake/arg_subst_func.qke                 |  2 +-
 test/Quake/replace_state_with_kernel.qke      |  6 +-
 8 files changed, 65 insertions(+), 70 deletions(-)

diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
index b70539acee8..5dbc506fa73 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
@@ -1463,7 +1463,7 @@ def quake_GetNumberOfQubitsOp : QuakeOp<"get_number_of_qubits", [Pure] > {
   }];
 }
 
-def QuakeOp_GetStateOp : QuakeOp<"get_state", [Pure] > {
+def QuakeOp_MaterializeStateOp : QuakeOp<"materialize_state", [Pure] > {
   let summary = "Get state from kernel with the provided name.";
   let description = [{
     This operation is created by argument synthesis of state pointer arguments
@@ -1482,7 +1482,7 @@ def QuakeOp_GetStateOp : QuakeOp<"get_state", [Pure] > {
     pass.
 
     ```mlir
-      %0 = quake.get_state @num_qubits @init : !cc.ptr<!cc.state>
+      %0 = quake.materialize_state @num_qubits @init : !cc.ptr<!cc.state>
     ```
   }];
 
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 3d22756d404..1a081409b62 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -856,18 +856,18 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::Func
     "Replace `quake.init_state` instructions with call to the kernel generating the state";
   let description = [{
     This optimization replaces `quake.init_state`, `quake.get_number_of_qubits`,
-    and `quake.get_state` operations invoked on state pointers during argument
-    synthesis for quantum devices.
+    and `quake.materialize_state` operations invoked on state pointers during
+    argument synthesis for quantum devices.
 
     Before this optimization, argument synthesis for state pointers for quantum
-    devices substituted a state created from the `quake.get_state` operation
-    for the state argument.
+    devices substituted a state created from the `quake.materialize_state`
+    operation for the state argument.
 
-    The `quake.get_state` operation accepts symbols for the synthesized kernels
-    `@num_qubits` and `@init` that argument synthesis generated from the original
-    kernel call that generated the state, e.g., the `cudaq::get_state` call that
-    refers to the result of a specific quantum kernel being invoked with a set
-    of parameters
+    The `quake.materialize_state` operation accepts symbols for the synthesized
+    kernels `@num_qubits` and `@init` that argument synthesis generated from
+    the original kernel call that generated the state, e.g., 
+    the `cudaq::get_state` call that refers to the result of a specific quantum
+    kernel being invoked with a set of parameters
 
     For example, for the user code:
       ```
@@ -885,22 +885,23 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::Func
     The argument synthesis also substituted the state argument in the `caller`
     with:
       ```
-      quake.get_state @callee_num_qubits @callee_init: !cc.ptr<!cc.state>
+      quake.materialize_state @callee_num_qubits @callee_init: !cc.ptr<!cc.state>
       ```
 
     This optimization performs the replacements for the the following operations 
-    that use a state produced by  `quake.get_state @num_qubits @init` operation:
+    that use a state produced by  `quake.materialize_state @num_qubits @init`
+    operation:
 
-    - Replace `quake.get_number_of_qubits` operation by the call to `@num_qubits`
-    - Replace `quake.init_state` operation by the call to `@init`
-    - Clean up unused `quake.get_state` operation
+    - Replace `quake.get_number_of_qubits` operation by call to `@num_qubits`
+    - Replace `quake.init_state` operation by call to `@init`
+    - Clean up unused `quake.materialize_state` operation
 
     For example:
 
     Before ReplaceStateWithKernel (replace-state-with-kernel):
     ```
     func.func @foo() {
-      %0 = quake.get_state @callee.num_qubits_0 @callee.init_0: !cc.ptr<!cc.state>
+      %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_0: !cc.ptr<!cc.state>
       %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
       %2 = quake.alloca !quake.veq<?>[%1 : i64]
       %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
diff --git a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index d385f061738..a9cd1dd80e1 100644
--- a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -17,7 +17,6 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
-#include <span>
 
 namespace cudaq::opt {
 #define GEN_PASS_DEF_REPLACESTATEWITHKERNEL
@@ -33,11 +32,10 @@ namespace {
 /// Replace `quake.get_number_of_qubits` by a call to a function
 /// that computes the number of qubits for a state.
 ///
-/// ```
-///  %0 = quake.get_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
+/// ```mlir
+///  %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
 ///  %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
 /// ───────────────────────────────────────────
-/// ...
 ///  %1 = call @callee.num_qubits_0() : () -> i64
 /// ```
 // clang-format on
@@ -50,16 +48,16 @@ class ReplaceGetNumQubitsPattern
                                 PatternRewriter &rewriter) const override {
 
     auto stateOp = numQubits.getOperand();
-    if (auto getState = stateOp.getDefiningOp<quake::GetStateOp>()) {
-      auto numQubitsFunc = getState.getNumQubitsFunc();
-
-      rewriter.setInsertionPoint(numQubits);
-      rewriter.replaceOpWithNewOp<func::CallOp>(
-          numQubits, numQubits.getType(), numQubitsFunc, mlir::ValueRange{});
-      return success();
-    }
-    return numQubits->emitError(
-        "ReplaceStateWithKernel: failed to replace `quake.get_num_qubits`");
+    auto materializeState = stateOp.getDefiningOp<quake::MaterializeStateOp>();
+    if (!materializeState)
+      return numQubits->emitError(
+          "ReplaceStateWithKernel: failed to replace `quake.get_num_qubits`");
+
+    auto numQubitsFunc = materializeState.getNumQubitsFunc();
+    rewriter.setInsertionPoint(numQubits);
+    rewriter.replaceOpWithNewOp<func::CallOp>(
+        numQubits, numQubits.getType(), numQubitsFunc, mlir::ValueRange{});
+    return success();
   }
 };
 
@@ -67,11 +65,10 @@ class ReplaceGetNumQubitsPattern
 /// Replace `quake.init_state` by a call to a (modified) kernel that produced
 /// the state.
 ///
-/// ```
-///  %0 = quake.get_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
+/// ```mlir
+///  %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
 ///  %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 /// ───────────────────────────────────────────
-/// ...
 /// %3 = call @callee.init_0(%2): (!quake.veq<?>) -> !quake.veq<?>
 /// ```
 // clang-format on
@@ -87,19 +84,19 @@ class ReplaceInitStatePattern
 
     if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(stateOp.getType())) {
       if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-        if (auto getState = stateOp.getDefiningOp<quake::GetStateOp>()) {
-          auto initName = getState.getInitFunc();
-
-          rewriter.setInsertionPoint(initState);
-          rewriter.replaceOpWithNewOp<func::CallOp>(
-              initState, initState.getType(), initName,
-              mlir::ValueRange{allocaOp});
-
-          return success();
-        }
-
-        return initState->emitError(
-            "ReplaceStateWithKernel: failed to replace `quake.init_state`");
+        auto materializeState =
+            stateOp.getDefiningOp<quake::MaterializeStateOp>();
+        if (!materializeState)
+          return initState->emitError(
+              "ReplaceStateWithKernel: failed to replace `quake.init_state`");
+
+        auto initName = materializeState.getInitFunc();
+        rewriter.setInsertionPoint(initState);
+        rewriter.replaceOpWithNewOp<func::CallOp>(initState,
+                                                  initState.getType(), initName,
+                                                  mlir::ValueRange{allocaOp});
+
+        return success();
       }
     }
     return failure();
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index daf84544ee3..84f1c13fcfe 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -329,15 +329,10 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
       cudaq::state_helper::getSimulationState(const_cast<cudaq::state *>(v));
 
   auto kernelName = converter.getKernelName();
-  // auto sourceMod = converter.getSourceModule();
   auto substMod = converter.getSubstitutionModule();
 
   // If the state has amplitude data, we materialize the data as a state
   // vector and create a new state from it.
-  // TODO: add an option to use the kernel info if available, i.e. for
-  // remote simulators
-  // TODO: add an option of storing the kernel info on simulators if
-  // preferred i.e. to support synthesis of density matrices.
   if (simState->hasData()) {
     // The call below might cause lazy execution of the state kernel.
     // TODO: For lazy execution scenario on remote simulators, we have the
@@ -391,7 +386,8 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   // efficient) we aim at replacing states with calls to kernels (`callees`)
   // that generated them. This is done in 2 stages:
   //
-  // 1. Replace state by quake.get_state instruction during argument conversion:
+  // 1. Replace state by quake.materialize_state instruction during argument
+  // conversion:
   //
   // Create two functions:
   // - callee.num_qubits_N
@@ -400,7 +396,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   //    Initializes the veq passed as a parameter
   //
   // Then replace the state with
-  //   `quake.get_state @callee.num_qubits_0 @callee.init_0`:
+  //   `quake.materialize_state @callee.num_qubits_0 @callee.init_0`:
   //
   // clang-format off
   // ```
@@ -429,7 +425,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   // clang-format off
   // ```
   // func.func @caller() {
-  //   %0 = quake.get_state @callee.num_qubits_0 @callee.init_state_0 : !cc.ptr<!cc.state>
+  //   %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_state_0 : !cc.ptr<!cc.state>
   //   %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
   //   %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
@@ -448,9 +444,9 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   // ```
   // clang-format on
   //
-  // 2. Replace the `quake.get_state` and ops that use its state with calls to
-  // the generated functions, synthesized with the arguments used to create the
-  // original state:
+  // 2. Replace the `quake.materialize_state` and ops that use its state with
+  // calls to the generated functions, synthesized with the arguments used to
+  // create the original state:
   //
   // After ReplaceStateWithKernel pass:
   //
@@ -505,7 +501,8 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     if (!cudaq::opt::ArgumentConverter::isRegisteredKernelName(initName) ||
         !cudaq::opt::ArgumentConverter::isRegisteredKernelName(numQubitsName)) {
       // Create `callee.init_N` and `callee.num_qubits_N` used for
-      // `quake.get_state` replacement later in ReplaceStateWithKernel pass
+      // `quake.materialize_state` replacement later in ReplaceStateWithKernel
+      // pass
       createInitFunc(builder, substMod, calleeFunc, initKernelName);
       createNumQubitsFunc(builder, substMod, calleeFunc, numQubitsKernelName);
 
@@ -524,7 +521,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     // Create a substitution for the state pointer.
     auto statePtrTy =
         cudaq::cc::PointerType::get(cudaq::cc::StateType::get(ctx));
-    return builder.create<quake::GetStateOp>(
+    return builder.create<quake::MaterializeStateOp>(
         loc, statePtrTy, builder.getStringAttr(numQubitsKernelName),
         builder.getStringAttr(initKernelName));
   }
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index bd3a7b2107b..246802eb355 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -561,7 +561,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         testy
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = quake.get_state @__nvqpp__mlirgen__init.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__init.init_[[HASH_0]] : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__init.init_[[HASH_0]] : !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK:         func.func private @__nvqpp__mlirgen__init.init_[[HASH_0]](%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
@@ -668,7 +668,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         testy
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %0 = quake.get_state @__nvqpp__mlirgen__state_param.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__state_param.init_[[HASH_0]] : !cc.ptr<!cc.state>
+// CHECK:           %0 = quake.materialize_state @__nvqpp__mlirgen__state_param.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__state_param.init_[[HASH_0]] : !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK:         func.func private @__nvqpp__mlirgen__state_param.init_[[HASH_0]](%arg0: !cc.ptr<!cc.state>, %arg1: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
@@ -693,7 +693,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         state_param.init_[[HASH_0]]
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %0 = quake.get_state @__nvqpp__mlirgen__init1.num_qubits_[[HASH_1:.*]] @__nvqpp__mlirgen__init1.init_[[HASH_1]] : !cc.ptr<!cc.state>
+// CHECK:           %0 = quake.materialize_state @__nvqpp__mlirgen__init1.num_qubits_[[HASH_1:.*]] @__nvqpp__mlirgen__init1.init_[[HASH_1]] : !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK:         func.func private @__nvqpp__mlirgen__init1.init_[[HASH_1]](%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
@@ -729,7 +729,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         state_param.num_qubits_[[HASH_0]]
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = quake.get_state @__nvqpp__mlirgen__init1.num_qubits_[[HASH_1]] @__nvqpp__mlirgen__init1.init_[[HASH_1]] : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init1.num_qubits_[[HASH_1]] @__nvqpp__mlirgen__init1.init_[[HASH_1]] : !cc.ptr<!cc.state>
 // CHECK:         }
 
   // clang-format on
@@ -801,7 +801,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         testy
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = quake.get_state @__nvqpp__mlirgen__init2.num_qubits_[[HASH_1:.*]] @__nvqpp__mlirgen__init2.init_[[HASH_1]] : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init2.num_qubits_[[HASH_1:.*]] @__nvqpp__mlirgen__init2.init_[[HASH_1]] : !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK:         func.func private @__nvqpp__mlirgen__init2.init_[[HASH_1]](%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
diff --git a/test/Quake/arg_subst-7.txt b/test/Quake/arg_subst-7.txt
index 959ec6ba364..a3ed90891ab 100644
--- a/test/Quake/arg_subst-7.txt
+++ b/test/Quake/arg_subst-7.txt
@@ -8,7 +8,7 @@
 
 module {
   cc.arg_subst[0] {
-    %0 = quake.get_state @num_qubits @init : !cc.ptr<!cc.state>
+    %0 = quake.materialize_state @num_qubits @init : !cc.ptr<!cc.state>
   }
   func.func @init(%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
     return %arg1 : !quake.veq<?>
diff --git a/test/Quake/arg_subst_func.qke b/test/Quake/arg_subst_func.qke
index b9a7f955981..92b2e712fa3 100644
--- a/test/Quake/arg_subst_func.qke
+++ b/test/Quake/arg_subst_func.qke
@@ -164,7 +164,7 @@ func.func @testy6(%arg0: !cc.ptr<!cc.state>) {
 // CHECK:           return %[[VAL_0]] : i32
 // CHECK:         }
 // CHECK-LABEL:   func.func @testy6() {
-// CHECK:           %[[VAL_2:.*]] = quake.get_state @num_qubits @init : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_2:.*]] = quake.materialize_state @num_qubits @init : !cc.ptr<!cc.state>
 // CHECK:           %[[VAL_3:.*]] = quake.get_number_of_qubits %[[VAL_2]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_5:.*]] = quake.init_state %[[VAL_4]], %[[VAL_2]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
diff --git a/test/Quake/replace_state_with_kernel.qke b/test/Quake/replace_state_with_kernel.qke
index 58b474a65b0..38b1c81d36d 100644
--- a/test/Quake/replace_state_with_kernel.qke
+++ b/test/Quake/replace_state_with_kernel.qke
@@ -23,7 +23,7 @@ module {
   }
 
   func.func @caller0() {
-    %0 = quake.get_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
+    %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
     %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
     %2 = quake.alloca !quake.veq<?>[%1 : i64]
     %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
@@ -38,7 +38,7 @@ module {
 // CHECK:         }
 
   func.func @caller1(%arg0: i64) {
-    %0 = quake.get_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
+    %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
     %2 = quake.alloca !quake.veq<?>[%arg0 : i64]
     %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
     return
@@ -51,7 +51,7 @@ module {
 // CHECK:         }
 
   func.func @caller2() -> i64 {
-    %0 = quake.get_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
+    %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
     %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
     return %1: i64
   }

From c4d600fd723e4eacacb71392d3f779ce876c330c Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 21 Feb 2025 09:13:18 -0800
Subject: [PATCH 39/54] Recursive with caching

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/common/ArgumentConversion.cpp | 133 ++++++++++++++------------
 runtime/common/ArgumentConversion.h   |  61 ++++++++----
 runtime/common/BaseRemoteRESTQPU.h    |   4 +-
 3 files changed, 120 insertions(+), 78 deletions(-)

diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 84f1c13fcfe..66af2ce7b6b 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -102,6 +102,7 @@ static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *,
 /// Create callee.init_N that initializes the state
 /// Callee (the kernel captured by state):
 // clang-format off
+/// ```mlir
 /// func.func @__nvqpp__mlirgen__callee(%arg0: i64) {
 ///   %0 = cc.alloca i64
 ///   cc.store %arg0, %0 : !cc.ptr<i64>
@@ -118,11 +119,12 @@ static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *,
 ///   quake.x %1 : (f64, !quake.ref) -> ()
 ///   return %arg0: !quake.veq<?>
 /// }
+/// ```
 // clang-format on
-static void createInitFunc(OpBuilder &builder, ModuleOp sourceMod,
+static void createInitFunc(OpBuilder &builder, ModuleOp moduleOp,
                            func::FuncOp calleeFunc, StringRef initKernelName) {
   OpBuilder::InsertionGuard guard(builder);
-  builder.setInsertionPointToEnd(sourceMod.getBody());
+  builder.setInsertionPointToEnd(moduleOp.getBody());
 
   auto ctx = builder.getContext();
   auto loc = builder.getUnknownLoc();
@@ -226,6 +228,7 @@ static void createInitFunc(OpBuilder &builder, ModuleOp sourceMod,
 /// initialize the state
 /// Callee: (the kernel captured by state):
 // clang-format off
+/// ```mlir
 /// func.func @callee(%arg0: i64) {
 ///   %0 = cc.alloca i64
 ///   cc.store %arg0, %0 : !cc.ptr<i64>
@@ -243,12 +246,13 @@ static void createInitFunc(OpBuilder &builder, ModuleOp sourceMod,
 ///   %1 = cc.load %0 : !cc.ptr<i64>
 ///   return %1 : i64
 /// }
+/// ```
 // clang-format on
-static void createNumQubitsFunc(OpBuilder &builder, ModuleOp sourceMod,
+static void createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
                                 func::FuncOp calleeFunc,
                                 StringRef numQubitsKernelName) {
   OpBuilder::InsertionGuard guard(builder);
-  builder.setInsertionPointToEnd(sourceMod.getBody());
+  builder.setInsertionPointToEnd(moduleOp.getBody());
 
   auto ctx = builder.getContext();
   auto loc = builder.getUnknownLoc();
@@ -384,19 +388,19 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
 
   // Otherwise (ie quantum hardware, where getting the amplitude data is not
   // efficient) we aim at replacing states with calls to kernels (`callees`)
-  // that generated them. This is done in 2 stages:
+  // that generated them. This is done in three stages:
   //
-  // 1. Replace state by quake.materialize_state instruction during argument
-  // conversion:
+  // 1) (done here) Generate @callee.num_qubits_0 @callee.init_0` for the callee
+  //    function and its arguments stored in a state.
+
+  //    Create two functions:
+  //      - callee.num_qubits_N
+  //        Calculates the number of qubits needed for the veq allocation
+  //      - callee.init_N
+  //        Initializes the veq passed as a parameter
   //
-  // Create two functions:
-  // - callee.num_qubits_N
-  //    Calculates the number of qubits needed for the veq allocation
-  // - callee.init_N
-  //    Initializes the veq passed as a parameter
-  //
-  // Then replace the state with
-  //   `quake.materialize_state @callee.num_qubits_0 @callee.init_0`:
+  // 2) (done here) Replace the state with
+  //   `quake.get_state @callee.num_qubits_0 @callee.init_0`:
   //
   // clang-format off
   // ```
@@ -425,7 +429,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   // clang-format off
   // ```
   // func.func @caller() {
-  //   %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_state_0 : !cc.ptr<!cc.state>
+  //   %0 = quake.get_state @callee.num_qubits_0 @callee.init_state_0 : !cc.ptr<!cc.state>
   //   %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
   //   %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
@@ -444,32 +448,32 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   // ```
   // clang-format on
   //
-  // 2. Replace the `quake.materialize_state` and ops that use its state with
-  // calls to the generated functions, synthesized with the arguments used to
-  // create the original state:
+  // 3) (done in ReplaceStateWithKernel) Replace the `quake.get_state` and ops
+  // that use its state with calls to the generated functions, synthesized with
+  // the arguments used to create the original state:
   //
   // After ReplaceStateWithKernel pass:
   //
   // clang-format off
-  // ```
-  // func.func @caller() {
-  //   %1 = call callee.num_qubits_0() : () -> i64
-  //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
-  //   %3 = call @callee.init_0(%2): (!quake.veq<?>) -> !quake.veq<?>
-  // }
-  //
-  // func.func private @callee.num_qubits_0() -> i64 {
-  //   %cst = arith.constant 2 : i64
-  //   return %cst : i64
-  // }
-  //
-  // func.func private @callee.init_0(%arg0: !quake.veq<?>): !quake.veq<?> {
-  //   %cst = arith.constant 1.5707963267948966 : f64
-  //   %1 = quake.extract_ref %arg0[0] : (!quake.veq<2>) -> !quake.ref
-  //   quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
-  //   return %arg0
-  // }
-  // ```
+   // ```
+   // func.func @caller() {
+   //   %1 = call callee.num_qubits_0() : () -> i64
+   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+   //   %3 = call @callee.init_0(%2): (!quake.veq<?>) -> !quake.veq<?>
+   // }
+   //
+   // func.func private @callee.num_qubits_0() -> i64 {
+   //   %cst = arith.constant 2 : i64
+   //   return %cst : i64
+   // }
+   //
+   // func.func private @callee.init_0(%arg0: !quake.veq<?>): !quake.veq<?> {
+   //   %cst = arith.constant 1.5707963267948966 : f64
+   //   %1 = quake.extract_ref %arg0[0] : (!quake.veq<2>) -> !quake.ref
+   //   quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+   //   return %arg0
+   // }
+   // ```
   // clang-format on
   if (simState->getKernelInfo().has_value()) {
     auto [calleeName, calleeArgs] = simState->getKernelInfo().value();
@@ -487,35 +491,31 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     auto calleeFunc = fromModule->lookupSymbol<func::FuncOp>(calleeKernelName);
     assert(calleeFunc && "callee func is missing");
 
-    // Use the state pointer as a hash to create the new kernel names.
-    // We can reuse the functions previously created from the same state.
+    // Use the state pointer as hash to look up the function name
+    // that was created using the same hash in StateAggregator.
     auto hash = std::to_string(reinterpret_cast<std::size_t>(v));
     auto initName = calleeName + ".init_" + hash;
     auto numQubitsName = calleeName + ".num_qubits_" + hash;
-
-    // Function names in the IR
     auto initKernelName = cudaq::runtime::cudaqGenPrefixName + initName;
     auto numQubitsKernelName =
         cudaq::runtime::cudaqGenPrefixName + numQubitsName;
 
-    if (!cudaq::opt::ArgumentConverter::isRegisteredKernelName(initName) ||
-        !cudaq::opt::ArgumentConverter::isRegisteredKernelName(numQubitsName)) {
-      // Create `callee.init_N` and `callee.num_qubits_N` used for
-      // `quake.materialize_state` replacement later in ReplaceStateWithKernel
-      // pass
+    // Create `callee.init_N` and `callee.num_qubits_N` used to replace
+    // `quake.materialize_state` in ReplaceStateWithKernel pass
+    if (!converter.isRegisteredKernel(initName) ||
+        !converter.isRegisteredKernel(numQubitsName)) {
       createInitFunc(builder, substMod, calleeFunc, initKernelName);
       createNumQubitsFunc(builder, substMod, calleeFunc, numQubitsKernelName);
 
-      // Create and register names for new `init` and `num_qubits` kernels so
-      // ArgumentConverters can keep a string reference to a valid memory.
-      auto &registeredInitName =
-          cudaq::opt::ArgumentConverter::registerKernelName(initName);
-      auto &registeredNumQubitsName =
-          cudaq::opt::ArgumentConverter::registerKernelName(numQubitsName);
+      // Convert arguments for `callee.init_N`.
+      auto &initConverter =
+          cudaq::opt::createChildConverter(converter, initName);
+      initConverter.gen(calleeArgs);
 
-      // Convert arguments  for `callee.init_N` and `callee.num_qubits_N`.
-      converter.genCallee(registeredInitName, calleeArgs);
-      converter.genCallee(registeredNumQubitsName, calleeArgs);
+      // Convert arguments for `callee.num_qubits_N`.
+      auto &numQubitsConverter =
+          cudaq::opt::createChildConverter(converter, numQubitsName);
+      numQubitsConverter.gen(calleeArgs);
     }
 
     // Create a substitution for the state pointer.
@@ -699,13 +699,20 @@ Value genConstant(OpBuilder &builder, cudaq::cc::IndirectCallableType indCallTy,
 
 //===----------------------------------------------------------------------===//
 
-std::list<std::string> cudaq::opt::ArgumentConverter::kernelNameRegistry =
-    std::list<std::string>();
+std::list<std::string> cudaq::opt::ArgumentConverter::emptyRegistry;
 
 cudaq::opt::ArgumentConverter::ArgumentConverter(StringRef kernelName,
                                                  ModuleOp sourceModule)
     : sourceModule(sourceModule), builder(sourceModule.getContext()),
-      kernelName(kernelName) {
+      kernelName(kernelName), kernelRegistry(emptyRegistry) {
+  substModule = builder.create<ModuleOp>(builder.getUnknownLoc());
+}
+
+cudaq::opt::ArgumentConverter::ArgumentConverter(
+    std::list<std::string> &kernelRegistry, StringRef kernelName,
+    ModuleOp sourceModule)
+    : sourceModule(sourceModule), builder(sourceModule.getContext()),
+      kernelName(kernelName), kernelRegistry(kernelRegistry) {
   substModule = builder.create<ModuleOp>(builder.getUnknownLoc());
 }
 
@@ -835,3 +842,11 @@ void cudaq::opt::ArgumentConverter::gen_drop_front(
   }
   gen(partialArgs);
 }
+
+cudaq::opt::ArgumentConverter &
+cudaq::opt::createChildConverter(cudaq::opt::ArgumentConverter &parent,
+                                 std::string &calleeName) {
+  // Store the name in the kernel name cache before referencing it.
+  auto &name = parent.registerKernel(calleeName);
+  return parent.createCalleeConverter(name);
+}
diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index 677bc53b066..2a95178ed1b 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -25,6 +25,12 @@ class ArgumentConverter {
   /// kernelName in \p sourceModule.
   ArgumentConverter(mlir::StringRef kernelName, mlir::ModuleOp sourceModule);
 
+  /// Build an instance to create argument substitutions for a specified \p
+  /// kernelName in \p sourceModule. Use \p kernelRegistry to store newly
+  /// generated functions.
+  ArgumentConverter(std::list<std::string> &kernelRegistry,
+                    mlir::StringRef kernelName, mlir::ModuleOp sourceModule);
+
   /// Generate a substitution ModuleOp for the vector of arguments presented.
   /// The arguments are those presented to the kernel, kernelName.
   void gen(const std::vector<void *> &arguments);
@@ -48,42 +54,61 @@ class ArgumentConverter {
   /// created.
   mlir::ModuleOp getSubstitutionModule() { return substModule; }
 
-  mlir::ModuleOp getSourceModule() { return sourceModule; }
-
+  /// Kernel we are converting the arguments for.
   mlir::StringRef getKernelName() { return kernelName; }
 
-  void genCallee(mlir::StringRef calleeName, std::vector<void *> &args) {
-    auto &converter = calleeConverters.emplace_back(calleeName, substModule);
-    converter.gen(args);
-  }
-
+  /// Return child converters for functions created from kernel used in state
+  /// arguments.
   std::vector<ArgumentConverter> &getCalleeConverters() {
     return calleeConverters;
   }
 
-  static bool isRegisteredKernelName(const std::string &kernelName) {
-    return std::find(kernelNameRegistry.begin(), kernelNameRegistry.end(),
-                     kernelName) != kernelNameRegistry.end();
+  /// Is kernel name already created?
+  bool isRegisteredKernel(const std::string &kernelName) {
+    return std::find(kernelRegistry.begin(), kernelRegistry.end(),
+                     kernelName) != kernelRegistry.end();
   }
 
-  static const std::string &registerKernelName(const std::string &kernelName) {
-    return kernelNameRegistry.emplace_back(kernelName);
+  /// Store kernel name in memory for newly created kernels.
+  const std::string &registerKernel(const std::string &kernelName) {
+    return kernelRegistry.emplace_back(kernelName);
   }
 
 private:
-  /// Keeps kernel names created during argument conversion in memory.
-  /// References to those names are used by the argument converters for
-  /// those kernels.
-  /// Note: use std::list to make sure we always return valid references
-  /// when registering new kernel names.
-  static std::list<std::string> kernelNameRegistry;
+  /// Default registry to use when state synthesis is not needed.
+  static std::list<std::string> emptyRegistry;
+
+  /// Create a child converter for the new callee created from a
+  /// state argument.
+  ArgumentConverter &createCalleeConverter(mlir::StringRef calleeName) {
+    assert(&kernelRegistry != &emptyRegistry &&
+           "Argument converter is missing a kernel registry");
+    return calleeConverters.emplace_back(kernelRegistry, calleeName,
+                                         substModule);
+  }
 
   mlir::ModuleOp sourceModule;
   mlir::ModuleOp substModule;
   mlir::OpBuilder builder;
   mlir::StringRef kernelName;
   mlir::SmallVector<cc::ArgumentSubstitutionOp> substitutions;
+
+  /// Converters for functions created during state argument conversion.
   std::vector<ArgumentConverter> calleeConverters;
+
+  /// Keeps new kernel names created during argument conversion in memory.
+  /// References to the names are used by the argument converters for
+  /// their kernels.
+  /// NOTE: use `std::list` to make sure we always return valid references
+  /// when registering new kernel names, as the references are taken while
+  /// the list is growing.
+  std::list<std::string> &kernelRegistry;
+
+  friend ArgumentConverter &createChildConverter(ArgumentConverter &parent,
+                                                 std::string &calleeName);
 };
 
+ArgumentConverter &createChildConverter(ArgumentConverter &parent,
+                                        std::string &calleeName);
+
 } // namespace cudaq::opt
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 409153c6181..8424aa9999e 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -454,7 +454,8 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
       mlir::PassManager pm(&context);
       if (!rawArgs.empty()) {
         cudaq::info("Run Argument Synth.\n");
-        opt::ArgumentConverter argCon(kernelName, moduleOp);
+        std::list<std::string> kernelRegistry;
+        opt::ArgumentConverter argCon(kernelRegistry, kernelName, moduleOp);
         argCon.gen(rawArgs);
 
         // For quantum devices, we've created a tree of ArgumentConverters
@@ -488,6 +489,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
             };
         collect(argCon);
 
+        // Collect references for the argument synthesis.
         mlir::SmallVector<mlir::StringRef> funcNames{kernels.begin(),
                                                      kernels.end()};
         mlir::SmallVector<mlir::StringRef> substitutions{substs.begin(),

From e58f5775ac055bea2e6e329cc58ac2f9124279a6 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 21 Feb 2025 15:40:58 -0800
Subject: [PATCH 40/54] StateAggregatorWithArgumentConverter

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/common/ArgumentConversion.cpp     | 287 ++-------------
 runtime/common/ArgumentConversion.h       |  52 ---
 runtime/common/BaseRemoteRESTQPU.h        |  78 ++--
 runtime/common/CMakeLists.txt             |   1 +
 runtime/common/StateAggregator.cpp        | 422 ++++++++++++++++++++++
 runtime/common/StateAggregator.h          |  65 ++++
 runtime/test/test_argument_conversion.cpp |  62 +++-
 7 files changed, 608 insertions(+), 359 deletions(-)
 create mode 100644 runtime/common/StateAggregator.cpp
 create mode 100644 runtime/common/StateAggregator.h

diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 66af2ce7b6b..1fe8d9747d5 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -20,6 +20,8 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Parser/Parser.h"
 
+#include <iostream>
+
 using namespace mlir;
 
 template <typename A>
@@ -99,233 +101,6 @@ static Value genConstant(OpBuilder &, cudaq::cc::StructType, void *,
 static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *,
                          ModuleOp substMod, llvm::DataLayout &);
 
-/// Create callee.init_N that initializes the state
-/// Callee (the kernel captured by state):
-// clang-format off
-/// ```mlir
-/// func.func @__nvqpp__mlirgen__callee(%arg0: i64) {
-///   %0 = cc.alloca i64
-///   cc.store %arg0, %0 : !cc.ptr<i64>
-///   %1 = cc.load %0 : !cc.ptr<i64>
-///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
-///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
-///   quake.x %3 : (!quake.ref) -> ()
-///   return
-/// }
-/// callee.init_N:
-/// func.func private @callee.init_0(%arg0: !quake.veq<?>, %arg0: i64) ->
-/// !!quake.veq<?> {
-///   %1 = quake.extract_ref %arg0[1] : (!quake.veq<2>) -> !quake.ref
-///   quake.x %1 : (f64, !quake.ref) -> ()
-///   return %arg0: !quake.veq<?>
-/// }
-/// ```
-// clang-format on
-static void createInitFunc(OpBuilder &builder, ModuleOp moduleOp,
-                           func::FuncOp calleeFunc, StringRef initKernelName) {
-  OpBuilder::InsertionGuard guard(builder);
-  builder.setInsertionPointToEnd(moduleOp.getBody());
-
-  auto ctx = builder.getContext();
-  auto loc = builder.getUnknownLoc();
-
-  auto initFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
-
-  auto argTypes = calleeFunc.getArgumentTypes();
-  auto retTy = quake::VeqType::getUnsized(ctx);
-  auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retTy});
-
-  initFunc.setName(initKernelName);
-  initFunc.setType(funcTy);
-  initFunc.setPrivate();
-
-  OpBuilder newBuilder(ctx);
-
-  auto *entryBlock = &initFunc.getRegion().front();
-  newBuilder.setInsertionPointToStart(entryBlock);
-  Value zero = newBuilder.create<arith::ConstantIntOp>(loc, 0, 64);
-  Value one = newBuilder.create<arith::ConstantIntOp>(loc, 1, 64);
-  Value begin = zero;
-
-  auto argPos = initFunc.getArguments().size();
-
-  // Detect errors in kernel passed to get_state.
-  std::function<void(Block &)> processInner = [&](Block &block) {
-    for (auto &op : block) {
-      for (auto &region : op.getRegions())
-        for (auto &b : region)
-          processInner(b);
-
-      // Don't allow returns in inner scopes
-      if (auto retOp = dyn_cast<func::ReturnOp>(&op))
-        calleeFunc.emitError("Encountered return in inner scope in a kernel "
-                             "passed to get_state");
-    }
-  };
-
-  for (auto &op : calleeFunc.getRegion().front())
-    for (auto &region : op.getRegions())
-      for (auto &b : region)
-        processInner(b);
-
-  // Process outer block to initialize the allocation passed as an argument.
-  std::function<void(Block &)> process = [&](Block &block) {
-    SmallVector<Operation *> cleanUps;
-    Operation *replacedReturn = nullptr;
-
-    Value arg;
-    Value subArg;
-    Value blockBegin = begin;
-    Value blockAllocSize = zero;
-    for (auto &op : block) {
-      if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
-        newBuilder.setInsertionPointAfter(alloc);
-
-        if (!arg) {
-          initFunc.insertArgument(argPos, retTy, {}, loc);
-          arg = initFunc.getArgument(argPos);
-        }
-
-        auto allocSize = alloc.getSize();
-        auto offset = newBuilder.create<arith::SubIOp>(loc, allocSize, one);
-        subArg =
-            newBuilder.create<quake::SubVeqOp>(loc, retTy, arg, begin, offset);
-        alloc.replaceAllUsesWith(subArg);
-        cleanUps.push_back(alloc);
-        begin = newBuilder.create<arith::AddIOp>(loc, begin, allocSize);
-        blockAllocSize =
-            newBuilder.create<arith::AddIOp>(loc, blockAllocSize, allocSize);
-      }
-
-      if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
-        if (retOp != replacedReturn) {
-          newBuilder.setInsertionPointAfter(retOp);
-
-          auto offset =
-              newBuilder.create<arith::SubIOp>(loc, blockAllocSize, one);
-          Value ret = newBuilder.create<quake::SubVeqOp>(loc, retTy, arg,
-                                                         blockBegin, offset);
-
-          assert(arg && "No veq allocations found");
-          replacedReturn = newBuilder.create<func::ReturnOp>(loc, ret);
-          cleanUps.push_back(retOp);
-        }
-      }
-    }
-
-    for (auto &op : cleanUps) {
-      op->dropAllReferences();
-      op->dropAllUses();
-      op->erase();
-    }
-  };
-
-  // Process the function body
-  process(initFunc.getRegion().front());
-}
-
-/// Create callee.num_qubits_N that calculates the number of qubits to
-/// initialize the state
-/// Callee: (the kernel captured by state):
-// clang-format off
-/// ```mlir
-/// func.func @callee(%arg0: i64) {
-///   %0 = cc.alloca i64
-///   cc.store %arg0, %0 : !cc.ptr<i64>
-///   %1 = cc.load %0 : !cc.ptr<i64>
-///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
-///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
-///   quake.x %3 : (!quake.ref) -> ()
-///   return
-/// }
-///
-/// callee.num_qubits_0:
-/// func.func private @callee.num_qubits_0(%arg0: i64) -> i64 {
-///   %0 = cc.alloca i64
-///   cc.store %arg0, %0 : !cc.ptr<i64>
-///   %1 = cc.load %0 : !cc.ptr<i64>
-///   return %1 : i64
-/// }
-/// ```
-// clang-format on
-static void createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
-                                func::FuncOp calleeFunc,
-                                StringRef numQubitsKernelName) {
-  OpBuilder::InsertionGuard guard(builder);
-  builder.setInsertionPointToEnd(moduleOp.getBody());
-
-  auto ctx = builder.getContext();
-  auto loc = builder.getUnknownLoc();
-
-  auto numQubitsFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
-
-  auto argTypes = calleeFunc.getArgumentTypes();
-  auto retType = builder.getI64Type();
-  auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retType});
-
-  numQubitsFunc.setName(numQubitsKernelName);
-  numQubitsFunc.setType(funcTy);
-  numQubitsFunc.setPrivate();
-
-  OpBuilder newBuilder(ctx);
-
-  auto *entryBlock = &numQubitsFunc.getRegion().front();
-  newBuilder.setInsertionPointToStart(entryBlock);
-  Value size = newBuilder.create<arith::ConstantIntOp>(loc, 0, retType);
-
-  // Process block recursively to calculate and return allocation size
-  // and remove everything else.
-  std::function<void(Block &)> process = [&](Block &block) {
-    SmallVector<Operation *> used;
-    Operation *replacedReturn = nullptr;
-
-    for (auto &op : block) {
-      // Calculate allocation size (existing allocation size plus new one)
-      if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
-        auto allocSize = alloc.getSize();
-        newBuilder.setInsertionPointAfter(alloc);
-        size = newBuilder.create<arith::AddIOp>(loc, size, allocSize);
-      }
-
-      // Return allocation size
-      if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
-        if (retOp != replacedReturn) {
-
-          newBuilder.setInsertionPointAfter(retOp);
-          auto newRet = newBuilder.create<func::ReturnOp>(loc, size);
-          replacedReturn = newRet;
-          used.push_back(newRet);
-        }
-      }
-    }
-
-    // Collect all ops needed for size calculation
-    SmallVector<Operation *> keep;
-    while (!used.empty()) {
-      auto *op = used.pop_back_val();
-      keep.push_back(op);
-      for (auto opnd : op->getOperands())
-        if (auto defOp = opnd.getDefiningOp())
-          used.push_back(defOp);
-    }
-
-    // Remove the rest of the ops
-    SmallVector<Operation *> toErase;
-    for (auto &op : block)
-      if (std::find(keep.begin(), keep.end(), &op) == keep.end())
-        toErase.push_back(&op);
-
-    for (auto &op : toErase) {
-      op->dropAllReferences();
-      op->dropAllUses();
-      op->erase();
-    }
-  };
-
-  // Process the function body
-  process(numQubitsFunc.getRegion().front());
-}
-
 static Value genConstant(OpBuilder &builder, const cudaq::state *v,
                          llvm::DataLayout &layout,
                          cudaq::opt::ArgumentConverter &converter) {
@@ -335,6 +110,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
   auto kernelName = converter.getKernelName();
   auto substMod = converter.getSubstitutionModule();
 
+
   // If the state has amplitude data, we materialize the data as a state
   // vector and create a new state from it.
   if (simState->hasData()) {
@@ -385,7 +161,6 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     return builder.create<quake::CreateStateOp>(loc, statePtrTy, buffer,
                                                 arrSize);
   }
-
   // Otherwise (ie quantum hardware, where getting the amplitude data is not
   // efficient) we aim at replacing states with calls to kernels (`callees`)
   // that generated them. This is done in three stages:
@@ -475,6 +250,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
    // }
    // ```
   // clang-format on
+
   if (simState->getKernelInfo().has_value()) {
     auto [calleeName, calleeArgs] = simState->getKernelInfo().value();
 
@@ -500,23 +276,23 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     auto numQubitsKernelName =
         cudaq::runtime::cudaqGenPrefixName + numQubitsName;
 
-    // Create `callee.init_N` and `callee.num_qubits_N` used to replace
-    // `quake.materialize_state` in ReplaceStateWithKernel pass
-    if (!converter.isRegisteredKernel(initName) ||
-        !converter.isRegisteredKernel(numQubitsName)) {
-      createInitFunc(builder, substMod, calleeFunc, initKernelName);
-      createNumQubitsFunc(builder, substMod, calleeFunc, numQubitsKernelName);
-
-      // Convert arguments for `callee.init_N`.
-      auto &initConverter =
-          cudaq::opt::createChildConverter(converter, initName);
-      initConverter.gen(calleeArgs);
-
-      // Convert arguments for `callee.num_qubits_N`.
-      auto &numQubitsConverter =
-          cudaq::opt::createChildConverter(converter, numQubitsName);
-      numQubitsConverter.gen(calleeArgs);
-    }
+    // // Create `callee.init_N` and `callee.num_qubits_N` used to replace
+    // // `quake.materialize_state` in ReplaceStateWithKernel pass
+    // if (!converter.isRegisteredKernel(initName) ||
+    //     !converter.isRegisteredKernel(numQubitsName)) {
+    //   createInitFunc(builder, substMod, calleeFunc, initKernelName);
+    //   createNumQubitsFunc(builder, substMod, calleeFunc, numQubitsKernelName);
+
+    //   // Convert arguments for `callee.init_N`.
+    //   auto &initConverter =
+    //       cudaq::opt::createChildConverter(converter, initName);
+    //   initConverter.gen(calleeArgs);
+
+    //   // Convert arguments for `callee.num_qubits_N`.
+    //   auto &numQubitsConverter =
+    //       cudaq::opt::createChildConverter(converter, numQubitsName);
+    //   numQubitsConverter.gen(calleeArgs);
+    // }
 
     // Create a substitution for the state pointer.
     auto statePtrTy =
@@ -699,20 +475,10 @@ Value genConstant(OpBuilder &builder, cudaq::cc::IndirectCallableType indCallTy,
 
 //===----------------------------------------------------------------------===//
 
-std::list<std::string> cudaq::opt::ArgumentConverter::emptyRegistry;
-
 cudaq::opt::ArgumentConverter::ArgumentConverter(StringRef kernelName,
                                                  ModuleOp sourceModule)
     : sourceModule(sourceModule), builder(sourceModule.getContext()),
-      kernelName(kernelName), kernelRegistry(emptyRegistry) {
-  substModule = builder.create<ModuleOp>(builder.getUnknownLoc());
-}
-
-cudaq::opt::ArgumentConverter::ArgumentConverter(
-    std::list<std::string> &kernelRegistry, StringRef kernelName,
-    ModuleOp sourceModule)
-    : sourceModule(sourceModule), builder(sourceModule.getContext()),
-      kernelName(kernelName), kernelRegistry(kernelRegistry) {
+      kernelName(kernelName) {
   substModule = builder.create<ModuleOp>(builder.getUnknownLoc());
 }
 
@@ -722,6 +488,7 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
 
   auto fun = sourceModule.lookupSymbol<func::FuncOp>(
       cudaq::runtime::cudaqGenPrefixName + kernelName.str());
+
   FunctionType fromFuncTy = fun.getFunctionType();
   for (auto iter :
        llvm::enumerate(llvm::zip(fromFuncTy.getInputs(), arguments))) {
@@ -842,11 +609,3 @@ void cudaq::opt::ArgumentConverter::gen_drop_front(
   }
   gen(partialArgs);
 }
-
-cudaq::opt::ArgumentConverter &
-cudaq::opt::createChildConverter(cudaq::opt::ArgumentConverter &parent,
-                                 std::string &calleeName) {
-  // Store the name in the kernel name cache before referencing it.
-  auto &name = parent.registerKernel(calleeName);
-  return parent.createCalleeConverter(name);
-}
diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index 2a95178ed1b..b876955385f 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -25,12 +25,6 @@ class ArgumentConverter {
   /// kernelName in \p sourceModule.
   ArgumentConverter(mlir::StringRef kernelName, mlir::ModuleOp sourceModule);
 
-  /// Build an instance to create argument substitutions for a specified \p
-  /// kernelName in \p sourceModule. Use \p kernelRegistry to store newly
-  /// generated functions.
-  ArgumentConverter(std::list<std::string> &kernelRegistry,
-                    mlir::StringRef kernelName, mlir::ModuleOp sourceModule);
-
   /// Generate a substitution ModuleOp for the vector of arguments presented.
   /// The arguments are those presented to the kernel, kernelName.
   void gen(const std::vector<void *> &arguments);
@@ -57,58 +51,12 @@ class ArgumentConverter {
   /// Kernel we are converting the arguments for.
   mlir::StringRef getKernelName() { return kernelName; }
 
-  /// Return child converters for functions created from kernel used in state
-  /// arguments.
-  std::vector<ArgumentConverter> &getCalleeConverters() {
-    return calleeConverters;
-  }
-
-  /// Is kernel name already created?
-  bool isRegisteredKernel(const std::string &kernelName) {
-    return std::find(kernelRegistry.begin(), kernelRegistry.end(),
-                     kernelName) != kernelRegistry.end();
-  }
-
-  /// Store kernel name in memory for newly created kernels.
-  const std::string &registerKernel(const std::string &kernelName) {
-    return kernelRegistry.emplace_back(kernelName);
-  }
-
 private:
-  /// Default registry to use when state synthesis is not needed.
-  static std::list<std::string> emptyRegistry;
-
-  /// Create a child converter for the new callee created from a
-  /// state argument.
-  ArgumentConverter &createCalleeConverter(mlir::StringRef calleeName) {
-    assert(&kernelRegistry != &emptyRegistry &&
-           "Argument converter is missing a kernel registry");
-    return calleeConverters.emplace_back(kernelRegistry, calleeName,
-                                         substModule);
-  }
-
   mlir::ModuleOp sourceModule;
   mlir::ModuleOp substModule;
   mlir::OpBuilder builder;
   mlir::StringRef kernelName;
   mlir::SmallVector<cc::ArgumentSubstitutionOp> substitutions;
-
-  /// Converters for functions created during state argument conversion.
-  std::vector<ArgumentConverter> calleeConverters;
-
-  /// Keeps new kernel names created during argument conversion in memory.
-  /// References to the names are used by the argument converters for
-  /// their kernels.
-  /// NOTE: use `std::list` to make sure we always return valid references
-  /// when registering new kernel names, as the references are taken while
-  /// the list is growing.
-  std::list<std::string> &kernelRegistry;
-
-  friend ArgumentConverter &createChildConverter(ArgumentConverter &parent,
-                                                 std::string &calleeName);
 };
 
-ArgumentConverter &createChildConverter(ArgumentConverter &parent,
-                                        std::string &calleeName);
-
 } // namespace cudaq::opt
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 8424aa9999e..4b12d396126 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include "common/ArgumentConversion.h"
+#include "common/StateAggregator.h"
 #include "common/Environment.h"
 #include "common/ExecutionContext.h"
 #include "common/Executor.h"
@@ -454,40 +455,69 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
       mlir::PassManager pm(&context);
       if (!rawArgs.empty()) {
         cudaq::info("Run Argument Synth.\n");
-        std::list<std::string> kernelRegistry;
-        opt::ArgumentConverter argCon(kernelRegistry, kernelName, moduleOp);
-        argCon.gen(rawArgs);
-
-        // For quantum devices, we've created a tree of ArgumentConverters
+        // For quantum devices, create a list of ArgumentConverters
         // with nodes corresponding to `init` and `num_qubits` functions
         // created from a kernel that generated the state argument.
         // Traverse the tree and collect substitutions for all those
         // functions.
+        cudaq::opt::StateAggregator aggregator;
+        aggregator.collect(moduleOp, kernelName, rawArgs);
 
         // Store kernel and substitution strings on the stack.
         // We pass string references to the `createArgumentSynthesisPass`.
         mlir::SmallVector<std::string> kernels;
         mlir::SmallVector<std::string> substs;
+        for (auto &kInfo : aggregator.getKernelInfo()) {
+          auto con = kInfo.converter;
+          con.gen(kInfo.args);
+          {
+            auto name = con.getKernelName();
+            std::string kernName =
+                cudaq::runtime::cudaqGenPrefixName + name.str();
+            kernels.emplace_back(kernName);
+          }
+          {
+            std::string substBuff;
+            llvm::raw_string_ostream ss(substBuff);
+            ss << con.getSubstitutionModule();
+            substs.emplace_back(substBuff);
+          }
+        }
 
-        std::function<void(opt::ArgumentConverter &)> collect =
-            [&kernels, &substs, &collect](opt::ArgumentConverter &con) {
-              {
-                auto name = con.getKernelName();
-                std::string kernName =
-                    cudaq::runtime::cudaqGenPrefixName + name.str();
-                kernels.emplace_back(kernName);
-              }
-              {
-                std::string substBuff;
-                llvm::raw_string_ostream ss(substBuff);
-                ss << con.getSubstitutionModule();
-                substs.emplace_back(substBuff);
-              }
-
-              for (auto &calleeCon : con.getCalleeConverters())
-                collect(calleeCon);
-            };
-        collect(argCon);
+        // std::list<std::string> kernelRegistry;
+        // opt::ArgumentConverter argCon(kernelRegistry, kernelName, moduleOp);
+        // argCon.gen(rawArgs);
+
+        // // For quantum devices, we've created a tree of ArgumentConverters
+        // // with nodes corresponding to `init` and `num_qubits` functions
+        // // created from a kernel that generated the state argument.
+        // // Traverse the tree and collect substitutions for all those
+        // // functions.
+
+        // // Store kernel and substitution strings on the stack.
+        // // We pass string references to the `createArgumentSynthesisPass`.
+        // mlir::SmallVector<std::string> kernels;
+        // mlir::SmallVector<std::string> substs;
+
+        // std::function<void(opt::ArgumentConverter &)> collect =
+        //     [&kernels, &substs, &collect](opt::ArgumentConverter &con) {
+        //       {
+        //         auto name = con.getKernelName();
+        //         std::string kernName =
+        //             cudaq::runtime::cudaqGenPrefixName + name.str();
+        //         kernels.emplace_back(kernName);
+        //       }
+        //       {
+        //         std::string substBuff;
+        //         llvm::raw_string_ostream ss(substBuff);
+        //         ss << con.getSubstitutionModule();
+        //         substs.emplace_back(substBuff);
+        //       }
+
+        //       for (auto &calleeCon : con.getCalleeConverters())
+        //         collect(calleeCon);
+        //     };
+        // collect(argCon);
 
         // Collect references for the argument synthesis.
         mlir::SmallVector<mlir::StringRef> funcNames{kernels.begin(),
diff --git a/runtime/common/CMakeLists.txt b/runtime/common/CMakeLists.txt
index 3d6061f4ef0..8567416bf4a 100644
--- a/runtime/common/CMakeLists.txt
+++ b/runtime/common/CMakeLists.txt
@@ -90,6 +90,7 @@ add_library(cudaq-mlir-runtime
     JIT.cpp
     Logger.cpp
     RuntimeMLIR.cpp
+    StateAggregator.cpp
 )
 set_property(GLOBAL APPEND PROPERTY CUDAQ_RUNTIME_LIBS cudaq-mlir-runtime)
 
diff --git a/runtime/common/StateAggregator.cpp b/runtime/common/StateAggregator.cpp
new file mode 100644
index 00000000000..80f6d30e0d0
--- /dev/null
+++ b/runtime/common/StateAggregator.cpp
@@ -0,0 +1,422 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+ #include "StateAggregator.h"
+ #include "cudaq.h"
+ #include "cudaq/Optimizer/Builder/Intrinsics.h"
+ #include "cudaq/Optimizer/Builder/Runtime.h"
+ #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+ #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+ #include "cudaq/Todo.h"
+ #include "cudaq/qis/pauli_word.h"
+ #include "cudaq/utils/registry.h"
+ #include "llvm/ADT/TypeSwitch.h"
+ #include "mlir/Dialect/Arith/IR/Arith.h"
+ #include "mlir/Dialect/Complex/IR/Complex.h"
+ #include "mlir/IR/BuiltinAttributes.h"
+ #include "mlir/Parser/Parser.h"
+
+ #include <iostream>
+ 
+ using namespace mlir;
+ 
+ /// Create callee.init_N that initializes the state
+ /// Callee (the kernel captured by state):
+ // clang-format off
+ /// func.func @callee(%arg0: i64) {
+ ///   %0 = cc.alloca i64
+ ///   cc.store %arg0, %0 : !cc.ptr<i64>
+ ///   %1 = cc.load %0 : !cc.ptr<i64>
+ ///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+ ///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
+ ///   quake.x %3 : (!quake.ref) -> ()
+ ///   return
+ /// }
+ /// callee.init_N:
+ /// func.func private @callee.init_0(%arg0: !quake.veq<?>, %arg0: i64) ->
+ /// !!quake.veq<?> {
+ ///   %1 = quake.extract_ref %arg0[1] : (!quake.veq<2>) -> !quake.ref
+ ///   quake.x %1 : (f64, !quake.ref) -> ()
+ ///   return %arg0: !quake.veq<?>
+ /// }
+ // clang-format on
+ static void createInitFunc(OpBuilder &builder, ModuleOp moduleOp,
+                            func::FuncOp calleeFunc, StringRef initKernelName) {
+   OpBuilder::InsertionGuard guard(builder);
+   builder.setInsertionPointToEnd(moduleOp.getBody());
+ 
+   auto ctx = builder.getContext();
+   auto loc = builder.getUnknownLoc();
+ 
+   auto initFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
+ 
+   auto argTypes = calleeFunc.getArgumentTypes();
+   auto retTy = quake::VeqType::getUnsized(ctx);
+   auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retTy});
+ 
+   initFunc.setName(initKernelName);
+   initFunc.setType(funcTy);
+   initFunc.setPrivate();
+ 
+   OpBuilder newBuilder(ctx);
+ 
+   auto *entryBlock = &initFunc.getRegion().front();
+   newBuilder.setInsertionPointToStart(entryBlock);
+   Value zero = newBuilder.create<arith::ConstantIntOp>(loc, 0, 64);
+   Value one = newBuilder.create<arith::ConstantIntOp>(loc, 1, 64);
+   Value begin = zero;
+ 
+   auto argPos = initFunc.getArguments().size();
+ 
+   // Detect errors in kernel passed to get_state.
+   std::function<void(Block &)> processInner = [&](Block &block) {
+     for (auto &op : block) {
+       for (auto &region : op.getRegions())
+         for (auto &b : region)
+           processInner(b);
+ 
+       // Don't allow returns in inner scopes
+       if (auto retOp = dyn_cast<func::ReturnOp>(&op))
+         calleeFunc.emitError("Encountered return in inner scope in a kernel "
+                              "passed to get_state");
+     }
+   };
+ 
+   for (auto &op : calleeFunc.getRegion().front())
+     for (auto &region : op.getRegions())
+       for (auto &b : region)
+         processInner(b);
+ 
+   // Process outer block to initialize the allocation passed as an argument.
+   std::function<void(Block &)> process = [&](Block &block) {
+     SmallVector<Operation *> cleanUps;
+     Operation *replacedReturn = nullptr;
+ 
+     Value arg;
+     Value subArg;
+     Value blockBegin = begin;
+     Value blockAllocSize = zero;
+     for (auto &op : block) {
+       if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
+         newBuilder.setInsertionPointAfter(alloc);
+ 
+         if (!arg) {
+           initFunc.insertArgument(argPos, retTy, {}, loc);
+           arg = initFunc.getArgument(argPos);
+         }
+ 
+         auto allocSize = alloc.getSize();
+         auto offset = newBuilder.create<arith::SubIOp>(loc, allocSize, one);
+         subArg =
+             newBuilder.create<quake::SubVeqOp>(loc, retTy, arg, begin, offset);
+         alloc.replaceAllUsesWith(subArg);
+         cleanUps.push_back(alloc);
+         begin = newBuilder.create<arith::AddIOp>(loc, begin, allocSize);
+         blockAllocSize =
+             newBuilder.create<arith::AddIOp>(loc, blockAllocSize, allocSize);
+       }
+ 
+       if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
+         if (retOp != replacedReturn) {
+           newBuilder.setInsertionPointAfter(retOp);
+ 
+           auto offset =
+               newBuilder.create<arith::SubIOp>(loc, blockAllocSize, one);
+           Value ret = newBuilder.create<quake::SubVeqOp>(loc, retTy, arg,
+                                                          blockBegin, offset);
+ 
+           assert(arg && "No veq allocations found");
+           replacedReturn = newBuilder.create<func::ReturnOp>(loc, ret);
+           cleanUps.push_back(retOp);
+         }
+       }
+     }
+ 
+     for (auto &op : cleanUps) {
+       op->dropAllReferences();
+       op->dropAllUses();
+       op->erase();
+     }
+   };
+ 
+   // Process the function body
+   process(initFunc.getRegion().front());
+ }
+ 
+ /// Create callee.num_qubits_N that calculates the number of qubits to
+ /// initialize the state
+ /// Callee: (the kernel captured by state):
+ // clang-format off
+ /// func.func @callee(%arg0: i64) {
+ ///   %0 = cc.alloca i64
+ ///   cc.store %arg0, %0 : !cc.ptr<i64>
+ ///   %1 = cc.load %0 : !cc.ptr<i64>
+ ///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+ ///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
+ ///   quake.x %3 : (!quake.ref) -> ()
+ ///   return
+ /// }
+ ///
+ /// callee.num_qubits_0:
+ /// func.func private @callee.num_qubits_0(%arg0: i64) -> i64 {
+ ///   %0 = cc.alloca i64
+ ///   cc.store %arg0, %0 : !cc.ptr<i64>
+ ///   %1 = cc.load %0 : !cc.ptr<i64>
+ ///   return %1 : i64
+ /// }
+ // clang-format on
+ static void createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
+                                 func::FuncOp calleeFunc,
+                                 StringRef numQubitsKernelName) {
+   OpBuilder::InsertionGuard guard(builder);
+   builder.setInsertionPointToEnd(moduleOp.getBody());
+ 
+   auto ctx = builder.getContext();
+   auto loc = builder.getUnknownLoc();
+ 
+   auto numQubitsFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
+ 
+   auto argTypes = calleeFunc.getArgumentTypes();
+   auto retType = builder.getI64Type();
+   auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retType});
+ 
+   numQubitsFunc.setName(numQubitsKernelName);
+   numQubitsFunc.setType(funcTy);
+   numQubitsFunc.setPrivate();
+ 
+   OpBuilder newBuilder(ctx);
+ 
+   auto *entryBlock = &numQubitsFunc.getRegion().front();
+   newBuilder.setInsertionPointToStart(entryBlock);
+   Value size = newBuilder.create<arith::ConstantIntOp>(loc, 0, retType);
+ 
+   // Process block recursively to calculate and return allocation size
+   // and remove everything else.
+   std::function<void(Block &)> process = [&](Block &block) {
+     SmallVector<Operation *> used;
+     Operation *replacedReturn = nullptr;
+ 
+     for (auto &op : block) {
+       // Calculate allocation size (existing allocation size plus new one)
+       if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
+         auto allocSize = alloc.getSize();
+         newBuilder.setInsertionPointAfter(alloc);
+         size = newBuilder.create<arith::AddIOp>(loc, size, allocSize);
+       }
+ 
+       // Return allocation size
+       if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
+         if (retOp != replacedReturn) {
+ 
+           newBuilder.setInsertionPointAfter(retOp);
+           auto newRet = newBuilder.create<func::ReturnOp>(loc, size);
+           replacedReturn = newRet;
+           used.push_back(newRet);
+         }
+       }
+     }
+ 
+     // Collect all ops needed for size calculation
+     SmallVector<Operation *> keep;
+     while (!used.empty()) {
+       auto *op = used.pop_back_val();
+       keep.push_back(op);
+       for (auto opnd : op->getOperands())
+         if (auto defOp = opnd.getDefiningOp())
+           used.push_back(defOp);
+     }
+ 
+     // Remove the rest of the ops
+     SmallVector<Operation *> toErase;
+     for (auto &op : block)
+       if (std::find(keep.begin(), keep.end(), &op) == keep.end())
+         toErase.push_back(&op);
+ 
+     for (auto &op : toErase) {
+       op->dropAllReferences();
+       op->dropAllUses();
+       op->erase();
+     }
+   };
+ 
+   // Process the function body
+   process(numQubitsFunc.getRegion().front());
+ }
+ 
+ void cudaq::opt::StateAggregator::collectKernelInfo(ModuleOp moduleOp, const cudaq::state *v) {
+   auto simState =
+       cudaq::state_helper::getSimulationState(const_cast<cudaq::state *>(v));
+ 
+   // If the state has amplitude data, we materialize the data as a state
+   // vector and create a new state from it in the ArgumentConverter.
+   // TODO: add an option to use the kernel info if available, i.e. for
+   // remote simulators
+   // TODO: add an option of storing the kernel info on simulators if
+   // preferred i.e. to support synthesis of density matrices.
+   if (simState->hasData()) {
+     return;
+   }
+ 
+   // Otherwise (ie quantum hardware, where getting the amplitude data is not
+   // efficient) we aim at replacing states with calls to kernels (`callees`)
+   // that generated them. This is done in three stages:
+   //
+   // 1) (done here) Generate @callee.num_qubits_0 @callee.init_0` for the callee
+   //    function and its arguments stored in a state.
+ 
+   //    Create two functions:
+   //      - callee.num_qubits_N
+   //        Calculates the number of qubits needed for the veq allocation
+   //      - callee.init_N
+   //        Initializes the veq passed as a parameter
+   //
+   // 2) (done in ArgumentConverter) Replace the state with
+   //   `quake.get_state @callee.num_qubits_0 @callee.init_0`:
+   //
+   // clang-format off
+   // ```
+   // func.func @caller(%arg0: !cc.ptr<!cc.state>) {
+   //   %1 = quake.get_number_of_qubits %arg0: (!cc.ptr<!cc.state>) -> i64
+   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+   //   %3 = quake.init_state %2, %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+   //   return
+   // }
+   //
+   // func.func private @callee(%arg0: i64) {
+   //   %0 = quake.alloca !quake.veq<?>[%arg0 : i64]
+   //   %1 = quake.extract_ref %0[0] : (!quake.veq<2>) -> !quake.ref
+   //   quake.x %1 : (!quake.ref) -> ()
+   //   return
+   // }
+   //
+   // Call from the user host code:
+   // state = cudaq.get_state(callee, 2)
+   // counts = cudaq.sample(caller, state)
+   // ```
+   // clang-format on
+   //
+   // => after argument synthesis:
+   //
+   // clang-format off
+   // ```
+   // func.func @caller() {
+   //   %0 = quake.get_state @callee.num_qubits_0 @callee.init_state_0 : !cc.ptr<!cc.state>
+   //   %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
+   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+   //   %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+   //   return
+   // }
+   //
+   // func.func private @callee.num_qubits_0(%arg0: i64) -> i64 {
+   //   return %arg0 : i64
+   // }
+   //
+   // func.func private @callee.init_0(%arg0: i64, %arg1: !quake.veq<?>) {
+   //   %1 = quake.extract_ref %arg0[0] : (!quake.veq<2>) -> !quake.ref
+   //   quake.x %1 : (f64, !quake.ref) -> ()
+   //   return
+   // }
+   // ```
+   // clang-format on
+   //
+   // 3) (done in ReplaceStateWithKernel) Replace the `quake.get_state` and ops
+   // that use its state with calls to the generated functions, synthesized with
+   // the arguments used to create the original state:
+   //
+   // After ReplaceStateWithKernel pass:
+   //
+   // clang-format off
+   // ```
+   // func.func @caller() {
+   //   %1 = call callee.num_qubits_0() : () -> i64
+   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+   //   %3 = call @callee.init_0(%2): (!quake.veq<?>) -> !quake.veq<?>
+   // }
+   //
+   // func.func private @callee.num_qubits_0() -> i64 {
+   //   %cst = arith.constant 2 : i64
+   //   return %cst : i64
+   // }
+   //
+   // func.func private @callee.init_0(%arg0: !quake.veq<?>): !quake.veq<?> {
+   //   %cst = arith.constant 1.5707963267948966 : f64
+   //   %1 = quake.extract_ref %arg0[0] : (!quake.veq<2>) -> !quake.ref
+   //   quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
+   //   return %arg0
+   // }
+   // ```
+   // clang-format on
+   if (simState->getKernelInfo().has_value()) {
+     auto [calleeName, calleeArgs] = simState->getKernelInfo().value();
+ 
+     std::string calleeKernelName =
+         cudaq::runtime::cudaqGenPrefixName + calleeName;
+ 
+     auto builder = IRBuilder(moduleOp);
+     auto ctx = builder.getContext();
+ 
+     auto code = cudaq::get_quake_by_name(calleeName, /*throwException=*/false);
+     assert(!code.empty() && "Quake code not found for callee");
+     auto fromModule = parseSourceString<ModuleOp>(code, ctx);
+ 
+     auto calleeFunc = fromModule->lookupSymbol<func::FuncOp>(calleeKernelName);
+     assert(calleeFunc && "callee func is missing");
+ 
+     // Use the state pointer as hash to store new function names
+     // so we can look them up later in ArgumentConverter.
+     auto hash = std::to_string(reinterpret_cast<std::size_t>(v));
+     auto initName = calleeName + ".init_" + hash;
+     auto numQubitsName = calleeName + ".num_qubits_" + hash;
+ 
+     if (!hasKernelInfo(initName) && !hasKernelInfo(numQubitsName)) {
+       auto initKernelName = cudaq::runtime::cudaqGenPrefixName + initName;
+       auto numQubitsKernelName =
+           cudaq::runtime::cudaqGenPrefixName + numQubitsName;
+ 
+       // Create `callee.init_N` and `callee.num_qubits_N` functions used to
+       // replace `quake.get_state` later in ReplaceStateWithKernel pass
+       createInitFunc(builder, moduleOp, calleeFunc, initKernelName);
+       createNumQubitsFunc(builder, moduleOp, calleeFunc, numQubitsKernelName);
+ 
+       // Collect kernel info from the callee arguments recursively
+       collect(moduleOp, initName, calleeArgs);
+       collect(moduleOp, numQubitsName, calleeArgs);
+     }
+     return;
+   }
+ 
+   TODO("cudaq::state* argument synthesis for quantum hardware for c functions");
+ }
+ 
+ //===----------------------------------------------------------------------===//
+
+
+ void cudaq::opt::StateAggregator::collect(ModuleOp moduleOp,
+     const std::string& kernelName, const std::vector<void *> &arguments) {
+
+   auto &info = addKernelInfo(moduleOp, kernelName, arguments);
+   auto substModule = info.converter.getSubstitutionModule();
+   auto *ctx = moduleOp.getContext();
+ 
+   auto fun = moduleOp.lookupSymbol<func::FuncOp>(
+       cudaq::runtime::cudaqGenPrefixName + kernelName);
+   assert(fun && "callee func is missing in state aggregator");
+ 
+   FunctionType fromFuncTy = fun.getFunctionType();
+   for (auto iter :
+        llvm::enumerate(llvm::zip(fromFuncTy.getInputs(), arguments))) {
+     void *argPtr = std::get<1>(iter.value());
+     if (!argPtr)
+       continue;
+     Type argTy = std::get<0>(iter.value());
+ 
+     if (auto ptrTy = dyn_cast<cc::PointerType>(argTy))
+       if (ptrTy.getElementType() == cc::StateType::get(ctx))
+         collectKernelInfo(substModule, static_cast<const state *>(argPtr));
+   }
+ }
\ No newline at end of file
diff --git a/runtime/common/StateAggregator.h b/runtime/common/StateAggregator.h
new file mode 100644
index 00000000000..69dd1ca621f
--- /dev/null
+++ b/runtime/common/StateAggregator.h
@@ -0,0 +1,65 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+ #pragma once
+
+ #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+ #include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
+ #include "cudaq/qis/state.h"
+ #include "mlir/IR/Builders.h"
+ #include "mlir/IR/Types.h"
+ #include <list>
+ #include <unordered_set>
+ #include <vector>
+ #include "ArgumentConversion.h"
+ 
+ namespace cudaq::opt {
+  struct KernelInfo {
+    ArgumentConverter converter;
+    const std::vector<void *> args;
+  };
+
+ class StateAggregator {
+ public:
+   /// Create an instance of the state aggregator for a specified \p
+   /// sourceModule.
+   StateAggregator(){}
+ 
+   /// Collect kernel names and arguments for all state arguments.
+   void collect(mlir::ModuleOp moduleOp, const std::string& kernelName,
+                const std::vector<void *> &arguments);
+ 
+   /// Get the map of kernel names to their kernel info that
+   /// were collected by `collect()`.
+   std::list<KernelInfo>& getKernelInfo() {
+     return kernelInfo;
+   }
+ 
+ private:
+   void collectKernelInfo(mlir::ModuleOp moduleOp, const cudaq::state *v);
+ 
+   bool hasKernelInfo(const std::string &kernelName) {
+     return std::find(nameRegistry.begin(), nameRegistry.end(), kernelName) != nameRegistry.end();
+   }
+ 
+   KernelInfo& addKernelInfo(mlir::ModuleOp moduleOp, const std::string &kernelName,
+                      const std::vector<void *> &args) {
+    auto &name = nameRegistry.emplace_back(kernelName);
+    return kernelInfo.emplace_back(std::move(ArgumentConverter(name, moduleOp)), args);
+   }
+ 
+ private:
+   /// Memory to store new kernel names generated during argument conversion.
+   std::list<std::string> nameRegistry;
+
+   /// Kernel info for kernels we are converting the arguments for, including
+   /// new kernels generated from state arguments.
+   std::list<KernelInfo> kernelInfo;
+ };
+ 
+ } // namespace cudaq::opt
\ No newline at end of file
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 246802eb355..5ab571f46fb 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -12,6 +12,7 @@
 // RUN: test_argument_conversion | FileCheck %s
 
 #include "common/ArgumentConversion.h"
+#include "common/StateAggregator.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "cudaq/Optimizer/InitAllDialects.h"
@@ -142,20 +143,12 @@ class FakeDeviceState : public cudaq::SimulationState {
 
 extern "C" void __cudaq_deviceCodeHolderAdd(const char *, const char *);
 
-void dumpSubstitutionModules(cudaq::opt::ArgumentConverter &ab) {
-  std::function<void(cudaq::opt::ArgumentConverter &)> dump =
-      [&dump](cudaq::opt::ArgumentConverter &con) {
-        // Dump the conversions
-        llvm::outs() << "========================================\n"
-                        "Substitution module:\n"
-                     << con.getKernelName() << "\n"
-                     << con.getSubstitutionModule() << '\n';
-
-        for (auto &calleeCon : con.getCalleeConverters())
-          dump(calleeCon);
-      };
-
-  dump(ab);
+void dumpSubstitutionModule(cudaq::opt::ArgumentConverter &con) {
+  // Dump the conversions
+  llvm::outs() << "========================================\n"
+                  "Substitution module:\n"
+               << con.getKernelName() << "\n"
+               << con.getSubstitutionModule() << '\n';
 }
 
 void doSimpleTest(mlir::MLIRContext *ctx, const std::string &typeName,
@@ -178,7 +171,38 @@ func.func @__nvqpp__mlirgen__testy(%0: )#" +
   // Create the argument conversions
   ab.gen(args);
   // Dump all conversions
-  dumpSubstitutionModules(ab);
+  dumpSubstitutionModule(ab);
+}
+
+
+void doStateAggregationTest(mlir::MLIRContext *ctx, const std::string &typeName,
+  std::vector<void *> args,
+  const std::string &additionalCode = "") {
+std::string code = additionalCode + R"#(
+func.func private @callee(%0: )#" +
+     typeName + R"#()
+func.func @__nvqpp__mlirgen__testy(%0: )#" +
+     typeName + R"#() {
+call @callee(%0) : ()#" +
+     typeName + R"#() -> ()
+return
+})#";
+
+// Create the Module
+auto mod = mlir::parseSourceString<mlir::ModuleOp>(code, ctx);
+llvm::outs() << "Source module:\n" << *mod << '\n';
+
+  // Create the argument conversions for state arguments
+  cudaq::opt::StateAggregator sa;
+  sa.collect(*mod, "testy", args);
+
+  for (auto &kInfo : sa.getKernelInfo()) {
+    cudaq::opt::ArgumentConverter &cab = kInfo.converter;
+    // Create the argument conversions for callee kernels from state arguments
+    cab.gen(kInfo.args);
+    // Dump all conversions
+    dumpSubstitutionModule(cab);
+  }
 }
 
 void doTest(mlir::MLIRContext *ctx, std::vector<std::string> &typeNames,
@@ -221,7 +245,7 @@ void doTest(mlir::MLIRContext *ctx, std::vector<std::string> &typeNames,
   // Create the argument conversions
   ab.gen_drop_front(args, startingArgIdx);
   // Dump all conversions
-  dumpSubstitutionModules(ab);
+  dumpSubstitutionModule(ab);
 }
 
 void test_scalars(mlir::MLIRContext *ctx) {
@@ -544,7 +568,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
     std::vector<void *> a = {static_cast<void *>(&n)};
     auto s = cudaq::state(new FakeDeviceState(init, a));
     std::vector<void *> v = {static_cast<void *>(&s)};
-    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, initCode);
+    doStateAggregationTest(ctx, "!cc.ptr<!cc.state>", v, initCode);
   }
 
   // clang-format off
@@ -645,7 +669,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
     std::vector<void *> v1 = {static_cast<void *>(&s1)};
 
     auto code = std::string{initCode} + std::string{stateParamCode};
-    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v1, code);
+    doStateAggregationTest(ctx, "!cc.ptr<!cc.state>", v1, code);
   }
 
   // clang-format off
@@ -775,7 +799,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
     std::vector<void *> a = {static_cast<void *>(&n)};
     auto s = cudaq::state(new FakeDeviceState(init, a));
     std::vector<void *> v = {static_cast<void *>(&s)};
-    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, initCode);
+    doStateAggregationTest(ctx, "!cc.ptr<!cc.state>", v, initCode);
   }
 
   // clang-format off

From 6c0dd7d36c64c24dbd1c414ed32f55d0db09e9b0 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 25 Feb 2025 12:05:07 -0800
Subject: [PATCH 41/54] Make ArgumentConverter handle the state call tree

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  37 +-
 runtime/common/ArgumentConversion.cpp         | 273 ++++++++++-
 runtime/common/ArgumentConversion.h           |  70 ++-
 runtime/common/BaseRemoteRESTQPU.h            |  64 +--
 runtime/common/BaseRestRemoteClient.h         |  35 +-
 runtime/common/CMakeLists.txt                 |   1 -
 runtime/common/StateAggregator.cpp            | 422 ------------------
 runtime/common/StateAggregator.h              |  65 ---
 runtime/test/test_argument_conversion.cpp     |  56 +--
 9 files changed, 394 insertions(+), 629 deletions(-)
 delete mode 100644 runtime/common/StateAggregator.cpp
 delete mode 100644 runtime/common/StateAggregator.h

diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 083b31e4dde..3f15beac689 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -543,18 +543,39 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   auto isLocalSimulator = platform.is_simulator() && !platform.is_emulated();
   auto isSimulator = isLocalSimulator || isRemoteSimulator;
 
-  cudaq::opt::ArgumentConverter argCon(name, unwrap(module));
+  auto argCon = cudaq::opt::ArgumentConverter(name, unwrap(module));
   argCon.gen(runtimeArgs.getArgs());
-  std::string kernName = cudaq::runtime::cudaqGenPrefixName + name;
-  SmallVector<StringRef> kernels = {kernName};
-  std::string substBuff;
-  llvm::raw_string_ostream ss(substBuff);
-  ss << argCon.getSubstitutionModule();
-  SmallVector<StringRef> substs = {substBuff};
+
+  // Store kernel and substitution strings on the stack.
+  // We pass string references to the `createArgumentSynthesisPass`.
+  mlir::SmallVector<std::string> kernels;
+  mlir::SmallVector<std::string> substs;
+  for (auto &[kName, kInfo] : argCon.getKernelInfo()) {
+    {
+      std::string kernName =
+          cudaq::runtime::cudaqGenPrefixName + kName.str();
+      kernels.emplace_back(kernName);
+    }
+    {
+      std::string substBuff;
+      llvm::raw_string_ostream ss(substBuff);
+      ss << kInfo.getSubstitutionModule();
+      substs.emplace_back(substBuff);
+    }
+  }
+
+  // Collect references for the argument synthesis.
+  mlir::SmallVector<mlir::StringRef> kernelRefs{kernels.begin(),
+                                               kernels.end()};
+  mlir::SmallVector<mlir::StringRef> substRefs{substs.begin(),
+                                                   substs.end()};
+
   PassManager pm(context);
-  pm.addPass(opt::createArgumentSynthesisPass(kernels, substs));
+  pm.addPass(opt::createArgumentSynthesisPass(kernelRefs, substRefs));
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   pm.addPass(opt::createDeleteStates());
+  pm.addNestedPass<mlir::func::FuncOp>(opt::createReplaceStateWithKernel());
+  pm.addPass(mlir::createSymbolDCEPass());
 
   // Run state preparation for quantum devices (or their emulation) only.
   // Simulators have direct implementation of state initialization
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 1fe8d9747d5..3b0efa4fe70 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -101,15 +101,238 @@ static Value genConstant(OpBuilder &, cudaq::cc::StructType, void *,
 static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *,
                          ModuleOp substMod, llvm::DataLayout &);
 
+
+ /// Create callee.init_N that initializes the state
+ /// Callee (the kernel captured by state):
+ // clang-format off
+ /// func.func @callee(%arg0: i64) {
+ ///   %0 = cc.alloca i64
+ ///   cc.store %arg0, %0 : !cc.ptr<i64>
+ ///   %1 = cc.load %0 : !cc.ptr<i64>
+ ///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+ ///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
+ ///   quake.x %3 : (!quake.ref) -> ()
+ ///   return
+ /// }
+ /// callee.init_N:
+ /// func.func private @callee.init_0(%arg0: !quake.veq<?>, %arg0: i64) ->
+ /// !!quake.veq<?> {
+ ///   %1 = quake.extract_ref %arg0[1] : (!quake.veq<2>) -> !quake.ref
+ ///   quake.x %1 : (f64, !quake.ref) -> ()
+ ///   return %arg0: !quake.veq<?>
+ /// }
+ // clang-format on
+ static void createInitFunc(OpBuilder &builder, ModuleOp moduleOp,
+  func::FuncOp calleeFunc, StringRef initKernelName) {
+OpBuilder::InsertionGuard guard(builder);
+builder.setInsertionPointToEnd(moduleOp.getBody());
+
+auto ctx = builder.getContext();
+auto loc = builder.getUnknownLoc();
+
+auto initFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
+
+auto argTypes = calleeFunc.getArgumentTypes();
+auto retTy = quake::VeqType::getUnsized(ctx);
+auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retTy});
+
+initFunc.setName(initKernelName);
+initFunc.setType(funcTy);
+initFunc.setPrivate();
+
+OpBuilder newBuilder(ctx);
+
+auto *entryBlock = &initFunc.getRegion().front();
+newBuilder.setInsertionPointToStart(entryBlock);
+Value zero = newBuilder.create<arith::ConstantIntOp>(loc, 0, 64);
+Value one = newBuilder.create<arith::ConstantIntOp>(loc, 1, 64);
+Value begin = zero;
+
+auto argPos = initFunc.getArguments().size();
+
+// Detect errors in kernel passed to get_state.
+std::function<void(Block &)> processInner = [&](Block &block) {
+for (auto &op : block) {
+for (auto &region : op.getRegions())
+for (auto &b : region)
+processInner(b);
+
+// Don't allow returns in inner scopes
+if (auto retOp = dyn_cast<func::ReturnOp>(&op))
+calleeFunc.emitError("Encountered return in inner scope in a kernel "
+    "passed to get_state");
+}
+};
+
+for (auto &op : calleeFunc.getRegion().front())
+for (auto &region : op.getRegions())
+for (auto &b : region)
+processInner(b);
+
+// Process outer block to initialize the allocation passed as an argument.
+std::function<void(Block &)> process = [&](Block &block) {
+SmallVector<Operation *> cleanUps;
+Operation *replacedReturn = nullptr;
+
+Value arg;
+Value subArg;
+Value blockBegin = begin;
+Value blockAllocSize = zero;
+for (auto &op : block) {
+if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
+newBuilder.setInsertionPointAfter(alloc);
+
+if (!arg) {
+initFunc.insertArgument(argPos, retTy, {}, loc);
+arg = initFunc.getArgument(argPos);
+}
+
+auto allocSize = alloc.getSize();
+auto offset = newBuilder.create<arith::SubIOp>(loc, allocSize, one);
+subArg =
+newBuilder.create<quake::SubVeqOp>(loc, retTy, arg, begin, offset);
+alloc.replaceAllUsesWith(subArg);
+cleanUps.push_back(alloc);
+begin = newBuilder.create<arith::AddIOp>(loc, begin, allocSize);
+blockAllocSize =
+newBuilder.create<arith::AddIOp>(loc, blockAllocSize, allocSize);
+}
+
+if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
+if (retOp != replacedReturn) {
+newBuilder.setInsertionPointAfter(retOp);
+
+auto offset =
+newBuilder.create<arith::SubIOp>(loc, blockAllocSize, one);
+Value ret = newBuilder.create<quake::SubVeqOp>(loc, retTy, arg,
+                                blockBegin, offset);
+
+assert(arg && "No veq allocations found");
+replacedReturn = newBuilder.create<func::ReturnOp>(loc, ret);
+cleanUps.push_back(retOp);
+}
+}
+}
+
+for (auto &op : cleanUps) {
+op->dropAllReferences();
+op->dropAllUses();
+op->erase();
+}
+};
+
+// Process the function body
+process(initFunc.getRegion().front());
+}
+
+/// Create callee.num_qubits_N that calculates the number of qubits to
+/// initialize the state
+/// Callee: (the kernel captured by state):
+// clang-format off
+/// func.func @callee(%arg0: i64) {
+///   %0 = cc.alloca i64
+///   cc.store %arg0, %0 : !cc.ptr<i64>
+///   %1 = cc.load %0 : !cc.ptr<i64>
+///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
+///   quake.x %3 : (!quake.ref) -> ()
+///   return
+/// }
+///
+/// callee.num_qubits_0:
+/// func.func private @callee.num_qubits_0(%arg0: i64) -> i64 {
+///   %0 = cc.alloca i64
+///   cc.store %arg0, %0 : !cc.ptr<i64>
+///   %1 = cc.load %0 : !cc.ptr<i64>
+///   return %1 : i64
+/// }
+// clang-format on
+static void createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
+       func::FuncOp calleeFunc,
+       StringRef numQubitsKernelName) {
+OpBuilder::InsertionGuard guard(builder);
+builder.setInsertionPointToEnd(moduleOp.getBody());
+
+auto ctx = builder.getContext();
+auto loc = builder.getUnknownLoc();
+
+auto numQubitsFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
+
+auto argTypes = calleeFunc.getArgumentTypes();
+auto retType = builder.getI64Type();
+auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retType});
+
+numQubitsFunc.setName(numQubitsKernelName);
+numQubitsFunc.setType(funcTy);
+numQubitsFunc.setPrivate();
+
+OpBuilder newBuilder(ctx);
+
+auto *entryBlock = &numQubitsFunc.getRegion().front();
+newBuilder.setInsertionPointToStart(entryBlock);
+Value size = newBuilder.create<arith::ConstantIntOp>(loc, 0, retType);
+
+// Process block recursively to calculate and return allocation size
+// and remove everything else.
+std::function<void(Block &)> process = [&](Block &block) {
+SmallVector<Operation *> used;
+Operation *replacedReturn = nullptr;
+
+for (auto &op : block) {
+// Calculate allocation size (existing allocation size plus new one)
+if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
+auto allocSize = alloc.getSize();
+newBuilder.setInsertionPointAfter(alloc);
+size = newBuilder.create<arith::AddIOp>(loc, size, allocSize);
+}
+
+// Return allocation size
+if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
+if (retOp != replacedReturn) {
+
+newBuilder.setInsertionPointAfter(retOp);
+auto newRet = newBuilder.create<func::ReturnOp>(loc, size);
+replacedReturn = newRet;
+used.push_back(newRet);
+}
+}
+}
+
+// Collect all ops needed for size calculation
+SmallVector<Operation *> keep;
+while (!used.empty()) {
+auto *op = used.pop_back_val();
+keep.push_back(op);
+for (auto opnd : op->getOperands())
+if (auto defOp = opnd.getDefiningOp())
+used.push_back(defOp);
+}
+
+// Remove the rest of the ops
+SmallVector<Operation *> toErase;
+for (auto &op : block)
+if (std::find(keep.begin(), keep.end(), &op) == keep.end())
+toErase.push_back(&op);
+
+for (auto &op : toErase) {
+op->dropAllReferences();
+op->dropAllUses();
+op->erase();
+}
+};
+
+// Process the function body
+process(numQubitsFunc.getRegion().front());
+}
+
 static Value genConstant(OpBuilder &builder, const cudaq::state *v,
-                         llvm::DataLayout &layout,
+                         llvm::DataLayout &layout, StringRef kernelName, ModuleOp substMod,
                          cudaq::opt::ArgumentConverter &converter) {
   auto simState =
       cudaq::state_helper::getSimulationState(const_cast<cudaq::state *>(v));
 
-  auto kernelName = converter.getKernelName();
-  auto substMod = converter.getSubstitutionModule();
-
+  //auto kernelName = converter.getKernelName();
+  //auto substMod = converter.getSubstitutionModule();
 
   // If the state has amplitude data, we materialize the data as a state
   // vector and create a new state from it.
@@ -276,23 +499,21 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     auto numQubitsKernelName =
         cudaq::runtime::cudaqGenPrefixName + numQubitsName;
 
-    // // Create `callee.init_N` and `callee.num_qubits_N` used to replace
-    // // `quake.materialize_state` in ReplaceStateWithKernel pass
-    // if (!converter.isRegisteredKernel(initName) ||
-    //     !converter.isRegisteredKernel(numQubitsName)) {
-    //   createInitFunc(builder, substMod, calleeFunc, initKernelName);
-    //   createNumQubitsFunc(builder, substMod, calleeFunc, numQubitsKernelName);
+    // Create `callee.init_N` and `callee.num_qubits_N` used to replace
+    // `quake.materialize_state` in ReplaceStateWithKernel pass
+    if (!converter.isRegisteredKernel(initName) ||
+        !converter.isRegisteredKernel(numQubitsName)) {
+      createInitFunc(builder, substMod, calleeFunc, initKernelName);
+      createNumQubitsFunc(builder, substMod, calleeFunc, numQubitsKernelName);
 
-    //   // Convert arguments for `callee.init_N`.
-    //   auto &initConverter =
-    //       cudaq::opt::createChildConverter(converter, initName);
-    //   initConverter.gen(calleeArgs);
+      // Convert arguments for `callee.init_N`.
+      auto &registeredInitName = converter.registerKernel(initName);
+      converter.gen(registeredInitName, calleeArgs);
 
-    //   // Convert arguments for `callee.num_qubits_N`.
-    //   auto &numQubitsConverter =
-    //       cudaq::opt::createChildConverter(converter, numQubitsName);
-    //   numQubitsConverter.gen(calleeArgs);
-    // }
+      // Convert arguments for `callee.num_qubits_N`.
+      auto &registeredNumQubitsName = converter.registerKernel(initName);
+      converter.gen(registeredNumQubitsName, calleeArgs);
+    }
 
     // Create a substitution for the state pointer.
     auto statePtrTy =
@@ -479,15 +700,23 @@ cudaq::opt::ArgumentConverter::ArgumentConverter(StringRef kernelName,
                                                  ModuleOp sourceModule)
     : sourceModule(sourceModule), builder(sourceModule.getContext()),
       kernelName(kernelName) {
-  substModule = builder.create<ModuleOp>(builder.getUnknownLoc());
 }
 
 void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
+  gen(kernelName, arguments);
+}
+
+void cudaq::opt::ArgumentConverter::gen(StringRef kernelName, const std::vector<void *> &arguments) {
   auto *ctx = builder.getContext();
   // We should look up the input type signature here.
+  auto &kernelInfo = addKernelInfo(kernelName);
+  auto substModule = kernelInfo.getSubstitutionModule();
 
   auto fun = sourceModule.lookupSymbol<func::FuncOp>(
       cudaq::runtime::cudaqGenPrefixName + kernelName.str());
+  if (!fun) {
+    throw std::runtime_error("missing fun in argument conversion: " + kernelName.str());
+  }
 
   FunctionType fromFuncTy = fun.getFunctionType();
   for (auto iter :
@@ -556,7 +785,7 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
             .Case([&](cc::PointerType ptrTy) -> cc::ArgumentSubstitutionOp {
               if (ptrTy.getElementType() == cc::StateType::get(ctx))
                 return buildSubst(static_cast<const state *>(argPtr),
-                                  dataLayout, *this);
+                                  dataLayout, kernelName, substModule, *this);
               return {};
             })
             .Case([&](cc::StdvecType ty) {
@@ -574,7 +803,7 @@ void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
             })
             .Default({});
     if (subst)
-      substitutions.emplace_back(std::move(subst));
+      kernelInfo.getSubstitutions().emplace_back(std::move(subst));
   }
 }
 
diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index b876955385f..2be7ba579dc 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -19,6 +19,34 @@
 
 namespace cudaq::opt {
 
+  
+class KernelInfo {
+  public:
+    KernelInfo(mlir::OpBuilder builder, mlir::StringRef kernelName)
+    :  kernelName(kernelName) {
+      substModule = builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
+    }
+  
+    /// Some substitutions may generate global constant information. Use this
+    /// interface to access both the substitutions and any global constants
+    /// created.
+    mlir::ModuleOp getSubstitutionModule() {
+      return substModule;
+    }
+
+    /// Get the list of substitutions for this kernel that were generated
+    /// by `ArgumentConverter::gen()`.
+    mlir::SmallVector<cc::ArgumentSubstitutionOp> &getSubstitutions() {
+      return substitutions;
+    }
+
+  private:
+    mlir::ModuleOp substModule;
+    mlir::StringRef kernelName;
+    mlir::SmallVector<cc::ArgumentSubstitutionOp> substitutions;
+  };
+
+  
 class ArgumentConverter {
 public:
   /// Build an instance to create argument substitutions for a specified \p
@@ -29,6 +57,10 @@ class ArgumentConverter {
   /// The arguments are those presented to the kernel, kernelName.
   void gen(const std::vector<void *> &arguments);
 
+  /// Generate a substitution ModuleOp for the vector of arguments presented.
+  /// The arguments are those presented to the kernel, kernelName.
+  void gen(mlir::StringRef kernelName, const std::vector<void *> &arguments);
+
   /// Generate a substitution ModuleOp but include only the arguments that do
   /// not appear in the set of \p exclusions.
   void gen(const std::vector<void *> &arguments,
@@ -38,25 +70,39 @@ class ArgumentConverter {
   /// and thereby exclude them from the substitutions.
   void gen_drop_front(const std::vector<void *> &arguments, unsigned numDrop);
 
-  /// Get the list of substitutions that were generated by `gen()`.
-  mlir::SmallVector<cudaq::cc::ArgumentSubstitutionOp> &getSubstitutions() {
-    return substitutions;
+  /// Kernel we are converting the arguments for.
+  mlir::StringRef getKernelName() { return kernelName; }
+
+  /// Get the map of kernel names to their kernel info that
+  /// were collected by `collect()`.
+   mlir::DenseMap<mlir::StringRef, KernelInfo>& getKernelInfo() {
+    return kernelInfo;
   }
 
-  /// Some substitutions may generate global constant information. Use this
-  /// interface to access both the substitutions and any global constants
-  /// created.
-  mlir::ModuleOp getSubstitutionModule() { return substModule; }
+  bool isRegisteredKernel(const std::string &kernelName) {
+    return std::find(nameRegistry.begin(), nameRegistry.end(), kernelName) != nameRegistry.end();
+  }
 
-  /// Kernel we are converting the arguments for.
-  mlir::StringRef getKernelName() { return kernelName; }
+  std::string &registerKernel(const std::string &kernelName) {
+    return nameRegistry.emplace_back(kernelName);
+  }
+
+  KernelInfo& addKernelInfo(mlir::StringRef kernelName) {
+    auto [it,b] = kernelInfo.try_emplace(kernelName, std::move(KernelInfo(builder, kernelName)));
+    return it->second;
+  }
+
+  private:
+  /// Memory to store new kernel names generated during argument conversion.
+  std::list<std::string> nameRegistry;
+
+  /// Kernel info for kernels we are converting the arguments for, including
+  /// new kernels generated from state arguments.
+  mlir::DenseMap<mlir::StringRef, KernelInfo> kernelInfo;
 
-private:
   mlir::ModuleOp sourceModule;
-  mlir::ModuleOp substModule;
   mlir::OpBuilder builder;
   mlir::StringRef kernelName;
-  mlir::SmallVector<cc::ArgumentSubstitutionOp> substitutions;
 };
 
 } // namespace cudaq::opt
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 4b12d396126..1e6654e6132 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -9,7 +9,6 @@
 #pragma once
 
 #include "common/ArgumentConversion.h"
-#include "common/StateAggregator.h"
 #include "common/Environment.h"
 #include "common/ExecutionContext.h"
 #include "common/Executor.h"
@@ -458,73 +457,35 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         // For quantum devices, create a list of ArgumentConverters
         // with nodes corresponding to `init` and `num_qubits` functions
         // created from a kernel that generated the state argument.
-        // Traverse the tree and collect substitutions for all those
+        // Traverse the list and collect substitutions for all those
         // functions.
-        cudaq::opt::StateAggregator aggregator;
-        aggregator.collect(moduleOp, kernelName, rawArgs);
+        auto argCon = cudaq::opt::ArgumentConverter(kernelName, moduleOp);
+        argCon.gen(rawArgs);
 
         // Store kernel and substitution strings on the stack.
         // We pass string references to the `createArgumentSynthesisPass`.
         mlir::SmallVector<std::string> kernels;
         mlir::SmallVector<std::string> substs;
-        for (auto &kInfo : aggregator.getKernelInfo()) {
-          auto con = kInfo.converter;
-          con.gen(kInfo.args);
+        for (auto &[kName, kInfo] : argCon.getKernelInfo()) {
           {
-            auto name = con.getKernelName();
             std::string kernName =
-                cudaq::runtime::cudaqGenPrefixName + name.str();
+                cudaq::runtime::cudaqGenPrefixName + kName.str();
             kernels.emplace_back(kernName);
           }
           {
             std::string substBuff;
             llvm::raw_string_ostream ss(substBuff);
-            ss << con.getSubstitutionModule();
+            ss << kInfo.getSubstitutionModule();
             substs.emplace_back(substBuff);
           }
         }
 
-        // std::list<std::string> kernelRegistry;
-        // opt::ArgumentConverter argCon(kernelRegistry, kernelName, moduleOp);
-        // argCon.gen(rawArgs);
-
-        // // For quantum devices, we've created a tree of ArgumentConverters
-        // // with nodes corresponding to `init` and `num_qubits` functions
-        // // created from a kernel that generated the state argument.
-        // // Traverse the tree and collect substitutions for all those
-        // // functions.
-
-        // // Store kernel and substitution strings on the stack.
-        // // We pass string references to the `createArgumentSynthesisPass`.
-        // mlir::SmallVector<std::string> kernels;
-        // mlir::SmallVector<std::string> substs;
-
-        // std::function<void(opt::ArgumentConverter &)> collect =
-        //     [&kernels, &substs, &collect](opt::ArgumentConverter &con) {
-        //       {
-        //         auto name = con.getKernelName();
-        //         std::string kernName =
-        //             cudaq::runtime::cudaqGenPrefixName + name.str();
-        //         kernels.emplace_back(kernName);
-        //       }
-        //       {
-        //         std::string substBuff;
-        //         llvm::raw_string_ostream ss(substBuff);
-        //         ss << con.getSubstitutionModule();
-        //         substs.emplace_back(substBuff);
-        //       }
-
-        //       for (auto &calleeCon : con.getCalleeConverters())
-        //         collect(calleeCon);
-        //     };
-        // collect(argCon);
-
         // Collect references for the argument synthesis.
-        mlir::SmallVector<mlir::StringRef> funcNames{kernels.begin(),
+        mlir::SmallVector<mlir::StringRef> kernelRefs{kernels.begin(),
                                                      kernels.end()};
-        mlir::SmallVector<mlir::StringRef> substitutions{substs.begin(),
+        mlir::SmallVector<mlir::StringRef> substRefs{substs.begin(),
                                                          substs.end()};
-        pm.addPass(opt::createArgumentSynthesisPass(funcNames, substitutions));
+        pm.addPass(opt::createArgumentSynthesisPass(kernelRefs, substRefs));
         pm.addPass(opt::createDeleteStates());
         pm.addNestedPass<mlir::func::FuncOp>(
             opt::createReplaceStateWithKernel());
@@ -625,6 +586,8 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     } else
       modules.emplace_back(kernelName, moduleOp);
 
+    std::cout << "Modules: " << modules.size() << std::endl;
+
     if (emulate) {
       // If we are in emulation mode, we need to first get a full QIR
       // representation of the code. Then we'll map to an LLVM Module, create a
@@ -750,7 +713,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
             std::vector<cudaq::ExecutionResult> results;
 
             // If seed is 0, then it has not been set.
-            if (seed > 0)
+            if (seed == 0)
               cudaq::set_random_seed(seed);
 
             bool hasConditionals =
@@ -758,6 +721,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
             if (hasConditionals && isObserve)
               throw std::runtime_error("error: spin_ops not yet supported with "
                                        "kernels containing conditionals");
+
             if (hasConditionals) {
               executor->setShots(1); // run one shot at a time
 
@@ -783,6 +747,8 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
                       counts.sequential_data(regName);
                 }
               }
+              localJIT.clear();
+              return cudaq::sample_result(results);
             }
 
             for (std::size_t i = 0; i < codes.size(); i++) {
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index ab8d3ba79d5..f5f63d132b2 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -185,15 +185,36 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
           cudaq::info("Run Argument Synth.\n");
           opt::ArgumentConverter argCon(name, moduleOp);
           argCon.gen_drop_front(*rawArgs, startingArgIdx);
-          std::string kernName = runtime::cudaqGenPrefixName + name;
-          mlir::SmallVector<mlir::StringRef> kernels = {kernName};
-          std::string substBuff;
-          llvm::raw_string_ostream ss(substBuff);
-          ss << argCon.getSubstitutionModule();
-          mlir::SmallVector<mlir::StringRef> substs = {substBuff};
-          pm.addPass(opt::createArgumentSynthesisPass(kernels, substs));
+
+          // Store kernel and substitution strings on the stack.
+          // We pass string references to the `createArgumentSynthesisPass`.
+          mlir::SmallVector<std::string> kernels;
+          mlir::SmallVector<std::string> substs;
+          for (auto &[kName, kInfo] : argCon.getKernelInfo()) {
+            {
+              std::string kernName =
+                  cudaq::runtime::cudaqGenPrefixName + kName.str();
+              kernels.emplace_back(kernName);
+            }
+            {
+              std::string substBuff;
+              llvm::raw_string_ostream ss(substBuff);
+              ss << kInfo.getSubstitutionModule();
+              substs.emplace_back(substBuff);
+            }
+          }
+  
+          // Collect references for the argument synthesis.
+          mlir::SmallVector<mlir::StringRef> kernelRefs{kernels.begin(),
+            kernels.end()};
+          mlir::SmallVector<mlir::StringRef> substRefs{substs.begin(),
+                        substs.end()};
+          pm.addPass(opt::createArgumentSynthesisPass(kernelRefs, substRefs));
           pm.addPass(mlir::createCanonicalizerPass());
           pm.addPass(opt::createDeleteStates());
+          pm.addNestedPass<mlir::func::FuncOp>(
+            opt::createReplaceStateWithKernel());
+          pm.addPass(mlir::createSymbolDCEPass());
         } else if (args) {
           cudaq::info("Run Quake Synth.\n");
           pm.addPass(opt::createQuakeSynthesizer(name, args, startingArgIdx));
diff --git a/runtime/common/CMakeLists.txt b/runtime/common/CMakeLists.txt
index 8567416bf4a..3d6061f4ef0 100644
--- a/runtime/common/CMakeLists.txt
+++ b/runtime/common/CMakeLists.txt
@@ -90,7 +90,6 @@ add_library(cudaq-mlir-runtime
     JIT.cpp
     Logger.cpp
     RuntimeMLIR.cpp
-    StateAggregator.cpp
 )
 set_property(GLOBAL APPEND PROPERTY CUDAQ_RUNTIME_LIBS cudaq-mlir-runtime)
 
diff --git a/runtime/common/StateAggregator.cpp b/runtime/common/StateAggregator.cpp
deleted file mode 100644
index 80f6d30e0d0..00000000000
--- a/runtime/common/StateAggregator.cpp
+++ /dev/null
@@ -1,422 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
- #include "StateAggregator.h"
- #include "cudaq.h"
- #include "cudaq/Optimizer/Builder/Intrinsics.h"
- #include "cudaq/Optimizer/Builder/Runtime.h"
- #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
- #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
- #include "cudaq/Todo.h"
- #include "cudaq/qis/pauli_word.h"
- #include "cudaq/utils/registry.h"
- #include "llvm/ADT/TypeSwitch.h"
- #include "mlir/Dialect/Arith/IR/Arith.h"
- #include "mlir/Dialect/Complex/IR/Complex.h"
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/Parser/Parser.h"
-
- #include <iostream>
- 
- using namespace mlir;
- 
- /// Create callee.init_N that initializes the state
- /// Callee (the kernel captured by state):
- // clang-format off
- /// func.func @callee(%arg0: i64) {
- ///   %0 = cc.alloca i64
- ///   cc.store %arg0, %0 : !cc.ptr<i64>
- ///   %1 = cc.load %0 : !cc.ptr<i64>
- ///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
- ///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
- ///   quake.x %3 : (!quake.ref) -> ()
- ///   return
- /// }
- /// callee.init_N:
- /// func.func private @callee.init_0(%arg0: !quake.veq<?>, %arg0: i64) ->
- /// !!quake.veq<?> {
- ///   %1 = quake.extract_ref %arg0[1] : (!quake.veq<2>) -> !quake.ref
- ///   quake.x %1 : (f64, !quake.ref) -> ()
- ///   return %arg0: !quake.veq<?>
- /// }
- // clang-format on
- static void createInitFunc(OpBuilder &builder, ModuleOp moduleOp,
-                            func::FuncOp calleeFunc, StringRef initKernelName) {
-   OpBuilder::InsertionGuard guard(builder);
-   builder.setInsertionPointToEnd(moduleOp.getBody());
- 
-   auto ctx = builder.getContext();
-   auto loc = builder.getUnknownLoc();
- 
-   auto initFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
- 
-   auto argTypes = calleeFunc.getArgumentTypes();
-   auto retTy = quake::VeqType::getUnsized(ctx);
-   auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retTy});
- 
-   initFunc.setName(initKernelName);
-   initFunc.setType(funcTy);
-   initFunc.setPrivate();
- 
-   OpBuilder newBuilder(ctx);
- 
-   auto *entryBlock = &initFunc.getRegion().front();
-   newBuilder.setInsertionPointToStart(entryBlock);
-   Value zero = newBuilder.create<arith::ConstantIntOp>(loc, 0, 64);
-   Value one = newBuilder.create<arith::ConstantIntOp>(loc, 1, 64);
-   Value begin = zero;
- 
-   auto argPos = initFunc.getArguments().size();
- 
-   // Detect errors in kernel passed to get_state.
-   std::function<void(Block &)> processInner = [&](Block &block) {
-     for (auto &op : block) {
-       for (auto &region : op.getRegions())
-         for (auto &b : region)
-           processInner(b);
- 
-       // Don't allow returns in inner scopes
-       if (auto retOp = dyn_cast<func::ReturnOp>(&op))
-         calleeFunc.emitError("Encountered return in inner scope in a kernel "
-                              "passed to get_state");
-     }
-   };
- 
-   for (auto &op : calleeFunc.getRegion().front())
-     for (auto &region : op.getRegions())
-       for (auto &b : region)
-         processInner(b);
- 
-   // Process outer block to initialize the allocation passed as an argument.
-   std::function<void(Block &)> process = [&](Block &block) {
-     SmallVector<Operation *> cleanUps;
-     Operation *replacedReturn = nullptr;
- 
-     Value arg;
-     Value subArg;
-     Value blockBegin = begin;
-     Value blockAllocSize = zero;
-     for (auto &op : block) {
-       if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
-         newBuilder.setInsertionPointAfter(alloc);
- 
-         if (!arg) {
-           initFunc.insertArgument(argPos, retTy, {}, loc);
-           arg = initFunc.getArgument(argPos);
-         }
- 
-         auto allocSize = alloc.getSize();
-         auto offset = newBuilder.create<arith::SubIOp>(loc, allocSize, one);
-         subArg =
-             newBuilder.create<quake::SubVeqOp>(loc, retTy, arg, begin, offset);
-         alloc.replaceAllUsesWith(subArg);
-         cleanUps.push_back(alloc);
-         begin = newBuilder.create<arith::AddIOp>(loc, begin, allocSize);
-         blockAllocSize =
-             newBuilder.create<arith::AddIOp>(loc, blockAllocSize, allocSize);
-       }
- 
-       if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
-         if (retOp != replacedReturn) {
-           newBuilder.setInsertionPointAfter(retOp);
- 
-           auto offset =
-               newBuilder.create<arith::SubIOp>(loc, blockAllocSize, one);
-           Value ret = newBuilder.create<quake::SubVeqOp>(loc, retTy, arg,
-                                                          blockBegin, offset);
- 
-           assert(arg && "No veq allocations found");
-           replacedReturn = newBuilder.create<func::ReturnOp>(loc, ret);
-           cleanUps.push_back(retOp);
-         }
-       }
-     }
- 
-     for (auto &op : cleanUps) {
-       op->dropAllReferences();
-       op->dropAllUses();
-       op->erase();
-     }
-   };
- 
-   // Process the function body
-   process(initFunc.getRegion().front());
- }
- 
- /// Create callee.num_qubits_N that calculates the number of qubits to
- /// initialize the state
- /// Callee: (the kernel captured by state):
- // clang-format off
- /// func.func @callee(%arg0: i64) {
- ///   %0 = cc.alloca i64
- ///   cc.store %arg0, %0 : !cc.ptr<i64>
- ///   %1 = cc.load %0 : !cc.ptr<i64>
- ///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
- ///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
- ///   quake.x %3 : (!quake.ref) -> ()
- ///   return
- /// }
- ///
- /// callee.num_qubits_0:
- /// func.func private @callee.num_qubits_0(%arg0: i64) -> i64 {
- ///   %0 = cc.alloca i64
- ///   cc.store %arg0, %0 : !cc.ptr<i64>
- ///   %1 = cc.load %0 : !cc.ptr<i64>
- ///   return %1 : i64
- /// }
- // clang-format on
- static void createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
-                                 func::FuncOp calleeFunc,
-                                 StringRef numQubitsKernelName) {
-   OpBuilder::InsertionGuard guard(builder);
-   builder.setInsertionPointToEnd(moduleOp.getBody());
- 
-   auto ctx = builder.getContext();
-   auto loc = builder.getUnknownLoc();
- 
-   auto numQubitsFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
- 
-   auto argTypes = calleeFunc.getArgumentTypes();
-   auto retType = builder.getI64Type();
-   auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retType});
- 
-   numQubitsFunc.setName(numQubitsKernelName);
-   numQubitsFunc.setType(funcTy);
-   numQubitsFunc.setPrivate();
- 
-   OpBuilder newBuilder(ctx);
- 
-   auto *entryBlock = &numQubitsFunc.getRegion().front();
-   newBuilder.setInsertionPointToStart(entryBlock);
-   Value size = newBuilder.create<arith::ConstantIntOp>(loc, 0, retType);
- 
-   // Process block recursively to calculate and return allocation size
-   // and remove everything else.
-   std::function<void(Block &)> process = [&](Block &block) {
-     SmallVector<Operation *> used;
-     Operation *replacedReturn = nullptr;
- 
-     for (auto &op : block) {
-       // Calculate allocation size (existing allocation size plus new one)
-       if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
-         auto allocSize = alloc.getSize();
-         newBuilder.setInsertionPointAfter(alloc);
-         size = newBuilder.create<arith::AddIOp>(loc, size, allocSize);
-       }
- 
-       // Return allocation size
-       if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
-         if (retOp != replacedReturn) {
- 
-           newBuilder.setInsertionPointAfter(retOp);
-           auto newRet = newBuilder.create<func::ReturnOp>(loc, size);
-           replacedReturn = newRet;
-           used.push_back(newRet);
-         }
-       }
-     }
- 
-     // Collect all ops needed for size calculation
-     SmallVector<Operation *> keep;
-     while (!used.empty()) {
-       auto *op = used.pop_back_val();
-       keep.push_back(op);
-       for (auto opnd : op->getOperands())
-         if (auto defOp = opnd.getDefiningOp())
-           used.push_back(defOp);
-     }
- 
-     // Remove the rest of the ops
-     SmallVector<Operation *> toErase;
-     for (auto &op : block)
-       if (std::find(keep.begin(), keep.end(), &op) == keep.end())
-         toErase.push_back(&op);
- 
-     for (auto &op : toErase) {
-       op->dropAllReferences();
-       op->dropAllUses();
-       op->erase();
-     }
-   };
- 
-   // Process the function body
-   process(numQubitsFunc.getRegion().front());
- }
- 
- void cudaq::opt::StateAggregator::collectKernelInfo(ModuleOp moduleOp, const cudaq::state *v) {
-   auto simState =
-       cudaq::state_helper::getSimulationState(const_cast<cudaq::state *>(v));
- 
-   // If the state has amplitude data, we materialize the data as a state
-   // vector and create a new state from it in the ArgumentConverter.
-   // TODO: add an option to use the kernel info if available, i.e. for
-   // remote simulators
-   // TODO: add an option of storing the kernel info on simulators if
-   // preferred i.e. to support synthesis of density matrices.
-   if (simState->hasData()) {
-     return;
-   }
- 
-   // Otherwise (ie quantum hardware, where getting the amplitude data is not
-   // efficient) we aim at replacing states with calls to kernels (`callees`)
-   // that generated them. This is done in three stages:
-   //
-   // 1) (done here) Generate @callee.num_qubits_0 @callee.init_0` for the callee
-   //    function and its arguments stored in a state.
- 
-   //    Create two functions:
-   //      - callee.num_qubits_N
-   //        Calculates the number of qubits needed for the veq allocation
-   //      - callee.init_N
-   //        Initializes the veq passed as a parameter
-   //
-   // 2) (done in ArgumentConverter) Replace the state with
-   //   `quake.get_state @callee.num_qubits_0 @callee.init_0`:
-   //
-   // clang-format off
-   // ```
-   // func.func @caller(%arg0: !cc.ptr<!cc.state>) {
-   //   %1 = quake.get_number_of_qubits %arg0: (!cc.ptr<!cc.state>) -> i64
-   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
-   //   %3 = quake.init_state %2, %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
-   //   return
-   // }
-   //
-   // func.func private @callee(%arg0: i64) {
-   //   %0 = quake.alloca !quake.veq<?>[%arg0 : i64]
-   //   %1 = quake.extract_ref %0[0] : (!quake.veq<2>) -> !quake.ref
-   //   quake.x %1 : (!quake.ref) -> ()
-   //   return
-   // }
-   //
-   // Call from the user host code:
-   // state = cudaq.get_state(callee, 2)
-   // counts = cudaq.sample(caller, state)
-   // ```
-   // clang-format on
-   //
-   // => after argument synthesis:
-   //
-   // clang-format off
-   // ```
-   // func.func @caller() {
-   //   %0 = quake.get_state @callee.num_qubits_0 @callee.init_state_0 : !cc.ptr<!cc.state>
-   //   %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
-   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
-   //   %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
-   //   return
-   // }
-   //
-   // func.func private @callee.num_qubits_0(%arg0: i64) -> i64 {
-   //   return %arg0 : i64
-   // }
-   //
-   // func.func private @callee.init_0(%arg0: i64, %arg1: !quake.veq<?>) {
-   //   %1 = quake.extract_ref %arg0[0] : (!quake.veq<2>) -> !quake.ref
-   //   quake.x %1 : (f64, !quake.ref) -> ()
-   //   return
-   // }
-   // ```
-   // clang-format on
-   //
-   // 3) (done in ReplaceStateWithKernel) Replace the `quake.get_state` and ops
-   // that use its state with calls to the generated functions, synthesized with
-   // the arguments used to create the original state:
-   //
-   // After ReplaceStateWithKernel pass:
-   //
-   // clang-format off
-   // ```
-   // func.func @caller() {
-   //   %1 = call callee.num_qubits_0() : () -> i64
-   //   %2 = quake.alloca !quake.veq<?>[%1 : i64]
-   //   %3 = call @callee.init_0(%2): (!quake.veq<?>) -> !quake.veq<?>
-   // }
-   //
-   // func.func private @callee.num_qubits_0() -> i64 {
-   //   %cst = arith.constant 2 : i64
-   //   return %cst : i64
-   // }
-   //
-   // func.func private @callee.init_0(%arg0: !quake.veq<?>): !quake.veq<?> {
-   //   %cst = arith.constant 1.5707963267948966 : f64
-   //   %1 = quake.extract_ref %arg0[0] : (!quake.veq<2>) -> !quake.ref
-   //   quake.ry (%cst) %1 : (f64, !quake.ref) -> ()
-   //   return %arg0
-   // }
-   // ```
-   // clang-format on
-   if (simState->getKernelInfo().has_value()) {
-     auto [calleeName, calleeArgs] = simState->getKernelInfo().value();
- 
-     std::string calleeKernelName =
-         cudaq::runtime::cudaqGenPrefixName + calleeName;
- 
-     auto builder = IRBuilder(moduleOp);
-     auto ctx = builder.getContext();
- 
-     auto code = cudaq::get_quake_by_name(calleeName, /*throwException=*/false);
-     assert(!code.empty() && "Quake code not found for callee");
-     auto fromModule = parseSourceString<ModuleOp>(code, ctx);
- 
-     auto calleeFunc = fromModule->lookupSymbol<func::FuncOp>(calleeKernelName);
-     assert(calleeFunc && "callee func is missing");
- 
-     // Use the state pointer as hash to store new function names
-     // so we can look them up later in ArgumentConverter.
-     auto hash = std::to_string(reinterpret_cast<std::size_t>(v));
-     auto initName = calleeName + ".init_" + hash;
-     auto numQubitsName = calleeName + ".num_qubits_" + hash;
- 
-     if (!hasKernelInfo(initName) && !hasKernelInfo(numQubitsName)) {
-       auto initKernelName = cudaq::runtime::cudaqGenPrefixName + initName;
-       auto numQubitsKernelName =
-           cudaq::runtime::cudaqGenPrefixName + numQubitsName;
- 
-       // Create `callee.init_N` and `callee.num_qubits_N` functions used to
-       // replace `quake.get_state` later in ReplaceStateWithKernel pass
-       createInitFunc(builder, moduleOp, calleeFunc, initKernelName);
-       createNumQubitsFunc(builder, moduleOp, calleeFunc, numQubitsKernelName);
- 
-       // Collect kernel info from the callee arguments recursively
-       collect(moduleOp, initName, calleeArgs);
-       collect(moduleOp, numQubitsName, calleeArgs);
-     }
-     return;
-   }
- 
-   TODO("cudaq::state* argument synthesis for quantum hardware for c functions");
- }
- 
- //===----------------------------------------------------------------------===//
-
-
- void cudaq::opt::StateAggregator::collect(ModuleOp moduleOp,
-     const std::string& kernelName, const std::vector<void *> &arguments) {
-
-   auto &info = addKernelInfo(moduleOp, kernelName, arguments);
-   auto substModule = info.converter.getSubstitutionModule();
-   auto *ctx = moduleOp.getContext();
- 
-   auto fun = moduleOp.lookupSymbol<func::FuncOp>(
-       cudaq::runtime::cudaqGenPrefixName + kernelName);
-   assert(fun && "callee func is missing in state aggregator");
- 
-   FunctionType fromFuncTy = fun.getFunctionType();
-   for (auto iter :
-        llvm::enumerate(llvm::zip(fromFuncTy.getInputs(), arguments))) {
-     void *argPtr = std::get<1>(iter.value());
-     if (!argPtr)
-       continue;
-     Type argTy = std::get<0>(iter.value());
- 
-     if (auto ptrTy = dyn_cast<cc::PointerType>(argTy))
-       if (ptrTy.getElementType() == cc::StateType::get(ctx))
-         collectKernelInfo(substModule, static_cast<const state *>(argPtr));
-   }
- }
\ No newline at end of file
diff --git a/runtime/common/StateAggregator.h b/runtime/common/StateAggregator.h
deleted file mode 100644
index 69dd1ca621f..00000000000
--- a/runtime/common/StateAggregator.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/****************************************************************-*- C++ -*-****
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
- #pragma once
-
- #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
- #include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
- #include "cudaq/qis/state.h"
- #include "mlir/IR/Builders.h"
- #include "mlir/IR/Types.h"
- #include <list>
- #include <unordered_set>
- #include <vector>
- #include "ArgumentConversion.h"
- 
- namespace cudaq::opt {
-  struct KernelInfo {
-    ArgumentConverter converter;
-    const std::vector<void *> args;
-  };
-
- class StateAggregator {
- public:
-   /// Create an instance of the state aggregator for a specified \p
-   /// sourceModule.
-   StateAggregator(){}
- 
-   /// Collect kernel names and arguments for all state arguments.
-   void collect(mlir::ModuleOp moduleOp, const std::string& kernelName,
-                const std::vector<void *> &arguments);
- 
-   /// Get the map of kernel names to their kernel info that
-   /// were collected by `collect()`.
-   std::list<KernelInfo>& getKernelInfo() {
-     return kernelInfo;
-   }
- 
- private:
-   void collectKernelInfo(mlir::ModuleOp moduleOp, const cudaq::state *v);
- 
-   bool hasKernelInfo(const std::string &kernelName) {
-     return std::find(nameRegistry.begin(), nameRegistry.end(), kernelName) != nameRegistry.end();
-   }
- 
-   KernelInfo& addKernelInfo(mlir::ModuleOp moduleOp, const std::string &kernelName,
-                      const std::vector<void *> &args) {
-    auto &name = nameRegistry.emplace_back(kernelName);
-    return kernelInfo.emplace_back(std::move(ArgumentConverter(name, moduleOp)), args);
-   }
- 
- private:
-   /// Memory to store new kernel names generated during argument conversion.
-   std::list<std::string> nameRegistry;
-
-   /// Kernel info for kernels we are converting the arguments for, including
-   /// new kernels generated from state arguments.
-   std::list<KernelInfo> kernelInfo;
- };
- 
- } // namespace cudaq::opt
\ No newline at end of file
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 5ab571f46fb..afe24478d11 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -12,7 +12,6 @@
 // RUN: test_argument_conversion | FileCheck %s
 
 #include "common/ArgumentConversion.h"
-#include "common/StateAggregator.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "cudaq/Optimizer/InitAllDialects.h"
@@ -143,12 +142,14 @@ class FakeDeviceState : public cudaq::SimulationState {
 
 extern "C" void __cudaq_deviceCodeHolderAdd(const char *, const char *);
 
-void dumpSubstitutionModule(cudaq::opt::ArgumentConverter &con) {
-  // Dump the conversions
-  llvm::outs() << "========================================\n"
-                  "Substitution module:\n"
-               << con.getKernelName() << "\n"
-               << con.getSubstitutionModule() << '\n';
+void dumpSubstitutionModules(cudaq::opt::ArgumentConverter &con) {
+  for (auto &[kName, kInfo] : con.getKernelInfo()) {
+    // Dump the conversions
+    llvm::outs() << "========================================\n"
+                    "Substitution module:\n"
+                << kName << "\n"
+                << kInfo.getSubstitutionModule() << '\n';
+  }
 }
 
 void doSimpleTest(mlir::MLIRContext *ctx, const std::string &typeName,
@@ -171,38 +172,7 @@ func.func @__nvqpp__mlirgen__testy(%0: )#" +
   // Create the argument conversions
   ab.gen(args);
   // Dump all conversions
-  dumpSubstitutionModule(ab);
-}
-
-
-void doStateAggregationTest(mlir::MLIRContext *ctx, const std::string &typeName,
-  std::vector<void *> args,
-  const std::string &additionalCode = "") {
-std::string code = additionalCode + R"#(
-func.func private @callee(%0: )#" +
-     typeName + R"#()
-func.func @__nvqpp__mlirgen__testy(%0: )#" +
-     typeName + R"#() {
-call @callee(%0) : ()#" +
-     typeName + R"#() -> ()
-return
-})#";
-
-// Create the Module
-auto mod = mlir::parseSourceString<mlir::ModuleOp>(code, ctx);
-llvm::outs() << "Source module:\n" << *mod << '\n';
-
-  // Create the argument conversions for state arguments
-  cudaq::opt::StateAggregator sa;
-  sa.collect(*mod, "testy", args);
-
-  for (auto &kInfo : sa.getKernelInfo()) {
-    cudaq::opt::ArgumentConverter &cab = kInfo.converter;
-    // Create the argument conversions for callee kernels from state arguments
-    cab.gen(kInfo.args);
-    // Dump all conversions
-    dumpSubstitutionModule(cab);
-  }
+  dumpSubstitutionModules(ab);
 }
 
 void doTest(mlir::MLIRContext *ctx, std::vector<std::string> &typeNames,
@@ -245,7 +215,7 @@ void doTest(mlir::MLIRContext *ctx, std::vector<std::string> &typeNames,
   // Create the argument conversions
   ab.gen_drop_front(args, startingArgIdx);
   // Dump all conversions
-  dumpSubstitutionModule(ab);
+  dumpSubstitutionModules(ab);
 }
 
 void test_scalars(mlir::MLIRContext *ctx) {
@@ -568,7 +538,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
     std::vector<void *> a = {static_cast<void *>(&n)};
     auto s = cudaq::state(new FakeDeviceState(init, a));
     std::vector<void *> v = {static_cast<void *>(&s)};
-    doStateAggregationTest(ctx, "!cc.ptr<!cc.state>", v, initCode);
+    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, initCode);
   }
 
   // clang-format off
@@ -669,7 +639,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
     std::vector<void *> v1 = {static_cast<void *>(&s1)};
 
     auto code = std::string{initCode} + std::string{stateParamCode};
-    doStateAggregationTest(ctx, "!cc.ptr<!cc.state>", v1, code);
+    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v1, code);
   }
 
   // clang-format off
@@ -799,7 +769,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
     std::vector<void *> a = {static_cast<void *>(&n)};
     auto s = cudaq::state(new FakeDeviceState(init, a));
     std::vector<void *> v = {static_cast<void *>(&s)};
-    doStateAggregationTest(ctx, "!cc.ptr<!cc.state>", v, initCode);
+    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, initCode);
   }
 
   // clang-format off

From 5ef4c3d0ec1c24a58d3ae98fd27689a433b49578 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Mon, 3 Mar 2025 14:10:16 -0800
Subject: [PATCH 42/54] Make argument converter handle kernels created from
 states

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 .../Transforms/ArgumentSynthesis.cpp          | 203 ++++-----
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  10 +-
 runtime/common/ArgumentConversion.cpp         | 404 +++++++++---------
 runtime/common/ArgumentConversion.h           |  91 ++--
 runtime/common/BaseRemoteRESTQPU.h            |  10 +-
 runtime/common/BaseRestRemoteClient.h         |  14 +-
 runtime/test/test_argument_conversion.cpp     |   6 +-
 7 files changed, 356 insertions(+), 382 deletions(-)

diff --git a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
index b80bdfeea23..51c771eb45e 100644
--- a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
+++ b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
@@ -14,7 +14,6 @@
 #include "mlir/Parser/Parser.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
-#include <list>
 
 namespace cudaq::opt {
 #define GEN_PASS_DEF_ARGUMENTSYNTHESIS
@@ -31,99 +30,9 @@ class ArgumentSynthesisPass
 public:
   using ArgumentSynthesisBase::ArgumentSynthesisBase;
 
-  void
-  applySubstitutions(func::FuncOp func,
-                     DenseMap<StringRef, OwningOpRef<ModuleOp>> &substModules) {
-    MLIRContext *ctx = func.getContext();
-    auto funcName = func.getName();
-    LLVM_DEBUG(llvm::dbgs() << "processing : '" << funcName << "'\n");
-
-    // 1. Find substitution module with argument replacements for the function.
-    auto it = substModules.find(funcName);
-    if (it == substModules.end()) {
-      // If the function isn't on the list, do nothing.
-      LLVM_DEBUG(llvm::dbgs() << funcName << " has no substitutions.\n");
-      return;
-    }
-    auto substMod = *(it->second);
-
-    // 2. Go through the Module and process each substitution.
-    SmallVector<bool> processedArgs(func.getFunctionType().getNumInputs());
-    SmallVector<std::tuple<unsigned, Value, Value>> replacements;
-    BitVector replacedArgs(processedArgs.size());
-    for (auto &op : substMod) {
-      auto subst = dyn_cast<cudaq::cc::ArgumentSubstitutionOp>(op);
-      if (!subst)
-        continue;
-      auto pos = subst.getPosition();
-      if (pos >= processedArgs.size()) {
-        func.emitError("Argument " + std::to_string(pos) + " is invalid.");
-        signalPassFailure();
-        return;
-      }
-      if (processedArgs[pos]) {
-        func.emitError("Argument " + std::to_string(pos) +
-                       " was already substituted.");
-        signalPassFailure();
-        return;
-      }
-
-      // OK, substitute the code for the argument.
-      Block &entry = func.getRegion().front();
-      processedArgs[pos] = true;
-      if (subst.getBody().front().empty()) {
-        // No code is present. Erase the argument if it is not used.
-        const auto numUses =
-            std::distance(entry.getArgument(pos).getUses().begin(),
-                          entry.getArgument(pos).getUses().end());
-        LLVM_DEBUG(llvm::dbgs() << "maybe erasing an unused argument ("
-                                << std::to_string(numUses) << ")\n");
-        if (numUses == 0)
-          replacedArgs.set(pos);
-        continue;
-      }
-      OpBuilder builder{ctx};
-      Block *splitBlock = entry.splitBlock(entry.begin());
-      builder.setInsertionPointToEnd(&entry);
-      builder.create<cf::BranchOp>(func.getLoc(), &subst.getBody().front());
-      Operation *lastOp = &subst.getBody().front().back();
-      builder.setInsertionPointToEnd(&subst.getBody().front());
-      builder.create<cf::BranchOp>(func.getLoc(), splitBlock);
-      func.getBlocks().splice(Region::iterator{splitBlock},
-                              subst.getBody().getBlocks());
-      if (lastOp &&
-          lastOp->getResult(0).getType() == entry.getArgument(pos).getType()) {
-        LLVM_DEBUG(llvm::dbgs()
-                   << funcName << " argument " << std::to_string(pos)
-                   << " was substituted.\n");
-        replacements.emplace_back(pos, entry.getArgument(pos),
-                                  lastOp->getResult(0));
-      }
-    }
-
-    // Note: if we exited before here, any code that was cloned into the
-    // function is still dead and can be removed by a DCE.
-
-    // 3. Replace the block argument values with the freshly inserted new code.
-    for (auto [pos, fromVal, toVal] : replacements) {
-      replacedArgs.set(pos);
-      fromVal.replaceAllUsesWith(toVal);
-    }
-
-    // 4. Finish specializing func and erase any of func's arguments that were
-    // substituted.
-    func.eraseArguments(replacedArgs);
-  }
-
   void runOnOperation() override {
-    ModuleOp mod = getOperation();
-    MLIRContext *ctx = mod.getContext();
-
-    // 1. Collect all substitution modules.
-    std::list<std::string> funcNames;
-    DenseMap<StringRef, OwningOpRef<ModuleOp>> substModules;
-
-    for (auto &item : funcList) {
+    ModuleOp moduleOp = getOperation();
+    for (auto item : funcList) {
       auto pos = item.find(':');
       if (pos == std::string::npos)
         continue;
@@ -131,15 +40,27 @@ class ArgumentSynthesisPass
       std::string funcName = item.substr(0, pos);
       std::string text = item.substr(pos + 1);
 
+      auto *op = moduleOp.lookupSymbol(funcName);
+      func::FuncOp func = dyn_cast_if_present<func::FuncOp>(op);
+
+      if (!func) {
+        LLVM_DEBUG(llvm::dbgs() << funcName << " is not in the module.");
+        continue;
+      }
+
+      // If there are no substitutions, we're done.
       if (text.empty()) {
         LLVM_DEBUG(llvm::dbgs() << funcName << " has no substitutions.");
         continue;
       }
 
-      // Create a Module with the substitutions that we'll be making.
-      LLVM_DEBUG(llvm::dbgs()
-                 << funcName << " : substitution pattern: '" << text << "'\n");
-      auto substModule = [&]() -> OwningOpRef<ModuleOp> {
+      // If we're here, we have a FuncOp and we have substitutions that can be
+      // applied.
+      //
+      // 1. Create a Module with the substitutions that we'll be making.
+      auto *ctx = func.getContext();
+      LLVM_DEBUG(llvm::dbgs() << "substitution pattern: '" << text << "'\n");
+      auto substMod = [&]() -> OwningOpRef<ModuleOp> {
         if (text.front() == '*') {
           // Substitutions are a raw string after the '*' character.
           return parseSourceString<ModuleOp>(text.substr(1), ctx);
@@ -147,27 +68,83 @@ class ArgumentSynthesisPass
         // Substitutions are in a text file (command-line usage).
         return parseSourceFile<ModuleOp>(text, ctx);
       }();
-      assert(*substModule && "module must have been created");
+      assert(*substMod && "module must have been created");
 
-      auto &name = funcNames.emplace_back(funcName);
-      substModules.try_emplace(name, std::move(substModule));
-    }
-
-    // 2. Merge symbols from substitution modules into the source module.
-    for (auto &[funcName, substMod] : substModules) {
+      // 2. Go through the Module and process each substitution.
+      SmallVector<bool> processedArgs(func.getFunctionType().getNumInputs());
+      SmallVector<std::tuple<unsigned, Value, Value>> replacements;
+      BitVector replacedArgs(processedArgs.size());
       for (auto &op : *substMod) {
-        if (auto symInterface = dyn_cast<SymbolOpInterface>(op)) {
-          auto name = symInterface.getName();
-          auto obj = mod.lookupSymbol(name);
-          if (!obj)
-            mod.getBody()->push_back(op.clone());
+        auto subst = dyn_cast<cudaq::cc::ArgumentSubstitutionOp>(op);
+        if (!subst) {
+          if (auto symInterface = dyn_cast<SymbolOpInterface>(op)) {
+            auto name = symInterface.getName();
+            auto obj = moduleOp.lookupSymbol(name);
+            if (!obj)
+              moduleOp.getBody()->push_back(op.clone());
+          }
+          continue;
+        }
+        auto pos = subst.getPosition();
+        if (pos >= processedArgs.size()) {
+          func.emitError("Argument " + std::to_string(pos) + " is invalid.");
+          signalPassFailure();
+          return;
+        }
+        if (processedArgs[pos]) {
+          func.emitError("Argument " + std::to_string(pos) +
+                         " was already substituted.");
+          signalPassFailure();
+          return;
+        }
+
+        // OK, substitute the code for the argument.
+        Block &entry = func.getRegion().front();
+        processedArgs[pos] = true;
+        if (subst.getBody().front().empty()) {
+          // No code is present. Erase the argument if it is not used.
+          const auto numUses =
+              std::distance(entry.getArgument(pos).getUses().begin(),
+                            entry.getArgument(pos).getUses().end());
+          LLVM_DEBUG(llvm::dbgs() << "maybe erasing an unused argument ("
+                                  << std::to_string(numUses) << ")\n");
+          if (numUses == 0)
+            replacedArgs.set(pos);
+          continue;
+        }
+        OpBuilder builder{ctx};
+        Block *splitBlock = entry.splitBlock(entry.begin());
+        builder.setInsertionPointToEnd(&entry);
+        builder.create<cf::BranchOp>(func.getLoc(), &subst.getBody().front());
+        Operation *lastOp = &subst.getBody().front().back();
+        builder.setInsertionPointToEnd(&subst.getBody().front());
+        builder.create<cf::BranchOp>(func.getLoc(), splitBlock);
+        func.getBlocks().splice(Region::iterator{splitBlock},
+                                subst.getBody().getBlocks());
+        if (lastOp && lastOp->getResult(0).getType() ==
+                          entry.getArgument(pos).getType()) {
+          LLVM_DEBUG(llvm::dbgs()
+                     << funcName << " argument " << std::to_string(pos)
+                     << " was substituted.\n");
+          replacements.emplace_back(pos, entry.getArgument(pos),
+                                    lastOp->getResult(0));
         }
       }
-    }
 
-    // 3. Apply all substitutions.
-    mod->walk(
-        [&](func::FuncOp func) { applySubstitutions(func, substModules); });
+      // Note: if we exited before here, any code that was cloned into the
+      // function is still dead and can be removed by a DCE.
+
+      // 3. Replace the block argument values with the freshly inserted new
+      // code.
+      for (auto [pos, fromVal, toVal] : replacements) {
+        replacedArgs.set(pos);
+        fromVal.replaceAllUsesWith(toVal);
+      }
+
+      // 4. Finish specializing func and erase any of func's arguments that were
+      // substituted.
+      func.eraseArguments(replacedArgs);
+    }
   }
 };
 } // namespace
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 3f15beac689..c2294035ad5 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -550,10 +550,10 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   // We pass string references to the `createArgumentSynthesisPass`.
   mlir::SmallVector<std::string> kernels;
   mlir::SmallVector<std::string> substs;
-  for (auto &[kName, kInfo] : argCon.getKernelInfo()) {
+  for (auto &kInfo : argCon.getKernelSubstitutions()) {
     {
       std::string kernName =
-          cudaq::runtime::cudaqGenPrefixName + kName.str();
+          cudaq::runtime::cudaqGenPrefixName + kInfo.getKernelName().str();
       kernels.emplace_back(kernName);
     }
     {
@@ -565,10 +565,8 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   }
 
   // Collect references for the argument synthesis.
-  mlir::SmallVector<mlir::StringRef> kernelRefs{kernels.begin(),
-                                               kernels.end()};
-  mlir::SmallVector<mlir::StringRef> substRefs{substs.begin(),
-                                                   substs.end()};
+  mlir::SmallVector<mlir::StringRef> kernelRefs{kernels.begin(), kernels.end()};
+  mlir::SmallVector<mlir::StringRef> substRefs{substs.begin(), substs.end()};
 
   PassManager pm(context);
   pm.addPass(opt::createArgumentSynthesisPass(kernelRefs, substRefs));
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 3b0efa4fe70..e563a90f99f 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -20,8 +20,6 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Parser/Parser.h"
 
-#include <iostream>
-
 using namespace mlir;
 
 template <typename A>
@@ -101,128 +99,127 @@ static Value genConstant(OpBuilder &, cudaq::cc::StructType, void *,
 static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *,
                          ModuleOp substMod, llvm::DataLayout &);
 
+/// Create callee.init_N that initializes the state
+/// Callee (the kernel captured by state):
+// clang-format off
+/// func.func @callee(%arg0: i64) {
+///   %0 = cc.alloca i64
+///   cc.store %arg0, %0 : !cc.ptr<i64>
+///   %1 = cc.load %0 : !cc.ptr<i64>
+///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
+///   quake.x %3 : (!quake.ref) -> ()
+///   return
+/// }
+/// callee.init_N:
+/// func.func private @callee.init_0(%arg0: !quake.veq<?>, %arg0: i64) ->
+/// !!quake.veq<?> {
+///   %1 = quake.extract_ref %arg0[1] : (!quake.veq<2>) -> !quake.ref
+///   quake.x %1 : (f64, !quake.ref) -> ()
+///   return %arg0: !quake.veq<?>
+/// }
+// clang-format on
+static void createInitFunc(OpBuilder &builder, ModuleOp moduleOp,
+                           func::FuncOp calleeFunc, StringRef initKernelName) {
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToEnd(moduleOp.getBody());
 
- /// Create callee.init_N that initializes the state
- /// Callee (the kernel captured by state):
- // clang-format off
- /// func.func @callee(%arg0: i64) {
- ///   %0 = cc.alloca i64
- ///   cc.store %arg0, %0 : !cc.ptr<i64>
- ///   %1 = cc.load %0 : !cc.ptr<i64>
- ///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
- ///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
- ///   quake.x %3 : (!quake.ref) -> ()
- ///   return
- /// }
- /// callee.init_N:
- /// func.func private @callee.init_0(%arg0: !quake.veq<?>, %arg0: i64) ->
- /// !!quake.veq<?> {
- ///   %1 = quake.extract_ref %arg0[1] : (!quake.veq<2>) -> !quake.ref
- ///   quake.x %1 : (f64, !quake.ref) -> ()
- ///   return %arg0: !quake.veq<?>
- /// }
- // clang-format on
- static void createInitFunc(OpBuilder &builder, ModuleOp moduleOp,
-  func::FuncOp calleeFunc, StringRef initKernelName) {
-OpBuilder::InsertionGuard guard(builder);
-builder.setInsertionPointToEnd(moduleOp.getBody());
-
-auto ctx = builder.getContext();
-auto loc = builder.getUnknownLoc();
-
-auto initFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
-
-auto argTypes = calleeFunc.getArgumentTypes();
-auto retTy = quake::VeqType::getUnsized(ctx);
-auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retTy});
-
-initFunc.setName(initKernelName);
-initFunc.setType(funcTy);
-initFunc.setPrivate();
-
-OpBuilder newBuilder(ctx);
-
-auto *entryBlock = &initFunc.getRegion().front();
-newBuilder.setInsertionPointToStart(entryBlock);
-Value zero = newBuilder.create<arith::ConstantIntOp>(loc, 0, 64);
-Value one = newBuilder.create<arith::ConstantIntOp>(loc, 1, 64);
-Value begin = zero;
-
-auto argPos = initFunc.getArguments().size();
-
-// Detect errors in kernel passed to get_state.
-std::function<void(Block &)> processInner = [&](Block &block) {
-for (auto &op : block) {
-for (auto &region : op.getRegions())
-for (auto &b : region)
-processInner(b);
-
-// Don't allow returns in inner scopes
-if (auto retOp = dyn_cast<func::ReturnOp>(&op))
-calleeFunc.emitError("Encountered return in inner scope in a kernel "
-    "passed to get_state");
-}
-};
-
-for (auto &op : calleeFunc.getRegion().front())
-for (auto &region : op.getRegions())
-for (auto &b : region)
-processInner(b);
-
-// Process outer block to initialize the allocation passed as an argument.
-std::function<void(Block &)> process = [&](Block &block) {
-SmallVector<Operation *> cleanUps;
-Operation *replacedReturn = nullptr;
-
-Value arg;
-Value subArg;
-Value blockBegin = begin;
-Value blockAllocSize = zero;
-for (auto &op : block) {
-if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
-newBuilder.setInsertionPointAfter(alloc);
-
-if (!arg) {
-initFunc.insertArgument(argPos, retTy, {}, loc);
-arg = initFunc.getArgument(argPos);
-}
+  auto ctx = builder.getContext();
+  auto loc = builder.getUnknownLoc();
 
-auto allocSize = alloc.getSize();
-auto offset = newBuilder.create<arith::SubIOp>(loc, allocSize, one);
-subArg =
-newBuilder.create<quake::SubVeqOp>(loc, retTy, arg, begin, offset);
-alloc.replaceAllUsesWith(subArg);
-cleanUps.push_back(alloc);
-begin = newBuilder.create<arith::AddIOp>(loc, begin, allocSize);
-blockAllocSize =
-newBuilder.create<arith::AddIOp>(loc, blockAllocSize, allocSize);
-}
+  auto initFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
 
-if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
-if (retOp != replacedReturn) {
-newBuilder.setInsertionPointAfter(retOp);
+  auto argTypes = calleeFunc.getArgumentTypes();
+  auto retTy = quake::VeqType::getUnsized(ctx);
+  auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retTy});
 
-auto offset =
-newBuilder.create<arith::SubIOp>(loc, blockAllocSize, one);
-Value ret = newBuilder.create<quake::SubVeqOp>(loc, retTy, arg,
-                                blockBegin, offset);
+  initFunc.setName(initKernelName);
+  initFunc.setType(funcTy);
+  initFunc.setPrivate();
 
-assert(arg && "No veq allocations found");
-replacedReturn = newBuilder.create<func::ReturnOp>(loc, ret);
-cleanUps.push_back(retOp);
-}
-}
-}
+  OpBuilder newBuilder(ctx);
 
-for (auto &op : cleanUps) {
-op->dropAllReferences();
-op->dropAllUses();
-op->erase();
-}
-};
+  auto *entryBlock = &initFunc.getRegion().front();
+  newBuilder.setInsertionPointToStart(entryBlock);
+  Value zero = newBuilder.create<arith::ConstantIntOp>(loc, 0, 64);
+  Value one = newBuilder.create<arith::ConstantIntOp>(loc, 1, 64);
+  Value begin = zero;
+
+  auto argPos = initFunc.getArguments().size();
+
+  // Detect errors in kernel passed to get_state.
+  std::function<void(Block &)> processInner = [&](Block &block) {
+    for (auto &op : block) {
+      for (auto &region : op.getRegions())
+        for (auto &b : region)
+          processInner(b);
+
+      // Don't allow returns in inner scopes
+      if (auto retOp = dyn_cast<func::ReturnOp>(&op))
+        calleeFunc.emitError("Encountered return in inner scope in a kernel "
+                             "passed to get_state");
+    }
+  };
+
+  for (auto &op : calleeFunc.getRegion().front())
+    for (auto &region : op.getRegions())
+      for (auto &b : region)
+        processInner(b);
+
+  // Process outer block to initialize the allocation passed as an argument.
+  std::function<void(Block &)> process = [&](Block &block) {
+    SmallVector<Operation *> cleanUps;
+    Operation *replacedReturn = nullptr;
+
+    Value arg;
+    Value subArg;
+    Value blockBegin = begin;
+    Value blockAllocSize = zero;
+    for (auto &op : block) {
+      if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
+        newBuilder.setInsertionPointAfter(alloc);
+
+        if (!arg) {
+          initFunc.insertArgument(argPos, retTy, {}, loc);
+          arg = initFunc.getArgument(argPos);
+        }
+
+        auto allocSize = alloc.getSize();
+        auto offset = newBuilder.create<arith::SubIOp>(loc, allocSize, one);
+        subArg =
+            newBuilder.create<quake::SubVeqOp>(loc, retTy, arg, begin, offset);
+        alloc.replaceAllUsesWith(subArg);
+        cleanUps.push_back(alloc);
+        begin = newBuilder.create<arith::AddIOp>(loc, begin, allocSize);
+        blockAllocSize =
+            newBuilder.create<arith::AddIOp>(loc, blockAllocSize, allocSize);
+      }
+
+      if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
+        if (retOp != replacedReturn) {
+          newBuilder.setInsertionPointAfter(retOp);
+
+          auto offset =
+              newBuilder.create<arith::SubIOp>(loc, blockAllocSize, one);
+          Value ret = newBuilder.create<quake::SubVeqOp>(loc, retTy, arg,
+                                                         blockBegin, offset);
+
+          assert(arg && "No veq allocations found");
+          replacedReturn = newBuilder.create<func::ReturnOp>(loc, ret);
+          cleanUps.push_back(retOp);
+        }
+      }
+    }
+
+    for (auto &op : cleanUps) {
+      op->dropAllReferences();
+      op->dropAllUses();
+      op->erase();
+    }
+  };
 
-// Process the function body
-process(initFunc.getRegion().front());
+  // Process the function body
+  process(initFunc.getRegion().front());
 }
 
 /// Create callee.num_qubits_N that calculates the number of qubits to
@@ -248,92 +245,90 @@ process(initFunc.getRegion().front());
 /// }
 // clang-format on
 static void createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
-       func::FuncOp calleeFunc,
-       StringRef numQubitsKernelName) {
-OpBuilder::InsertionGuard guard(builder);
-builder.setInsertionPointToEnd(moduleOp.getBody());
-
-auto ctx = builder.getContext();
-auto loc = builder.getUnknownLoc();
-
-auto numQubitsFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
-
-auto argTypes = calleeFunc.getArgumentTypes();
-auto retType = builder.getI64Type();
-auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retType});
-
-numQubitsFunc.setName(numQubitsKernelName);
-numQubitsFunc.setType(funcTy);
-numQubitsFunc.setPrivate();
-
-OpBuilder newBuilder(ctx);
-
-auto *entryBlock = &numQubitsFunc.getRegion().front();
-newBuilder.setInsertionPointToStart(entryBlock);
-Value size = newBuilder.create<arith::ConstantIntOp>(loc, 0, retType);
-
-// Process block recursively to calculate and return allocation size
-// and remove everything else.
-std::function<void(Block &)> process = [&](Block &block) {
-SmallVector<Operation *> used;
-Operation *replacedReturn = nullptr;
-
-for (auto &op : block) {
-// Calculate allocation size (existing allocation size plus new one)
-if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
-auto allocSize = alloc.getSize();
-newBuilder.setInsertionPointAfter(alloc);
-size = newBuilder.create<arith::AddIOp>(loc, size, allocSize);
-}
+                                func::FuncOp calleeFunc,
+                                StringRef numQubitsKernelName) {
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToEnd(moduleOp.getBody());
 
-// Return allocation size
-if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
-if (retOp != replacedReturn) {
+  auto ctx = builder.getContext();
+  auto loc = builder.getUnknownLoc();
 
-newBuilder.setInsertionPointAfter(retOp);
-auto newRet = newBuilder.create<func::ReturnOp>(loc, size);
-replacedReturn = newRet;
-used.push_back(newRet);
-}
-}
-}
+  auto numQubitsFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
 
-// Collect all ops needed for size calculation
-SmallVector<Operation *> keep;
-while (!used.empty()) {
-auto *op = used.pop_back_val();
-keep.push_back(op);
-for (auto opnd : op->getOperands())
-if (auto defOp = opnd.getDefiningOp())
-used.push_back(defOp);
-}
+  auto argTypes = calleeFunc.getArgumentTypes();
+  auto retType = builder.getI64Type();
+  auto funcTy = FunctionType::get(ctx, argTypes, TypeRange{retType});
 
-// Remove the rest of the ops
-SmallVector<Operation *> toErase;
-for (auto &op : block)
-if (std::find(keep.begin(), keep.end(), &op) == keep.end())
-toErase.push_back(&op);
+  numQubitsFunc.setName(numQubitsKernelName);
+  numQubitsFunc.setType(funcTy);
+  numQubitsFunc.setPrivate();
 
-for (auto &op : toErase) {
-op->dropAllReferences();
-op->dropAllUses();
-op->erase();
-}
-};
+  OpBuilder newBuilder(ctx);
+
+  auto *entryBlock = &numQubitsFunc.getRegion().front();
+  newBuilder.setInsertionPointToStart(entryBlock);
+  Value size = newBuilder.create<arith::ConstantIntOp>(loc, 0, retType);
+
+  // Process block recursively to calculate and return allocation size
+  // and remove everything else.
+  std::function<void(Block &)> process = [&](Block &block) {
+    SmallVector<Operation *> used;
+    Operation *replacedReturn = nullptr;
+
+    for (auto &op : block) {
+      // Calculate allocation size (existing allocation size plus new one)
+      if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
+        auto allocSize = alloc.getSize();
+        newBuilder.setInsertionPointAfter(alloc);
+        size = newBuilder.create<arith::AddIOp>(loc, size, allocSize);
+      }
 
-// Process the function body
-process(numQubitsFunc.getRegion().front());
+      // Return allocation size
+      if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
+        if (retOp != replacedReturn) {
+
+          newBuilder.setInsertionPointAfter(retOp);
+          auto newRet = newBuilder.create<func::ReturnOp>(loc, size);
+          replacedReturn = newRet;
+          used.push_back(newRet);
+        }
+      }
+    }
+
+    // Collect all ops needed for size calculation
+    SmallVector<Operation *> keep;
+    while (!used.empty()) {
+      auto *op = used.pop_back_val();
+      keep.push_back(op);
+      for (auto opnd : op->getOperands())
+        if (auto defOp = opnd.getDefiningOp())
+          used.push_back(defOp);
+    }
+
+    // Remove the rest of the ops
+    SmallVector<Operation *> toErase;
+    for (auto &op : block)
+      if (std::find(keep.begin(), keep.end(), &op) == keep.end())
+        toErase.push_back(&op);
+
+    for (auto &op : toErase) {
+      op->dropAllReferences();
+      op->dropAllUses();
+      op->erase();
+    }
+  };
+
+  // Process the function body
+  process(numQubitsFunc.getRegion().front());
 }
 
 static Value genConstant(OpBuilder &builder, const cudaq::state *v,
-                         llvm::DataLayout &layout, StringRef kernelName, ModuleOp substMod,
+                         llvm::DataLayout &layout, StringRef kernelName,
+                         ModuleOp substMod,
                          cudaq::opt::ArgumentConverter &converter) {
   auto simState =
       cudaq::state_helper::getSimulationState(const_cast<cudaq::state *>(v));
 
-  //auto kernelName = converter.getKernelName();
-  //auto substMod = converter.getSubstitutionModule();
-
   // If the state has amplitude data, we materialize the data as a state
   // vector and create a new state from it.
   if (simState->hasData()) {
@@ -508,11 +503,11 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
 
       // Convert arguments for `callee.init_N`.
       auto &registeredInitName = converter.registerKernel(initName);
-      converter.gen(registeredInitName, calleeArgs);
+      converter.gen(registeredInitName, substMod, calleeArgs);
 
       // Convert arguments for `callee.num_qubits_N`.
-      auto &registeredNumQubitsName = converter.registerKernel(initName);
-      converter.gen(registeredNumQubitsName, calleeArgs);
+      auto &registeredNumQubitsName = converter.registerKernel(numQubitsName);
+      converter.gen(registeredNumQubitsName, substMod, calleeArgs);
     }
 
     // Create a substitution for the state pointer.
@@ -698,24 +693,27 @@ Value genConstant(OpBuilder &builder, cudaq::cc::IndirectCallableType indCallTy,
 
 cudaq::opt::ArgumentConverter::ArgumentConverter(StringRef kernelName,
                                                  ModuleOp sourceModule)
-    : sourceModule(sourceModule), builder(sourceModule.getContext()),
-      kernelName(kernelName) {
-}
+    : sourceModule(sourceModule), kernelName(kernelName) {}
 
 void cudaq::opt::ArgumentConverter::gen(const std::vector<void *> &arguments) {
-  gen(kernelName, arguments);
+  gen(kernelName, sourceModule, arguments);
 }
 
-void cudaq::opt::ArgumentConverter::gen(StringRef kernelName, const std::vector<void *> &arguments) {
-  auto *ctx = builder.getContext();
-  // We should look up the input type signature here.
-  auto &kernelInfo = addKernelInfo(kernelName);
-  auto substModule = kernelInfo.getSubstitutionModule();
+void cudaq::opt::ArgumentConverter::gen(StringRef kernelName,
+                                        ModuleOp sourceModule,
+                                        const std::vector<void *> &arguments) {
+  auto *ctx = sourceModule.getContext();
+  OpBuilder builder(ctx);
+  ModuleOp substModule =
+      builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
+  auto &kernelInfo = addKernelInfo(kernelName, substModule);
 
+  // We should look up the input type signature here.
   auto fun = sourceModule.lookupSymbol<func::FuncOp>(
       cudaq::runtime::cudaqGenPrefixName + kernelName.str());
   if (!fun) {
-    throw std::runtime_error("missing fun in argument conversion: " + kernelName.str());
+    throw std::runtime_error("missing fun in argument conversion: " +
+                             kernelName.str());
   }
 
   FunctionType fromFuncTy = fun.getFunctionType();
diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index 2be7ba579dc..9252ee1b8a2 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -19,34 +19,34 @@
 
 namespace cudaq::opt {
 
-  
-class KernelInfo {
-  public:
-    KernelInfo(mlir::OpBuilder builder, mlir::StringRef kernelName)
-    :  kernelName(kernelName) {
-      substModule = builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
-    }
-  
-    /// Some substitutions may generate global constant information. Use this
-    /// interface to access both the substitutions and any global constants
-    /// created.
-    mlir::ModuleOp getSubstitutionModule() {
-      return substModule;
-    }
-
-    /// Get the list of substitutions for this kernel that were generated
-    /// by `ArgumentConverter::gen()`.
-    mlir::SmallVector<cc::ArgumentSubstitutionOp> &getSubstitutions() {
-      return substitutions;
-    }
-
-  private:
-    mlir::ModuleOp substModule;
-    mlir::StringRef kernelName;
-    mlir::SmallVector<cc::ArgumentSubstitutionOp> substitutions;
-  };
-
-  
+class ArgumentConverter;
+
+class KernelSubstitutionInfo {
+public:
+  KernelSubstitutionInfo(mlir::StringRef kernelName, mlir::ModuleOp substModule)
+      : kernelName(kernelName), substModule(substModule) {}
+
+  /// Some substitutions may generate global constant information. Use this
+  /// interface to access both the substitutions and any global constants
+  /// created.
+  mlir::ModuleOp getSubstitutionModule() { return substModule; }
+
+  /// Get the list of substitutions for this kernel that were generated
+  /// by `ArgumentConverter::gen()`.
+  mlir::SmallVector<cc::ArgumentSubstitutionOp> &getSubstitutions() {
+    return substitutions;
+  }
+
+  mlir::StringRef getKernelName() { return kernelName; }
+
+private:
+  mlir::StringRef kernelName;
+  mlir::ModuleOp substModule;
+  mlir::SmallVector<cc::ArgumentSubstitutionOp> substitutions;
+
+  friend ArgumentConverter;
+};
+
 class ArgumentConverter {
 public:
   /// Build an instance to create argument substitutions for a specified \p
@@ -59,7 +59,8 @@ class ArgumentConverter {
 
   /// Generate a substitution ModuleOp for the vector of arguments presented.
   /// The arguments are those presented to the kernel, kernelName.
-  void gen(mlir::StringRef kernelName, const std::vector<void *> &arguments);
+  void gen(mlir::StringRef kernelName, mlir::ModuleOp sourceModule,
+           const std::vector<void *> &arguments);
 
   /// Generate a substitution ModuleOp but include only the arguments that do
   /// not appear in the set of \p exclusions.
@@ -70,38 +71,38 @@ class ArgumentConverter {
   /// and thereby exclude them from the substitutions.
   void gen_drop_front(const std::vector<void *> &arguments, unsigned numDrop);
 
-  /// Kernel we are converting the arguments for.
-  mlir::StringRef getKernelName() { return kernelName; }
-
-  /// Get the map of kernel names to their kernel info that
-  /// were collected by `collect()`.
-   mlir::DenseMap<mlir::StringRef, KernelInfo>& getKernelInfo() {
-    return kernelInfo;
+  /// Get the kernel info that were collected by `gen()`.
+  std::list<KernelSubstitutionInfo> &getKernelSubstitutions() {
+    return kernelSubstitutions;
   }
 
   bool isRegisteredKernel(const std::string &kernelName) {
-    return std::find(nameRegistry.begin(), nameRegistry.end(), kernelName) != nameRegistry.end();
+    return std::find(nameRegistry.begin(), nameRegistry.end(), kernelName) !=
+           nameRegistry.end();
   }
 
   std::string &registerKernel(const std::string &kernelName) {
     return nameRegistry.emplace_back(kernelName);
   }
 
-  KernelInfo& addKernelInfo(mlir::StringRef kernelName) {
-    auto [it,b] = kernelInfo.try_emplace(kernelName, std::move(KernelInfo(builder, kernelName)));
-    return it->second;
+private:
+  KernelSubstitutionInfo &addKernelInfo(mlir::StringRef kernelName,
+                                        mlir::ModuleOp substModule) {
+    return kernelSubstitutions.emplace_back(kernelName, substModule);
   }
 
-  private:
   /// Memory to store new kernel names generated during argument conversion.
+  /// Use list here to keep references to those elements valid.
   std::list<std::string> nameRegistry;
 
-  /// Kernel info for kernels we are converting the arguments for, including
-  /// new kernels generated from state arguments.
-  mlir::DenseMap<mlir::StringRef, KernelInfo> kernelInfo;
+  /// Memory to store new kernel info generated during argument conversion.
+  /// Use list here to keep elements sorted in order of creation.
+  std::list<KernelSubstitutionInfo> kernelSubstitutions;
 
+  /// Original module before substitutions.
   mlir::ModuleOp sourceModule;
-  mlir::OpBuilder builder;
+
+  /// Kernel we are substituting the arguments for.
   mlir::StringRef kernelName;
 };
 
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 6937c43233e..94ebdbaab2d 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -466,10 +466,10 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         // We pass string references to the `createArgumentSynthesisPass`.
         mlir::SmallVector<std::string> kernels;
         mlir::SmallVector<std::string> substs;
-        for (auto &[kName, kInfo] : argCon.getKernelInfo()) {
+        for (auto &kInfo : argCon.getKernelSubstitutions()) {
           {
-            std::string kernName =
-                cudaq::runtime::cudaqGenPrefixName + kName.str();
+            std::string kernName = cudaq::runtime::cudaqGenPrefixName +
+                                   kInfo.getKernelName().str();
             kernels.emplace_back(kernName);
           }
           {
@@ -482,9 +482,9 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
 
         // Collect references for the argument synthesis.
         mlir::SmallVector<mlir::StringRef> kernelRefs{kernels.begin(),
-                                                     kernels.end()};
+                                                      kernels.end()};
         mlir::SmallVector<mlir::StringRef> substRefs{substs.begin(),
-                                                         substs.end()};
+                                                     substs.end()};
         pm.addPass(opt::createArgumentSynthesisPass(kernelRefs, substRefs));
         pm.addPass(opt::createDeleteStates());
         pm.addNestedPass<mlir::func::FuncOp>(
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index f5f63d132b2..fdb34719420 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -190,10 +190,10 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
           // We pass string references to the `createArgumentSynthesisPass`.
           mlir::SmallVector<std::string> kernels;
           mlir::SmallVector<std::string> substs;
-          for (auto &[kName, kInfo] : argCon.getKernelInfo()) {
+          for (auto &kInfo : argCon.getKernelSubstitutions()) {
             {
-              std::string kernName =
-                  cudaq::runtime::cudaqGenPrefixName + kName.str();
+              std::string kernName = cudaq::runtime::cudaqGenPrefixName +
+                                     kInfo.getKernelName().str();
               kernels.emplace_back(kernName);
             }
             {
@@ -203,17 +203,17 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
               substs.emplace_back(substBuff);
             }
           }
-  
+
           // Collect references for the argument synthesis.
           mlir::SmallVector<mlir::StringRef> kernelRefs{kernels.begin(),
-            kernels.end()};
+                                                        kernels.end()};
           mlir::SmallVector<mlir::StringRef> substRefs{substs.begin(),
-                        substs.end()};
+                                                       substs.end()};
           pm.addPass(opt::createArgumentSynthesisPass(kernelRefs, substRefs));
           pm.addPass(mlir::createCanonicalizerPass());
           pm.addPass(opt::createDeleteStates());
           pm.addNestedPass<mlir::func::FuncOp>(
-            opt::createReplaceStateWithKernel());
+              opt::createReplaceStateWithKernel());
           pm.addPass(mlir::createSymbolDCEPass());
         } else if (args) {
           cudaq::info("Run Quake Synth.\n");
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index afe24478d11..75e7eaf96b8 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -143,12 +143,12 @@ class FakeDeviceState : public cudaq::SimulationState {
 extern "C" void __cudaq_deviceCodeHolderAdd(const char *, const char *);
 
 void dumpSubstitutionModules(cudaq::opt::ArgumentConverter &con) {
-  for (auto &[kName, kInfo] : con.getKernelInfo()) {
+  for (auto &kInfo : con.getKernelSubstitutions()) {
     // Dump the conversions
     llvm::outs() << "========================================\n"
                     "Substitution module:\n"
-                << kName << "\n"
-                << kInfo.getSubstitutionModule() << '\n';
+                 << kInfo.getKernelName() << "\n"
+                 << kInfo.getSubstitutionModule() << '\n';
   }
 }
 

From 6fefc2715efebad2b25be8628860f7e36d17b5e2 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Mon, 10 Mar 2025 14:40:39 -0700
Subject: [PATCH 43/54] Fix null alloc size and add tests

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/common/ArgumentConversion.cpp     |  7 ++
 runtime/test/test_argument_conversion.cpp | 92 +++++++++++++++++++----
 2 files changed, 85 insertions(+), 14 deletions(-)

diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index e563a90f99f..80440e3d57a 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -185,6 +185,10 @@ static void createInitFunc(OpBuilder &builder, ModuleOp moduleOp,
         }
 
         auto allocSize = alloc.getSize();
+        if (!allocSize)
+          allocSize = newBuilder.create<arith::ConstantIntOp>(
+              loc, quake::getAllocationSize(alloc.getType()), 64);
+
         auto offset = newBuilder.create<arith::SubIOp>(loc, allocSize, one);
         subArg =
             newBuilder.create<quake::SubVeqOp>(loc, retTy, arg, begin, offset);
@@ -279,6 +283,9 @@ static void createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
       // Calculate allocation size (existing allocation size plus new one)
       if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
         auto allocSize = alloc.getSize();
+        if (!allocSize)
+          allocSize = newBuilder.create<arith::ConstantIntOp>(
+              loc, quake::getAllocationSize(alloc.getType()), 64);
         newBuilder.setInsertionPointAfter(alloc);
         size = newBuilder.create<arith::AddIOp>(loc, size, allocSize);
       }
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 75e7eaf96b8..825e08152ed 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -515,18 +515,82 @@ void test_simulation_state(mlir::MLIRContext *ctx) {
 void test_quantum_state(mlir::MLIRContext *ctx) {
   {
     // @cudaq.kernel
-    // def init(n: int):
+    // def init():
+    //    q = cudaq.qvector(2)
+    //
+    // def kernel(s: cudaq.State):
+    //   ...
+    //
+    // s = cudaq.get_state(init)
+    // cudaq.sample(kernel, s)
+    auto init = "init";
+    auto initCode = "func.func private @__nvqpp__mlirgen__init() {\n"
+                    "  %0 = quake.alloca !quake.veq<2>\n"
+                    "  return\n"
+                    "}\n";
+    __cudaq_deviceCodeHolderAdd(init, initCode);
+
+    std::int64_t n = 2;
+    std::vector<void *> a = {static_cast<void *>(&n)};
+    auto s = cudaq::state(new FakeDeviceState(init, a));
+    std::vector<void *> v = {static_cast<void *>(&s)};
+    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, initCode);
+  }
+
+  // clang-format off
+// CHECK:       Source module:
+// CHECK:         func.func private @__nvqpp__mlirgen__init() {
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           return
+// CHECK:         }
+// CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
+
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         testy
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__init.init_[[HASH_0]] : !cc.ptr<!cc.state>
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__init.init_[[HASH_0]](%arg0: !quake.veq<?>) -> !quake.veq<?> {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant 2 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.subi %[[VAL_2]], %[[VAL_1]] : i64
+// CHECK:           %[[VAL_4:.*]] = quake.subveq %arg0, %[[VAL_0]], %[[VAL_3]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_0]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_6:.*]] = arith.addi %[[VAL_0]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_7:.*]] = arith.subi %[[VAL_6]], %[[VAL_1]] : i64
+// CHECK:           %[[VAL_8:.*]] = quake.subveq %arg0, %[[VAL_0]], %[[VAL_7]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           return %[[VAL_8]] : !quake.veq<?>
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__init.num_qubits_[[HASH_0]]() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 2 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i64
+// CHECK:           return %[[VAL_2]] : i64
+// CHECK:         }
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         init.init_[[HASH_0]]
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         init.num_qubits_[[HASH_0]]
+  // clang-format on
+
+  {
+    // @cudaq.kernel
+    // def init0(n: int):
     //    q = cudaq.qvector(n)
     //    x(q[0])
     //
     // def kernel(s: cudaq.State):
     //   ...
     //
-    // s = cudaq.get_state(init, 2)
+    // s = cudaq.get_state(init0, 2)
     // cudaq.sample(kernel, s)
-    auto init = "init";
+    auto init = "init0";
     auto initCode =
-        "func.func private @__nvqpp__mlirgen__init(%arg0: i64) {\n"
+        "func.func private @__nvqpp__mlirgen__init0(%arg0: i64) {\n"
         "  %0 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
         "  %1 = quake.extract_ref %0[0] : (!quake.veq<?>) -> !quake.ref\n"
         "  quake.x %1 : (!quake.ref) -> ()\n"
@@ -543,7 +607,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 
   // clang-format off
 // CHECK:       Source module:
-// CHECK:         func.func private @__nvqpp__mlirgen__init(%arg0: i64) {
+// CHECK:         func.func private @__nvqpp__mlirgen__init0(%arg0: i64) {
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
 // CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<?>) -> !quake.ref
 // CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
@@ -555,9 +619,9 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         testy
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__init.init_[[HASH_0]] : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init0.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__init0.init_[[HASH_0]] : !cc.ptr<!cc.state>
 // CHECK:         }
-// CHECK:         func.func private @__nvqpp__mlirgen__init.init_[[HASH_0]](%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
+// CHECK:         func.func private @__nvqpp__mlirgen__init0.init_[[HASH_0]](%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
 // CHECK:           %[[VAL_2:.*]] = arith.subi %arg0, %[[VAL_1]] : i64
@@ -570,20 +634,20 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:           %[[VAL_8:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_7]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
 // CHECK:           return %[[VAL_8]] : !quake.veq<?>
 // CHECK:         }
-// CHECK:         func.func private @__nvqpp__mlirgen__init.num_qubits_[[HASH_0]](%arg0: i64) -> i64 {
+// CHECK:         func.func private @__nvqpp__mlirgen__init0.num_qubits_[[HASH_0]](%arg0: i64) -> i64 {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_1:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
 // CHECK:           return %[[VAL_1]] : i64
 // CHECK:         }
 // CHECK:         ========================================
 // CHECK:         Substitution module:
-// CHECK:         init.init_[[HASH_0]]
+// CHECK:         init0.init_[[HASH_0]]
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
 // CHECK:         }
 // CHECK:         ========================================
 // CHECK:         Substitution module:
-// CHECK:         init.num_qubits_[[HASH_0]]
+// CHECK:         init0.num_qubits_[[HASH_0]]
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
 // CHECK:         }
@@ -591,7 +655,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 
   {
     // @cudaq.kernel
-    // def init(n: int):
+    // def init1(n: int):
     //    q = cudaq.qvector(n)
     //    x(q[0])
     //
@@ -602,7 +666,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
     // def kernel(s: cudaq.State):
     //   ...
     //
-    // s0 = cudaq.get_state(init, 2)
+    // s0 = cudaq.get_state(init1, 2)
     // s1 = cudaq.get_state(state_param, s0)
     // cudaq.sample(kernel, s1)
     auto init = "init1";
@@ -730,7 +794,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 
   {
     // @cudaq.kernel
-    // def init(n: int):
+    // def init2(n: int):
     //    q0 = cudaq.qvector(n)
     //    x(q0[0])
     //    r = mz(q0[0])
@@ -742,7 +806,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
     // def kernel(s: cudaq.State):
     //   ...
     //
-    // s = cudaq.get_state(init, 2)
+    // s = cudaq.get_state(init2, 2)
     // cudaq.sample(kernel, s)
     auto init = "init2";
     auto initCode =

From 55e25dd33ae98019549123d581ad2a2fffc6955e Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 12 Mar 2025 16:25:36 -0700
Subject: [PATCH 44/54] Keep storing ops when generating numSubits func

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/common/ArgumentConversion.cpp     |  33 ++++---
 runtime/test/test_argument_conversion.cpp | 111 ++++++++++++++--------
 2 files changed, 90 insertions(+), 54 deletions(-)

diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 80440e3d57a..c06970c862a 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -100,17 +100,16 @@ static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *,
                          ModuleOp substMod, llvm::DataLayout &);
 
 /// Create callee.init_N that initializes the state
-/// Callee (the kernel captured by state):
+///
 // clang-format off
+/// Callee (the kernel captured by state):
 /// func.func @callee(%arg0: i64) {
-///   %0 = cc.alloca i64
-///   cc.store %arg0, %0 : !cc.ptr<i64>
-///   %1 = cc.load %0 : !cc.ptr<i64>
-///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+///   %2 = quake.alloca !quake.veq<?>[%arg0 : i64]
 ///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
 ///   quake.x %3 : (!quake.ref) -> ()
 ///   return
 /// }
+///
 /// callee.init_N:
 /// func.func private @callee.init_0(%arg0: !quake.veq<?>, %arg0: i64) ->
 /// !!quake.veq<?> {
@@ -228,13 +227,11 @@ static void createInitFunc(OpBuilder &builder, ModuleOp moduleOp,
 
 /// Create callee.num_qubits_N that calculates the number of qubits to
 /// initialize the state
-/// Callee: (the kernel captured by state):
+///
 // clang-format off
+/// Callee: (the kernel captured by state):
 /// func.func @callee(%arg0: i64) {
-///   %0 = cc.alloca i64
-///   cc.store %arg0, %0 : !cc.ptr<i64>
-///   %1 = cc.load %0 : !cc.ptr<i64>
-///   %2 = quake.alloca !quake.veq<?>[%1 : i64]
+///   %2 = quake.alloca !quake.veq<?>[%arg0 : i64]
 ///   %3 = quake.extract_ref %2[1] : (!quake.veq<?>) -> !quake.ref
 ///   quake.x %3 : (!quake.ref) -> ()
 ///   return
@@ -242,10 +239,7 @@ static void createInitFunc(OpBuilder &builder, ModuleOp moduleOp,
 ///
 /// callee.num_qubits_0:
 /// func.func private @callee.num_qubits_0(%arg0: i64) -> i64 {
-///   %0 = cc.alloca i64
-///   cc.store %arg0, %0 : !cc.ptr<i64>
-///   %1 = cc.load %0 : !cc.ptr<i64>
-///   return %1 : i64
+///   return %arg0 : i64
 /// }
 // clang-format on
 static void createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
@@ -306,10 +300,21 @@ static void createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
     SmallVector<Operation *> keep;
     while (!used.empty()) {
       auto *op = used.pop_back_val();
+      if (std::find(keep.begin(), keep.end(), op) != keep.end())
+        continue;
+
       keep.push_back(op);
+
+      // Collect ops creating operands used in ops we already collected
       for (auto opnd : op->getOperands())
         if (auto defOp = opnd.getDefiningOp())
           used.push_back(defOp);
+
+      // Collect ops that store into memory used in ops we already collected.
+      for (auto user : op->getUsers())
+        if (auto iface = dyn_cast<MemoryEffectOpInterface>(user))
+          if (iface.hasEffect<MemoryEffects::Write>())
+            used.push_back(user);
     }
 
     // Remove the rest of the ops
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 825e08152ed..e66a9d37a8f 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -513,6 +513,7 @@ void test_simulation_state(mlir::MLIRContext *ctx) {
 }
 
 void test_quantum_state(mlir::MLIRContext *ctx) {
+
   {
     // @cudaq.kernel
     // def init():
@@ -530,19 +531,13 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
                     "}\n";
     __cudaq_deviceCodeHolderAdd(init, initCode);
 
-    std::int64_t n = 2;
-    std::vector<void *> a = {static_cast<void *>(&n)};
-    auto s = cudaq::state(new FakeDeviceState(init, a));
+    auto s = cudaq::state(new FakeDeviceState(init, {}));
     std::vector<void *> v = {static_cast<void *>(&s)};
     doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, initCode);
   }
 
   // clang-format off
 // CHECK:       Source module:
-// CHECK:         func.func private @__nvqpp__mlirgen__init() {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           return
-// CHECK:         }
 // CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
 
 // CHECK:         ========================================
@@ -607,12 +602,6 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 
   // clang-format off
 // CHECK:       Source module:
-// CHECK:         func.func private @__nvqpp__mlirgen__init0(%arg0: i64) {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
-// CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<?>) -> !quake.ref
-// CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
-// CHECK:           return
-// CHECK:         }
 // CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
 
 // CHECK:         ========================================
@@ -708,18 +697,6 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 
   // clang-format off
 // CHECK:       Source module:
-// CHECK:         func.func private @__nvqpp__mlirgen__init1(%arg0: i64) {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
-// CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<?>) -> !quake.ref
-// CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
-// CHECK:           return
-// CHECK:         }
-// CHECK:         func.func private @__nvqpp__mlirgen__state_param(%arg0: !cc.ptr<!cc.state>) {
-// CHECK:           %[[VAL_0:.*]] = quake.get_number_of_qubits %arg0 : (!cc.ptr<!cc.state>) -> i64
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<?>[%[[VAL_0]] : i64]
-// CHECK:           %[[VAL_2:.*]] = quake.init_state %[[VAL_1]], %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
-// CHECK:           return
-// CHECK:         }
 // CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
 
 // CHECK:         ========================================
@@ -838,21 +815,6 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 
   // clang-format off
 // CHECK:       Source module:
-// CHECK:         func.func private @__nvqpp__mlirgen__init2(%arg0: i64) {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
-// CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<?>) -> !quake.ref
-// CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "q0" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!quake.measure) -> i1
-// CHECK:           cc.if(%[[VAL_3]]) {
-// CHECK:             %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%arg0 : i64]
-// CHECK:             %[[VAL_5:.*]] = quake.extract_ref %[[VAL_4]][0] : (!quake.veq<?>) -> !quake.ref
-// CHECK:             quake.x %[[VAL_5]] : (!quake.ref) -> ()
-// CHECK:             %[[VAL_6:.*]] = quake.extract_ref %[[VAL_0]][1] : (!quake.veq<?>) -> !quake.ref
-// CHECK:             quake.y %[[VAL_6]] : (!quake.ref) -> ()
-// CHECK:           }
-// CHECK:           return
-// CHECK:         }
 // CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
 
 // CHECK:         ========================================
@@ -901,6 +863,75 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
 // CHECK:         }
   // clang-format on
+
+  {
+    // (No memtoreg pass before argument conversion)
+    // @cudaq.kernel
+    // def init3(n: int):
+    //    q0 = cudaq.qvector(n)
+    //
+    // def kernel(s: cudaq.State):
+    //   ...
+    //
+    // s = cudaq.get_state(init3, 2)
+    // cudaq.sample(kernel, s)
+    auto init = "init3";
+    auto initCode = " func.func @__nvqpp__mlirgen__init3(%arg0: i64) {\n"
+                    "   %0 = cc.alloca i64\n"
+                    "   cc.store %arg0, %0 : !cc.ptr<i64>\n"
+                    "   %1 = cc.load %0 : !cc.ptr<i64>\n"
+                    "   %2 = quake.alloca !quake.veq<?>[%1 : i64]\n"
+                    "   return\n"
+                    "}\n";
+
+    __cudaq_deviceCodeHolderAdd(init, initCode);
+
+    std::int64_t n = 2;
+    std::vector<void *> a = {static_cast<void *>(&n)};
+    auto s = cudaq::state(new FakeDeviceState(init, a));
+    std::vector<void *> v = {static_cast<void *>(&s)};
+    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v, initCode);
+  }
+
+  // clang-format off
+// CHECK:       Source module:
+// CHECK:         func.func private @callee(!cc.ptr<!cc.state>)
+
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         testy
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init3.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__init3.init_[[HASH_0]] : !cc.ptr<!cc.state>
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__init3.init_[[HASH_0]](%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = cc.alloca i64
+// CHECK:           cc.store %arg0, %[[VAL_2]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_4:.*]] = arith.subi %[[VAL_3]], %[[VAL_1]] : i64
+// CHECK:           %[[VAL_5:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_4]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           %[[VAL_6:.*]] = arith.addi %[[VAL_0]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_7:.*]] = arith.addi %[[VAL_0]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_8:.*]] = arith.subi %[[VAL_7]], %[[VAL_1]] : i64
+// CHECK:           %[[VAL_9:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_8]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           return %[[VAL_9]] : !quake.veq<?>
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__init3.num_qubits_[[HASH_0]](%arg0: i64) -> i64 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i64
+// CHECK:           cc.store %arg0, %[[VAL_1]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_3:.*]] = arith.addi %[[VAL_0]], %[[VAL_2]] : i64
+// CHECK:           return %[[VAL_3]] : i64
+// CHECK:         }
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         init3.init_[[HASH_0]]
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         init3.num_qubits_[[HASH_0]]
+  // clang-format on
 }
 
 void test_combinations(mlir::MLIRContext *ctx) {

From 745fc5d9cc7fb15815a39c65a9ddbb1850774e21 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 12 Mar 2025 16:31:47 -0700
Subject: [PATCH 45/54] Cleanup

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 include/cudaq/Optimizer/Transforms/Passes.td | 2 +-
 runtime/common/ArgumentConversion.cpp        | 3 +--
 runtime/common/BaseRemoteRESTQPU.h           | 7 +------
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 715c50623e6..243e70bfaf0 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -874,7 +874,7 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::Func
 
     The `quake.materialize_state` operation accepts symbols for the synthesized
     kernels `@num_qubits` and `@init` that argument synthesis generated from
-    the original kernel call that generated the state, e.g., 
+    the original kernel call that generated the state, e.g.,
     the `cudaq::get_state` call that refers to the result of a specific quantum
     kernel being invoked with a set of parameters
 
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index c06970c862a..bdeeafdcc96 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -723,10 +723,9 @@ void cudaq::opt::ArgumentConverter::gen(StringRef kernelName,
   // We should look up the input type signature here.
   auto fun = sourceModule.lookupSymbol<func::FuncOp>(
       cudaq::runtime::cudaqGenPrefixName + kernelName.str());
-  if (!fun) {
+  if (!fun)
     throw std::runtime_error("missing fun in argument conversion: " +
                              kernelName.str());
-  }
 
   FunctionType fromFuncTy = fun.getFunctionType();
   for (auto iter :
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 94ebdbaab2d..64f68cb6021 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -586,8 +586,6 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     } else
       modules.emplace_back(kernelName, moduleOp);
 
-    std::cout << "Modules: " << modules.size() << std::endl;
-
     if (emulate) {
       // If we are in emulation mode, we need to first get a full QIR
       // representation of the code. Then we'll map to an LLVM Module, create a
@@ -714,7 +712,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
             std::vector<cudaq::ExecutionResult> results;
 
             // If seed is 0, then it has not been set.
-            if (seed == 0)
+            if (seed > 0)
               cudaq::set_random_seed(seed);
 
             bool hasConditionals =
@@ -722,7 +720,6 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
             if (hasConditionals && isObserve)
               throw std::runtime_error("error: spin_ops not yet supported with "
                                        "kernels containing conditionals");
-
             if (hasConditionals) {
               executor->setShots(1); // run one shot at a time
 
@@ -748,8 +745,6 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
                       counts.sequential_data(regName);
                 }
               }
-              localJIT.clear();
-              return cudaq::sample_result(results);
             }
 
             for (std::size_t i = 0; i < codes.size(); i++) {

From b8c8c8a0d2794136f031e8d11774dadc82760b05 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 12 Mar 2025 16:56:28 -0700
Subject: [PATCH 46/54] Cleanup

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 test/Quake/arg_subst-5.txt | 2 +-
 test/Quake/arg_subst-6.txt | 2 +-
 test/Quake/arg_subst-7.txt | 2 +-
 test/Quake/arg_subst-8.txt | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/Quake/arg_subst-5.txt b/test/Quake/arg_subst-5.txt
index b1383d071d3..2d6ce5c6cf1 100644
--- a/test/Quake/arg_subst-5.txt
+++ b/test/Quake/arg_subst-5.txt
@@ -1,5 +1,5 @@
 // ========================================================================== //
-// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                 //
 // All rights reserved.                                                       //
 //                                                                            //
 // This source code and the accompanying materials are made available under   //
diff --git a/test/Quake/arg_subst-6.txt b/test/Quake/arg_subst-6.txt
index 4871d034829..4227fbd386b 100644
--- a/test/Quake/arg_subst-6.txt
+++ b/test/Quake/arg_subst-6.txt
@@ -1,5 +1,5 @@
 // ========================================================================== //
-// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                 //
 // All rights reserved.                                                       //
 //                                                                            //
 // This source code and the accompanying materials are made available under   //
diff --git a/test/Quake/arg_subst-7.txt b/test/Quake/arg_subst-7.txt
index a3ed90891ab..58ca8a163e6 100644
--- a/test/Quake/arg_subst-7.txt
+++ b/test/Quake/arg_subst-7.txt
@@ -1,5 +1,5 @@
 // ========================================================================== //
-// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                 //
 // All rights reserved.                                                       //
 //                                                                            //
 // This source code and the accompanying materials are made available under   //
diff --git a/test/Quake/arg_subst-8.txt b/test/Quake/arg_subst-8.txt
index 7a53d0369de..26583075911 100644
--- a/test/Quake/arg_subst-8.txt
+++ b/test/Quake/arg_subst-8.txt
@@ -1,5 +1,5 @@
 // ========================================================================== //
-// Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                 //
+// Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                 //
 // All rights reserved.                                                       //
 //                                                                            //
 // This source code and the accompanying materials are made available under   //

From edf02247ac698eaa19b9190e46e7a1a0dced39a1 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 14 Mar 2025 09:35:29 -0700
Subject: [PATCH 47/54] Cleanup

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 python/runtime/cudaq/platform/py_alt_launch_kernel.cpp | 2 +-
 runtime/common/BaseRemoteRESTQPU.h                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 5d53c7098cd..00c2bbcc272 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -543,7 +543,7 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   auto isLocalSimulator = platform.is_simulator() && !platform.is_emulated();
   auto isSimulator = isLocalSimulator || isRemoteSimulator;
 
-  auto argCon = cudaq::opt::ArgumentConverter(name, unwrap(module));
+  cudaq::opt::ArgumentConverter argCon(name, unwrap(module));
   argCon.gen(runtimeArgs.getArgs());
 
   // Store kernel and substitution strings on the stack.
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 64f68cb6021..cec3644d19d 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -459,7 +459,7 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         // created from a kernel that generated the state argument.
         // Traverse the list and collect substitutions for all those
         // functions.
-        auto argCon = cudaq::opt::ArgumentConverter(kernelName, moduleOp);
+        cudaq::opt::ArgumentConverter argCon(kernelName, moduleOp);
         argCon.gen(rawArgs);
 
         // Store kernel and substitution strings on the stack.

From 6b5161a74dd1fcddd74e3258b1ec79f5efcc90f4 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 14 Mar 2025 11:43:25 -0700
Subject: [PATCH 48/54] Fix failing test

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/common/ArgumentConversion.cpp | 3 ++-
 runtime/common/BaseRemoteRESTQPU.h    | 8 +++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index bdeeafdcc96..4c89a185194 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -313,7 +313,8 @@ static void createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
       // Collect ops that store into memory used in ops we already collected.
       for (auto user : op->getUsers())
         if (auto iface = dyn_cast<MemoryEffectOpInterface>(user))
-          if (iface.hasEffect<MemoryEffects::Write>())
+          if (iface.hasEffect<MemoryEffects::Write>() &&
+              !iface.hasEffect<MemoryEffects::Allocate>())
             used.push_back(user);
     }
 
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index cec3644d19d..56449854431 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -454,11 +454,9 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
       mlir::PassManager pm(&context);
       if (!rawArgs.empty()) {
         cudaq::info("Run Argument Synth.\n");
-        // For quantum devices, create a list of ArgumentConverters
-        // with nodes corresponding to `init` and `num_qubits` functions
-        // created from a kernel that generated the state argument.
-        // Traverse the list and collect substitutions for all those
-        // functions.
+        // For quantum devices, we generate a collection of `init` and
+        // `num_qubits` functions and their substitutions created
+        // from a kernel and arguments that generated a state argument.
         cudaq::opt::ArgumentConverter argCon(kernelName, moduleOp);
         argCon.gen(rawArgs);
 

From 4df4390479fe43ffe3ce74c0beb425eaaec7c4ca Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Fri, 14 Mar 2025 15:41:27 -0700
Subject: [PATCH 49/54] Fix failing doc build

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 docs/sphinx/api/languages/cpp_api.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/sphinx/api/languages/cpp_api.rst b/docs/sphinx/api/languages/cpp_api.rst
index 0c050f4faf7..69332c4dd2a 100644
--- a/docs/sphinx/api/languages/cpp_api.rst
+++ b/docs/sphinx/api/languages/cpp_api.rst
@@ -91,6 +91,8 @@ Common
 
 .. doxygenclass:: cudaq::RemoteSimulationState
 
+.. doxygenclass:: cudaq::QPUState
+
 .. doxygenclass:: cudaq::registry::RegisteredType
     :members:
 

From a5d63c5969b33333d3b29f283b2ebc469bb108f8 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 18 Mar 2025 11:23:16 -0700
Subject: [PATCH 50/54] Address CR comments

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 .../cudaq/Optimizer/Dialect/Quake/QuakeOps.td |   4 +-
 include/cudaq/Optimizer/Transforms/Passes.td  |   4 +-
 .../Transforms/ReplaceStateWithKernel.cpp     |  28 +--
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  20 +-
 runtime/common/ArgumentConversion.cpp         |  14 +-
 runtime/common/ArgumentConversion.h           |  30 +--
 runtime/common/BaseRemoteRESTQPU.h            |  20 +-
 runtime/common/BaseRestRemoteClient.h         |  24 +--
 runtime/cudaq/algorithms/get_state.h          |   8 +-
 runtime/cudaq/cudaq.cpp                       |   1 -
 runtime/cudaq/platform/qpu_state.cpp          |   7 -
 runtime/cudaq/platform/qpu_state.h            |  16 +-
 runtime/test/test_argument_conversion.cpp     |  94 +++++----
 targettests/execution/test_trotter.cpp        | 183 ++++++++++++++++++
 test/Quake/arg_subst-7.txt                    |   2 +-
 test/Quake/arg_subst_func.qke                 |   2 +-
 test/Quake/replace_state_with_kernel.qke      |   6 +-
 17 files changed, 328 insertions(+), 135 deletions(-)
 create mode 100644 targettests/execution/test_trotter.cpp

diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
index 1bc97abebc6..8046d5ca4ee 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
@@ -1659,7 +1659,7 @@ def QuakeOp_MaterializeStateOp : QuakeOp<"materialize_state", [Pure] > {
     pass.
 
     ```mlir
-      %0 = quake.materialize_state @num_qubits @init : !cc.ptr<!cc.state>
+      %0 = quake.materialize_state @num_qubits, @init : !cc.ptr<!cc.state>
     ```
   }];
 
@@ -1669,7 +1669,7 @@ def QuakeOp_MaterializeStateOp : QuakeOp<"materialize_state", [Pure] > {
   );
   let results = (outs PointerOf<[cc_StateType]>:$result);
   let assemblyFormat = [{
-     $numQubitsFunc $initFunc `:` qualified(type(results)) attr-dict
+     $numQubitsFunc `,` $initFunc `:` qualified(type(results)) attr-dict
   }];
 }
 
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 76bb242e840..63f4f3b3c0f 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -898,7 +898,7 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::Func
       ```
 
     This optimization performs the replacements for the the following operations 
-    that use a state produced by  `quake.materialize_state @num_qubits @init`
+    that use a state produced by  `quake.materialize_state @num_qubits, @init`
     operation:
 
     - Replace `quake.get_number_of_qubits` operation by call to `@num_qubits`
@@ -910,7 +910,7 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel", "mlir::func::Func
     Before ReplaceStateWithKernel (replace-state-with-kernel):
     ```
     func.func @foo() {
-      %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_0: !cc.ptr<!cc.state>
+      %0 = quake.materialize_state @callee.num_qubits_0, @callee.init_0: !cc.ptr<!cc.state>
       %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
       %2 = quake.alloca !quake.veq<?>[%1 : i64]
       %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
diff --git a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index a9cd1dd80e1..8ff63140b6f 100644
--- a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -33,7 +33,7 @@ namespace {
 /// that computes the number of qubits for a state.
 ///
 /// ```mlir
-///  %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
+///  %0 = quake.materialize_state @callee.num_qubits_0, @callee.init_0 : !cc.ptr<!cc.state>
 ///  %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
 /// ───────────────────────────────────────────
 ///  %1 = call @callee.num_qubits_0() : () -> i64
@@ -47,11 +47,14 @@ class ReplaceGetNumQubitsPattern
   LogicalResult matchAndRewrite(quake::GetNumberOfQubitsOp numQubits,
                                 PatternRewriter &rewriter) const override {
 
-    auto stateOp = numQubits.getOperand();
+    auto stateOp = numQubits.getState();
     auto materializeState = stateOp.getDefiningOp<quake::MaterializeStateOp>();
-    if (!materializeState)
-      return numQubits->emitError(
-          "ReplaceStateWithKernel: failed to replace `quake.get_num_qubits`");
+    if (!materializeState) {
+      LLVM_DEBUG(llvm::dbgs() << "ReplaceStateWithKernel: failed to replace "
+                                 "`quake.get_num_qubits`: "
+                              << stateOp << '\n');
+      return failure();
+    }
 
     auto numQubitsFunc = materializeState.getNumQubitsFunc();
     rewriter.setInsertionPoint(numQubits);
@@ -66,7 +69,7 @@ class ReplaceGetNumQubitsPattern
 /// the state.
 ///
 /// ```mlir
-///  %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
+///  %0 = quake.materialize_state @callee.num_qubits_0, @callee.init_0 : !cc.ptr<!cc.state>
 ///  %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
 /// ───────────────────────────────────────────
 /// %3 = call @callee.init_0(%2): (!quake.veq<?>) -> !quake.veq<?>
@@ -79,16 +82,19 @@ class ReplaceInitStatePattern
 
   LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
                                 PatternRewriter &rewriter) const override {
-    auto allocaOp = initState.getOperand(0);
-    auto stateOp = initState.getOperand(1);
+    auto allocaOp = initState.getTargets();
+    auto stateOp = initState.getState();
 
     if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(stateOp.getType())) {
       if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
         auto materializeState =
             stateOp.getDefiningOp<quake::MaterializeStateOp>();
-        if (!materializeState)
-          return initState->emitError(
-              "ReplaceStateWithKernel: failed to replace `quake.init_state`");
+        if (!materializeState) {
+          LLVM_DEBUG(llvm::dbgs() << "ReplaceStateWithKernel: failed to "
+                                     "replace `quake.init_state`: "
+                                  << stateOp << '\n');
+          return failure();
+        }
 
         auto initName = materializeState.getInitFunc();
         rewriter.setInsertionPoint(initState);
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 00c2bbcc272..687886cdffb 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -550,18 +550,14 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   // We pass string references to the `createArgumentSynthesisPass`.
   mlir::SmallVector<std::string> kernels;
   mlir::SmallVector<std::string> substs;
-  for (auto &kInfo : argCon.getKernelSubstitutions()) {
-    {
-      std::string kernName =
-          cudaq::runtime::cudaqGenPrefixName + kInfo.getKernelName().str();
-      kernels.emplace_back(kernName);
-    }
-    {
-      std::string substBuff;
-      llvm::raw_string_ostream ss(substBuff);
-      ss << kInfo.getSubstitutionModule();
-      substs.emplace_back(substBuff);
-    }
+  for (auto *kInfo : argCon.getKernelSubstitutions()) {
+    std::string kernName =
+        cudaq::runtime::cudaqGenPrefixName + kInfo->getKernelName().str();
+    kernels.emplace_back(kernName);
+    std::string substBuff;
+    llvm::raw_string_ostream ss(substBuff);
+    ss << kInfo->getSubstitutionModule();
+    substs.emplace_back(substBuff);
   }
 
   // Collect references for the argument synthesis.
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 4c89a185194..8c1af552461 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -124,9 +124,8 @@ static void createInitFunc(OpBuilder &builder, ModuleOp moduleOp,
   builder.setInsertionPointToEnd(moduleOp.getBody());
 
   auto ctx = builder.getContext();
-  auto loc = builder.getUnknownLoc();
-
   auto initFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
+  auto loc = initFunc.getLoc();
 
   auto argTypes = calleeFunc.getArgumentTypes();
   auto retTy = quake::VeqType::getUnsized(ctx);
@@ -249,9 +248,8 @@ static void createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
   builder.setInsertionPointToEnd(moduleOp.getBody());
 
   auto ctx = builder.getContext();
-  auto loc = builder.getUnknownLoc();
-
   auto numQubitsFunc = cast<func::FuncOp>(builder.clone(*calleeFunc));
+  auto loc = numQubitsFunc.getLoc();
 
   auto argTypes = calleeFunc.getArgumentTypes();
   auto retType = builder.getI64Type();
@@ -515,11 +513,11 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
       createNumQubitsFunc(builder, substMod, calleeFunc, numQubitsKernelName);
 
       // Convert arguments for `callee.init_N`.
-      auto &registeredInitName = converter.registerKernel(initName);
+      auto registeredInitName = converter.registerKernel(initName);
       converter.gen(registeredInitName, substMod, calleeArgs);
 
       // Convert arguments for `callee.num_qubits_N`.
-      auto &registeredNumQubitsName = converter.registerKernel(numQubitsName);
+      auto registeredNumQubitsName = converter.registerKernel(numQubitsName);
       converter.gen(registeredNumQubitsName, substMod, calleeArgs);
     }
 
@@ -719,7 +717,7 @@ void cudaq::opt::ArgumentConverter::gen(StringRef kernelName,
   OpBuilder builder(ctx);
   ModuleOp substModule =
       builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
-  auto &kernelInfo = addKernelInfo(kernelName, substModule);
+  auto *kernelInfo = addKernelInfo(kernelName, substModule);
 
   // We should look up the input type signature here.
   auto fun = sourceModule.lookupSymbol<func::FuncOp>(
@@ -813,7 +811,7 @@ void cudaq::opt::ArgumentConverter::gen(StringRef kernelName,
             })
             .Default({});
     if (subst)
-      kernelInfo.getSubstitutions().emplace_back(std::move(subst));
+      kernelInfo->getSubstitutions().emplace_back(std::move(subst));
   }
 }
 
diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index 9252ee1b8a2..6d4d23958fc 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -53,6 +53,12 @@ class ArgumentConverter {
   /// kernelName in \p sourceModule.
   ArgumentConverter(mlir::StringRef kernelName, mlir::ModuleOp sourceModule);
 
+  ~ArgumentConverter() {
+    for (auto *kInfo : kernelSubstitutions) {
+      delete kInfo;
+    }
+  }
+
   /// Generate a substitution ModuleOp for the vector of arguments presented.
   /// The arguments are those presented to the kernel, kernelName.
   void gen(const std::vector<void *> &arguments);
@@ -72,32 +78,32 @@ class ArgumentConverter {
   void gen_drop_front(const std::vector<void *> &arguments, unsigned numDrop);
 
   /// Get the kernel info that were collected by `gen()`.
-  std::list<KernelSubstitutionInfo> &getKernelSubstitutions() {
+  mlir::SmallVector<KernelSubstitutionInfo *> &getKernelSubstitutions() {
     return kernelSubstitutions;
   }
 
-  bool isRegisteredKernel(const std::string &kernelName) {
-    return std::find(nameRegistry.begin(), nameRegistry.end(), kernelName) !=
-           nameRegistry.end();
+  bool isRegisteredKernel(mlir::StringRef kernelName) {
+    return std::find(nameRegistry.begin(), nameRegistry.end(),
+                     kernelName.str()) != nameRegistry.end();
   }
 
-  std::string &registerKernel(const std::string &kernelName) {
-    return nameRegistry.emplace_back(kernelName);
+  mlir::StringRef registerKernel(mlir::StringRef kernelName) {
+    return nameRegistry.emplace_back(
+        mlir::StringAttr::get(sourceModule.getContext(), kernelName));
   }
 
 private:
-  KernelSubstitutionInfo &addKernelInfo(mlir::StringRef kernelName,
+  KernelSubstitutionInfo *addKernelInfo(mlir::StringRef kernelName,
                                         mlir::ModuleOp substModule) {
-    return kernelSubstitutions.emplace_back(kernelName, substModule);
+    return kernelSubstitutions.emplace_back(
+        new KernelSubstitutionInfo(kernelName, substModule));
   }
 
   /// Memory to store new kernel names generated during argument conversion.
-  /// Use list here to keep references to those elements valid.
-  std::list<std::string> nameRegistry;
+  mlir::SmallVector<mlir::StringAttr> nameRegistry;
 
   /// Memory to store new kernel info generated during argument conversion.
-  /// Use list here to keep elements sorted in order of creation.
-  std::list<KernelSubstitutionInfo> kernelSubstitutions;
+  mlir::SmallVector<KernelSubstitutionInfo *> kernelSubstitutions;
 
   /// Original module before substitutions.
   mlir::ModuleOp sourceModule;
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 56449854431..60070249615 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -464,18 +464,14 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
         // We pass string references to the `createArgumentSynthesisPass`.
         mlir::SmallVector<std::string> kernels;
         mlir::SmallVector<std::string> substs;
-        for (auto &kInfo : argCon.getKernelSubstitutions()) {
-          {
-            std::string kernName = cudaq::runtime::cudaqGenPrefixName +
-                                   kInfo.getKernelName().str();
-            kernels.emplace_back(kernName);
-          }
-          {
-            std::string substBuff;
-            llvm::raw_string_ostream ss(substBuff);
-            ss << kInfo.getSubstitutionModule();
-            substs.emplace_back(substBuff);
-          }
+        for (auto *kInfo : argCon.getKernelSubstitutions()) {
+          std::string kernName =
+              cudaq::runtime::cudaqGenPrefixName + kInfo->getKernelName().str();
+          kernels.emplace_back(kernName);
+          std::string substBuff;
+          llvm::raw_string_ostream ss(substBuff);
+          ss << kInfo->getSubstitutionModule();
+          substs.emplace_back(substBuff);
         }
 
         // Collect references for the argument synthesis.
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index fdb34719420..445463b0c63 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -190,18 +190,14 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
           // We pass string references to the `createArgumentSynthesisPass`.
           mlir::SmallVector<std::string> kernels;
           mlir::SmallVector<std::string> substs;
-          for (auto &kInfo : argCon.getKernelSubstitutions()) {
-            {
-              std::string kernName = cudaq::runtime::cudaqGenPrefixName +
-                                     kInfo.getKernelName().str();
-              kernels.emplace_back(kernName);
-            }
-            {
-              std::string substBuff;
-              llvm::raw_string_ostream ss(substBuff);
-              ss << kInfo.getSubstitutionModule();
-              substs.emplace_back(substBuff);
-            }
+          for (auto *kInfo : argCon.getKernelSubstitutions()) {
+            std::string kernName = cudaq::runtime::cudaqGenPrefixName +
+                                   kInfo->getKernelName().str();
+            kernels.emplace_back(kernName);
+            std::string substBuff;
+            llvm::raw_string_ostream ss(substBuff);
+            ss << kInfo->getSubstitutionModule();
+            substs.emplace_back(substBuff);
           }
 
           // Collect references for the argument synthesis.
@@ -349,6 +345,10 @@ class BaseRemoteRestRuntimeClient : public cudaq::RemoteRuntimeClient {
       if (!castedState1 || !castedState2)
         throw std::runtime_error(
             "Invalid execution context: input states are not compatible");
+      if (!castedState1->getKernelInfo().has_value())
+        throw std::runtime_error("Missing first input state in state-overlap");
+      if (!castedState2->getKernelInfo().has_value())
+        throw std::runtime_error("Missing second input state in state-overlap");
       auto [kernelName1, args1] = castedState1->getKernelInfo().value();
       auto [kernelName2, args2] = castedState2->getKernelInfo().value();
       cudaq::IRPayLoad stateIrPayload1, stateIrPayload2;
diff --git a/runtime/cudaq/algorithms/get_state.h b/runtime/cudaq/algorithms/get_state.h
index 79202f98b48..093ae36dcff 100644
--- a/runtime/cudaq/algorithms/get_state.h
+++ b/runtime/cudaq/algorithms/get_state.h
@@ -119,8 +119,7 @@ auto get_state(QuantumKernel &&kernel, Args &&...args) {
     return state(new RemoteSimulationState(std::forward<QuantumKernel>(kernel),
                                            std::forward<Args>(args)...));
   }
-#else
-#if defined(CUDAQ_QUANTUM_DEVICE) && !defined(CUDAQ_LIBRARY_MODE)
+#elif defined(CUDAQ_QUANTUM_DEVICE) && !defined(CUDAQ_LIBRARY_MODE)
   // Store kernel name and arguments for quantum states.
   if (!cudaq::get_quake_by_name(cudaq::getKernelName(kernel), false).empty())
     return state(new QPUState(std::forward<QuantumKernel>(kernel),
@@ -128,8 +127,7 @@ auto get_state(QuantumKernel &&kernel, Args &&...args) {
   throw std::runtime_error(
       "cudaq::state* argument synthesis is not supported for quantum hardware"
       " for c-like functions, use class kernels instead");
-#else
-#if defined(CUDAQ_QUANTUM_DEVICE)
+#elif defined(CUDAQ_QUANTUM_DEVICE)
   // Kernel builder is MLIR-based kernel.
   if constexpr (has_name<QuantumKernel>::value)
     return state(new QPUState(std::forward<QuantumKernel>(kernel),
@@ -138,8 +136,6 @@ auto get_state(QuantumKernel &&kernel, Args &&...args) {
   throw std::runtime_error(
       "cudaq::state* argument synthesis is not supported for quantum hardware"
       " for c-like functions in library mode");
-#endif
-#endif
 #endif
   return details::extractState([&]() mutable {
     cudaq::invokeKernel(std::forward<QuantumKernel>(kernel),
diff --git a/runtime/cudaq/cudaq.cpp b/runtime/cudaq/cudaq.cpp
index 5dbdf4ee8cf..071f658f43f 100644
--- a/runtime/cudaq/cudaq.cpp
+++ b/runtime/cudaq/cudaq.cpp
@@ -19,7 +19,6 @@
 #include "distributed/mpi_plugin.h"
 #include <dlfcn.h>
 #include <filesystem>
-#include <list>
 #include <map>
 #include <regex>
 #include <shared_mutex>
diff --git a/runtime/cudaq/platform/qpu_state.cpp b/runtime/cudaq/platform/qpu_state.cpp
index 0561ca29ddb..24ce4c412c9 100644
--- a/runtime/cudaq/platform/qpu_state.cpp
+++ b/runtime/cudaq/platform/qpu_state.cpp
@@ -7,17 +7,10 @@
  ******************************************************************************/
 
 #include "qpu_state.h"
-#include "common/Logger.h"
 
 namespace cudaq {
 
 QPUState::~QPUState() {
-  if (!platformExecutionLog.empty()) {
-    // Flush any info log from the remote execution
-    printf("%s\n", platformExecutionLog.c_str());
-    platformExecutionLog.clear();
-  }
-
   for (std::size_t counter = 0; auto &ptr : args)
     deleters[counter++](ptr);
 
diff --git a/runtime/cudaq/platform/qpu_state.h b/runtime/cudaq/platform/qpu_state.h
index a13ac6f7b40..a04120b3728 100644
--- a/runtime/cudaq/platform/qpu_state.h
+++ b/runtime/cudaq/platform/qpu_state.h
@@ -13,21 +13,13 @@
 #include "cudaq/utils/cudaq_utils.h"
 
 namespace cudaq {
-/// Implementation of `SimulationState` for quantum device backends.
-// The state is represented by a quantum kernel.
-// Quantum state contains all the information we need to replicate a
-// call to kernel that created the state.
+/// @brief Implementation of `SimulationState` for quantum device backends.
+/// The state is represented by a quantum kernel.
+/// Quantum state contains all the information we need to replicate a
+/// call to kernel that created the state.
 class QPUState : public cudaq::SimulationState {
 protected:
   std::string kernelName;
-  // Lazily-evaluated state data (just keeping the kernel name and arguments).
-  // e.g., to be evaluated at amplitude accessor APIs (const APIs, hence needs
-  // to be mutable) or overlap calculation with another remote state (combining
-  // the IR of both states for remote evaluation)
-  mutable std::unique_ptr<cudaq::SimulationState> state;
-  // Cache log messages from the remote execution.
-  // Mutable to support lazy execution during `const` API calls.
-  mutable std::string platformExecutionLog;
   using ArgDeleter = std::function<void(void *)>;
   /// @brief  Vector of arguments
   // Note: we create a copy of all arguments except pointers.
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index e66a9d37a8f..fb913384c7d 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -143,13 +143,12 @@ class FakeDeviceState : public cudaq::SimulationState {
 extern "C" void __cudaq_deviceCodeHolderAdd(const char *, const char *);
 
 void dumpSubstitutionModules(cudaq::opt::ArgumentConverter &con) {
-  for (auto &kInfo : con.getKernelSubstitutions()) {
-    // Dump the conversions
+  // Dump the conversions
+  for (auto *kInfo : con.getKernelSubstitutions())
     llvm::outs() << "========================================\n"
                     "Substitution module:\n"
-                 << kInfo.getKernelName() << "\n"
-                 << kInfo.getSubstitutionModule() << '\n';
-  }
+                 << kInfo->getKernelName() << "\n"
+                 << kInfo->getSubstitutionModule() << '\n';
 }
 
 void doSimpleTest(mlir::MLIRContext *ctx, const std::string &typeName,
@@ -544,7 +543,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         testy
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__init.init_[[HASH_0]] : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init.num_qubits_[[HASH_0:.*]], @__nvqpp__mlirgen__init.init_[[HASH_0]] : !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK:         func.func private @__nvqpp__mlirgen__init.init_[[HASH_0]](%arg0: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
@@ -608,7 +607,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         testy
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init0.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__init0.init_[[HASH_0]] : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init0.num_qubits_[[HASH_0:.*]], @__nvqpp__mlirgen__init0.init_[[HASH_0]] : !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK:         func.func private @__nvqpp__mlirgen__init0.init_[[HASH_0]](%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
@@ -657,15 +656,14 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
     //
     // s0 = cudaq.get_state(init1, 2)
     // s1 = cudaq.get_state(state_param, s0)
-    // cudaq.sample(kernel, s1)
+    // s2 = cudaq.get_state(state_param, s1)
+    // s3 = cudaq.get_state(state_param, s2)
+    // cudaq.sample(kernel, s3)
     auto init = "init1";
-    auto initCode =
-        "func.func private @__nvqpp__mlirgen__init1(%arg0: i64) {\n"
-        "  %0 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
-        "  %1 = quake.extract_ref %0[0] : (!quake.veq<?>) -> !quake.ref\n"
-        "  quake.x %1 : (!quake.ref) -> ()\n"
-        "  return\n"
-        "}\n";
+    auto initCode = "func.func private @__nvqpp__mlirgen__init1(%arg0: i64) {\n"
+                    "  %0 = quake.alloca !quake.veq<?>[%arg0 : i64]\n"
+                    "  return\n"
+                    "}\n";
     __cudaq_deviceCodeHolderAdd(init, initCode);
 
     auto stateParam = "state_param";
@@ -690,9 +688,11 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
     std::vector<void *> v0 = {static_cast<void *>(&s0)};
     auto s1 = cudaq::state(new FakeDeviceState(stateParam, v0));
     std::vector<void *> v1 = {static_cast<void *>(&s1)};
+    auto s2 = cudaq::state(new FakeDeviceState(stateParam, v1));
+    std::vector<void *> v2 = {static_cast<void *>(&s2)};
 
     auto code = std::string{initCode} + std::string{stateParamCode};
-    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v1, code);
+    doSimpleTest(ctx, "!cc.ptr<!cc.state>", v2, code);
   }
 
   // clang-format off
@@ -703,7 +703,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         testy
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %0 = quake.materialize_state @__nvqpp__mlirgen__state_param.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__state_param.init_[[HASH_0]] : !cc.ptr<!cc.state>
+// CHECK:           %0 = quake.materialize_state @__nvqpp__mlirgen__state_param.num_qubits_[[HASH_0:.*]], @__nvqpp__mlirgen__state_param.init_[[HASH_0]] : !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK:         func.func private @__nvqpp__mlirgen__state_param.init_[[HASH_0]](%arg0: !cc.ptr<!cc.state>, %arg1: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
@@ -728,45 +728,73 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         state_param.init_[[HASH_0]]
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %0 = quake.materialize_state @__nvqpp__mlirgen__init1.num_qubits_[[HASH_1:.*]] @__nvqpp__mlirgen__init1.init_[[HASH_1]] : !cc.ptr<!cc.state>
+// CHECK:           %0 = quake.materialize_state @__nvqpp__mlirgen__state_param.num_qubits_[[HASH_1:.*]], @__nvqpp__mlirgen__state_param.init_[[HASH_1]] : !cc.ptr<!cc.state>
 // CHECK:         }
-// CHECK:         func.func private @__nvqpp__mlirgen__init1.init_[[HASH_1]](%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
+// CHECK:         func.func private @__nvqpp__mlirgen__state_param.init_[[HASH_1]](%arg0: !cc.ptr<!cc.state>, %arg1: !quake.veq<?>) -> !quake.veq<?> {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = quake.get_number_of_qubits %arg0 : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_3:.*]] = arith.subi %[[VAL_2]], %[[VAL_1]] : i64
+// CHECK:           %[[VAL_4:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_3]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_0]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_6:.*]] = arith.addi %[[VAL_0]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_7:.*]] = quake.init_state %[[VAL_4]], %arg0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
+// CHECK:           %[[VAL_8:.*]] = arith.subi %[[VAL_6]], %[[VAL_1]] : i64
+// CHECK:           %[[VAL_9:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_8]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           return %[[VAL_9]] : !quake.veq<?>
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__state_param.num_qubits_[[HASH_1]](%arg0: !cc.ptr<!cc.state>) -> i64 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = quake.get_number_of_qubits %arg0 : (!cc.ptr<!cc.state>) -> i64
+// CHECK:           %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i64
+// CHECK:           return %[[VAL_2]] : i64
+// CHECK:         }
+// CHECK:         ========================================
+// CHECK:         Substitution module:
+// CHECK:         state_param.init_[[HASH_1]]
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %0 = quake.materialize_state @__nvqpp__mlirgen__init1.num_qubits_[[HASH_2:.*]], @__nvqpp__mlirgen__init1.init_[[HASH_2]] : !cc.ptr<!cc.state>
+// CHECK:         }
+// CHECK:         func.func private @__nvqpp__mlirgen__init1.init_[[HASH_2]](%arg0: i64,  %arg1: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
 // CHECK:           %[[VAL_2:.*]] = arith.subi %arg0, %[[VAL_1]] : i64
-// CHECK:           %[[VAL_3:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_2]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
-// CHECK:           %[[VAL_4:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
+// CHECK:           %[[VAL_4:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_2]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
 // CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
-// CHECK:           %[[VAL_6:.*]] = quake.extract_ref %[[VAL_3]][0] : (!quake.veq<?>) -> !quake.ref
-// CHECK:           quake.x %[[VAL_6]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_7:.*]] = arith.subi %[[VAL_5]], %[[VAL_1]] : i64
-// CHECK:           %[[VAL_8:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_7]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
-// CHECK:           return %[[VAL_8]] : !quake.veq<?>
+// CHECK:           %[[VAL_6:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
+// CHECK:           %[[VAL_8:.*]] = arith.subi %[[VAL_6]], %[[VAL_1]] : i64
+// CHECK:           %[[VAL_9:.*]] = quake.subveq %arg1, %[[VAL_0]], %[[VAL_8]] : (!quake.veq<?>, i64, i64) -> !quake.veq<?>
+// CHECK:           return %[[VAL_9]] : !quake.veq<?>
 // CHECK:         }
-// CHECK:         func.func private @__nvqpp__mlirgen__init1.num_qubits_[[HASH_1]](%arg0: i64) -> i64 {
+// CHECK:         func.func private @__nvqpp__mlirgen__init1.num_qubits_[[HASH_2]](%arg0: i64) -> i64 {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_1:.*]] = arith.addi %[[VAL_0]], %arg0 : i64
 // CHECK:           return %[[VAL_1]] : i64
 // CHECK:         }
 // CHECK:         ========================================
 // CHECK:         Substitution module:
-// CHECK:         init1.init_[[HASH_1]]
+// CHECK:         init1.init_[[HASH_2]]
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
 // CHECK:         }
 // CHECK:         ========================================
 // CHECK:         Substitution module:
-// CHECK:         init1.num_qubits_[[HASH_1]]
+// CHECK:         init1.num_qubits_[[HASH_2]]
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
 // CHECK:         }
 // CHECK:         ========================================
 // CHECK:         Substitution module:
+// CHECK:         state_param.num_qubits_[[HASH_1]]
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK:           %0 = quake.materialize_state @__nvqpp__mlirgen__init1.num_qubits_[[HASH_2]], @__nvqpp__mlirgen__init1.init_[[HASH_2]] : !cc.ptr<!cc.state>
+// CHECK:         }
+// CHECK:         ========================================
+// CHECK:         Substitution module:
 // CHECK:         state_param.num_qubits_[[HASH_0]]
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init1.num_qubits_[[HASH_1]] @__nvqpp__mlirgen__init1.init_[[HASH_1]] : !cc.ptr<!cc.state>
+// CHECK:           %0 = quake.materialize_state @__nvqpp__mlirgen__state_param.num_qubits_[[HASH_1]], @__nvqpp__mlirgen__state_param.init_[[HASH_1]] : !cc.ptr<!cc.state>
 // CHECK:         }
-
   // clang-format on
 
   {
@@ -821,7 +849,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         testy
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init2.num_qubits_[[HASH_1:.*]] @__nvqpp__mlirgen__init2.init_[[HASH_1]] : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init2.num_qubits_[[HASH_1:.*]], @__nvqpp__mlirgen__init2.init_[[HASH_1]] : !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK:         func.func private @__nvqpp__mlirgen__init2.init_[[HASH_1]](%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
@@ -901,7 +929,7 @@ void test_quantum_state(mlir::MLIRContext *ctx) {
 // CHECK:         Substitution module:
 // CHECK:         testy
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init3.num_qubits_[[HASH_0:.*]] @__nvqpp__mlirgen__init3.init_[[HASH_0]] : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_0:.*]] = quake.materialize_state @__nvqpp__mlirgen__init3.num_qubits_[[HASH_0:.*]], @__nvqpp__mlirgen__init3.init_[[HASH_0]] : !cc.ptr<!cc.state>
 // CHECK:         }
 // CHECK:         func.func private @__nvqpp__mlirgen__init3.init_[[HASH_0]](%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
diff --git a/targettests/execution/test_trotter.cpp b/targettests/execution/test_trotter.cpp
new file mode 100644
index 00000000000..4dd06bb3040
--- /dev/null
+++ b/targettests/execution/test_trotter.cpp
@@ -0,0 +1,183 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// TODO-FIX-KERNEL-EXEC
+// Simulators
+// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
+
+// Quantum emulators
+// RUN: if %braket_avail; then nvq++ %cpp_std -target braket -emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s ; fi
+// RUN: nvq++ %cpp_std -target quantinuum -emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std -target ionq       -emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std -target oqc        -emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
+
+// 2 different IQM machines for 2 different topologies
+// RUN: nvq++ %cpp_std -target iqm --iqm-machine Adonis -emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std -target iqm --iqm-machine Apollo -emulate %s -fkernel-exec-kind=2 -o %t && %t | FileCheck %s
+// clang-format on
+
+#include <complex>
+#include <cudaq.h>
+#include <iostream>
+#include <string>
+
+// Compute magnetization using Suzuki-Trotter approximation.
+// This example demonstrates usage of quantum states in kernel mode.
+//
+// Details
+// https://pubs.aip.org/aip/jmp/article-abstract/32/2/400/229229/General-theory-of-fractal-path-integrals-with
+//
+// Hamiltonian used
+// https://en.m.wikipedia.org/wiki/Quantum_Heisenberg_model
+
+// If you have a NVIDIA GPU you can use this example to see
+// that the GPU-accelerated backends can easily handle a
+// larger number of qubits compared the CPU-only backend.
+//
+// Depending on the available memory on your GPU, you can
+// set the number of qubits to around 30 qubits, and run
+// the execution command with `-target nvidia` option.
+//
+// Note: Without setting the target to the `nvidia` backend,
+// there will be a noticeable decrease in simulation performance.
+// This is because the CPU-only backend has difficulty handling
+// 30+ qubit simulations.
+
+int SPINS = 5; // set to around 25 qubits for `nvidia` target
+int STEPS = 4; // set to around 100 for `nvidia` target
+
+// Compile and run with:
+// clang-format off
+// ```
+// nvq++ --enable-mlir -v trotter_kernel_mode.cpp -o trotter.x --target nvidia && ./trotter.x
+// ```
+// clang-format off
+
+// Alternating up/down spins
+struct initState {
+  void operator()(int num_spins) __qpu__ {
+    cudaq::qvector q(num_spins);
+    for (int qId = 0; qId < num_spins; qId += 2)
+      x(q[qId]);
+  }
+};
+
+std::vector<double> term_coefficients(cudaq::spin_op op) {
+  std::vector<double> result{};
+  op.for_each_term([&](cudaq::spin_op &term) {
+    const auto coeff = term.get_coefficient().real();
+    result.push_back(coeff);
+  });
+  return result;
+}
+
+std::vector<cudaq::pauli_word> term_words(cudaq::spin_op op) {
+  std::vector<cudaq::pauli_word> result{};
+  op.for_each_term(
+      [&](cudaq::spin_op &term) { result.push_back(term.to_string(false)); });
+  return result;
+}
+
+struct trotter {
+  // Note: This performs a single-step Trotter on top of an initial state, e.g.,
+  // result state of the previous Trotter step.
+  void operator()(cudaq::state *initial_state,
+                  std::vector<double> &coefficients,
+                  std::vector<cudaq::pauli_word> &words, double dt) __qpu__ {
+    cudaq::qvector q(initial_state);
+    for (std::size_t i = 0; i < coefficients.size(); ++i) {
+      cudaq::exp_pauli(coefficients[i] * dt, q, words[i]);
+    }
+  }
+};
+
+int run_steps(int steps, int spins) {
+  const double g = 1.0;
+  const double Jx = 1.0;
+  const double Jy = 1.0;
+  const double Jz = g;
+  const double dt = 0.05;
+  const int n_steps = steps;
+  const int n_spins = spins;
+  const double omega = 2 * M_PI;
+  const auto heisenbergModelHam = [&](double t) -> cudaq::spin_op {
+    cudaq::spin_op tdOp(n_spins);
+    for (int i = 0; i < n_spins - 1; ++i) {
+      tdOp += (Jx * cudaq::spin::x(i) * cudaq::spin::x(i + 1));
+      tdOp += (Jy * cudaq::spin::y(i) * cudaq::spin::y(i + 1));
+      tdOp += (Jz * cudaq::spin::z(i) * cudaq::spin::z(i + 1));
+    }
+    for (int i = 0; i < n_spins; ++i)
+      tdOp += (std::cos(omega * t) * cudaq::spin::x(i));
+    return tdOp;
+  };
+  // Observe the average magnetization of all spins (<Z>)
+  cudaq::spin_op average_magnetization(n_spins);
+  for (int i = 0; i < n_spins; ++i)
+    average_magnetization += ((1.0 / n_spins) * cudaq::spin::z(i));
+  average_magnetization -= 1.0;
+
+  // Run loop
+  auto state = cudaq::get_state(initState{}, n_spins);
+  std::vector<double> expResults;
+  std::vector<double> runtimeMs;
+  for (int i = 0; i < n_steps; ++i) {
+    const auto start = std::chrono::high_resolution_clock::now();
+    auto ham = heisenbergModelHam(i * dt);
+    auto coefficients = term_coefficients(ham);
+    auto words = term_words(ham);
+    auto magnetization_exp_val = cudaq::observe(
+        trotter{}, average_magnetization, &state, coefficients, words, dt);
+    auto result = magnetization_exp_val.expectation();
+    expResults.emplace_back(result);
+    state = cudaq::get_state(trotter{}, &state, coefficients, words, dt);
+    const auto stop = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+    auto timeInSeconds = duration.count() / 1000.0 / 1000.0;
+    runtimeMs.emplace_back(timeInSeconds);
+    std::cout << "Step " << i << ": time [s]: " << timeInSeconds
+              << ", result: " << result << std::endl;
+  }
+  std::cout << std::endl;
+
+  // Print runtimes and results (useful for plotting).
+  std::cout << "Step times [s]: [";
+  for (const auto &x : runtimeMs)
+    std::cout << x << ", ";
+  std::cout << "]" << std::endl;
+
+  std::cout << "Results: [";
+  for (const auto &x : expResults)
+    std::cout << x << ", ";
+  std::cout << "]" << std::endl;
+
+  std::cout << std::endl;
+  return 0;
+}
+
+int main() {
+  const auto start = std::chrono::high_resolution_clock::now();
+  run_steps(STEPS, SPINS);
+  const auto stop = std::chrono::high_resolution_clock::now();
+  auto duration =
+      std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+  std::cout << "Total running time: " << duration.count() / 1000.0 / 1000.0
+            << "s" << std::endl;
+}
+
+// CHECK:  Step 0: time [s]: [[t0:.*]], result: [[v0:.*]]
+// CHECK:  Step 1: time [s]: [[t1:.*]], result: [[v1:.*]]
+// CHECK:  Step 2: time [s]: [[t2:.*]], result: [[v2:.*]]
+// CHECK:  Step 3: time [s]: [[t3:.*]], result: [[v3:.*]]
+
+// CHECK:  Step times [s]: [[ts:.*]]
+// CHECK:  Results: [[rs:.*]]
+
+// CHECK:  Total running time: [[tts:.*]]s
diff --git a/test/Quake/arg_subst-7.txt b/test/Quake/arg_subst-7.txt
index 58ca8a163e6..e5ec93f57f8 100644
--- a/test/Quake/arg_subst-7.txt
+++ b/test/Quake/arg_subst-7.txt
@@ -8,7 +8,7 @@
 
 module {
   cc.arg_subst[0] {
-    %0 = quake.materialize_state @num_qubits @init : !cc.ptr<!cc.state>
+    %0 = quake.materialize_state @num_qubits, @init : !cc.ptr<!cc.state>
   }
   func.func @init(%arg0: i64, %arg1: !quake.veq<?>) -> !quake.veq<?> {
     return %arg1 : !quake.veq<?>
diff --git a/test/Quake/arg_subst_func.qke b/test/Quake/arg_subst_func.qke
index 5310404c3cd..2125ca99710 100644
--- a/test/Quake/arg_subst_func.qke
+++ b/test/Quake/arg_subst_func.qke
@@ -163,7 +163,7 @@ func.func @testy6(%arg0: !cc.ptr<!cc.state>) {
 // CHECK:           return %[[VAL_0]] : i32
 // CHECK:         }
 // CHECK-LABEL:   func.func @testy6() {
-// CHECK:           %[[VAL_2:.*]] = quake.materialize_state @num_qubits @init : !cc.ptr<!cc.state>
+// CHECK:           %[[VAL_2:.*]] = quake.materialize_state @num_qubits, @init : !cc.ptr<!cc.state>
 // CHECK:           %[[VAL_3:.*]] = quake.get_number_of_qubits %[[VAL_2]] : (!cc.ptr<!cc.state>) -> i64
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
 // CHECK:           %[[VAL_5:.*]] = quake.init_state %[[VAL_4]], %[[VAL_2]] : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
diff --git a/test/Quake/replace_state_with_kernel.qke b/test/Quake/replace_state_with_kernel.qke
index 38b1c81d36d..40ca88badd9 100644
--- a/test/Quake/replace_state_with_kernel.qke
+++ b/test/Quake/replace_state_with_kernel.qke
@@ -23,7 +23,7 @@ module {
   }
 
   func.func @caller0() {
-    %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
+    %0 = quake.materialize_state @callee.num_qubits_0, @callee.init_0 : !cc.ptr<!cc.state>
     %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
     %2 = quake.alloca !quake.veq<?>[%1 : i64]
     %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
@@ -38,7 +38,7 @@ module {
 // CHECK:         }
 
   func.func @caller1(%arg0: i64) {
-    %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
+    %0 = quake.materialize_state @callee.num_qubits_0, @callee.init_0 : !cc.ptr<!cc.state>
     %2 = quake.alloca !quake.veq<?>[%arg0 : i64]
     %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!cc.state>) -> !quake.veq<?>
     return
@@ -51,7 +51,7 @@ module {
 // CHECK:         }
 
   func.func @caller2() -> i64 {
-    %0 = quake.materialize_state @callee.num_qubits_0 @callee.init_0 : !cc.ptr<!cc.state>
+    %0 = quake.materialize_state @callee.num_qubits_0, @callee.init_0 : !cc.ptr<!cc.state>
     %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!cc.state>) -> i64
     return %1: i64
   }

From 4a12db08bc5514e4c3ba317cb4e94d3f523094e5 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 18 Mar 2025 11:37:49 -0700
Subject: [PATCH 51/54] Address more CR comments and add a test

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/common/ArgumentConversion.h    | 1 -
 targettests/execution/test_trotter.cpp | 1 -
 2 files changed, 2 deletions(-)

diff --git a/runtime/common/ArgumentConversion.h b/runtime/common/ArgumentConversion.h
index 6d4d23958fc..d38de7399cf 100644
--- a/runtime/common/ArgumentConversion.h
+++ b/runtime/common/ArgumentConversion.h
@@ -13,7 +13,6 @@
 #include "cudaq/qis/state.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Types.h"
-#include <list>
 #include <unordered_set>
 #include <vector>
 
diff --git a/targettests/execution/test_trotter.cpp b/targettests/execution/test_trotter.cpp
index 4dd06bb3040..341594ecefb 100644
--- a/targettests/execution/test_trotter.cpp
+++ b/targettests/execution/test_trotter.cpp
@@ -12,7 +12,6 @@
 // RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
 
 // Quantum emulators
-// RUN: if %braket_avail; then nvq++ %cpp_std -target braket -emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s ; fi
 // RUN: nvq++ %cpp_std -target quantinuum -emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std -target ionq       -emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std -target oqc        -emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s

From cc1faea5fd87773d53ee13b5c144ee667fcb7a79 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Tue, 18 Mar 2025 11:59:53 -0700
Subject: [PATCH 52/54] Address more CR comments

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 runtime/test/test_argument_conversion.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index fb913384c7d..2e3f43720df 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -18,7 +18,6 @@
 #include "cudaq/qis/pauli_word.h"
 #include "cudaq/qis/state.h"
 #include "mlir/Parser/Parser.h"
-#include <cassert>
 #include <memory>
 #include <numeric>
 
@@ -109,8 +108,13 @@ class FakeDeviceState : public cudaq::SimulationState {
   operator()(std::size_t tensorIdx,
              const std::vector<std::size_t> &indices) override {
     if (hasData()) {
-      assert(tensorIdx == 0);
-      assert(indices.size() == 1);
+      if (tensorIdx != 0)
+        throw std::runtime_error("Non-zero tensor index is not supported");
+
+      if (indices.size() != 1)
+        throw std::runtime_error(
+            "Multi-dimensional tensor index is not supported");
+
       return *(static_cast<std::complex<double> *>(data) + indices[0]);
     }
     throw std::runtime_error("Not implemented");

From 175a70d38c9fbcc5176f29c6c5945e65edd369a8 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Wed, 19 Mar 2025 21:28:45 -0700
Subject: [PATCH 53/54] Fix links

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 docs/sphinx/using/backends/hardware/neutralatom.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/using/backends/hardware/neutralatom.rst b/docs/sphinx/using/backends/hardware/neutralatom.rst
index d54031c5fdc..918bc42ff5e 100644
--- a/docs/sphinx/using/backends/hardware/neutralatom.rst
+++ b/docs/sphinx/using/backends/hardware/neutralatom.rst
@@ -11,8 +11,8 @@ accessed via `Superstaq <https://superstaq.infleqtion.com/>`__, a cross-platform
 that performs low-level compilation and cross-layer optimization. To get started users can create a Superstaq
 account by following `these instructions <https://superstaq.readthedocs.io/en/latest/get_started/credentials.html>`__.
 
-For access to Infleqtion's neutral atom quantum computer, Sqale,
-`pre-registration <https://www.infleqtion.com/sqale-preregistration>`__ is now open.
+For access to Infleqtion's neutral atom quantum computer, Sqale, see details about
+`pre-registration <https://infleqtion.com/infleqtion-delivers-first-quantum-material-design-application-powered-by-logical-qubits-and-nvidia-cuda-q/>`__.
 
 Setting Credentials
 `````````````````````````

From e1449de8bb7cc1f3477fda04f4c4ae0f861dfe62 Mon Sep 17 00:00:00 2001
From: Anna Gringauze <agringauze@nvidia.com>
Date: Thu, 20 Mar 2025 09:54:49 -0700
Subject: [PATCH 54/54] Fix links

Signed-off-by: Anna Gringauze <agringauze@nvidia.com>
---
 docs/sphinx/using/backends/hardware/neutralatom.rst | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/sphinx/using/backends/hardware/neutralatom.rst b/docs/sphinx/using/backends/hardware/neutralatom.rst
index 918bc42ff5e..313a32224fc 100644
--- a/docs/sphinx/using/backends/hardware/neutralatom.rst
+++ b/docs/sphinx/using/backends/hardware/neutralatom.rst
@@ -11,8 +11,6 @@ accessed via `Superstaq <https://superstaq.infleqtion.com/>`__, a cross-platform
 that performs low-level compilation and cross-layer optimization. To get started users can create a Superstaq
 account by following `these instructions <https://superstaq.readthedocs.io/en/latest/get_started/credentials.html>`__.
 
-For access to Infleqtion's neutral atom quantum computer, Sqale, see details about
-`pre-registration <https://infleqtion.com/infleqtion-delivers-first-quantum-material-design-application-powered-by-logical-qubits-and-nvidia-cuda-q/>`__.
 
 Setting Credentials
 `````````````````````````