NVIDIA
diff --git a/‎cmake/Modules/CUDA-QX.cmake‎
Lines changed: 11 additions & 6 deletions b/‎cmake/Modules/CUDA-QX.cmake‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎libs/core/include/cuda-qx/core/kwargs_utils.h‎
Lines changed: 25 additions & 0 deletions b/‎libs/core/include/cuda-qx/core/kwargs_utils.h‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎libs/qec/CMakeLists.txt‎
Lines changed: 17 additions & 0 deletions b/‎libs/qec/CMakeLists.txt‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎libs/qec/include/cudaq/qec/decoder.h‎
Lines changed: 89 additions & 4 deletions b/‎libs/qec/include/cudaq/qec/decoder.h‎
Lines changed: 89 additions & 4 deletions
diff --git a/‎libs/qec/include/cudaq/qec/pcm_utils.h‎
Lines changed: 56 additions & 0 deletions b/‎libs/qec/include/cudaq/qec/pcm_utils.h‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎libs/qec/include/cudaq/qec/realtime/decoding.h‎
Lines changed: 52 additions & 0 deletions b/‎libs/qec/include/cudaq/qec/realtime/decoding.h‎
Lines changed: 52 additions & 0 deletions
@@ -1,5 +1,5 @@
 # ============================================================================ #
-# Copyright (c) 2024 NVIDIA Corporation & Affiliates.                          #
+# Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates.                   #
 # All rights reserved.                                                         #
 #                                                                              #
 # This source code and the accompanying materials are made available under     #
@@ -43,6 +43,9 @@ resulting object files to the specified library target.
 Note: This function assumes that the CUDAQ_INSTALL_DIR variable is set
 to the CUDAQ installation directory.
 
+Note: You can use DEPENDS_ON if you want to delay compilation until some other
+target has been built.
+
 Example usage:
   cudaqx_add_device_code(
     my_library
@@ -52,13 +55,15 @@ Example usage:
     COMPILER_FLAGS
       --enable-mlir
       -v
+    DEPENDS_ON
+      SomeOtherTarget
   )
 
 #]=======================================================================]
 function(cudaqx_add_device_code LIBRARY_NAME)
   set(options)
   set(oneValueArgs)
-  set(multiValueArgs SOURCES COMPILER_FLAGS)
+  set(multiValueArgs SOURCES COMPILER_FLAGS DEPENDS_ON)
   cmake_parse_arguments(ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   if(NOT DEFINED CUDAQ_INSTALL_DIR)
@@ -83,7 +88,7 @@ function(cudaqx_add_device_code LIBRARY_NAME)
   set(prop "$<TARGET_PROPERTY:${LIBRARY_NAME},INCLUDE_DIRECTORIES>")
   foreach(source ${ARGS_SOURCES})
     get_filename_component(filename ${source} NAME_WE)
-    set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.o")
+    set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${LIBRARY_NAME}_${filename}.o")
     cmake_path(GET output_file FILENAME baseName)
 
     add_custom_command(
@@ -92,15 +97,15 @@ function(cudaqx_add_device_code LIBRARY_NAME)
         ${ARGS_COMPILER_FLAGS} -c -fPIC --enable-mlir
         ${CMAKE_CURRENT_SOURCE_DIR}/${source} -o ${baseName}
         "$<$<BOOL:${prop}>:-I $<JOIN:${prop}, -I >>"
-      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} ${ARGS_DEPENDS_ON}
       COMMENT "Compiling ${source} with nvq++"
       VERBATIM
     )
 
     list(APPEND object_files ${output_file})
-    list(APPEND custom_targets ${filename}_target)
+    list(APPEND custom_targets ${LIBRARY_NAME}_${filename}_target)
 
-    add_custom_target(${filename}_target DEPENDS ${output_file})
+    add_custom_target(${LIBRARY_NAME}_${filename}_target DEPENDS ${output_file})
   endforeach()
 
   add_dependencies(${LIBRARY_NAME} ${custom_targets})
 
@@ -47,6 +47,31 @@ inline heterogeneous_map hetMapFromKwargs(const py::kwargs &kwargs) {
     } else if (py::isinstance<py::dict>(value)) {
       // Recursively convert nested dictionary
       result.insert(key, hetMapFromKwargs(value.cast<py::dict>()));
+    } else if (py::isinstance<py::list>(value)) {
+      // Handle Python lists
+      py::list py_list = value.cast<py::list>();
+      if (py_list.size() > 0) {
+        // Check if it's a nested list (list of lists)
+        if (py::isinstance<py::list>(py_list[0])) {
+          std::vector<std::vector<double>> vec_vec;
+          for (const auto &item : py_list) {
+            py::list inner_list = item.cast<py::list>();
+            std::vector<double> inner_vec;
+            for (const auto &v : inner_list) {
+              inner_vec.push_back(v.cast<double>());
+            }
+            vec_vec.push_back(inner_vec);
+          }
+          result.insert(key, std::move(vec_vec));
+        } else {
+          // Single-level list - try to convert to vector<double>
+          std::vector<double> vec;
+          for (const auto &item : py_list) {
+            vec.push_back(item.cast<double>());
+          }
+          result.insert(key, std::move(vec));
+        }
+      }
     } else if (py::isinstance<py::array>(value)) {
       py::array np_array = value.cast<py::array>();
       py::buffer_info info = np_array.request();
 
@@ -226,3 +226,20 @@ if (QEC_EXTERNAL_DECODERS)
     add_target_libs_to_wheel(${LIB_FILE})
   endwhile()
 endif()
+
+# External Dependencies 
+# ==============================================================================
+
+include(FetchContent)
+
+# We need version 3.11.1 because that is what CUDA-Q uses. If CUDA-Q updates,
+# then we need to remember to update.
+# TODO: remove when no longer needed.
+FetchContent_Declare(
+  json
+  GIT_REPOSITORY https://github.com/nlohmann/json
+  GIT_TAG v3.11.1
+  EXCLUDE_FROM_ALL
+)
+
+FetchContent_MakeAvailable(json)
@@ -123,6 +123,13 @@ class async_decoder_result {
 class decoder
     : public cudaqx::extension_point<decoder, const cudaqx::tensor<uint8_t> &,
                                      const cudaqx::heterogeneous_map &> {
+private:
+  struct rt_impl;
+  struct rt_impl_deleter {
+    void operator()(rt_impl *p) const;
+  };
+  std::unique_ptr<rt_impl, rt_impl_deleter> pimpl;
+
 public:
   decoder() = delete;
 
@@ -173,8 +180,59 @@ class decoder
   std::size_t get_block_size() { return block_size; }
   std::size_t get_syndrome_size() { return syndrome_size; }
 
+  // -- Begin realtime decoding API --
+
+  // Note: all of the current realtime decoding API is designed to be used with
+  // hard syndromes.
+
+  /// @brief Get the number of measurement syndromes per decode call. This
+  /// depends on D_sparse, so you must have called set_D_sparse() first.
+  uint32_t get_num_msyn_per_decode() const;
+
+  /// @brief Set the observable matrix.
+  void set_O_sparse(const std::vector<std::vector<uint32_t>> &O_sparse);
+
+  /// @brief Set the observable matrix, using a single long vector with -1 as
+  /// row terminators.
+  void set_O_sparse(const std::vector<int64_t> &O_sparse);
+
+  /// @brief Set the D_sparse matrix.
+  void set_D_sparse(const std::vector<std::vector<uint32_t>> &D_sparse);
+
+  /// @brief Set the D_sparse matrix, using a single long vector with -1 as row
+  /// terminators.
+  void set_D_sparse(const std::vector<int64_t> &D_sparse);
+
+  /// @brief Set the decoder id.
+  void set_decoder_id(uint32_t decoder_id);
+
+  /// @brief Get the decoder id.
+  uint32_t get_decoder_id() const;
+
+  /// @brief Enqueue a syndrome for decoding (pointer version)
+  /// @return True if enough syndromes have been enqueued to trigger a decode.
+  bool enqueue_syndrome(const uint8_t *syndrome, std::size_t syndrome_length);
+
+  /// @brief Enqueue a syndrome for decoding (vector version)
+  /// @return True if enough syndromes have been enqueued to trigger a decode.
+  bool enqueue_syndrome(const std::vector<uint8_t> &syndrome);
+
+  /// @brief Get the current observable corrections.
+  const uint8_t *get_obs_corrections() const;
+
+  /// @brief Get the number of observables.
+  std::size_t get_num_observables() const;
+
+  /// @brief Clear any stored corrections.
+  void clear_corrections();
+
+  /// @brief Reset the decoder, clearing all per-shot memory and corrections.
+  void reset_decoder();
+
+  // -- End realtime decoding API --
+
   /// @brief Destructor
-  virtual ~decoder() {}
+  virtual ~decoder() = default;
 
   /// @brief Get the version of the decoder. Subclasses that are not part of the
   /// standard GitHub repo should override this to provide a more tailored
@@ -191,6 +249,12 @@ class decoder
 
   /// @brief The decoder's parity check matrix
   cudaqx::tensor<uint8_t> H;
+
+  /// @brief The decoder's observable matrix in sparse format
+  std::vector<std::vector<uint32_t>> O_sparse;
+
+  /// @brief The decoder's D matrix in sparse format
+  std::vector<std::vector<uint32_t>> D_sparse;
 };
 
 /// @brief Convert a vector of soft probabilities to a vector of hard
@@ -243,6 +307,7 @@ inline void convert_vec_soft_to_tensor_hard(const std::vector<t_soft> &in,
 /// @brief Convert a vector of hard probabilities to a vector of soft
 /// probabilities.
 /// @param in Hard probability input vector containing only 0/false or 1/true.
+/// @param in_size The size of the input vector (in elements)
 /// @param out Soft probability output vector in the range [0.0, 1.0]
 /// @param true_val The soft probability value assigned when the input is 1
 /// (default to 1.0)
@@ -253,15 +318,35 @@ template <typename t_soft, typename t_hard,
                                       (std::is_integral<t_hard>::value ||
                                        std::is_same<t_hard, bool>::value),
                                   int>::type = 0>
-inline void convert_vec_hard_to_soft(const std::vector<t_hard> &in,
+inline void convert_vec_hard_to_soft(const t_hard *in, std::size_t in_size,
                                      std::vector<t_soft> &out,
                                      const t_soft true_val = 1.0,
                                      const t_soft false_val = 0.0) {
-  out.resize(in.size());
-  for (std::size_t i = 0; i < in.size(); i++)
+  out.resize(in_size);
+  for (std::size_t i = 0; i < in_size; i++)
     out[i] = static_cast<t_soft>(in[i] ? true_val : false_val);
 }
 
+/// @brief Convert a vector of hard probabilities to a vector of soft
+/// probabilities.
+/// @param in Hard probability input vector containing only 0/false or 1/true.
+/// @param out Soft probability output vector in the range [0.0, 1.0]
+/// @param true_val The soft probability value assigned when the input is 1
+/// (default to 1.0)
+/// @param false_val The soft probability value assigned when the input is 0
+/// (default to 0.0)
+template <typename t_soft, typename t_hard,
+          typename std::enable_if<std::is_floating_point<t_soft>::value &&
+                                      (std::is_integral<t_hard>::value ||
+                                       std::is_same<t_hard, bool>::value),
+                                  int>::type = 0>
+inline void convert_vec_hard_to_soft(const std::vector<t_hard> &in,
+                                     std::vector<t_soft> &out,
+                                     const t_soft true_val = 1.0,
+                                     const t_soft false_val = 0.0) {
+  convert_vec_hard_to_soft(in.data(), in.size(), out, true_val, false_val);
+}
+
 /// @brief Convert a 2D vector of soft probabilities to a 2D vector of hard
 /// probabilities.
 /// @param in Soft probability input vector in range [0.0, 1.0]
 
@@ -22,6 +22,62 @@ namespace cudaq::qec {
 std::vector<std::vector<std::uint32_t>>
 dense_to_sparse(const cudaqx::tensor<uint8_t> &pcm);
 
+/// @brief Return a sparse representation of the PCM as a string.
+/// @param pcm The PCM to convert to a sparse representation.
+/// @return A string that represents the PCM in a sparse format.
+std::string pcm_to_sparse_string(const cudaqx::tensor<uint8_t> &pcm);
+
+/// @brief Return a PCM from a sparse representation.
+/// @param sparse_str The sparse representation of the PCM.
+/// @param num_rows The number of rows in the PCM.
+/// @param num_cols The number of columns in the PCM.
+/// @return A PCM tensor.
+cudaqx::tensor<uint8_t> pcm_from_sparse_string(const std::string &sparse_str,
+                                               std::size_t num_rows,
+                                               std::size_t num_cols);
+
+/// @brief Return a PCM from a sparse representation.
+/// @param sparse_vec The sparse representation of the PCM.
+/// @param num_rows The number of rows in the PCM.
+/// @param num_cols The number of columns in the PCM.
+/// @return A PCM tensor.
+cudaqx::tensor<uint8_t>
+pcm_from_sparse_vec(const std::vector<std::int64_t> &sparse_vec,
+                    std::size_t num_rows, std::size_t num_cols);
+
+/// @brief Return a sparse representation of the PCM.
+/// @param pcm The PCM to convert to a sparse representation.
+/// @return A vector of integers that represents the PCM in a sparse format.
+std::vector<std::int64_t> pcm_to_sparse_vec(const cudaqx::tensor<uint8_t> &pcm);
+
+/// @brief Generate a sparse detector matrix for a given number of syndromes per
+/// round and number of rounds. Timelike here means that each round of syndrome
+/// bits are xor'd against the preceding round.
+/// @param num_syndromes_per_round The number of syndromes per round.
+/// @param num_rounds The number of rounds.
+/// @param include_first_round Whether to include the first round in the
+/// detector matrix.
+/// @return The detector matrix format is CSR-like, with -1 values
+/// indicating the end of a row.
+std::vector<std::int64_t>
+generate_timelike_sparse_detector_matrix(std::uint32_t num_syndromes_per_round,
+                                         std::uint32_t num_rounds,
+                                         bool include_first_round = false);
+
+/// @brief Generate a sparse detector matrix for a given number of syndromes per
+/// round and number of rounds. Timelike here means that each round of syndrome
+/// bits are xor'd against the preceding round. The first round is supplied by
+/// the user, to allow for a mixture of detectors and non-detectors.
+/// @param num_syndromes_per_round The number of syndromes per round.
+/// @param num_rounds The number of rounds.
+/// @param first_round_matrix User specified detector matrix for the first
+/// round.
+/// @return The detector matrix format is CSR-like, with -1 values
+/// indicating the end of a row.
+std::vector<std::int64_t> generate_timelike_sparse_detector_matrix(
+    std::uint32_t num_syndromes_per_round, std::uint32_t num_rounds,
+    std::vector<std::int64_t> first_round_matrix);
+
 /// @brief Return a vector of column indices that would sort the PCM columns
 /// in topological order.
 /// @param row_indices For each column, a vector of row indices that have a
 
@@ -0,0 +1,52 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/qis/qubit_qis.h"
+
+// Define the CUDA-Q QEC Realtime Decoding API
+//
+// These functions enable CUDA-Q quantum kernel code to
+// offload decoding work to our QEC decoders in real time
+// (within qubit coherence times)
+//
+// The design here is as follows: We declare but do not
+// implement the API. Then we allow users to specify concrete
+// implementations of the API via the target specification passed to
+// nvq++.
+
+namespace cudaq::qec::decoding {
+// CUDA-Q QEC Realtime Decoding API (declarations)
+
+/// @brief Enqueue syndromes for decoding.
+/// @param decoder_id The ID of the decoder to use.
+/// @param syndromes The syndromes to enqueue.
+/// @param tag The tag to use for the syndrome (currently useful for logging
+/// only)
+__qpu__ void
+enqueue_syndromes(std::uint64_t decoder_id,
+                  const std::vector<cudaq::measure_result> &syndromes,
+                  std::uint64_t tag = 0);
+
+/// @brief Get the corrections for a given decoder.
+/// @param decoder_id The ID of the decoder to use.
+/// @param return_size The number of bits to return (in bits). This is expected
+/// to match the number of observables in the decoder.
+/// @param reset Whether to reset the decoder corrections after retrieving them.
+/// @return The corrections (detected bit flips) for the given decoder, based on
+/// all of the decoded syndromes since the last time any corrections were reset.
+__qpu__ std::vector<bool> get_corrections(std::uint64_t decoder_id,
+                                          std::uint64_t return_size,
+                                          bool reset = false);
+
+/// @brief Reset the decoder. This clears any queued syndromes and resets any
+/// corrections back to 0.
+/// @param decoder_id The ID of the decoder to reset.
+__qpu__ void reset_decoder(std::uint64_t decoder_id);
+} // namespace cudaq::qec::decoding