diff --git a/docs/sphinx/using/backends/sims/tnsims.rst b/docs/sphinx/using/backends/sims/tnsims.rst
index 2e0a46ad33f..24c6fbd76aa 100644
--- a/docs/sphinx/using/backends/sims/tnsims.rst
+++ b/docs/sphinx/using/backends/sims/tnsims.rst
@@ -81,6 +81,9 @@ Specific aspects of the simulation can be configured by setting the following of
 * **`CUDA_VISIBLE_DEVICES=X`**: Makes the process only see GPU X on multi-GPU nodes. Each MPI process must only see its own dedicated GPU. For example, if you run 8 MPI processes on a DGX system with 8 GPUs, each MPI process should be assigned its own dedicated GPU via `CUDA_VISIBLE_DEVICES` when invoking `mpiexec` (or `mpirun`) commands. 
 * **`OMP_PLACES=cores`**: Set this environment variable to improve CPU parallelization.
 * **`OMP_NUM_THREADS=X`**: To enable CPU parallelization, set X to `NUMBER_OF_CORES_PER_NODE/NUMBER_OF_GPUS_PER_NODE`.
+* **`CUDAQ_TENSORNET_CONTROLLED_RANK=X`**: Specify the number of controlled qubits whereby the full tensor body of the controlled gate is expanded. If the number of controlled qubits is greater than this value, the gate is applied as a controlled tensor operator to the tensor network state. Default value is 1.
+* **`CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=X`**: Set this environment variable to `TRUE` (`ON`) or `FALSE` (`OFF`) to enable or disable contraction path reuse when computing expectation values. Default is `OFF`.
+* **`CUDAQ_TENSORNET_NUM_HYPER_SAMPLES=X`**: Specify the number of hyper samples used in the tensor network contraction path finder. Default value is 8 if not specified. 
 
 .. note:: 
 
diff --git a/runtime/common/NoiseModel.cpp b/runtime/common/NoiseModel.cpp
index ab7b4446df1..aa276f68284 100644
--- a/runtime/common/NoiseModel.cpp
+++ b/runtime/common/NoiseModel.cpp
@@ -97,7 +97,7 @@ kraus_channel &kraus_channel::operator=(const kraus_channel &other) {
   return *this;
 }
 
-std::vector<kraus_op> kraus_channel::get_ops() { return ops; }
+std::vector<kraus_op> kraus_channel::get_ops() const { return ops; }
 void kraus_channel::push_back(kraus_op op) { ops.push_back(op); }
 
 void noise_model::add_channel(const std::string &quantumOp,
diff --git a/runtime/common/NoiseModel.h b/runtime/common/NoiseModel.h
index 23f8997a081..6ee2bc5e3da 100644
--- a/runtime/common/NoiseModel.h
+++ b/runtime/common/NoiseModel.h
@@ -185,7 +185,7 @@ class kraus_channel {
   kraus_channel &operator=(const kraus_channel &other);
 
   /// @brief Return all kraus_ops in this channel
-  std::vector<kraus_op> get_ops();
+  std::vector<kraus_op> get_ops() const;
 
   /// @brief Add a kraus_op to this channel.
   void push_back(kraus_op op);
diff --git a/runtime/nvqir/cutensornet/CMakeLists.txt b/runtime/nvqir/cutensornet/CMakeLists.txt
index 4e73dc9a32a..6490e16d36e 100644
--- a/runtime/nvqir/cutensornet/CMakeLists.txt
+++ b/runtime/nvqir/cutensornet/CMakeLists.txt
@@ -60,8 +60,8 @@ set(CUTENSORNET_PATCH ${CMAKE_MATCH_1})
 
 set(CUTENSORNET_VERSION ${CUTENSORNET_MAJOR}.${CUTENSORNET_MINOR}.${CUTENSORNET_PATCH})
 message(STATUS "Found cutensornet version: ${CUTENSORNET_VERSION}")
-# We need cutensornet v2.5.0+
-if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.5")
+# We need cutensornet v2.6.0+ (cutensornetStateApplyUnitaryChannel)
+if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.6")
   set (BASE_TENSOR_BACKEND_SRS  
         simulator_cutensornet.cpp 
         tensornet_spin_op.cpp
diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
index f99e189fe4f..a6aaa7a81d4 100644
--- a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
+++ b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
@@ -11,6 +11,17 @@
 #include "cutensornet.h"
 #include "tensornet_spin_op.h"
 
+namespace {
+const std::vector<std::complex<double>> matPauliI = {
+    {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}};
+const std::vector<std::complex<double>> matPauliX{
+    {0.0, 0.0}, {1.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}};
+const std::vector<std::complex<double>> matPauliY{
+    {0.0, 0.0}, {0.0, -1.0}, {0.0, 1.0}, {0.0, 0.0}};
+const std::vector<std::complex<double>> matPauliZ{
+    {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {-1.0, 0.0}};
+} // namespace
+
 namespace nvqir {
 
 SimulatorTensorNetBase::SimulatorTensorNetBase()
@@ -137,6 +148,201 @@ void SimulatorTensorNetBase::applyGate(const GateApplicationTask &task) {
   }
 }
 
+// Helper to check whether a matrix is a scaled unitary matrix, i.e., `k * U`
+// where U is a unitary matrix. If so, it also returns the `k` factor.
+// Otherwise, return a nullopt.
+template <typename T>
+std::optional<double> isScaledUnitary(const std::vector<std::complex<T>> &mat,
+                                      double eps) {
+  typedef Eigen::Matrix<std::complex<T>, Eigen::Dynamic, Eigen::Dynamic,
+                        Eigen::RowMajor>
+      RowMajorMatTy;
+  const int dim = std::log2(mat.size());
+  Eigen::Map<const RowMajorMatTy> kMat(mat.data(), dim, dim);
+  if (kMat.isZero())
+    return std::nullopt;
+  // Check that (K_dag * K) is a scaled identity matrix
+  // i.e., the K matrix is a scaled unitary.
+  auto kdK = kMat.adjoint() * kMat;
+  if (!kdK.isDiagonal())
+    return std::nullopt;
+  // First element
+  std::complex<T> val = kdK(0, 0);
+  if (std::abs(val) > eps && std::abs(val.imag()) < eps) {
+    auto scaledKdK = (std::complex<T>{1.0} / val) * kdK;
+    if (scaledKdK.isIdentity())
+      return val.real();
+  }
+  return std::nullopt;
+}
+
+std::optional<std::pair<
+    std::vector<double>,
+    std::vector<std::vector<std::complex<
+        double>>>>> static computeUnitaryMixture(const std::
+                                                     vector<std::vector<
+                                                         std::complex<double>>>
+                                                         &krausOps,
+                                                 double tol = 1e-6) {
+  std::vector<double> probs;
+  std::vector<std::vector<std::complex<double>>> mats;
+  const auto scaleMat = [](const std::vector<std::complex<double>> &mat,
+                           double scaleFactor) {
+    std::vector<std::complex<double>> scaledMat = mat;
+    for (auto &x : scaledMat)
+      x /= scaleFactor;
+    return scaledMat;
+  };
+  for (const auto &op : krausOps) {
+    const auto scaledFactor = isScaledUnitary(op, tol);
+    if (!scaledFactor.has_value())
+      return std::nullopt;
+    probs.emplace_back(scaledFactor.value());
+    mats.emplace_back(scaleMat(op, scaledFactor.value()));
+  }
+
+  if (std::abs(1.0 - std::reduce(probs.begin(), probs.end())) > tol)
+    return std::nullopt;
+
+  return std::make_pair(probs, mats);
+}
+
+// Helper to look up a device memory pointer from a  cache.
+// If not found, allocate a new device memory buffer and put it to the cache.
+static void *
+getOrCacheMat(const std::string &key,
+              const std::vector<std::complex<double>> &mat,
+              std::unordered_map<std::string, void *> &gateDeviceMemCache) {
+  const auto iter = gateDeviceMemCache.find(key);
+
+  if (iter == gateDeviceMemCache.end()) {
+    void *dMem = allocateGateMatrix(mat);
+    gateDeviceMemCache[key] = dMem;
+    return dMem;
+  }
+  return iter->second;
+};
+
+void SimulatorTensorNetBase::applyKrausChannel(
+    const std::vector<int32_t> &qubits,
+    const cudaq::kraus_channel &krausChannel) {
+  LOG_API_TIME();
+  switch (krausChannel.noise_type) {
+  case cudaq::noise_model_type::depolarization_channel: {
+    if (krausChannel.parameters.size() != 1)
+      throw std::runtime_error(
+          fmt::format("Invalid parameters for a depolarization channel. "
+                      "Expecting 1 parameter, got {}.",
+                      krausChannel.parameters.size()));
+    const std::vector<void *> channelMats{
+        getOrCacheMat("PauliI", matPauliI, m_gateDeviceMemCache),
+        getOrCacheMat("PauliX", matPauliX, m_gateDeviceMemCache),
+        getOrCacheMat("PauliY", matPauliY, m_gateDeviceMemCache),
+        getOrCacheMat("PauliZ", matPauliZ, m_gateDeviceMemCache)};
+    const double p = krausChannel.parameters[0];
+    const std::vector<double> probabilities = {1 - p, p / 3., p / 3., p / 3.};
+    m_state->applyUnitaryChannel(qubits, channelMats, probabilities);
+    break;
+  }
+  case cudaq::noise_model_type::bit_flip_channel: {
+    if (krausChannel.parameters.size() != 1)
+      throw std::runtime_error(
+          fmt::format("Invalid parameters for a bit-flip channel. "
+                      "Expecting 1 parameter, got {}.",
+                      krausChannel.parameters.size()));
+
+    const std::vector<void *> channelMats{
+        getOrCacheMat("PauliI", matPauliI, m_gateDeviceMemCache),
+        getOrCacheMat("PauliX", matPauliX, m_gateDeviceMemCache)};
+    const double p = krausChannel.parameters[0];
+    const std::vector<double> probabilities = {1 - p, p};
+    m_state->applyUnitaryChannel(qubits, channelMats, probabilities);
+    break;
+  }
+  case cudaq::noise_model_type::phase_flip_channel: {
+    if (krausChannel.parameters.size() != 1)
+      throw std::runtime_error(
+          fmt::format("Invalid parameters for a phase-flip channel. "
+                      "Expecting 1 parameter, got {}.",
+                      krausChannel.parameters.size()));
+
+    const std::vector<void *> channelMats{
+        getOrCacheMat("PauliI", matPauliI, m_gateDeviceMemCache),
+        getOrCacheMat("PauliZ", matPauliZ, m_gateDeviceMemCache)};
+    const double p = krausChannel.parameters[0];
+    const std::vector<double> probabilities = {1 - p, p};
+    m_state->applyUnitaryChannel(qubits, channelMats, probabilities);
+    break;
+  }
+  case cudaq::noise_model_type::amplitude_damping_channel: {
+    if (krausChannel.parameters.size() != 1)
+      throw std::runtime_error(
+          fmt::format("Invalid parameters for a amplitude damping channel. "
+                      "Expecting 1 parameter, got {}.",
+                      krausChannel.parameters.size()));
+    if (krausChannel.parameters[0] != 0.0)
+      throw std::runtime_error("Non-unitary noise channels are not supported.");
+    break;
+  }
+  case cudaq::noise_model_type::unknown: {
+    std::vector<std::vector<std::complex<double>>> mats;
+    for (const auto &op : krausChannel.get_ops())
+      mats.emplace_back(op.data);
+    auto asUnitaryMixture = computeUnitaryMixture(mats);
+    if (asUnitaryMixture.has_value()) {
+      auto &[probabilities, unitaries] = asUnitaryMixture.value();
+      std::vector<void *> channelMats;
+      for (const auto &mat : unitaries)
+        channelMats.emplace_back(getOrCacheMat(
+            "ScaledUnitary_" + std::to_string(vecComplexHash(mat)), mat,
+            m_gateDeviceMemCache));
+      m_state->applyUnitaryChannel(qubits, channelMats, probabilities);
+    } else {
+      throw std::runtime_error("Non-unitary noise channels are not supported.");
+    }
+    break;
+  }
+  default:
+    throw std::runtime_error(
+        "Unsupported noise model type: " +
+        std::to_string(static_cast<int>(krausChannel.noise_type)));
+  }
+}
+
+void SimulatorTensorNetBase::applyNoiseChannel(
+    const std::string_view gateName, const std::vector<std::size_t> &controls,
+    const std::vector<std::size_t> &targets,
+    const std::vector<double> &params) {
+  LOG_API_TIME();
+  // Do nothing if no execution context
+  if (!executionContext)
+    return;
+
+  // Do nothing if no noise model
+  if (!executionContext->noiseModel)
+    return;
+
+  // Get the name as a string
+  std::string gName(gateName);
+  std::vector<int32_t> qubits{controls.begin(), controls.end()};
+  qubits.insert(qubits.end(), targets.begin(), targets.end());
+
+  // Get the Kraus channels specified for this gate and qubits
+  auto krausChannels = executionContext->noiseModel->get_channels(
+      gName, targets, controls, params);
+
+  // If none, do nothing
+  if (krausChannels.empty())
+    return;
+
+  cudaq::info(
+      "[SimulatorTensorNetBase] Applying {} kraus channels on qubits: {}",
+      krausChannels.size(), qubits);
+
+  for (const auto &krausChannel : krausChannels)
+    applyKrausChannel(qubits, krausChannel);
+}
+
 /// @brief Reset the state of a given qubit to zero
 void SimulatorTensorNetBase::resetQubit(const std::size_t qubitIdx) {
   flushGateQueue();
@@ -225,10 +431,7 @@ cudaq::ExecutionResult
 SimulatorTensorNetBase::sample(const std::vector<std::size_t> &measuredBits,
                                const int shots) {
   LOG_API_TIME();
-  std::vector<int32_t> measuredBitIds;
-  std::transform(measuredBits.begin(), measuredBits.end(),
-                 std::back_inserter(measuredBitIds),
-                 [](std::size_t idx) { return static_cast<int32_t>(idx); });
+  std::vector<int32_t> measuredBitIds(measuredBits.begin(), measuredBits.end());
   if (shots < 1) {
     cudaq::spin_op::spin_op_term allZTerm(2 * m_state->getNumQubits(), 0);
     for (const auto &m : measuredBits)
@@ -239,7 +442,8 @@ SimulatorTensorNetBase::sample(const std::vector<std::size_t> &measuredBits,
   }
 
   prepareQubitTensorState();
-  const auto samples = m_state->sample(measuredBitIds, shots);
+  const auto samples =
+      m_state->sample(measuredBitIds, shots, requireCacheWorkspace());
   cudaq::ExecutionResult counts(samples);
   double expVal = 0.0;
   // Compute the expectation value from the counts
@@ -286,7 +490,8 @@ SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) {
     // cutensornetNetworkOperator_t and compute the expectation value.
     TensorNetworkSpinOp spinOp(ham, m_cutnHandle);
     std::complex<double> expVal =
-        m_state->computeExpVal(spinOp.getNetworkOperator());
+        m_state->computeExpVal(spinOp.getNetworkOperator(),
+                               this->executionContext->numberTrajectories);
     expVal += spinOp.getIdentityTermOffset();
     return cudaq::observe_result(expVal.real(), ham,
                                  cudaq::sample_result(cudaq::ExecutionResult(
@@ -316,7 +521,8 @@ SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) {
   });
 
   // Compute the expectation value for all terms
-  const auto termExpVals = m_state->computeExpVals(terms);
+  const auto termExpVals = m_state->computeExpVals(
+      terms, this->executionContext->numberTrajectories);
   std::complex<double> expVal = 0.0;
   // Construct per-term data in the final observe_result
   std::vector<cudaq::ExecutionResult> results;
diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.h b/runtime/nvqir/cutensornet/simulator_cutensornet.h
index 8d11c88deed..c99912496c5 100644
--- a/runtime/nvqir/cutensornet/simulator_cutensornet.h
+++ b/runtime/nvqir/cutensornet/simulator_cutensornet.h
@@ -30,6 +30,12 @@ class SimulatorTensorNetBase : public nvqir::CircuitSimulatorBase<double> {
   /// @brief Apply quantum gate
   void applyGate(const GateApplicationTask &task) override;
 
+  /// @brief Apply a noise channel
+  void applyNoiseChannel(const std::string_view gateName,
+                         const std::vector<std::size_t> &controls,
+                         const std::vector<std::size_t> &targets,
+                         const std::vector<double> &params) override;
+
   // Override base calculateStateDim (we don't instantiate full state vector in
   // the tensornet backend). When the user want to retrieve the state vector, we
   // check if it is feasible to do so.
@@ -88,6 +94,15 @@ class SimulatorTensorNetBase : public nvqir::CircuitSimulatorBase<double> {
   /// @brief Query if direct expectation value calculation is enabled
   virtual bool canHandleObserve() override;
 
+  /// @brief Return true if this simulator can use cache workspace (e.g., for
+  /// intermediate tensors)
+  virtual bool requireCacheWorkspace() const = 0;
+
+private:
+  // Helper to apply a Kraus channel
+  void applyKrausChannel(const std::vector<int32_t> &qubits,
+                         const cudaq::kraus_channel &channel);
+
 protected:
   cutensornetHandle_t m_cutnHandle;
   std::unique_ptr<TensorNetState> m_state;
diff --git a/runtime/nvqir/cutensornet/simulator_mps_register.cpp b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
index d6830b90a45..57b7f090bef 100644
--- a/runtime/nvqir/cutensornet/simulator_mps_register.cpp
+++ b/runtime/nvqir/cutensornet/simulator_mps_register.cpp
@@ -153,6 +153,164 @@ class SimulatorMPS : public SimulatorTensorNetBase {
     SimulatorTensorNetBase::applyExpPauli(theta, controls, qubitIds, op);
   }
 
+  // Helper to compute expectation value from a bit string distribution
+  static double computeExpValFromDistribution(
+      const std::unordered_map<std::string, std::size_t> &distribution,
+      int shots) {
+    double expVal = 0.0;
+    // Compute the expectation value from the distribution
+    for (auto &kv : distribution) {
+      auto par = cudaq::sample_result::has_even_parity(kv.first);
+      auto p = kv.second / (double)shots;
+      if (!par) {
+        p = -p;
+      }
+      expVal += p;
+    }
+    return expVal;
+  };
+
+  // Set up the MPS factorization before trajectory simulation run loop.
+  // We only need to do cutensornetStateFinalizeMPS once
+  void setUpFactorizeForTrajectoryRuns() {
+    for (auto &tensor : m_mpsTensors_d) {
+      HANDLE_CUDA_ERROR(cudaFree(tensor.deviceData));
+    }
+    m_mpsTensors_d.clear();
+    m_mpsTensors_d =
+        m_state->setupMPSFactorize(m_settings.maxBond, m_settings.absCutoff,
+                                   m_settings.relCutoff, m_settings.svdAlgo);
+  }
+
+  /// @brief Sample a subset of qubits
+  cudaq::ExecutionResult sample(const std::vector<std::size_t> &measuredBits,
+                                const int shots) override {
+    const bool hasNoise = executionContext && executionContext->noiseModel;
+    if (!hasNoise || shots < 1)
+      return SimulatorTensorNetBase::sample(measuredBits, shots);
+
+    LOG_API_TIME();
+    cudaq::ExecutionResult counts;
+    std::vector<int32_t> measuredBitIds(measuredBits.begin(),
+                                        measuredBits.end());
+
+    setUpFactorizeForTrajectoryRuns();
+    std::map<std::vector<int64_t>, std::pair<cutensornetStateSampler_t,
+                                             cutensornetWorkspaceDescriptor_t>>
+        samplerCache;
+    for (int i = 0; i < shots; ++i) {
+      // As the Kraus operator sampling may change the MPS state, we need to
+      // re-compute the factorization in each trajectory.
+      m_state->computeMPSFactorize(m_mpsTensors_d);
+      std::vector<int64_t> samplerKey;
+      for (const auto &tensor : m_mpsTensors_d)
+        samplerKey.insert(samplerKey.end(), tensor.extents.begin(),
+                          tensor.extents.end());
+
+      auto iter = samplerCache.find(samplerKey);
+      if (iter == samplerCache.end()) {
+        const auto [itInsert, success] = samplerCache.insert(
+            {samplerKey, m_state->prepareSample(measuredBitIds)});
+        assert(success);
+        iter = itInsert;
+      }
+
+      assert(iter != samplerCache.end());
+      auto &[sampler, workDesc] = iter->second;
+      const auto samples = m_state->executeSample(
+          sampler, workDesc, measuredBitIds, 1, requireCacheWorkspace());
+      assert(samples.size() == 1);
+      for (const auto &[bitString, count] : samples)
+        counts.appendResult(bitString, count);
+    }
+
+    for (const auto &[k, v] : samplerCache) {
+      auto &[sampler, workDesc] = v;
+      // Destroy the workspace descriptor
+      HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
+      // Destroy the quantum circuit sampler
+      HANDLE_CUTN_ERROR(cutensornetDestroySampler(sampler));
+    }
+
+    counts.expectationValue =
+        computeExpValFromDistribution(counts.counts, shots);
+
+    return counts;
+  }
+
+  // Helper to prepare term-by-term data from a spin op
+  static std::tuple<std::vector<std::string>,
+                    std::vector<cudaq::spin_op::spin_op_term>,
+                    std::vector<std::complex<double>>>
+  prepareSpinOpTermData(const cudaq::spin_op &ham) {
+    std::vector<std::string> termStrs;
+    std::vector<cudaq::spin_op::spin_op_term> terms;
+    std::vector<std::complex<double>> coeffs;
+    termStrs.reserve(ham.num_terms());
+    terms.reserve(ham.num_terms());
+    coeffs.reserve(ham.num_terms());
+
+    // Note: as the spin_op terms are stored as an unordered map, we need to
+    // iterate in one loop to collect all the data (string, symplectic data, and
+    // coefficient).
+    ham.for_each_term([&](cudaq::spin_op &term) {
+      termStrs.emplace_back(term.to_string(false));
+      auto [symplecticRep, coeff] = term.get_raw_data();
+      if (symplecticRep.size() != 1 || coeff.size() != 1)
+        throw std::runtime_error(fmt::format(
+            "Unexpected data encountered when iterating spin operator terms: "
+            "expecting a single term, got {} terms.",
+            symplecticRep.size()));
+      terms.emplace_back(symplecticRep[0]);
+      coeffs.emplace_back(coeff[0]);
+    });
+    return std::make_tuple(termStrs, terms, coeffs);
+  }
+
+  cudaq::observe_result observe(const cudaq::spin_op &ham) override {
+    LOG_API_TIME();
+    const bool hasNoise = executionContext && executionContext->noiseModel;
+    // If no noise, just use base class implementation.
+    if (!hasNoise)
+      return SimulatorTensorNetBase::observe(ham);
+
+    setUpFactorizeForTrajectoryRuns();
+    const std::size_t numObserveTrajectories =
+        this->executionContext->numberTrajectories.has_value()
+            ? this->executionContext->numberTrajectories.value()
+            : TensorNetState::g_numberTrajectoriesForObserve;
+
+    auto [termStrs, terms, coeffs] = prepareSpinOpTermData(ham);
+    std::vector<std::complex<double>> termExpVals(terms.size(), 0.0);
+
+    for (std::size_t i = 0; i < numObserveTrajectories; ++i) {
+      // As the Kraus operator sampling may change the MPS state, we need to
+      // re-compute the factorization in each trajectory.
+      m_state->computeMPSFactorize(m_mpsTensors_d);
+      // We run a single trajectory for MPS as the final MPS form depends on the
+      // randomly-selected noise op.
+      const auto trajTermExpVals = m_state->computeExpVals(terms, 1);
+
+      for (std::size_t idx = 0; idx < terms.size(); ++idx) {
+        termExpVals[idx] += (trajTermExpVals[idx] /
+                             static_cast<double>(numObserveTrajectories));
+      }
+    }
+    std::complex<double> expVal = 0.0;
+    // Construct per-term data in the final observe_result
+    std::vector<cudaq::ExecutionResult> results;
+    results.reserve(terms.size());
+
+    for (std::size_t i = 0; i < terms.size(); ++i) {
+      expVal += (coeffs[i] * termExpVals[i]);
+      results.emplace_back(
+          cudaq::ExecutionResult({}, termStrs[i], termExpVals[i].real()));
+    }
+
+    cudaq::sample_result perTermData(expVal.real(), results);
+    return cudaq::observe_result(expVal.real(), ham, perTermData);
+  }
+
   virtual std::string name() const override { return "tensornet-mps"; }
 
   CircuitSimulator *clone() override {
@@ -248,6 +406,8 @@ class SimulatorMPS : public SimulatorTensorNetBase {
         m_cutnHandle, m_randomEngine);
   }
 
+  bool requireCacheWorkspace() const override { return false; }
+
   virtual ~SimulatorMPS() noexcept {
     for (auto &tensor : m_mpsTensors_d) {
       HANDLE_CUDA_ERROR(cudaFree(tensor.deviceData));
diff --git a/runtime/nvqir/cutensornet/simulator_tensornet_register.cpp b/runtime/nvqir/cutensornet/simulator_tensornet_register.cpp
index 132e80f3b4c..649a8775b2e 100644
--- a/runtime/nvqir/cutensornet/simulator_tensornet_register.cpp
+++ b/runtime/nvqir/cutensornet/simulator_tensornet_register.cpp
@@ -134,6 +134,7 @@ class SimulatorTensorNet : public SimulatorTensorNetBase {
       }
     }
   }
+  bool requireCacheWorkspace() const override { return true; }
 
 private:
   friend nvqir::CircuitSimulator * ::getCircuitSimulator_tensornet();
diff --git a/runtime/nvqir/cutensornet/tensornet_state.cpp b/runtime/nvqir/cutensornet/tensornet_state.cpp
index c6e2b2d712c..1e6f83f8199 100644
--- a/runtime/nvqir/cutensornet/tensornet_state.cpp
+++ b/runtime/nvqir/cutensornet/tensornet_state.cpp
@@ -12,6 +12,27 @@
 #include <cassert>
 
 namespace nvqir {
+std::int32_t TensorNetState::numHyperSamples = []() {
+  constexpr int32_t defaultNumHyperSamples = 8;
+  if (auto envVal = std::getenv("CUDAQ_TENSORNET_NUM_HYPER_SAMPLES")) {
+    int32_t specifiedNumHyperSamples = 0;
+    try {
+      specifiedNumHyperSamples = std::stoi(envVal);
+      if (specifiedNumHyperSamples <= 0) {
+        // for the 'catch' below to handle.
+        throw std::invalid_argument("must be a positive number");
+      }
+      cudaq::info("Update number of hyper samples from {} to {}.",
+                  defaultNumHyperSamples, specifiedNumHyperSamples);
+      return specifiedNumHyperSamples;
+    } catch (...) {
+      throw std::runtime_error(
+          "Invalid CUDAQ_TENSORNET_NUM_HYPER_SAMPLES environment "
+          "variable, must be a positive integer.");
+    }
+  }
+  return defaultNumHyperSamples;
+}();
 
 TensorNetState::TensorNetState(std::size_t numQubits,
                                ScratchDeviceMem &inScratchPad,
@@ -74,6 +95,21 @@ void TensorNetState::applyGate(const std::vector<int32_t> &controlQubits,
                                            controlQubits, adjoint, true});
 }
 
+void TensorNetState::applyUnitaryChannel(
+    const std::vector<int32_t> &qubits, const std::vector<void *> &krausOps,
+    const std::vector<double> &probabilities) {
+  LOG_API_TIME();
+  HANDLE_CUTN_ERROR(cutensornetStateApplyUnitaryChannel(
+      m_cutnHandle, m_quantumState, /*numStateModes=*/qubits.size(),
+      /*stateModes=*/qubits.data(),
+      /*numTensors=*/krausOps.size(),
+      /*tensorData=*/const_cast<void **>(krausOps.data()),
+      /*tensorModeStrides=*/nullptr,
+      /*probabilities=*/probabilities.data(), &m_tensorId));
+  m_tensorOps.emplace_back(AppliedTensorOp{qubits, krausOps, probabilities});
+  m_hasNoiseChannel = true;
+}
+
 void TensorNetState::applyQubitProjector(void *proj_d,
                                          const std::vector<int32_t> &qubitIdx) {
   LOG_API_TIME();
@@ -99,23 +135,38 @@ void TensorNetState::addQubits(std::size_t numQubits) {
   // the new wires are all empty (zero state).
   int64_t tensorId = 0;
   for (auto &op : m_tensorOps)
-    if (op.controlQubitIds.empty()) {
-      HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(
-          m_cutnHandle, m_quantumState, op.targetQubitIds.size(),
-          op.targetQubitIds.data(), op.deviceData, nullptr, /*immutable*/ 1,
-          /*adjoint*/ static_cast<int32_t>(op.isAdjoint),
-          /*unitary*/ static_cast<int32_t>(op.isUnitary), &tensorId));
-    } else {
-      HANDLE_CUTN_ERROR(cutensornetStateApplyControlledTensorOperator(
+    if (op.deviceData) {
+      if (op.controlQubitIds.empty()) {
+        HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(
+            m_cutnHandle, m_quantumState, op.targetQubitIds.size(),
+            op.targetQubitIds.data(), op.deviceData, nullptr, /*immutable*/ 1,
+            /*adjoint*/ static_cast<int32_t>(op.isAdjoint),
+            /*unitary*/ static_cast<int32_t>(op.isUnitary), &tensorId));
+      } else {
+        HANDLE_CUTN_ERROR(cutensornetStateApplyControlledTensorOperator(
+            m_cutnHandle, m_quantumState,
+            /*numControlModes=*/op.controlQubitIds.size(),
+            /*stateControlModes=*/op.controlQubitIds.data(),
+            /*stateControlValues=*/nullptr,
+            /*numTargetModes*/ op.targetQubitIds.size(),
+            /*stateTargetModes*/ op.targetQubitIds.data(), op.deviceData,
+            nullptr,
+            /*immutable*/ 1,
+            /*adjoint*/ static_cast<int32_t>(op.isAdjoint),
+            /*unitary*/ static_cast<int32_t>(op.isUnitary), &m_tensorId));
+      }
+    } else if (op.unitaryChannel.has_value()) {
+      HANDLE_CUTN_ERROR(cutensornetStateApplyUnitaryChannel(
           m_cutnHandle, m_quantumState,
-          /*numControlModes=*/op.controlQubitIds.size(),
-          /*stateControlModes=*/op.controlQubitIds.data(),
-          /*stateControlValues=*/nullptr,
-          /*numTargetModes*/ op.targetQubitIds.size(),
-          /*stateTargetModes*/ op.targetQubitIds.data(), op.deviceData, nullptr,
-          /*immutable*/ 1,
-          /*adjoint*/ static_cast<int32_t>(op.isAdjoint),
-          /*unitary*/ static_cast<int32_t>(op.isUnitary), &m_tensorId));
+          /*numStateModes=*/op.targetQubitIds.size(),
+          /*stateModes=*/op.targetQubitIds.data(),
+          /*numTensors=*/op.unitaryChannel->tensorData.size(),
+          /*tensorData=*/op.unitaryChannel->tensorData.data(),
+          /*tensorModeStrides=*/nullptr,
+          /*probabilities=*/op.unitaryChannel->probabilities.data(),
+          &m_tensorId));
+    } else {
+      throw std::runtime_error("Invalid AppliedTensorOp encountered.");
     }
 }
 
@@ -148,9 +199,8 @@ void TensorNetState::addQubits(std::span<std::complex<double>> stateVec) {
   m_tempDevicePtrs.emplace_back(d_proj);
 }
 
-std::unordered_map<std::string, size_t>
-TensorNetState::sample(const std::vector<int32_t> &measuredBitIds,
-                       int32_t shots) {
+std::pair<cutensornetStateSampler_t, cutensornetWorkspaceDescriptor_t>
+TensorNetState::prepareSample(const std::vector<int32_t> &measuredBitIds) {
   LOG_API_TIME();
   // Create the quantum circuit sampler
   cutensornetStateSampler_t sampler;
@@ -161,14 +211,10 @@ TensorNetState::sample(const std::vector<int32_t> &measuredBitIds,
         measuredBitIds.data(), &sampler));
   }
 
-  // Configure the quantum circuit sampler
-  constexpr int32_t numHyperSamples =
-      8; // desired number of hyper samples used in the tensor network
-         // contraction path finder
   {
     ScopedTraceWithContext("cutensornetSamplerConfigure");
     HANDLE_CUTN_ERROR(cutensornetSamplerConfigure(
-        m_cutnHandle, sampler, CUTENSORNET_SAMPLER_OPT_NUM_HYPER_SAMPLES,
+        m_cutnHandle, sampler, CUTENSORNET_SAMPLER_CONFIG_NUM_HYPER_SAMPLES,
         &numHyperSamples, sizeof(numHyperSamples)));
 
     // Generate a random seed from the backend simulator's random engine.
@@ -211,9 +257,47 @@ TensorNetState::sample(const std::vector<int32_t> &measuredBitIds,
     throw std::runtime_error("ERROR: Insufficient workspace size on Device!");
   }
 
+  return std::make_pair(sampler, workDesc);
+}
+
+std::unordered_map<std::string, size_t>
+TensorNetState::executeSample(cutensornetStateSampler_t &sampler,
+                              cutensornetWorkspaceDescriptor_t &workDesc,
+                              const std::vector<int32_t> &measuredBitIds,
+                              int32_t shots, bool enableCacheWorkspace) {
+  int64_t reqCacheSize{0};
+  void *d_cache{nullptr};
+  if (enableCacheWorkspace) {
+    ScopedTraceWithContext("Allocate Cache Workspace");
+    HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(
+        m_cutnHandle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
+        CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_CACHE,
+        &reqCacheSize));
+
+    // Query the GPU memory capacity
+    std::size_t freeSize{0}, totalSize{0};
+    HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
+    // Compute the minimum of [required size, or 90% of the free memory (to
+    // avoid oversubscribing)] (see cutensornet examples)
+    const std::size_t cacheSizeAvailable =
+        std::min(static_cast<size_t>(reqCacheSize),
+                 size_t(freeSize * 0.9) - (size_t(freeSize * 0.9) % 4096));
+    cudaq::info("Cache size = {} bytes", cacheSizeAvailable);
+    const auto errCode = cudaMalloc(&d_cache, cacheSizeAvailable);
+    if (errCode == cudaSuccess) {
+      HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(
+          m_cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
+          CUTENSORNET_WORKSPACE_CACHE, d_cache, cacheSizeAvailable));
+    } else {
+      cudaq::info("Failed to allocate cache workspace memory.");
+      d_cache = nullptr;
+    }
+  }
   // Sample the quantum circuit state
   std::unordered_map<std::string, size_t> counts;
-  constexpr int MAX_SHOTS_PER_RUNS = 10000;
+  // If this is a trajectory simulation, each shot needs an independent
+  // trajectory sampling.
+  const int MAX_SHOTS_PER_RUNS = m_hasNoiseChannel ? 1 : 10000;
   int shotsToRun = shots;
   while (shotsToRun > 0) {
     const int numShots = std::min(shotsToRun, MAX_SHOTS_PER_RUNS);
@@ -236,6 +320,19 @@ TensorNetState::sample(const std::vector<int32_t> &measuredBitIds,
     shotsToRun -= numShots;
   }
 
+  if (enableCacheWorkspace && d_cache) {
+    HANDLE_CUDA_ERROR(cudaFree(d_cache));
+  }
+  return counts;
+}
+
+std::unordered_map<std::string, size_t>
+TensorNetState::sample(const std::vector<int32_t> &measuredBitIds,
+                       int32_t shots, bool enableCacheWorkspace) {
+  LOG_API_TIME();
+  auto [sampler, workDesc] = prepareSample(measuredBitIds);
+  std::unordered_map<std::string, size_t> counts = executeSample(
+      sampler, workDesc, measuredBitIds, shots, enableCacheWorkspace);
   // Destroy the workspace descriptor
   HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
   // Destroy the quantum circuit sampler
@@ -270,13 +367,10 @@ std::pair<void *, std::size_t> TensorNetState::contractStateVectorInternal(
         projectedModes.data(), nullptr, &accessor));
   }
 
-  const int32_t numHyperSamples =
-      8; // desired number of hyper samples used in the tensor network
-         // contraction path finder
   {
     ScopedTraceWithContext("cutensornetAccessorConfigure");
     HANDLE_CUTN_ERROR(cutensornetAccessorConfigure(
-        m_cutnHandle, accessor, CUTENSORNET_ACCESSOR_OPT_NUM_HYPER_SAMPLES,
+        m_cutnHandle, accessor, CUTENSORNET_ACCESSOR_CONFIG_NUM_HYPER_SAMPLES,
         &numHyperSamples, sizeof(numHyperSamples)));
   }
   // Prepare the quantum state amplitudes accessor
@@ -327,100 +421,10 @@ std::pair<void *, std::size_t> TensorNetState::contractStateVectorInternal(
   return std::make_pair(d_sv, svDim);
 }
 
-std::vector<std::complex<double>> TensorNetState::getStateVector(
-    const std::vector<int32_t> &projectedModes,
-    const std::vector<int64_t> &projectedModeValues) {
-  auto [d_sv, svDim] =
-      contractStateVectorInternal(projectedModes, projectedModeValues);
-  std::vector<std::complex<double>> h_sv(svDim);
-  HANDLE_CUDA_ERROR(cudaMemcpy(h_sv.data(), d_sv,
-                               svDim * sizeof(std::complex<double>),
-                               cudaMemcpyDeviceToHost));
-  // Free resources
-  HANDLE_CUDA_ERROR(cudaFree(d_sv));
-
-  return h_sv;
-}
-
-std::vector<std::complex<double>>
-TensorNetState::computeRDM(const std::vector<int32_t> &qubits) {
-  // Make sure that we don't overflow the memory size calculation.
-  // Note: the actual limitation will depend on the system memory.
-  if (qubits.size() >= 32 ||
-      (1ull << (2 * qubits.size())) >
-          std::numeric_limits<uint64_t>::max() / sizeof(std::complex<double>))
-    throw std::runtime_error("Too many qubits are requested for reduced "
-                             "density matrix contraction.");
-  LOG_API_TIME();
-  void *d_rdm{nullptr};
-  const uint64_t rdmSize = 1ull << (2 * qubits.size());
-  const uint64_t rdmSizeBytes = rdmSize * sizeof(std::complex<double>);
-  HANDLE_CUDA_ERROR(cudaMalloc(&d_rdm, rdmSizeBytes));
-
-  cutensornetStateMarginal_t marginal;
-  {
-    ScopedTraceWithContext("cutensornetCreateMarginal");
-    HANDLE_CUTN_ERROR(cutensornetCreateMarginal(
-        m_cutnHandle, m_quantumState, qubits.size(), qubits.data(),
-        /*numProjectedModes*/ 0, /*projectedModes*/ nullptr,
-        /*marginalTensorStrides*/ nullptr, &marginal));
-  }
-
-  const int32_t numHyperSamples =
-      8; // desired number of hyper samples used in the tensor network
-         // contraction path finder
-  {
-    ScopedTraceWithContext("cutensornetMarginalConfigure");
-    HANDLE_CUTN_ERROR(cutensornetMarginalConfigure(
-        m_cutnHandle, marginal, CUTENSORNET_MARGINAL_OPT_NUM_HYPER_SAMPLES,
-        &numHyperSamples, sizeof(numHyperSamples)));
-  }
-
-  // Prepare the specified quantum circuit reduced density matrix (marginal)
-  cutensornetWorkspaceDescriptor_t workDesc;
-  HANDLE_CUTN_ERROR(
-      cutensornetCreateWorkspaceDescriptor(m_cutnHandle, &workDesc));
-  {
-    ScopedTraceWithContext("cutensornetMarginalPrepare");
-    HANDLE_CUTN_ERROR(cutensornetMarginalPrepare(
-        m_cutnHandle, marginal, scratchPad.scratchSize, workDesc, 0));
-  }
-  // Attach the workspace buffer
-  int64_t worksize{0};
-  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(
-      m_cutnHandle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
-      CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH, &worksize));
-  if (worksize <= static_cast<int64_t>(scratchPad.scratchSize)) {
-    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(
-        m_cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
-        CUTENSORNET_WORKSPACE_SCRATCH, scratchPad.d_scratch, worksize));
-  } else {
-    throw std::runtime_error("ERROR: Insufficient workspace size on Device!");
-  }
-  {
-    ScopedTraceWithContext("cutensornetMarginalCompute");
-    // Compute the specified quantum circuit reduced density matrix (marginal)
-    HANDLE_CUTN_ERROR(cutensornetMarginalCompute(m_cutnHandle, marginal,
-                                                 nullptr, workDesc, d_rdm, 0));
-  }
-  std::vector<std::complex<double>> h_rdm(rdmSize);
-  HANDLE_CUDA_ERROR(
-      cudaMemcpy(h_rdm.data(), d_rdm, rdmSizeBytes, cudaMemcpyDeviceToHost));
-
-  // Clean up
-  HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
-  HANDLE_CUTN_ERROR(cutensornetDestroyMarginal(marginal));
-  HANDLE_CUDA_ERROR(cudaFree(d_rdm));
-
-  return h_rdm;
-}
-
-// Returns MPS tensors (device mems)
-// Note: user needs to clean up these tensors
 std::vector<MPSTensor>
-TensorNetState::factorizeMPS(int64_t maxExtent, double absCutoff,
-                             double relCutoff,
-                             cutensornetTensorSVDAlgo_t algo) {
+TensorNetState::setupMPSFactorize(int64_t maxExtent, double absCutoff,
+                                  double relCutoff,
+                                  cutensornetTensorSVDAlgo_t algo) {
   LOG_API_TIME();
   if (m_numQubits == 0)
     return {};
@@ -430,18 +434,10 @@ TensorNetState::factorizeMPS(int64_t maxExtent, double absCutoff,
     tensor.extents = {2};
     HANDLE_CUDA_ERROR(
         cudaMalloc(&tensor.deviceData, 2 * sizeof(std::complex<double>)));
-    // Just contract all the gates to the tensor.
-    // Note: if none gates, don't call `getStateVector`, which performs a
-    // contraction (`Flop count is zero` error).
-    const std::vector<std::complex<double>> stateVec =
-        isDirty() ? getStateVector()
-                  : std::vector<std::complex<double>>{1.0, 0.0};
-    assert(stateVec.size() == 2);
-    HANDLE_CUDA_ERROR(cudaMemcpy(tensor.deviceData, stateVec.data(),
-                                 2 * sizeof(std::complex<double>),
-                                 cudaMemcpyHostToDevice));
+
     return {tensor};
   }
+
   std::vector<MPSTensor> mpsTensors(m_numQubits);
   std::vector<int64_t *> extentsPtr(m_numQubits);
   for (std::size_t i = 0; i < m_numQubits; ++i) {
@@ -480,6 +476,35 @@ TensorNetState::factorizeMPS(int64_t maxExtent, double absCutoff,
   HANDLE_CUTN_ERROR(cutensornetStateConfigure(
       m_cutnHandle, m_quantumState, CUTENSORNET_STATE_CONFIG_MPS_SVD_REL_CUTOFF,
       &relCutoff, sizeof(relCutoff)));
+  return mpsTensors;
+}
+
+void TensorNetState::computeMPSFactorize(std::vector<MPSTensor> &mpsTensors) {
+  LOG_API_TIME();
+  if (mpsTensors.empty())
+    return;
+  if (mpsTensors.size() == 1) {
+    MPSTensor &tensor = mpsTensors[0];
+    // Just contract all the gates to the tensor.
+    // Note: if none gates, don't call `getStateVector`, which performs a
+    // contraction (`Flop count is zero` error).
+    const std::vector<std::complex<double>> stateVec =
+        isDirty() ? getStateVector()
+                  : std::vector<std::complex<double>>{1.0, 0.0};
+    assert(stateVec.size() == 2);
+    HANDLE_CUDA_ERROR(cudaMemcpy(tensor.deviceData, stateVec.data(),
+                                 2 * sizeof(std::complex<double>),
+                                 cudaMemcpyHostToDevice));
+    return;
+  }
+
+  std::vector<int64_t *> extentsPtr(mpsTensors.size());
+  std::vector<void *> allData(mpsTensors.size());
+  for (std::size_t i = 0; auto &tensor : mpsTensors) {
+    allData[i] = tensor.deviceData;
+    extentsPtr[i] = tensor.extents.data();
+    ++i;
+  }
 
   // Prepare the MPS computation and attach workspace
   cutensornetWorkspaceDescriptor_t workDesc;
@@ -522,9 +547,6 @@ TensorNetState::factorizeMPS(int64_t maxExtent, double absCutoff,
       m_cutnHandle, workDesc, CUTENSORNET_MEMSPACE_HOST,
       CUTENSORNET_WORKSPACE_SCRATCH, hostWork, hostWorkspaceSize));
 
-  std::vector<void *> allData(m_numQubits);
-  for (std::size_t i = 0; auto &tensor : mpsTensors)
-    allData[i++] = tensor.deviceData;
   {
     ScopedTraceWithContext("cutensornetStateCompute");
     // Execute MPS computation
@@ -533,14 +555,110 @@ TensorNetState::factorizeMPS(int64_t maxExtent, double absCutoff,
         /*strides=*/nullptr, allData.data(), 0));
   }
 
-  if (hostWork) {
+  if (hostWork)
     free(hostWork);
+}
+
+std::vector<std::complex<double>> TensorNetState::getStateVector(
+    const std::vector<int32_t> &projectedModes,
+    const std::vector<int64_t> &projectedModeValues) {
+  auto [d_sv, svDim] =
+      contractStateVectorInternal(projectedModes, projectedModeValues);
+  std::vector<std::complex<double>> h_sv(svDim);
+  HANDLE_CUDA_ERROR(cudaMemcpy(h_sv.data(), d_sv,
+                               svDim * sizeof(std::complex<double>),
+                               cudaMemcpyDeviceToHost));
+  // Free resources
+  HANDLE_CUDA_ERROR(cudaFree(d_sv));
+
+  return h_sv;
+}
+
+std::vector<std::complex<double>>
+TensorNetState::computeRDM(const std::vector<int32_t> &qubits) {
+  // Make sure that we don't overflow the memory size calculation.
+  // Note: the actual limitation will depend on the system memory.
+  if (qubits.size() >= 32 ||
+      (1ull << (2 * qubits.size())) >
+          std::numeric_limits<uint64_t>::max() / sizeof(std::complex<double>))
+    throw std::runtime_error("Too many qubits are requested for reduced "
+                             "density matrix contraction.");
+  LOG_API_TIME();
+  void *d_rdm{nullptr};
+  const uint64_t rdmSize = 1ull << (2 * qubits.size());
+  const uint64_t rdmSizeBytes = rdmSize * sizeof(std::complex<double>);
+  HANDLE_CUDA_ERROR(cudaMalloc(&d_rdm, rdmSizeBytes));
+
+  cutensornetStateMarginal_t marginal;
+  {
+    ScopedTraceWithContext("cutensornetCreateMarginal");
+    HANDLE_CUTN_ERROR(cutensornetCreateMarginal(
+        m_cutnHandle, m_quantumState, qubits.size(), qubits.data(),
+        /*numProjectedModes*/ 0, /*projectedModes*/ nullptr,
+        /*marginalTensorStrides*/ nullptr, &marginal));
+  }
+
+  {
+    ScopedTraceWithContext("cutensornetMarginalConfigure");
+    HANDLE_CUTN_ERROR(cutensornetMarginalConfigure(
+        m_cutnHandle, marginal, CUTENSORNET_MARGINAL_CONFIG_NUM_HYPER_SAMPLES,
+        &numHyperSamples, sizeof(numHyperSamples)));
+  }
+
+  // Prepare the specified quantum circuit reduced density matrix (marginal)
+  cutensornetWorkspaceDescriptor_t workDesc;
+  HANDLE_CUTN_ERROR(
+      cutensornetCreateWorkspaceDescriptor(m_cutnHandle, &workDesc));
+  {
+    ScopedTraceWithContext("cutensornetMarginalPrepare");
+    HANDLE_CUTN_ERROR(cutensornetMarginalPrepare(
+        m_cutnHandle, marginal, scratchPad.scratchSize, workDesc, 0));
+  }
+  // Attach the workspace buffer
+  int64_t worksize{0};
+  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(
+      m_cutnHandle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
+      CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH, &worksize));
+  if (worksize <= static_cast<int64_t>(scratchPad.scratchSize)) {
+    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(
+        m_cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
+        CUTENSORNET_WORKSPACE_SCRATCH, scratchPad.d_scratch, worksize));
+  } else {
+    throw std::runtime_error("ERROR: Insufficient workspace size on Device!");
+  }
+  {
+    ScopedTraceWithContext("cutensornetMarginalCompute");
+    // Compute the specified quantum circuit reduced density matrix (marginal)
+    HANDLE_CUTN_ERROR(cutensornetMarginalCompute(m_cutnHandle, marginal,
+                                                 nullptr, workDesc, d_rdm, 0));
   }
+  std::vector<std::complex<double>> h_rdm(rdmSize);
+  HANDLE_CUDA_ERROR(
+      cudaMemcpy(h_rdm.data(), d_rdm, rdmSizeBytes, cudaMemcpyDeviceToHost));
+
+  // Clean up
+  HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
+  HANDLE_CUTN_ERROR(cutensornetDestroyMarginal(marginal));
+  HANDLE_CUDA_ERROR(cudaFree(d_rdm));
+
+  return h_rdm;
+}
+
+// Returns MPS tensors (device mems)
+// Note: user needs to clean up these tensors
+std::vector<MPSTensor>
+TensorNetState::factorizeMPS(int64_t maxExtent, double absCutoff,
+                             double relCutoff,
+                             cutensornetTensorSVDAlgo_t algo) {
+  LOG_API_TIME();
+  auto mpsTensors = setupMPSFactorize(maxExtent, absCutoff, relCutoff, algo);
+  computeMPSFactorize(mpsTensors);
   return mpsTensors;
 }
 
 std::vector<std::complex<double>> TensorNetState::computeExpVals(
-    const std::vector<std::vector<bool>> &symplecticRepr) {
+    const std::vector<std::vector<bool>> &symplecticRepr,
+    const std::optional<std::size_t> &numberTrajectories) {
   LOG_API_TIME();
   if (symplecticRepr.empty())
     return {};
@@ -607,16 +725,11 @@ std::vector<std::complex<double>> TensorNetState::computeExpVals(
                                                    &tensorNetworkExpectation));
   }
   // Step 2: configure
-  // Note: as we reuse this path across many contractions, use a higher number
-  // of hyper samples.
-  const int32_t numHyperSamples =
-      512; // desired number of hyper samples used in the tensor network
-           // contraction path finder
   {
     ScopedTraceWithContext("cutensornetExpectationConfigure");
     HANDLE_CUTN_ERROR(cutensornetExpectationConfigure(
         m_cutnHandle, tensorNetworkExpectation,
-        CUTENSORNET_EXPECTATION_OPT_NUM_HYPER_SAMPLES, &numHyperSamples,
+        CUTENSORNET_EXPECTATION_CONFIG_NUM_HYPER_SAMPLES, &numHyperSamples,
         sizeof(numHyperSamples)));
   }
 
@@ -646,6 +759,13 @@ std::vector<std::complex<double>> TensorNetState::computeExpVals(
   }
 
   // Step 4: Compute
+  const std::size_t numObserveTrajectories = [&]() -> std::size_t {
+    if (!m_hasNoiseChannel)
+      return 1;
+    if (numberTrajectories.has_value())
+      return numberTrajectories.value();
+    return g_numberTrajectoriesForObserve;
+  }();
   for (const auto &term : symplecticRepr) {
     bool allIdOps = true;
     for (std::size_t i = 0; i < numQubits; ++i) {
@@ -680,13 +800,14 @@ std::vector<std::complex<double>> TensorNetState::computeExpVals(
       HANDLE_CUDA_ERROR(cudaMemcpy(pauliMats_d, pauliMats_h,
                                    placeHolderArraySize,
                                    cudaMemcpyHostToDevice));
-      std::complex<double> expVal;
-      {
+      std::complex<double> expVal = 0.0;
+      for (std::size_t trajId = 0; trajId < numObserveTrajectories; ++trajId) {
+        std::complex<double> result;
         ScopedTraceWithContext("cutensornetExpectationCompute");
         HANDLE_CUTN_ERROR(cutensornetExpectationCompute(
-            m_cutnHandle, tensorNetworkExpectation, workDesc, &expVal,
-            /*stateNorm*/ nullptr,
+            m_cutnHandle, tensorNetworkExpectation, workDesc, &result, nullptr,
             /*cudaStream*/ 0));
+        expVal += (result / static_cast<double>(numObserveTrajectories));
       }
       allExpVals.emplace_back(expVal);
     }
@@ -699,7 +820,8 @@ std::vector<std::complex<double>> TensorNetState::computeExpVals(
 }
 
 std::complex<double> TensorNetState::computeExpVal(
-    cutensornetNetworkOperator_t tensorNetworkOperator) {
+    cutensornetNetworkOperator_t tensorNetworkOperator,
+    const std::optional<std::size_t> &numberTrajectories) {
   LOG_API_TIME();
   cutensornetStateExpectation_t tensorNetworkExpectation;
   // Step 1: create
@@ -710,14 +832,11 @@ std::complex<double> TensorNetState::computeExpVal(
                                                    &tensorNetworkExpectation));
   }
   // Step 2: configure
-  const int32_t numHyperSamples =
-      8; // desired number of hyper samples used in the tensor network
-         // contraction path finder
   {
     ScopedTraceWithContext("cutensornetExpectationConfigure");
     HANDLE_CUTN_ERROR(cutensornetExpectationConfigure(
         m_cutnHandle, tensorNetworkExpectation,
-        CUTENSORNET_EXPECTATION_OPT_NUM_HYPER_SAMPLES, &numHyperSamples,
+        CUTENSORNET_EXPECTATION_CONFIG_NUM_HYPER_SAMPLES, &numHyperSamples,
         sizeof(numHyperSamples)));
   }
 
@@ -746,14 +865,23 @@ std::complex<double> TensorNetState::computeExpVal(
   }
 
   // Step 4: Compute
-  std::complex<double> expVal;
-
-  {
+  const std::size_t numObserveTrajectories = [&]() -> std::size_t {
+    if (!m_hasNoiseChannel)
+      return 1;
+    if (numberTrajectories.has_value())
+      return numberTrajectories.value();
+    return g_numberTrajectoriesForObserve;
+  }();
+
+  std::complex<double> expVal = 0.0;
+  for (std::size_t trajId = 0; trajId < numObserveTrajectories; ++trajId) {
+    std::complex<double> result;
     ScopedTraceWithContext("cutensornetExpectationCompute");
     HANDLE_CUTN_ERROR(cutensornetExpectationCompute(
-        m_cutnHandle, tensorNetworkExpectation, workDesc, &expVal,
+        m_cutnHandle, tensorNetworkExpectation, workDesc, &result,
         /*stateNorm*/ nullptr,
         /*cudaStream*/ 0));
+    expVal += (result / static_cast<double>(numObserveTrajectories));
   }
   // Step 5: clean up
   HANDLE_CUTN_ERROR(cutensornetDestroyExpectation(tensorNetworkExpectation));
@@ -814,6 +942,55 @@ TensorNetState::reverseQubitOrder(std::span<std::complex<double>> stateVec) {
   return ket;
 }
 
+void TensorNetState::applyCachedOps() {
+  int64_t tensorId = 0;
+  for (auto &op : m_tensorOps)
+    if (op.deviceData) {
+      if (op.controlQubitIds.empty()) {
+        HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(
+            m_cutnHandle, m_quantumState, op.targetQubitIds.size(),
+            op.targetQubitIds.data(), op.deviceData, nullptr, /*immutable*/ 1,
+            /*adjoint*/ static_cast<int32_t>(op.isAdjoint),
+            /*unitary*/ static_cast<int32_t>(op.isUnitary), &tensorId));
+      } else {
+        HANDLE_CUTN_ERROR(cutensornetStateApplyControlledTensorOperator(
+            m_cutnHandle, m_quantumState,
+            /*numControlModes=*/op.controlQubitIds.size(),
+            /*stateControlModes=*/op.controlQubitIds.data(),
+            /*stateControlValues=*/nullptr,
+            /*numTargetModes*/ op.targetQubitIds.size(),
+            /*stateTargetModes*/ op.targetQubitIds.data(), op.deviceData,
+            nullptr,
+            /*immutable*/ 1,
+            /*adjoint*/ static_cast<int32_t>(op.isAdjoint),
+            /*unitary*/ static_cast<int32_t>(op.isUnitary), &m_tensorId));
+      }
+    } else if (op.unitaryChannel.has_value()) {
+      HANDLE_CUTN_ERROR(cutensornetStateApplyUnitaryChannel(
+          m_cutnHandle, m_quantumState,
+          /*numStateModes=*/op.targetQubitIds.size(),
+          /*stateModes=*/op.targetQubitIds.data(),
+          /*numTensors=*/op.unitaryChannel->tensorData.size(),
+          /*tensorData=*/op.unitaryChannel->tensorData.data(),
+          /*tensorModeStrides=*/nullptr,
+          /*probabilities=*/op.unitaryChannel->probabilities.data(),
+          &m_tensorId));
+    } else {
+      throw std::runtime_error("Invalid AppliedTensorOp encountered.");
+    }
+}
+
+void TensorNetState::setZeroState() {
+  LOG_API_TIME();
+  // Destroy the current quantum circuit state
+  HANDLE_CUTN_ERROR(cutensornetDestroyState(m_quantumState));
+  const std::vector<int64_t> qubitDims(m_numQubits, 2);
+  // Re-create the state
+  HANDLE_CUTN_ERROR(cutensornetCreateState(
+      m_cutnHandle, CUTENSORNET_STATE_PURITY_PURE, m_numQubits,
+      qubitDims.data(), CUDA_C_64F, &m_quantumState));
+}
+
 std::unique_ptr<TensorNetState> TensorNetState::createFromStateVector(
     std::span<std::complex<double>> stateVec, ScratchDeviceMem &inScratchPad,
     cutensornetHandle_t handle, std::mt19937 &randomEngine) {
diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h
index c680fd5c677..5400b0565f8 100644
--- a/runtime/nvqir/cutensornet/tensornet_state.h
+++ b/runtime/nvqir/cutensornet/tensornet_state.h
@@ -12,6 +12,7 @@
 #include "cutensornet.h"
 #include "tensornet_utils.h"
 #include "timing_utils.h"
+#include <optional>
 #include <span>
 #include <unordered_map>
 
@@ -28,13 +29,31 @@ struct MPSTensor {
   std::vector<int64_t> extents;
 };
 
+struct UnitaryChannel {
+  std::vector<void *> tensorData;
+  std::vector<double> probabilities;
+};
+
 /// Track gate tensors that were appended to the tensor network.
 struct AppliedTensorOp {
   void *deviceData = nullptr;
+  std::optional<UnitaryChannel> unitaryChannel;
   std::vector<int32_t> targetQubitIds;
   std::vector<int32_t> controlQubitIds;
   bool isAdjoint;
   bool isUnitary;
+  AppliedTensorOp(void *dataPtr, const std::vector<int32_t> &targetQubits,
+                  const std::vector<int32_t> &controlQubits, bool adjoint,
+                  bool unitary)
+      : deviceData(dataPtr), targetQubitIds(targetQubits),
+        controlQubitIds(controlQubits), isAdjoint(adjoint), isUnitary(unitary) {
+  }
+
+  AppliedTensorOp(const std::vector<int32_t> &qubits,
+                  const std::vector<void *> &krausOps,
+                  const std::vector<double> &probabilities)
+      : targetQubitIds(qubits),
+        unitaryChannel(UnitaryChannel(krausOps, probabilities)) {}
 };
 
 /// @brief Wrapper of cutensornetState_t to provide convenient API's for CUDA-Q
@@ -56,8 +75,13 @@ class TensorNetState {
   // This is a reference to the backend random number generator, which can be
   // reseeded by users.
   std::mt19937 &m_randomEngine;
+  bool m_hasNoiseChannel = false;
 
 public:
+  // The number of hyper samples used in the tensor network contraction path
+  // finder
+  static std::int32_t numHyperSamples;
+
   /// @brief Constructor
   TensorNetState(std::size_t numQubits, ScratchDeviceMem &inScratchPad,
                  cutensornetHandle_t handle, std::mt19937 &randomEngine);
@@ -69,6 +93,10 @@ class TensorNetState {
 
   std::unique_ptr<TensorNetState> clone() const;
 
+  /// Default number of trajectories for observe (with noise) in case no option
+  /// is provided.
+  static inline constexpr int g_numberTrajectoriesForObserve = 1000;
+
   /// Reconstruct/initialize a state from MPS tensors
   static std::unique_ptr<TensorNetState>
   createFromMpsTensors(const std::vector<MPSTensor> &mpsTensors,
@@ -101,6 +129,11 @@ class TensorNetState {
                  const std::vector<int32_t> &targetQubits, void *gateDeviceMem,
                  bool adjoint = false);
 
+  /// @brief Apply a unitary channel
+  void applyUnitaryChannel(const std::vector<int32_t> &qubits,
+                           const std::vector<void *> &krausOps,
+                           const std::vector<double> &probabilities);
+
   /// @brief Apply a projector matrix (non-unitary)
   /// @param proj_d Projector matrix (expected a 2x2 matrix in column major)
   /// @param qubitIdx Qubit operand
@@ -122,7 +155,8 @@ class TensorNetState {
 
   /// @brief Perform measurement sampling on the quantum state.
   std::unordered_map<std::string, size_t>
-  sample(const std::vector<int32_t> &measuredBitIds, int32_t shots);
+  sample(const std::vector<int32_t> &measuredBitIds, int32_t shots,
+         bool enableCacheWorkspace);
 
   /// @brief Contract the tensor network representation to retrieve the state
   /// vector.
@@ -149,12 +183,14 @@ class TensorNetState {
   /// @param symplecticRepr The symplectic representation of the observable
   /// @return
   std::vector<std::complex<double>>
-  computeExpVals(const std::vector<std::vector<bool>> &symplecticRepr);
+  computeExpVals(const std::vector<std::vector<bool>> &symplecticRepr,
+                 const std::optional<std::size_t> &numberTrajectories);
 
   /// @brief Evaluate the expectation value of a given
   /// `cutensornetNetworkOperator_t`
   std::complex<double>
-  computeExpVal(cutensornetNetworkOperator_t tensorNetworkOperator);
+  computeExpVal(cutensornetNetworkOperator_t tensorNetworkOperator,
+                const std::optional<std::size_t> &numberTrajectories);
 
   /// @brief Number of qubits that this state represents.
   std::size_t getNumQubits() const { return m_numQubits; }
@@ -167,6 +203,12 @@ class TensorNetState {
   static std::vector<std::complex<double>>
   reverseQubitOrder(std::span<std::complex<double>> stateVec);
 
+  /// @brief Apply all the cached ops to the state.
+  void applyCachedOps();
+
+  /// @brief Set the state to a zero state
+  void setZeroState();
+
   /// @brief Destructor
   ~TensorNetState();
 
@@ -178,5 +220,24 @@ class TensorNetState {
   std::pair<void *, std::size_t> contractStateVectorInternal(
       const std::vector<int32_t> &projectedModes,
       const std::vector<int64_t> &projectedModeValues = {});
+
+  /// Internal methods to perform MPS factorize.
+  // Note: `factorizeMPS` is an end-to-end API for factorization.
+  // This factorization can be split into `cutensornetStateFinalizeMPS` and
+  // `cutensornetStateCompute` to facilitate reuse.
+  std::vector<MPSTensor> setupMPSFactorize(int64_t maxExtent, double absCutoff,
+                                           double relCutoff,
+                                           cutensornetTensorSVDAlgo_t algo);
+  void computeMPSFactorize(std::vector<MPSTensor> &mpsTensors);
+
+  /// Internal methods for sampling
+  std::pair<cutensornetStateSampler_t, cutensornetWorkspaceDescriptor_t>
+  prepareSample(const std::vector<int32_t> &measuredBitIds);
+
+  std::unordered_map<std::string, size_t>
+  executeSample(cutensornetStateSampler_t &sampler,
+                cutensornetWorkspaceDescriptor_t &workspaceDesc,
+                const std::vector<int32_t> &measuredBitIds, int32_t shots,
+                bool enableCacheWorkspace);
 };
 } // namespace nvqir
diff --git a/runtime/nvqir/cutensornet/tn_simulation_state.cpp b/runtime/nvqir/cutensornet/tn_simulation_state.cpp
index 0f4a065101d..baf18a163cf 100644
--- a/runtime/nvqir/cutensornet/tn_simulation_state.cpp
+++ b/runtime/nvqir/cutensornet/tn_simulation_state.cpp
@@ -75,23 +75,38 @@ TensorNetSimulationState::overlap(const cudaq::SimulationState &other) {
   allTensorOps.insert(allTensorOps.end(), tensorOps.begin(), tensorOps.end());
 
   for (auto &op : allTensorOps) {
-    if (op.controlQubitIds.empty()) {
-      HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(
-          cutnHandle, tempQuantumState, op.targetQubitIds.size(),
-          op.targetQubitIds.data(), op.deviceData, nullptr, /*immutable*/ 1,
-          /*adjoint*/ static_cast<int32_t>(op.isAdjoint),
-          /*unitary*/ static_cast<int32_t>(op.isUnitary), &tensorId));
-    } else {
-      HANDLE_CUTN_ERROR(cutensornetStateApplyControlledTensorOperator(
+    if (op.deviceData) {
+      if (op.controlQubitIds.empty()) {
+        HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(
+            cutnHandle, tempQuantumState, op.targetQubitIds.size(),
+            op.targetQubitIds.data(), op.deviceData, nullptr, /*immutable*/ 1,
+            /*adjoint*/ static_cast<int32_t>(op.isAdjoint),
+            /*unitary*/ static_cast<int32_t>(op.isUnitary), &tensorId));
+      } else {
+        HANDLE_CUTN_ERROR(cutensornetStateApplyControlledTensorOperator(
+            cutnHandle, tempQuantumState,
+            /*numControlModes=*/op.controlQubitIds.size(),
+            /*stateControlModes=*/op.controlQubitIds.data(),
+            /*stateControlValues=*/nullptr,
+            /*numTargetModes*/ op.targetQubitIds.size(),
+            /*stateTargetModes*/ op.targetQubitIds.data(), op.deviceData,
+            nullptr,
+            /*immutable*/ 1,
+            /*adjoint*/ static_cast<int32_t>(op.isAdjoint),
+            /*unitary*/ static_cast<int32_t>(op.isUnitary), &tensorId));
+      }
+    } else if (op.unitaryChannel.has_value()) {
+      HANDLE_CUTN_ERROR(cutensornetStateApplyUnitaryChannel(
           cutnHandle, tempQuantumState,
-          /*numControlModes=*/op.controlQubitIds.size(),
-          /*stateControlModes=*/op.controlQubitIds.data(),
-          /*stateControlValues=*/nullptr,
-          /*numTargetModes*/ op.targetQubitIds.size(),
-          /*stateTargetModes*/ op.targetQubitIds.data(), op.deviceData, nullptr,
-          /*immutable*/ 1,
-          /*adjoint*/ static_cast<int32_t>(op.isAdjoint),
-          /*unitary*/ static_cast<int32_t>(op.isUnitary), &tensorId));
+          /*numStateModes=*/op.targetQubitIds.size(),
+          /*stateModes=*/op.targetQubitIds.data(),
+          /*numTensors=*/op.unitaryChannel->tensorData.size(),
+          /*tensorData=*/op.unitaryChannel->tensorData.data(),
+          /*tensorModeStrides=*/nullptr,
+          /*probabilities=*/op.unitaryChannel->probabilities.data(),
+          &tensorId));
+    } else {
+      throw std::runtime_error("Invalid AppliedTensorOp encountered.");
     }
   }
 
@@ -110,13 +125,11 @@ TensorNetSimulationState::overlap(const cudaq::SimulationState &other) {
         projectedModes.data(), nullptr, &accessor));
   }
 
-  const int32_t numHyperSamples =
-      8; // desired number of hyper samples used in the tensor network
-         // contraction path finder
+  const int32_t numHyperSamples = TensorNetState::numHyperSamples;
   {
     ScopedTraceWithContext("cutensornetAccessorConfigure");
     HANDLE_CUTN_ERROR(cutensornetAccessorConfigure(
-        cutnHandle, accessor, CUTENSORNET_ACCESSOR_OPT_NUM_HYPER_SAMPLES,
+        cutnHandle, accessor, CUTENSORNET_ACCESSOR_CONFIG_NUM_HYPER_SAMPLES,
         &numHyperSamples, sizeof(numHyperSamples)));
   }
   // Prepare the quantum state amplitudes accessor
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 81b7202ae6c..fa1a9be06d4 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -227,6 +227,7 @@ if(TARGET nvqir-tensornet)
     integration/builder_tester.cpp  
     integration/deuteron_variational_tester.cpp
     integration/observe_result_tester.cpp
+    integration/noise_tester.cpp # This test contains noisy observe test cases.
   )
   target_include_directories(test_tensornet_observe_path_reuse PRIVATE .)
   target_compile_definitions(test_tensornet_observe_path_reuse 
diff --git a/unittests/integration/noise_tester.cpp b/unittests/integration/noise_tester.cpp
index 746a8930a0c..8368f7a6a01 100644
--- a/unittests/integration/noise_tester.cpp
+++ b/unittests/integration/noise_tester.cpp
@@ -11,7 +11,8 @@
 #include <set>
 #include <stdio.h>
 
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+    defined(CUDAQ_BACKEND_TENSORNET)
 struct xOp {
   void operator()() __qpu__ {
     cudaq::qubit q;
@@ -28,7 +29,7 @@ struct bell {
 };
 
 #endif
-#if defined(CUDAQ_BACKEND_DM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_TENSORNET)
 // Stim does not support arbitrary cudaq::kraus_channel specification.
 
 CUDAQ_TEST(NoiseTest, checkSimple) {
@@ -93,81 +94,82 @@ CUDAQ_TEST(NoiseTest, checkAmplitudeDamping) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_TENSORNET)
 // Stim does not support arbitrary cudaq::kraus_op specification.
 
 CUDAQ_TEST(NoiseTest, checkCNOT) {
   cudaq::set_random_seed(13);
-  cudaq::kraus_op op0{cudaq::complex{0.99498743710662, 0.0},
+  // 1% depolarization
+  cudaq::kraus_op op0{cudaq::complex{0.94868329805, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
-                      {0.99498743710662, 0.0},
+                      {0.94868329805, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
-                      {0.99498743710662, 0.0},
+                      {0.94868329805, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
-                      {0.99498743710662, 0.0}},
+                      {0.94868329805, 0.0}},
       op1{cudaq::complex{0.0, 0.0},
           {0.0, 0.0},
-          {0.05773502691896258, 0.0},
+          {0.18257418583, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
-          {0.05773502691896258, 0.0},
-          {0.05773502691896258, 0.0},
+          {0.18257418583, 0.0},
+          {0.18257418583, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
-          {0.05773502691896258, 0.0},
+          {0.18257418583, 0.0},
           {0.0, 0.0},
           {0.0, 0.0}},
       op2{cudaq::complex{0.0, 0.0},
           {0.0, 0.0},
-          {0.0, -0.05773502691896258},
+          {0.0, -0.18257418583},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
-          {0.0, -0.05773502691896258},
-          {0.0, 0.05773502691896258},
+          {0.0, -0.18257418583},
+          {0.0, 0.18257418583},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
-          {0.0, 0.05773502691896258},
+          {0.0, 0.18257418583},
           {0.0, 0.0},
           {0.0, 0.0}},
-      op3{cudaq::complex{0.05773502691896258, 0.0},
+      op3{cudaq::complex{0.18257418583, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
-          {0.05773502691896258, 0.0},
+          {0.18257418583, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
-          {-0.05773502691896258, 0.0},
+          {-0.18257418583, 0.0},
           {-0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {-0.0, 0.0},
-          {-0.05773502691896258, 0.0}};
+          {-0.18257418583, 0.0}};
   cudaq::kraus_channel cnotNoise(
       std::vector<cudaq::kraus_op>{op0, op1, op2, op3});
   cudaq::noise_model noise;
   noise.add_channel<cudaq::types::x>({0, 1}, cnotNoise);
   cudaq::set_noise(noise);
-  auto counts = cudaq::sample(10000, bell{});
+  auto counts = cudaq::sample(100, bell{});
   counts.dump();
   EXPECT_TRUE(counts.size() > 2);
   cudaq::unset_noise(); // clear for subsequent tests
@@ -188,7 +190,8 @@ CUDAQ_TEST(NoiseTest, checkExceptions) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+    defined(CUDAQ_BACKEND_TENSORNET)
 
 CUDAQ_TEST(NoiseTest, checkDepolType) {
   cudaq::set_random_seed(13);
@@ -203,7 +206,8 @@ CUDAQ_TEST(NoiseTest, checkDepolType) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+    defined(CUDAQ_BACKEND_TENSORNET)
 
 CUDAQ_TEST(NoiseTest, checkDepolTypeSimple) {
   cudaq::set_random_seed(13);
@@ -255,7 +259,8 @@ CUDAQ_TEST(NoiseTest, checkAmpDampTypeSimple) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+    defined(CUDAQ_BACKEND_TENSORNET)
 
 CUDAQ_TEST(NoiseTest, checkBitFlipType) {
   cudaq::set_random_seed(13);
@@ -272,7 +277,8 @@ CUDAQ_TEST(NoiseTest, checkBitFlipType) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+    defined(CUDAQ_BACKEND_TENSORNET)
 
 CUDAQ_TEST(NoiseTest, checkBitFlipTypeSimple) {
   cudaq::set_random_seed(13);
@@ -288,7 +294,8 @@ CUDAQ_TEST(NoiseTest, checkBitFlipTypeSimple) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+    defined(CUDAQ_BACKEND_TENSORNET)
 // Same as above but use alternate sample interface that specifies the number of
 // shots and the noise model to use.
 CUDAQ_TEST(NoiseTest, checkBitFlipTypeSimpleOptions) {
@@ -308,7 +315,8 @@ CUDAQ_TEST(NoiseTest, checkBitFlipTypeSimpleOptions) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+    defined(CUDAQ_BACKEND_TENSORNET)
 
 CUDAQ_TEST(NoiseTest, checkPhaseFlipType) {
   cudaq::set_random_seed(13);
@@ -333,7 +341,8 @@ CUDAQ_TEST(NoiseTest, checkPhaseFlipType) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+    defined(CUDAQ_BACKEND_TENSORNET)
 
 template <std::size_t N>
 struct xOpAll {
@@ -344,7 +353,8 @@ struct xOpAll {
 };
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+    defined(CUDAQ_BACKEND_TENSORNET)
 
 CUDAQ_TEST(NoiseTest, checkAllQubitChannel) {
   cudaq::set_random_seed(13);
@@ -364,74 +374,75 @@ CUDAQ_TEST(NoiseTest, checkAllQubitChannel) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_TENSORNET)
 // Stim does not support arbitrary cudaq::kraus_op specification.
 
 static cudaq::kraus_channel create2pNoiseChannel() {
-  cudaq::kraus_op op0{cudaq::complex{0.99498743710662, 0.0},
+  // 1% depolarization
+  cudaq::kraus_op op0{cudaq::complex{0.94868329805, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
-                      {0.99498743710662, 0.0},
+                      {0.94868329805, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
-                      {0.99498743710662, 0.0},
+                      {0.94868329805, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
                       {0.0, 0.0},
-                      {0.99498743710662, 0.0}},
+                      {0.94868329805, 0.0}},
       op1{cudaq::complex{0.0, 0.0},
           {0.0, 0.0},
-          {0.05773502691896258, 0.0},
+          {0.18257418583, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
-          {0.05773502691896258, 0.0},
-          {0.05773502691896258, 0.0},
+          {0.18257418583, 0.0},
+          {0.18257418583, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
-          {0.05773502691896258, 0.0},
+          {0.18257418583, 0.0},
           {0.0, 0.0},
           {0.0, 0.0}},
       op2{cudaq::complex{0.0, 0.0},
           {0.0, 0.0},
-          {0.0, -0.05773502691896258},
+          {0.0, -0.18257418583},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
-          {0.0, -0.05773502691896258},
-          {0.0, 0.05773502691896258},
+          {0.0, -0.18257418583},
+          {0.0, 0.18257418583},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
-          {0.0, 0.05773502691896258},
+          {0.0, 0.18257418583},
           {0.0, 0.0},
           {0.0, 0.0}},
-      op3{cudaq::complex{0.05773502691896258, 0.0},
+      op3{cudaq::complex{0.18257418583, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
-          {0.05773502691896258, 0.0},
+          {0.18257418583, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
-          {-0.05773502691896258, 0.0},
+          {-0.18257418583, 0.0},
           {-0.0, 0.0},
           {0.0, 0.0},
           {0.0, 0.0},
           {-0.0, 0.0},
-          {-0.05773502691896258, 0.0}};
+          {-0.18257418583, 0.0}};
   cudaq::kraus_channel noise2q(
       std::vector<cudaq::kraus_op>{op0, op1, op2, op3});
   return noise2q;
@@ -447,7 +458,7 @@ struct bellRandom {
 };
 
 #endif
-#if defined(CUDAQ_BACKEND_DM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_TENSORNET)
 // Stim does not support arbitrary cudaq::kraus_op specification.
 
 CUDAQ_TEST(NoiseTest, checkAllQubitChannelWithControl) {
@@ -455,7 +466,7 @@ CUDAQ_TEST(NoiseTest, checkAllQubitChannelWithControl) {
   cudaq::noise_model noise;
   noise.add_all_qubit_channel<cudaq::types::x>(create2pNoiseChannel(),
                                                /*numControls=*/1);
-  const std::size_t shots = 1024;
+  const std::size_t shots = 100;
   constexpr std::size_t numQubits = 5;
   std::vector<int> qubitIds(numQubits);
   std::iota(qubitIds.begin(), qubitIds.end(), 0);
@@ -476,14 +487,14 @@ CUDAQ_TEST(NoiseTest, checkAllQubitChannelWithControl) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_TENSORNET)
 // Stim does not support arbitrary cudaq::kraus_op specification.
 
 CUDAQ_TEST(NoiseTest, checkAllQubitChannelWithControlPrefix) {
   cudaq::set_random_seed(13);
   cudaq::noise_model noise;
   noise.add_all_qubit_channel("cx", create2pNoiseChannel());
-  const std::size_t shots = 1024;
+  const std::size_t shots = 100;
   constexpr std::size_t numQubits = 5;
   std::vector<int> qubitIds(numQubits);
   std::iota(qubitIds.begin(), qubitIds.end(), 0);
@@ -504,7 +515,8 @@ CUDAQ_TEST(NoiseTest, checkAllQubitChannelWithControlPrefix) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+    defined(CUDAQ_BACKEND_TENSORNET)
 
 CUDAQ_TEST(NoiseTest, checkCallbackChannel) {
   cudaq::set_random_seed(13);
@@ -515,7 +527,7 @@ CUDAQ_TEST(NoiseTest, checkCallbackChannel) {
           return cudaq::bit_flip_channel(1.);
         return cudaq::kraus_channel();
       });
-  const std::size_t shots = 252;
+  const std::size_t shots = 100;
   auto counts = cudaq::sample({.shots = shots, .noise = noise}, xOpAll<5>{});
   // Check results
   EXPECT_EQ(1, counts.size());
@@ -537,7 +549,7 @@ struct rxOp {
 };
 
 #endif
-#if defined(CUDAQ_BACKEND_DM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_TENSORNET)
 // Stim does not support rx gate.
 
 CUDAQ_TEST(NoiseTest, checkCallbackChannelWithParams) {
@@ -574,14 +586,14 @@ CUDAQ_TEST(NoiseTest, checkCallbackChannelWithParams) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_TENSORNET)
 // Stim does not support custom operations.
 
-CUDAQ_REGISTER_OPERATION(CustomX, 1, 0, {0, 1, 1, 0});
+CUDAQ_REGISTER_OPERATION(CustomXOp, 1, 0, {0, 1, 1, 0});
 CUDAQ_TEST(NoiseTest, checkCustomOperation) {
   auto kernel = []() {
     cudaq::qubit q;
-    CustomX(q);
+    CustomXOp(q);
   };
 
   // Add channel for custom operation using the (name + operand) API
@@ -589,7 +601,7 @@ CUDAQ_TEST(NoiseTest, checkCustomOperation) {
     cudaq::set_random_seed(13);
     cudaq::bit_flip_channel bf(1.);
     cudaq::noise_model noise;
-    noise.add_channel("CustomX", {0}, bf);
+    noise.add_channel("CustomXOp", {0}, bf);
     const std::size_t shots = 252;
     auto counts = cudaq::sample({.shots = shots, .noise = noise}, kernel);
     // Check results
@@ -607,7 +619,7 @@ CUDAQ_TEST(NoiseTest, checkCustomOperation) {
     cudaq::set_random_seed(13);
     cudaq::bit_flip_channel bf(1.);
     cudaq::noise_model noise;
-    noise.add_all_qubit_channel("CustomX", bf);
+    noise.add_all_qubit_channel("CustomXOp", bf);
     const std::size_t shots = 252;
     auto counts = cudaq::sample({.shots = shots, .noise = noise}, kernel);
     // Check results
@@ -624,7 +636,7 @@ CUDAQ_TEST(NoiseTest, checkCustomOperation) {
     cudaq::set_random_seed(13);
     cudaq::noise_model noise;
     noise.add_channel(
-        "CustomX",
+        "CustomXOp",
         [](const auto &qubits, const auto &params) -> cudaq::kraus_channel {
           return cudaq::bit_flip_channel(1.);
         });
@@ -642,14 +654,15 @@ CUDAQ_TEST(NoiseTest, checkCustomOperation) {
 }
 #endif
 
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM)
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+    defined(CUDAQ_BACKEND_TENSORNET)
 
 CUDAQ_TEST(NoiseTest, checkMeasurementNoise) {
   cudaq::set_random_seed(13);
-  constexpr double bitFlipRate = 0.1;
+  constexpr double bitFlipRate = 0.25;
   cudaq::bit_flip_channel bf(bitFlipRate);
   cudaq::noise_model noise;
-  // 10% bit flipping during measurement
+  // 25% bit flipping during measurement
   noise.add_channel("mz", {0}, bf);
   cudaq::set_noise(noise);
   {
@@ -658,12 +671,12 @@ CUDAQ_TEST(NoiseTest, checkMeasurementNoise) {
       x(q);
       mz(q);
     };
-    auto counts = cudaq::sample(10000, kernel);
+    auto counts = cudaq::sample(1000, kernel);
     counts.dump();
     // Due to measurement errors, we have both 0/1 results.
     EXPECT_EQ(2, counts.size());
-    EXPECT_NEAR(counts.probability("0"), bitFlipRate, 0.01);
-    EXPECT_NEAR(counts.probability("1"), 1.0 - bitFlipRate, 0.01);
+    EXPECT_NEAR(counts.probability("0"), bitFlipRate, 0.1);
+    EXPECT_NEAR(counts.probability("1"), 1.0 - bitFlipRate, 0.1);
   }
   {
     auto kernel = []() {
@@ -671,14 +684,101 @@ CUDAQ_TEST(NoiseTest, checkMeasurementNoise) {
       x(q);
       mz(q);
     };
-    auto counts = cudaq::sample(10000, kernel);
+    auto counts = cudaq::sample(1000, kernel);
     counts.dump();
     // We only have measurement noise on the first qubit.
     EXPECT_EQ(2, counts.size());
-    EXPECT_NEAR(counts.probability("01"), bitFlipRate, 0.01);
-    EXPECT_NEAR(counts.probability("11"), 1.0 - bitFlipRate, 0.01);
+    EXPECT_NEAR(counts.probability("01"), bitFlipRate, 0.1);
+    EXPECT_NEAR(counts.probability("11"), 1.0 - bitFlipRate, 0.1);
   }
   cudaq::unset_noise(); // clear for subsequent tests
 }
 
 #endif
+
+#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_TENSORNET)
+CUDAQ_TEST(NoiseTest, checkObserveHamiltonianWithNoise) {
+  using namespace cudaq::spin;
+  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
+                     .21829 * z(0) - 6.125 * z(1);
+  cudaq::set_random_seed(13);
+  cudaq::depolarization_channel depol(0.1);
+  cudaq::noise_model noise;
+  noise.add_all_qubit_channel<cudaq::types::x>(depol);
+  noise.add_all_qubit_channel<cudaq::types::ry>(depol);
+  cudaq::set_noise(noise);
+
+  auto ansatz = [](double theta) __qpu__ {
+    cudaq::qubit q, r;
+    x(q);
+    ry(theta, r);
+    x<cudaq::ctrl>(r, q);
+  };
+
+  double result = cudaq::observe(ansatz, h, 0.59);
+  printf("Energy value = %lf\n", result);
+  EXPECT_GT(std::abs(result + 1.7487), 0.1);
+  cudaq::unset_noise(); // clear for subsequent tests
+}
+#endif
+
+#if defined(CUDAQ_BACKEND_TENSORNET)
+CUDAQ_REGISTER_OPERATION(CustomIdOp, 1, 0, {1, 0, 0, 1});
+
+CUDAQ_TEST(NoiseTest, checkNoiseMatrixRepresentation) {
+  cudaq::set_random_seed(13);
+  cudaq::depolarization_channel depol(
+      1.0); // 1/3 probability of X, Y, Z noise op.
+  cudaq::noise_model noise;
+  noise.add_channel("CustomIdOp", {0}, depol);
+  cudaq::set_noise(noise);
+  const auto noisyCircuit = []() __qpu__ {
+    cudaq::qubit q;
+    h(q);
+    CustomIdOp(q); // inject noise
+  };
+
+  // Referent states for X, Y, or Z noise
+  std::vector<std::complex<double>> stateVecX(2), stateVecY(2), stateVecZ(2);
+
+  cudaq::get_state([]() __qpu__ {
+    cudaq::qubit q;
+    h(q);
+    x(q);
+  }).to_host(stateVecX.data(), 2);
+  cudaq::get_state([]() __qpu__ {
+    cudaq::qubit q;
+    h(q);
+    y(q);
+  }).to_host(stateVecY.data(), 2);
+  cudaq::get_state([]() __qpu__ {
+    cudaq::qubit q;
+    h(q);
+    z(q);
+  }).to_host(stateVecZ.data(), 2);
+
+  const auto checkEqualVec = [](const auto &vec1, const auto &vec2) {
+    constexpr double tol = 1e-12;
+    const auto vecSize = vec1.size();
+    if (vec2.size() != vecSize)
+      return false;
+    for (std::size_t i = 0; i < vecSize; ++i) {
+      if (std::abs(vec1[i] - vec2[i]) > tol)
+        return false;
+    }
+    return true;
+  };
+
+  for (int i = 0; i < 10; ++i) {
+    std::vector<std::complex<double>> noisyStateVec(2);
+    auto noisyState = cudaq::get_state(noisyCircuit);
+    noisyState.to_host(noisyStateVec.data(), 2);
+    const auto equalX = checkEqualVec(noisyStateVec, stateVecX);
+    const auto equalY = checkEqualVec(noisyStateVec, stateVecY);
+    const auto equalZ = checkEqualVec(noisyStateVec, stateVecZ);
+    // One of these expected output states
+    EXPECT_TRUE(equalX || equalY || equalZ);
+  }
+  cudaq::unset_noise(); // clear for subsequent tests
+}
+#endif