diff --git a/runtime/nvqir/cudensitymat/CuDensityMatState.cpp b/runtime/nvqir/cudensitymat/CuDensityMatState.cpp
index 1de935a32d5..7197334d8af 100644
--- a/runtime/nvqir/cudensitymat/CuDensityMatState.cpp
+++ b/runtime/nvqir/cudensitymat/CuDensityMatState.cpp
@@ -39,14 +39,13 @@ CuDensityMatState::overlap(const cudaq::SimulationState &other) {
 
   if (!isDensityMatrix) {
     Eigen::VectorXcd state(dimension);
-    const auto size = dimension;
     HANDLE_CUDA_ERROR(cudaMemcpy(state.data(), devicePtr,
-                                 size * sizeof(std::complex<double>),
+                                 dimension * sizeof(std::complex<double>),
                                  cudaMemcpyDeviceToHost));
 
     Eigen::VectorXcd otherState(dimension);
     HANDLE_CUDA_ERROR(cudaMemcpy(otherState.data(), other.getTensor().data,
-                                 size * sizeof(std::complex<double>),
+                                 dimension * sizeof(std::complex<double>),
                                  cudaMemcpyDeviceToHost));
     return std::abs(std::inner_product(
         state.begin(), state.end(), otherState.begin(),
@@ -55,15 +54,17 @@ CuDensityMatState::overlap(const cudaq::SimulationState &other) {
   }
 
   // FIXME: implement this in GPU memory
-  Eigen::MatrixXcd state(dimension, dimension);
-  const auto size = dimension * dimension;
+  // For density matrices, `dimension` is the total number of elements (N^2).
+  // The matrix side length is sqrt(dimension).
+  const std::size_t matDim = static_cast<std::size_t>(std::sqrt(dimension));
+  Eigen::MatrixXcd state(matDim, matDim);
   HANDLE_CUDA_ERROR(cudaMemcpy(state.data(), devicePtr,
-                               size * sizeof(std::complex<double>),
+                               dimension * sizeof(std::complex<double>),
                                cudaMemcpyDeviceToHost));
 
-  Eigen::MatrixXcd otherState(dimension, dimension);
+  Eigen::MatrixXcd otherState(matDim, matDim);
   HANDLE_CUDA_ERROR(cudaMemcpy(otherState.data(), other.getTensor().data,
-                               size * sizeof(std::complex<double>),
+                               dimension * sizeof(std::complex<double>),
                                cudaMemcpyDeviceToHost));
 
   return (state.adjoint() * otherState).trace();
@@ -423,11 +424,14 @@ CuDensityMatState::CuDensityMatState(CuDensityMatState &&other) noexcept
     : isDensityMatrix(other.isDensityMatrix), dimension(other.dimension),
       devicePtr(other.devicePtr), cudmState(other.cudmState),
       cudmHandle(other.cudmHandle), hilbertSpaceDims(other.hilbertSpaceDims),
-      batchSize(other.batchSize), borrowedData(other.borrowedData) {
+      batchSize(other.batchSize),
+      singleStateDimension(other.singleStateDimension),
+      borrowedData(other.borrowedData) {
   other.isDensityMatrix = false;
   other.dimension = 0;
   other.devicePtr = nullptr;
   other.batchSize = 1;
+  other.singleStateDimension = 0;
   other.cudmState = nullptr;
   other.cudmHandle = nullptr;
   other.hilbertSpaceDims.clear();
@@ -453,14 +457,15 @@ CuDensityMatState::operator=(CuDensityMatState &&other) noexcept {
     cudmHandle = other.cudmHandle;
     hilbertSpaceDims = std::move(other.hilbertSpaceDims);
     batchSize = other.batchSize;
+    singleStateDimension = other.singleStateDimension;
     borrowedData = other.borrowedData;
     // Nullify other
     other.isDensityMatrix = false;
     other.dimension = 0;
     other.devicePtr = nullptr;
-
     other.cudmState = nullptr;
     other.batchSize = 1;
+    other.singleStateDimension = 0;
     other.borrowedData = false;
   }
   return *this;
diff --git a/unittests/dynamics/test_CuDensityMatState.cpp b/unittests/dynamics/test_CuDensityMatState.cpp
index 927e0bc5d66..ffb64df532b 100644
--- a/unittests/dynamics/test_CuDensityMatState.cpp
+++ b/unittests/dynamics/test_CuDensityMatState.cpp
@@ -257,6 +257,63 @@ TEST_F(CuDensityMatStateTest, DensityMatrixIndexing) {
   EXPECT_THROW(state(0, {4, 4}), std::runtime_error);
 }
 
+// Regression test for density matrix overlap dimension bug.
+// The bug was that the density matrix branch of overlap() used `dimension`
+// (total element count, N^2) as the matrix side length, creating an N^2 x N^2
+// Eigen matrix and copying N^4 elements -- far exceeding the actual buffer.
+TEST_F(CuDensityMatStateTest, DensityMatrixOverlapSelf) {
+  // |00><00| density matrix: self-overlap should be Tr(rho^2) = 1.0
+  CuDensityMatState state(densityMatrixData.size(),
+                          cudaq::dynamics::createArrayGpu(densityMatrixData));
+  state.initialize_cudm(handle, hilbertSpaceDims, /*batchSize=*/1);
+  EXPECT_TRUE(state.is_density_matrix());
+
+  auto result = state.overlap(state);
+  EXPECT_NEAR(result.real(), 1.0, 1e-12);
+  EXPECT_NEAR(result.imag(), 0.0, 1e-12);
+}
+
+TEST_F(CuDensityMatStateTest, DensityMatrixOverlapOrthogonal) {
+  // |00><00| vs |11><11|: overlap should be Tr(rho1 * rho2) = 0.0
+  CuDensityMatState state1(densityMatrixData.size(),
+                           cudaq::dynamics::createArrayGpu(densityMatrixData));
+  state1.initialize_cudm(handle, hilbertSpaceDims, /*batchSize=*/1);
+
+  // |11><11|: element (3,3)=1 in a 4x4 matrix (column-major storage)
+  std::vector<std::complex<double>> dm11(16, {0.0, 0.0});
+  dm11[15] = {1.0, 0.0}; // col-major index for (3,3) = 3*4+3 = 15
+  CuDensityMatState state2(dm11.size(), cudaq::dynamics::createArrayGpu(dm11));
+  state2.initialize_cudm(handle, hilbertSpaceDims, /*batchSize=*/1);
+
+  auto result = state1.overlap(state2);
+  EXPECT_NEAR(result.real(), 0.0, 1e-12);
+  EXPECT_NEAR(result.imag(), 0.0, 1e-12);
+}
+
+TEST_F(CuDensityMatStateTest, DensityMatrixOverlapPartial) {
+  // Single-qubit system: |0><0| vs |+><+| where |+> = (|0>+|1>)/sqrt(2)
+  // |0><0| = {{1,0},{0,0}}, |+><+| = {{0.5,0.5},{0.5,0.5}}
+  // Tr(|0><0| * |+><+|) = 0.5
+  std::vector<int64_t> dims1q = {2};
+
+  // |0><0| in column-major: [1, 0, 0, 0]
+  std::vector<std::complex<double>> dm0 = {
+      {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}};
+  CuDensityMatState state0(dm0.size(), cudaq::dynamics::createArrayGpu(dm0));
+  state0.initialize_cudm(handle, dims1q, /*batchSize=*/1);
+
+  // |+><+| in column-major: [0.5, 0.5, 0.5, 0.5]
+  std::vector<std::complex<double>> dmPlus = {
+      {0.5, 0.0}, {0.5, 0.0}, {0.5, 0.0}, {0.5, 0.0}};
+  CuDensityMatState statePlus(dmPlus.size(),
+                              cudaq::dynamics::createArrayGpu(dmPlus));
+  statePlus.initialize_cudm(handle, dims1q, /*batchSize=*/1);
+
+  auto result = state0.overlap(statePlus);
+  EXPECT_NEAR(result.real(), 0.5, 1e-12);
+  EXPECT_NEAR(result.imag(), 0.0, 1e-12);
+}
+
 // Test indexing for single-qubit density matrix
 TEST_F(CuDensityMatStateTest, SingleQubitDensityMatrixIndexing) {
   // 1-qubit system: 2x2 density matrix (4 elements)