cuda-quantum/runtime/nvqir/cutensornet/tensornet_utils.h at 12f619c3d6a9ab28b6e8263d88f4b9f7f9bac752 · 1tnguyen/cuda-quantum · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/****************************************************************-*- C++ -*-****
 * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
 * All rights reserved.                                                        *
 *                                                                             *
 * This source code and the accompanying materials are made available under    *
 * the terms of the Apache License 2.0 which accompanies this distribution.    *
 ******************************************************************************/

#pragma once
#include "cutensornet.h"
#include <algorithm>
#include <complex>
#include <random>

#define HANDLE_CUDA_ERROR(x)                                                   \
  {                                                                            \
    const auto err = x;                                                        \
    if (err != cudaSuccess) {                                                  \
      printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); \
      fflush(stdout);                                                          \
      std::abort();                                                            \
    }                                                                          \
  }

#define HANDLE_CUTN_ERROR(x)                                                   \
  {                                                                            \
    const auto err = x;                                                        \
    if (err != CUTENSORNET_STATUS_SUCCESS) {                                   \
      printf("cuTensorNet error %s in line %d\n",                              \
             cutensornetGetErrorString(err), __LINE__);                        \
      fflush(stdout);                                                          \
      std::abort();                                                            \
    }                                                                          \
  }

/// @brief Allocate and initialize device memory according to the input host
/// data.
inline void *
allocateGateMatrix(const std::vector<std::complex<double>> &gateMatHost) {
  // Copy quantum gates to Device memory
  void *d_gate{nullptr};
  const auto sizeBytes = gateMatHost.size() * sizeof(std::complex<double>);
  HANDLE_CUDA_ERROR(cudaMalloc(&d_gate, sizeBytes));
  HANDLE_CUDA_ERROR(cudaMemcpy(d_gate, gateMatHost.data(), sizeBytes,
                               cudaMemcpyHostToDevice));
  return d_gate;
}

/// @brief Generate an array of random values in the range (0.0, max)
inline std::vector<double> randomValues(uint64_t num_samples, double max_value,
                                        std::mt19937 &randomEngine) {
  std::vector<double> rs;
  rs.reserve(num_samples);
  std::uniform_real_distribution<double> distr(0.0, max_value);
  for (uint64_t i = 0; i < num_samples; ++i) {
    rs.emplace_back(distr(randomEngine));
  }
  std::sort(rs.begin(), rs.end());
  return rs;
}

/// @brief Struct to allocate and clean up device memory scratch space.
struct ScratchDeviceMem {
  void *d_scratch = nullptr;
  std::size_t scratchSize = 0;
  // Compute the scratch size to allocate.
  void computeScratchSize() {
    // Query the free memory on Device
    std::size_t freeSize{0}, totalSize{0};
    HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
    scratchSize = (freeSize - (freeSize % 4096)) /
                  2; // use half of available memory with alignment
  }

  // Allocate scratch device memory based on available memory
  void allocate() {
    if (d_scratch)
      throw std::runtime_error(
          "Multiple scratch device memory allocations is not allowed.");

    computeScratchSize();
    // Try allocate device memory
    auto errCode = cudaMalloc(&d_scratch, scratchSize);
    if (errCode == cudaErrorMemoryAllocation) {
      // This indicates race condition whereby other GPU code is allocating
      // memory while we are calling cudaMemGetInfo.
      // Attempt to redo the allocation with an updated cudaMemGetInfo data.
      computeScratchSize();
      HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
    } else {
      HANDLE_CUDA_ERROR(errCode);
    }
  }

  ~ScratchDeviceMem() {
    if (scratchSize > 0)
      HANDLE_CUDA_ERROR(cudaFree(d_scratch));
  }
};

/// Initialize `cutensornet` MPI Comm
/// If MPI is not available, fallback to an empty implementation.
void initCuTensornetComm(cutensornetHandle_t cutnHandle);

/// Reset `cutensornet` MPI Comm, e.g., in preparation for shutdown.
/// Note: this will make sure no further MPI activities from `cutensornet` can
/// occur once MPI has been finalized by CUDA-Q.
void resetCuTensornetComm(cutensornetHandle_t cutnHandle);