forked from NVIDIA/cuda-quantum
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtensornet_utils.h
More file actions
108 lines (97 loc) · 4.68 KB
/
tensornet_utils.h
File metadata and controls
108 lines (97 loc) · 4.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/****************************************************************-*- C++ -*-****
* Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. *
* All rights reserved. *
* *
* This source code and the accompanying materials are made available under *
* the terms of the Apache License 2.0 which accompanies this distribution. *
******************************************************************************/
#pragma once
#include "cutensornet.h"
#include <algorithm>
#include <complex>
#include <random>
#define HANDLE_CUDA_ERROR(x) \
{ \
const auto err = x; \
if (err != cudaSuccess) { \
printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); \
fflush(stdout); \
std::abort(); \
} \
}
#define HANDLE_CUTN_ERROR(x) \
{ \
const auto err = x; \
if (err != CUTENSORNET_STATUS_SUCCESS) { \
printf("cuTensorNet error %s in line %d\n", \
cutensornetGetErrorString(err), __LINE__); \
fflush(stdout); \
std::abort(); \
} \
}
/// @brief Allocate and initialize device memory according to the input host
/// data.
inline void *
allocateGateMatrix(const std::vector<std::complex<double>> &gateMatHost) {
// Copy quantum gates to Device memory
void *d_gate{nullptr};
const auto sizeBytes = gateMatHost.size() * sizeof(std::complex<double>);
HANDLE_CUDA_ERROR(cudaMalloc(&d_gate, sizeBytes));
HANDLE_CUDA_ERROR(cudaMemcpy(d_gate, gateMatHost.data(), sizeBytes,
cudaMemcpyHostToDevice));
return d_gate;
}
/// @brief Generate an array of random values in the range (0.0, max)
inline std::vector<double> randomValues(uint64_t num_samples, double max_value,
std::mt19937 &randomEngine) {
std::vector<double> rs;
rs.reserve(num_samples);
std::uniform_real_distribution<double> distr(0.0, max_value);
for (uint64_t i = 0; i < num_samples; ++i) {
rs.emplace_back(distr(randomEngine));
}
std::sort(rs.begin(), rs.end());
return rs;
}
/// @brief Struct to allocate and clean up device memory scratch space.
struct ScratchDeviceMem {
void *d_scratch = nullptr;
std::size_t scratchSize = 0;
// Compute the scratch size to allocate.
void computeScratchSize() {
// Query the free memory on Device
std::size_t freeSize{0}, totalSize{0};
HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
scratchSize = (freeSize - (freeSize % 4096)) /
2; // use half of available memory with alignment
}
// Allocate scratch device memory based on available memory
void allocate() {
if (d_scratch)
throw std::runtime_error(
"Multiple scratch device memory allocations is not allowed.");
computeScratchSize();
// Try allocate device memory
auto errCode = cudaMalloc(&d_scratch, scratchSize);
if (errCode == cudaErrorMemoryAllocation) {
// This indicates race condition whereby other GPU code is allocating
// memory while we are calling cudaMemGetInfo.
// Attempt to redo the allocation with an updated cudaMemGetInfo data.
computeScratchSize();
HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
} else {
HANDLE_CUDA_ERROR(errCode);
}
}
~ScratchDeviceMem() {
if (scratchSize > 0)
HANDLE_CUDA_ERROR(cudaFree(d_scratch));
}
};
/// Initialize `cutensornet` MPI Comm
/// If MPI is not available, fallback to an empty implementation.
void initCuTensornetComm(cutensornetHandle_t cutnHandle);
/// Reset `cutensornet` MPI Comm, e.g., in preparation for shutdown.
/// Note: this will make sure no further MPI activities from `cutensornet` can
/// occur once MPI has been finalized by CUDA-Q.
void resetCuTensornetComm(cutensornetHandle_t cutnHandle);