Paddle/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h at 7a94448209fcdf49dc78c48fdc4ba8a6312cab65 · ShigureNyako/Paddle · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// The file has been adapted from pytorch project
// Licensed under BSD-style license -
// https://github.com/pytorch/pytorch/blob/main/LICENSE

#pragma once
// Light-weight version of CUDAContext.h with fewer transitive includes

// cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
// added bf16 support

#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
    defined(USE_CUDSS)
#include <cudss.h>
#endif

#include <c10/core/Allocator.h>
#include <c10/cuda/CUDAFunctions.h>

#include <cstdint>
#include <map>
#include <shared_mutex>
#include <tuple>

#include "paddle/phi/backends/gpu/forwards.h"

namespace c10 {
struct Allocator;
}

namespace at::cuda {

#if defined(PADDLE_WITH_HIP)
using CUDAContextDeviceProp = phi::gpuDeviceProp;
using CUDAContextSparseHandle = phi::sparseHandle_t;
using CUDAContextBlasHandle = phi::blasHandle_t;
using CUDAContextBlasLtHandle = phi::blasLtHandle_t;
using CUDAContextSolverHandle = phi::solverHandle_t;
#else
using CUDAContextDeviceProp = cudaDeviceProp;
using CUDAContextSparseHandle = cusparseHandle_t;
using CUDAContextBlasHandle = cublasHandle_t;
using CUDAContextBlasLtHandle = cublasLtHandle_t;
using CUDAContextSolverHandle = cusolverDnHandle_t;
#endif

/*
A common CUDA interface for ATen.

This interface is distinct from CUDAHooks, which defines an interface that links
to both CPU-only and CUDA builds. That interface is intended for runtime
dispatch and should be used from files that are included in both CPU-only and
CUDA builds.

CUDAContext, on the other hand, should be preferred by files only included in
CUDA builds. It is intended to expose CUDA functionality in a consistent
manner.

This means there is some overlap between the CUDAContext and CUDAHooks, but
the choice of which to use is simple: use CUDAContext when in a CUDA-only file,
use CUDAHooks otherwise.

Note that CUDAContext simply defines an interface with no associated class.
It is expected that the modules whose functions compose this interface will
manage their own state. There is only a single CUDA context/state.
*/

/**
 * DEPRECATED: use device_count() instead
 */
inline int64_t getNumGPUs() { return c10::cuda::device_count(); }

/**
 * CUDA is available if we compiled with CUDA, and there are one or more
 * devices.  If we compiled with CUDA but there is a driver problem, etc.,
 * this function will report CUDA is not available (rather than raise an error.)
 */
inline bool is_available() { return c10::cuda::device_count() > 0; }

CUDAContextDeviceProp* getCurrentDeviceProperties();

int warp_size();

CUDAContextDeviceProp* getDeviceProperties(c10::DeviceIndex device);

bool canDeviceAccessPeer(c10::DeviceIndex device, c10::DeviceIndex peer_device);

/* Handles */
CUDAContextSparseHandle getCurrentCUDASparseHandle();
CUDAContextBlasHandle getCurrentCUDABlasHandle();
CUDAContextBlasLtHandle getCurrentCUDABlasLtHandle();

void clearCublasWorkspaces();
struct WorkspaceMapWithMutex {
  std::map<std::tuple<void*, void*>, at::DataPtr> map;
  std::shared_mutex mutex;
};

WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
size_t getChosenWorkspaceSize();
size_t getCUDABlasLtWorkspaceSize();
void* getCUDABlasLtWorkspace();

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
CUDAContextSolverHandle getCurrentCUDASolverDnHandle();

// Get the CUDA device allocator for the current device.
// Returns a pointer to a c10::Allocator that allocates GPU memory.
c10::Allocator* getCUDADeviceAllocator();
#endif

#if defined(USE_CUDSS)
cudssHandle_t getCurrentCudssHandle();
#endif

}  // namespace at::cuda