Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 15 additions & 14 deletions paddle/phi/api/include/compat/ATen/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#include <algorithm>

#include "paddle/common/macros.h"
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/phi/api/include/tensor.h"

Expand Down Expand Up @@ -71,24 +72,24 @@ Tensor tensor_complex_backend(ArrayRef<T> values,

} // namespace detail

#define TENSOR(T, _1) \
Tensor tensor(ArrayRef<T> values, const TensorOptions& options) { \
if (options.device().type() != c10::DeviceType::CPU) { \
return at::detail::tensor_backend(values, options); \
} else { \
return at::detail::tensor_cpu(values, options); \
} \
#define TENSOR(T, _1) \
PADDLE_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options) { \
if (options.device().type() != c10::DeviceType::CPU) { \
return at::detail::tensor_backend(values, options); \
} else { \
return at::detail::tensor_cpu(values, options); \
} \
}
AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
#undef TENSOR

#define TENSOR(T, _1) \
Tensor tensor(ArrayRef<T> values, const TensorOptions& options) { \
if (options.device().type() != c10::DeviceType::CPU) { \
return at::detail::tensor_complex_backend(values, options); \
} else { \
return at::detail::tensor_complex_cpu(values, options); \
} \
#define TENSOR(T, _1) \
PADDLE_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options) { \
if (options.device().type() != c10::DeviceType::CPU) { \
return at::detail::tensor_complex_backend(values, options); \
} else { \
return at::detail::tensor_complex_cpu(values, options); \
} \
}
AT_FORALL_COMPLEX_TYPES(TENSOR)
#undef TENSOR
Expand Down
16 changes: 10 additions & 6 deletions paddle/phi/api/include/compat/ATen/cuda/CUDABlas.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#include <ATen/OpMathType.h>
#include <ATen/cuda/CUDAContext.h>

#include "paddle/common/macros.h"

namespace at::cuda::blas {

/* LEVEL 3 BLAS FUNCTIONS */
Expand All @@ -54,16 +56,18 @@ inline void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
}

template <>
void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double));
PADDLE_API void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double));
template <>
void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float));
PADDLE_API void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float));
template <>
void gemm<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>));
PADDLE_API void gemm<c10::complex<double>>(
CUDABLAS_GEMM_ARGTYPES(c10::complex<double>));
template <>
void gemm<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>));
PADDLE_API void gemm<c10::complex<float>>(
CUDABLAS_GEMM_ARGTYPES(c10::complex<float>));
template <>
void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
PADDLE_API void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
template <>
void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
PADDLE_API void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));

} // namespace at::cuda::blas
34 changes: 18 additions & 16 deletions paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include <shared_mutex>
#include <tuple>

#include "paddle/common/macros.h"
#include "paddle/phi/backends/gpu/forwards.h"

namespace c10 {
Expand Down Expand Up @@ -95,40 +96,41 @@ inline int64_t getNumGPUs() { return c10::cuda::device_count(); }
inline bool is_available() { return c10::cuda::device_count() > 0; }

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
CUDAContextDeviceProp* getCurrentDeviceProperties();
PADDLE_API CUDAContextDeviceProp* getCurrentDeviceProperties();

int warp_size();
PADDLE_API int warp_size();

CUDAContextDeviceProp* getDeviceProperties(c10::DeviceIndex device);
PADDLE_API CUDAContextDeviceProp* getDeviceProperties(c10::DeviceIndex device);

bool canDeviceAccessPeer(c10::DeviceIndex device, c10::DeviceIndex peer_device);
PADDLE_API bool canDeviceAccessPeer(c10::DeviceIndex device,
c10::DeviceIndex peer_device);

/* Handles */
CUDAContextSparseHandle getCurrentCUDASparseHandle();
CUDAContextBlasHandle getCurrentCUDABlasHandle();
CUDAContextBlasLtHandle getCurrentCUDABlasLtHandle();
PADDLE_API CUDAContextSparseHandle getCurrentCUDASparseHandle();
PADDLE_API CUDAContextBlasHandle getCurrentCUDABlasHandle();
PADDLE_API CUDAContextBlasLtHandle getCurrentCUDABlasLtHandle();

void clearCublasWorkspaces();
PADDLE_API void clearCublasWorkspaces();
struct WorkspaceMapWithMutex {
std::map<std::tuple<void*, void*>, at::DataPtr> map;
std::shared_mutex mutex;
};

WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
size_t getChosenWorkspaceSize();
size_t getCUDABlasLtWorkspaceSize();
void* getCUDABlasLtWorkspace();
PADDLE_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
PADDLE_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
PADDLE_API size_t getChosenWorkspaceSize();
PADDLE_API size_t getCUDABlasLtWorkspaceSize();
PADDLE_API void* getCUDABlasLtWorkspace();

CUDAContextSolverHandle getCurrentCUDASolverDnHandle();
PADDLE_API CUDAContextSolverHandle getCurrentCUDASolverDnHandle();

#if defined(USE_CUDSS)
cudssHandle_t getCurrentCudssHandle();
PADDLE_API cudssHandle_t getCurrentCudssHandle();
#endif

// Get the CUDA device allocator for the current device.
// Returns a pointer to a c10::Allocator that allocates GPU memory.
c10::Allocator* getCUDADeviceAllocator();
PADDLE_API c10::Allocator* getCUDADeviceAllocator();
#endif

} // namespace at::cuda
14 changes: 9 additions & 5 deletions paddle/phi/api/include/compat/ATen/cuda/EmptyTensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@
#pragma once
#include <ATen/core/TensorBody.h>

#include "paddle/common/macros.h"

namespace at::detail {

using at::Tensor;
at::Tensor empty_cuda(IntArrayRef size,
ScalarType dtype,
std::optional<Device> device_opt,
std::optional<c10::MemoryFormat> memory_format_opt);
PADDLE_API at::Tensor empty_cuda(
IntArrayRef size,
ScalarType dtype,
std::optional<Device> device_opt,
std::optional<c10::MemoryFormat> memory_format_opt);

at::Tensor empty_cuda(IntArrayRef size, const TensorOptions &options);
PADDLE_API at::Tensor empty_cuda(IntArrayRef size,
const TensorOptions &options);

} // namespace at::detail
32 changes: 17 additions & 15 deletions paddle/phi/api/include/compat/ATen/ops/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,25 @@
#include <ATen/core/Tensor.h>
#include <c10/core/ScalarType.h>

#include "paddle/common/macros.h"

namespace at {

#define TENSOR(T, S) \
Tensor tensor(ArrayRef<T> values, const TensorOptions& options); \
inline Tensor tensor(std::initializer_list<T> values, \
const TensorOptions& options) { \
return at::tensor(ArrayRef<T>(values), options); \
} \
inline Tensor tensor(T value, const TensorOptions& options) { \
return at::tensor(ArrayRef<T>(value), options); \
} \
inline Tensor tensor(ArrayRef<T> values) { \
return at::tensor(std::move(values), at::dtype(k##S)); \
} \
inline Tensor tensor(std::initializer_list<T> values) { \
return at::tensor(ArrayRef<T>(values)); \
} \
#define TENSOR(T, S) \
PADDLE_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options); \
inline Tensor tensor(std::initializer_list<T> values, \
const TensorOptions& options) { \
return at::tensor(ArrayRef<T>(values), options); \
} \
inline Tensor tensor(T value, const TensorOptions& options) { \
return at::tensor(ArrayRef<T>(value), options); \
} \
inline Tensor tensor(ArrayRef<T> values) { \
return at::tensor(std::move(values), at::dtype(k##S)); \
} \
inline Tensor tensor(std::initializer_list<T> values) { \
return at::tensor(ArrayRef<T>(values)); \
} \
inline Tensor tensor(T value) { return at::tensor(ArrayRef<T>(value)); }
AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
AT_FORALL_COMPLEX_TYPES(TENSOR)
Expand Down
1 change: 1 addition & 0 deletions paddle/phi/api/include/compat/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
collect_srcs(api_srcs SRCS c10/core/Device.cpp)
collect_srcs(api_srcs SRCS c10/core/DefaultDtype.cpp)
collect_srcs(api_srcs SRCS c10/core/Stream.cpp)
collect_srcs(api_srcs SRCS c10/cuda/CUDAFunctions.cpp)
collect_srcs(api_srcs SRCS c10/cuda/CUDAStream.cpp)
Expand Down
50 changes: 50 additions & 0 deletions paddle/phi/api/include/compat/c10/core/DefaultDtype.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <c10/core/DefaultDtype.h>
#include <c10/util/complex.h>
#include <c10/util/typeid.h>

namespace c10 {
static auto default_dtype = caffe2::TypeMeta::Make<float>();
static auto default_dtype_as_scalartype = default_dtype.toScalarType();
static auto default_complex_dtype =
caffe2::TypeMeta::Make<c10::complex<float>>();

void set_default_dtype(caffe2::TypeMeta dtype) {
default_dtype = dtype;
default_dtype_as_scalartype = default_dtype.toScalarType();
switch (default_dtype_as_scalartype) {
case ScalarType::Half:
default_complex_dtype = ScalarType::ComplexHalf;
break;
case ScalarType::Double:
default_complex_dtype = ScalarType::ComplexDouble;
break;
default:
default_complex_dtype = ScalarType::ComplexFloat;
break;
}
}

const caffe2::TypeMeta get_default_dtype() { return default_dtype; }

ScalarType get_default_dtype_as_scalartype() {
return default_dtype_as_scalartype;
}

const caffe2::TypeMeta get_default_complex_dtype() {
return default_complex_dtype;
}
} // namespace c10
23 changes: 9 additions & 14 deletions paddle/phi/api/include/compat/c10/core/DefaultDtype.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,16 @@
#pragma once

#include <c10/core/ScalarType.h>
#include <c10/util/typeid.h>

namespace c10 {
static auto default_dtype = ScalarType::Float;
static auto default_complex_dtype = ScalarType::ComplexFloat;

void inline set_default_dtype(ScalarType dtype) { default_dtype = dtype; }
#include "paddle/common/macros.h"

ScalarType inline get_default_dtype_as_scalartype() { return default_dtype; }
namespace caffe2 {
class TypeMeta;
} // namespace caffe2

ScalarType inline get_default_complex_dtype() { return default_complex_dtype; }

/// Returns default dtype as caffe2::TypeMeta (the canonical form, mirrors
/// PyTorch).
inline caffe2::TypeMeta get_default_dtype() {
return caffe2::TypeMeta::fromScalarType(default_dtype);
}
namespace c10 {
PADDLE_API void set_default_dtype(caffe2::TypeMeta dtype);
PADDLE_API const caffe2::TypeMeta get_default_dtype();
PADDLE_API ScalarType get_default_dtype_as_scalartype();
PADDLE_API const caffe2::TypeMeta get_default_complex_dtype();
} // namespace c10
5 changes: 3 additions & 2 deletions paddle/phi/api/include/compat/c10/core/Device.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,14 @@ using gpuStream_t = hipStream_t;
#include <string>
#include <utility>

#include "paddle/common/macros.h"
#include "paddle/phi/core/platform/device/gpu/gpu_info.h"
#include "paddle/phi/core/platform/device_event_base.h"

namespace c10 {
using DeviceIndex = int8_t;

struct Device final {
struct PADDLE_API Device final {
using Type = DeviceType;
Device() = default;
Device(phi::Place place)
Expand Down Expand Up @@ -161,7 +162,7 @@ struct Device final {
}
};

std::ostream& operator<<(std::ostream& stream, const Device& device);
PADDLE_API std::ostream& operator<<(std::ostream& stream, const Device& device);

} // namespace c10

Expand Down
4 changes: 3 additions & 1 deletion paddle/phi/api/include/compat/c10/core/Stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include <functional>
#include <ostream>

#include "paddle/common/macros.h"

namespace c10 {

using StreamId = int64_t;
Expand All @@ -32,7 +34,7 @@ struct StreamData3 {
DeviceType device_type;
};

class Stream final {
class PADDLE_API Stream final {
private:
Device device_;
StreamId id_;
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@

namespace c10::cuda {

c10::DeviceIndex device_count();
PADDLE_API c10::DeviceIndex device_count();

void device_synchronize();
PADDLE_API void device_synchronize();

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void __inline__ stream_synchronize(gpuStream_t stream) {
Expand Down
Loading
Loading