|
| 1 | +/* |
| 2 | + * Copyright 2025 The llm-d Authors. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | + |
| 17 | +#pragma once |
| 18 | + |
| 19 | +#include <dlfcn.h> |
| 20 | +#include <cstddef> |
| 21 | +#include <cstdint> |
| 22 | + |
| 23 | +// cuFile operation status codes |
| 24 | +enum CUfileOpError { CU_FILE_SUCCESS = 0 }; |
| 25 | + |
| 26 | +// File handle types for cuFileHandleRegister |
| 27 | +enum CUfileFileHandleType { CU_FILE_HANDLE_TYPE_OPAQUE_FD = 1 }; |
| 28 | + |
| 29 | +// Buffer registration flag for RDMA-capable memory |
| 30 | +#define CU_FILE_RDMA_REGISTER 1 |
| 31 | + |
| 32 | +// Return type for most cuFile API calls |
| 33 | +struct CUfileError_t { |
| 34 | + CUfileOpError err; |
| 35 | +}; |
| 36 | + |
| 37 | +// Forward declaration for filesystem operations table |
| 38 | +struct CUfileFSOps_t; |
| 39 | + |
| 40 | +// File descriptor passed to cuFileHandleRegister (layout must match cufile.h) |
| 41 | +struct CUfileDescr_t { |
| 42 | + CUfileFileHandleType type; |
| 43 | + union { |
| 44 | + int fd; |
| 45 | + void* handle; |
| 46 | + } handle; |
| 47 | + const CUfileFSOps_t* fs_ops; |
| 48 | +}; |
| 49 | + |
| 50 | +// Opaque handle returned by cuFileHandleRegister |
| 51 | +typedef void* CUfileHandle_t; |
| 52 | + |
| 53 | +// Driver properties returned by cuFileDriverGetProperties |
| 54 | +struct CUfileDrvProps_t { |
| 55 | + size_t max_device_cache_size; |
| 56 | + size_t max_device_pinned_mem_size; |
| 57 | + char _reserved[256]; |
| 58 | +}; |
| 59 | + |
| 60 | +// CuFileApi is a runtime wrapper for the NVIDIA cuFile (GDS) library, |
| 61 | +// so the same wheel works with or without GDS. Loads libcufile.so via dlopen. |
| 62 | +// Function pointers are resolved via dlsym by symbol name (e.g. "cuFileRead"). |
| 63 | +// Singleton — library is loaded once and function pointers are reused. |
| 64 | +class CuFileApi { |
| 65 | + public: |
| 66 | + static CuFileApi& instance() { |
| 67 | + static CuFileApi loader; |
| 68 | + return loader; |
| 69 | + } |
| 70 | + |
| 71 | + bool is_loaded() const { return m_handle != nullptr; } |
| 72 | + |
| 73 | + // Function signature types — define what each cuFile function looks like |
| 74 | + using FnDriverOpen = CUfileError_t (*)(); |
| 75 | + using FnDriverClose = CUfileError_t (*)(); |
| 76 | + using FnGetVersion = CUfileError_t (*)(int*); |
| 77 | + using FnDriverGetProperties = CUfileError_t (*)(CUfileDrvProps_t*); |
| 78 | + using FnBufRegister = CUfileError_t (*)(const void*, size_t, int); |
| 79 | + using FnBufDeregister = CUfileError_t (*)(const void*); |
| 80 | + using FnHandleRegister = CUfileError_t (*)(CUfileHandle_t*, CUfileDescr_t*); |
| 81 | + using FnHandleDeregister = void (*)(CUfileHandle_t); |
| 82 | + using FnRead = ssize_t (*)(CUfileHandle_t, void*, size_t, off_t, off_t); |
| 83 | + using FnWrite = |
| 84 | + ssize_t (*)(CUfileHandle_t, const void*, size_t, off_t, off_t); |
| 85 | + |
| 86 | + // Resolved function pointers — null if library not loaded, filled by |
| 87 | + // constructor via dlsym |
| 88 | + FnDriverOpen cuFileDriverOpen = nullptr; |
| 89 | + FnDriverClose cuFileDriverClose = nullptr; |
| 90 | + FnGetVersion cuFileGetVersion = nullptr; |
| 91 | + FnDriverGetProperties cuFileDriverGetProperties = nullptr; |
| 92 | + FnBufRegister cuFileBufRegister = nullptr; |
| 93 | + FnBufDeregister cuFileBufDeregister = nullptr; |
| 94 | + FnHandleRegister cuFileHandleRegister = nullptr; |
| 95 | + FnHandleDeregister cuFileHandleDeregister = nullptr; |
| 96 | + FnRead cuFileRead = nullptr; |
| 97 | + FnWrite cuFileWrite = nullptr; |
| 98 | + |
| 99 | + private: |
| 100 | + void* m_handle = nullptr; |
| 101 | + |
| 102 | + // Attempts to load libcufile.so and resolve all function symbols. |
| 103 | + // If the library or any symbol is missing, m_handle stays null. |
| 104 | + CuFileApi() { |
| 105 | + m_handle = dlopen("libcufile.so", RTLD_NOW); |
| 106 | + if (!m_handle) { |
| 107 | + // Try versioned name |
| 108 | + m_handle = dlopen("libcufile.so.0", RTLD_NOW); |
| 109 | + } |
| 110 | + if (!m_handle) return; |
| 111 | + |
| 112 | + cuFileDriverOpen = |
| 113 | + reinterpret_cast<FnDriverOpen>(dlsym(m_handle, "cuFileDriverOpen")); |
| 114 | + cuFileDriverClose = |
| 115 | + reinterpret_cast<FnDriverClose>(dlsym(m_handle, "cuFileDriverClose")); |
| 116 | + cuFileGetVersion = |
| 117 | + reinterpret_cast<FnGetVersion>(dlsym(m_handle, "cuFileGetVersion")); |
| 118 | + cuFileDriverGetProperties = reinterpret_cast<FnDriverGetProperties>( |
| 119 | + dlsym(m_handle, "cuFileDriverGetProperties")); |
| 120 | + cuFileBufRegister = |
| 121 | + reinterpret_cast<FnBufRegister>(dlsym(m_handle, "cuFileBufRegister")); |
| 122 | + cuFileBufDeregister = reinterpret_cast<FnBufDeregister>( |
| 123 | + dlsym(m_handle, "cuFileBufDeregister")); |
| 124 | + cuFileHandleRegister = reinterpret_cast<FnHandleRegister>( |
| 125 | + dlsym(m_handle, "cuFileHandleRegister")); |
| 126 | + cuFileHandleDeregister = reinterpret_cast<FnHandleDeregister>( |
| 127 | + dlsym(m_handle, "cuFileHandleDeregister")); |
| 128 | + cuFileRead = reinterpret_cast<FnRead>(dlsym(m_handle, "cuFileRead")); |
| 129 | + cuFileWrite = reinterpret_cast<FnWrite>(dlsym(m_handle, "cuFileWrite")); |
| 130 | + |
| 131 | + // Verify all symbols resolved |
| 132 | + if (!cuFileDriverOpen || !cuFileDriverClose || !cuFileGetVersion || |
| 133 | + !cuFileBufRegister || !cuFileBufDeregister || !cuFileHandleRegister || |
| 134 | + !cuFileHandleDeregister || !cuFileRead || !cuFileWrite) { |
| 135 | + dlclose(m_handle); |
| 136 | + m_handle = nullptr; |
| 137 | + } |
| 138 | + } |
| 139 | + |
| 140 | + // Unloads the library when the process exits |
| 141 | + ~CuFileApi() { |
| 142 | + if (m_handle) { |
| 143 | + dlclose(m_handle); |
| 144 | + } |
| 145 | + } |
| 146 | +}; |
0 commit comments