Skip to content

Commit 9b260fc

Browse files
authored
sycl: Add optional USM system allocations (#22526)
This introduces an optional feature to allocate large GPU buffers (≥ 1GB) using USM system allocations if supported by the device. It allows using buffers from the system allocator then letting the system manage memory migrations between host and device as necessary. This feature is disabled by default and requires the GGML_SYCL_USM_SYSTEM environment variable to enable. If USM system allocations are not supported by the device or the system, we fallback to regular allocations. This feature can allow VRAM overcommit. For example, the test below fails on B580 due to lack of memory for allocation, but it passes when enabling USM system allocations: ./examples/sycl/test.sh -m Qwen3.5-27B-Q3_K_M.gguf -lv 4 Signed-off-by: Francois Dugast <francois.dugast@intel.com>
1 parent 74ade52 commit 9b260fc

3 files changed

Lines changed: 69 additions & 25 deletions

File tree

docs/backend/SYCL.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
720720
| GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. |
721721
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
722722
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. |
723+
| GGML_SYCL_USM_SYSTEM | 0 (default) or 1 | Enable experimental support for [USM system allocations](https://github.khronos.org/SYCL_Reference/iface/usm_basic_concept.html#system-allocations) for large GPU buffers. This requires enough host memory for model weights and caches, an Intel Xe2+ GPU such as BMG or newer and supported on Linux only, with CONFIG_DRM_XE_GPUSVM enabled. |
723724

724725
## Compile-time Flags
725726

ggml/src/ggml-sycl/common.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ struct sycl_device_info {
230230
size_t total_vram;
231231
sycl_hw_info hw_info;
232232
optimize_feature opt_feature;
233+
bool usm_system_support; // support for USM system allocations
233234
};
234235

235236

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 67 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@
7272
#include "ggml-sycl/gated_delta_net.hpp"
7373
#include "ggml-sycl/pool.hpp"
7474

75+
#define MEM_SIZE_2M 0x00200000
76+
#define MEM_SIZE_1G 0x40000000
77+
7578
static bool g_sycl_loaded = false;
7679
int g_ggml_sycl_debug = 0;
7780
int g_ggml_sycl_disable_optimize = 0;
@@ -83,7 +86,7 @@ int g_ggml_sycl_use_async_mem_op = 0;
8386
int g_ggml_sycl_use_async_mem_op_requested = 1;
8487
int g_ggml_sycl_enable_level_zero = 0;
8588
int g_ggml_sycl_enable_flash_attention = 1;
86-
89+
int g_ggml_sycl_usm_system = 0;
8790

8891
static ggml_sycl_device_info ggml_sycl_init() {
8992
ggml_sycl_device_info info = {};
@@ -137,6 +140,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
137140
info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu);
138141
info.devices[i].smpbo = prop.get_local_mem_size();
139142
info.devices[i].warp_size = WARP_SIZE;
143+
info.devices[i].usm_system_support = device.has(sycl::aspect::usm_system_allocations);
140144

141145
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
142146
info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
@@ -274,6 +278,8 @@ static void ggml_check_sycl() try {
274278
g_ggml_sycl_enable_flash_attention = 0;
275279
#endif
276280

281+
g_ggml_sycl_usm_system = ggml_sycl_get_env("GGML_SYCL_USM_SYSTEM", 0);
282+
277283
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
278284

279285
GGML_LOG_INFO("Build with Macros:\n");
@@ -342,6 +348,8 @@ static void ggml_check_sycl() try {
342348
g_ggml_sycl_enable_flash_attention);
343349
#endif
344350

351+
GGML_LOG_INFO(" GGML_SYCL_USM_SYSTEM: %d\n", g_ggml_sycl_usm_system);
352+
345353
/* NOT REMOVE, keep it for next optimize for XMX.
346354
#if defined(SYCL_USE_XMX)
347355
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
@@ -417,6 +425,14 @@ catch (sycl::exception const &exc) {
417425
std::exit(1);
418426
}
419427

428+
inline void free_aligned_mem_host(void * memblock) {
429+
#ifdef _WIN32
430+
_aligned_free(memblock);
431+
#else
432+
free(memblock);
433+
#endif
434+
}
435+
420436
// sycl buffer
421437

422438
struct ggml_backend_sycl_buffer_context {
@@ -426,9 +442,10 @@ struct ggml_backend_sycl_buffer_context {
426442
std::string name;
427443
optimize_feature opt_feature;
428444
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
445+
bool is_usm_system;
429446

430-
ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
431-
device(device), dev_ptr(dev_ptr), stream(stream) {
447+
ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream, bool is_usm_system) :
448+
device(device), dev_ptr(dev_ptr), stream(stream), is_usm_system(is_usm_system) {
432449
check_allow_gpu_index(device);
433450
name = (GGML_SYCL_NAME + std::to_string(device));
434451
opt_feature = ggml_sycl_info().devices[device].opt_feature;
@@ -437,7 +454,10 @@ struct ggml_backend_sycl_buffer_context {
437454
~ggml_backend_sycl_buffer_context() {
438455
if (dev_ptr != nullptr) {
439456
ggml_sycl_set_device(device);
440-
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
457+
if (is_usm_system)
458+
free_aligned_mem_host(dev_ptr);
459+
else
460+
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
441461
}
442462

443463
//release extra used by tensors
@@ -759,21 +779,59 @@ static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_t
759779
return ctx->name.c_str();
760780
}
761781

782+
static bool check_usm_system(int device, size_t size) {
783+
bool use_usm_system = g_ggml_sycl_usm_system && size >= MEM_SIZE_1G;
784+
785+
if (use_usm_system && !ggml_sycl_info().devices[device].usm_system_support) {
786+
GGML_LOG_INFO("Device does not support USM system allocations\n");
787+
use_usm_system = false;
788+
}
789+
790+
return use_usm_system;
791+
}
792+
793+
inline void * aligned_malloc_host(size_t alignment, size_t size) {
794+
#ifdef _WIN32
795+
return _aligned_malloc(size, alignment);
796+
#else
797+
return aligned_alloc(alignment, size);
798+
#endif
799+
}
800+
762801
static ggml_backend_buffer_t
763802
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
764803
size_t size) try {
804+
ggml_check_sycl();
805+
765806
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
766807
ggml_sycl_set_device(buft_ctx->device);
767808
const queue_ptr stream = buft_ctx->stream;
768809
size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
810+
/*
811+
Alignment below ensures best performance. While in theory it could lead to
812+
wasting memory, this is acceptable because in practice only few buffers are
813+
allocated and even less exceed the minimum size accepted here for USM system
814+
allocations.
815+
*/
816+
size_t alignment = MEM_SIZE_2M;
817+
size_t aligned_size = ((size + alignment - 1) / alignment) * alignment;
818+
bool use_usm_system = check_usm_system(buft_ctx->device, aligned_size);
769819

770820
void * dev_ptr;
771-
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream)));
772-
if (!dev_ptr) {
773-
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
774-
return nullptr;
821+
if (use_usm_system) {
822+
dev_ptr = (void *)aligned_malloc_host(alignment, aligned_size);
823+
if (!dev_ptr) {
824+
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size);
825+
return nullptr;
826+
}
827+
} else {
828+
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream)));
829+
if (!dev_ptr) {
830+
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
831+
return nullptr;
832+
}
775833
}
776-
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
834+
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream, use_usm_system);
777835
return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
778836
}
779837
catch (sycl::exception const &exc) {
@@ -1300,22 +1358,6 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
13001358
GGML_UNUSED(buft);
13011359
}
13021360

1303-
inline void * aligned_malloc_host(size_t alignment, size_t size) {
1304-
#ifdef _WIN32
1305-
return _aligned_malloc(size, alignment);
1306-
#else
1307-
return aligned_alloc(alignment, size);
1308-
#endif
1309-
}
1310-
1311-
inline void free_aligned_mem_host(void * memblock) {
1312-
#ifdef _WIN32
1313-
_aligned_free(memblock);
1314-
#else
1315-
free(memblock);
1316-
#endif
1317-
}
1318-
13191361
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
13201362
free_aligned_mem_host((void *)buffer->context);
13211363
}

0 commit comments

Comments
 (0)