Skip to content

Commit 74a80dd

Browse files
authored
[SYCL] add dev2dev memcpy by SYCL API (#24476)
* add dev2dev memcpy by SYCL API * mv GGML_SYCL_DEV2DEV_MEMCPY to runntime table * update the detect method for p2p comm * fix the erro created during fix confilct --------- Co-authored-by: Neo Zhang <NA>
1 parent d1759e4 commit 74a80dd

4 files changed

Lines changed: 61 additions & 21 deletions

File tree

docs/backend/SYCL.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
712712
| Name | Value | Function |
713713
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
714714
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
715+
| GGML_SYCL_DEV2DEV_MEMCPY | 0 (default) or 1 | Choose the SYCL or L0 API in dev2dev memory copy.<br>Value: <br>* 0: SYCL API (default)<br>* 1: L0 API -- L0 API is found to lead to abnormal crash in some case. This debug flag is used to check the issue.|
715716
| GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
716717
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for Intel devices older than Gen 10) |
717718
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
@@ -731,6 +732,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo
731732
| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
732733
| DEBUG_SYCL_MALLOC | Enable verbose per-call logging of device pool alloc/free operations. |
733734

735+
734736
## Design Rule
735737

736738
- Open to all contributors.

ggml/src/ggml-sycl/common.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ extern int g_ggml_sycl_debug;
6262
extern int g_ggml_sycl_disable_optimize;
6363
extern int g_ggml_sycl_prioritize_dmmv;
6464
extern int g_ggml_sycl_enable_flash_attention;
65+
extern int g_ggml_sycl_dev2dev_memcpy;
6566

6667

6768
#if defined(__clang__) && __has_builtin(__builtin_expect)
@@ -126,6 +127,11 @@ enum ggml_sycl_backend_gpu_mode {
126127
SYCL_MUL_GPU_MODE
127128
};
128129

130+
enum ggml_sycl_dev2dev_memcpy_mode {
131+
DEV2DEV_MEMCPY_SYCL = 0,
132+
DEV2DEV_MEMCPY_L0 = 1,
133+
};
134+
129135
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
130136

131137
static void crash() {

ggml/src/ggml-sycl/dpct/helper.hpp

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@
1313
#ifndef GGML_SYCL_DPCT_HELPER_HPP
1414
#define GGML_SYCL_DPCT_HELPER_HPP
1515

16+
#include <cstdlib>
17+
#include <iostream>
18+
#include <map>
19+
1620
#include <sycl/sycl.hpp>
1721
#include <sycl/half_type.hpp>
1822
#include <oneapi/mkl.hpp>
1923

20-
#include <map>
21-
22-
#include "ggml.h"
23-
2424
#if defined(__linux__)
2525
#include <sys/mman.h>
2626
#elif defined(_WIN64)
@@ -43,6 +43,7 @@
4343
#include <windows.h>
4444
#endif
4545

46+
4647
#define DPCT_COMPATIBILITY_TEMP (900)
4748

4849
#if defined(_MSC_VER)
@@ -59,6 +60,13 @@
5960
#define __dpct_noinline__ __attribute__((noinline))
6061
#endif
6162

63+
#define DPCT_UNUSED(x) (void)(x)
64+
65+
inline void _abort(const char * str) {
66+
std::cerr << str << std::endl;
67+
std::abort();
68+
}
69+
6270
inline std::string get_device_type_name(const sycl::device &Device) {
6371
auto DeviceType = Device.get_info<sycl::info::device::device_type>();
6472
switch (DeviceType) {
@@ -1017,7 +1025,7 @@ namespace dpct
10171025
if (backend == "opencl:cpu") return 4;
10181026
if (backend == "opencl:acc") return 5;
10191027
printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
1020-
GGML_ABORT("fatal error");
1028+
_abort("fatal error");
10211029
}
10221030
static bool compare_backend(std::string &backend1, std::string &backend2) {
10231031
return convert_backend_index(backend1) < convert_backend_index(backend2);
@@ -1426,7 +1434,7 @@ namespace dpct
14261434
if (!size)
14271435
return sycl::event{};
14281436
return q.memcpy(to_ptr, from_ptr, size, dep_events);
1429-
GGML_UNUSED(direction);
1437+
DPCT_UNUSED(direction);
14301438
}
14311439

14321440
// Get actual copy range and make sure it will not exceed range.
@@ -2092,7 +2100,7 @@ namespace dpct
20922100
if (!size)
20932101
return sycl::event{};
20942102
return q.memcpy(to_ptr, from_ptr, size, dep_events);
2095-
GGML_UNUSED(direction);
2103+
DPCT_UNUSED(direction);
20962104
}
20972105

20982106
// Get actual copy range and make sure it will not exceed range.

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ int g_ggml_sycl_use_async_mem_op = 0;
8686
int g_ggml_sycl_use_async_mem_op_requested = 1;
8787
int g_ggml_sycl_enable_level_zero = 0;
8888
int g_ggml_sycl_enable_flash_attention = 1;
89+
int g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
8990
int g_ggml_sycl_usm_system = 0;
9091

9192
static ggml_sycl_device_info ggml_sycl_init() {
@@ -272,6 +273,11 @@ static void ggml_check_sycl() try {
272273
g_ggml_sycl_enable_vmm = ggml_sycl_get_env("GGML_SYCL_ENABLE_VMM", 1);
273274
g_ggml_sycl_prioritize_dmmv = ggml_sycl_get_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
274275

276+
g_ggml_sycl_dev2dev_memcpy = ggml_sycl_get_env("GGML_SYCL_DEV2DEV_MEMCPY", DEV2DEV_MEMCPY_SYCL);
277+
if (g_ggml_sycl_enable_level_zero == 0) {
278+
g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL;
279+
}
280+
275281
#ifdef SYCL_FLASH_ATTN
276282
g_ggml_sycl_enable_flash_attention = ggml_sycl_get_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
277283
#else
@@ -324,8 +330,11 @@ static void ggml_check_sycl() try {
324330
#endif
325331
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
326332
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero);
333+
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d\n", g_ggml_sycl_dev2dev_memcpy);
327334
#else
328335
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n");
336+
GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO\n",
337+
g_ggml_sycl_dev2dev_memcpy);
329338
#endif
330339
#if GGML_SYCL_DNNL
331340
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
@@ -598,27 +607,42 @@ static bool ggml_sycl_is_l0_discrete_gpu(int device) {
598607

599608
static void dev2dev_memcpy(int device_dst, sycl::queue &q_dst, int device_src, sycl::queue &q_src, void *ptr_dst,
600609
const void *ptr_src, size_t size) {
610+
601611
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
602-
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
603-
const bool l0_copy_supported = g_ggml_sycl_enable_level_zero &&
604-
ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src);
605-
if (l0_copy_supported) {
606-
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
607-
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
608-
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
609-
0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
610-
ze_command_list_handle_t cl;
611-
ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl);
612-
if (r == ZE_RESULT_SUCCESS) {
613-
r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr);
614-
zeCommandListDestroy(cl);
612+
if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_L0) {
613+
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
614+
const bool l0_copy_supported =
615+
ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src);
616+
if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
617+
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
618+
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
619+
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
620+
0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
621+
ze_command_list_handle_t cl;
622+
ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl);
615623
if (r == ZE_RESULT_SUCCESS) {
616-
return;
624+
GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by L0\n");
625+
r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr);
626+
zeCommandListDestroy(cl);
627+
if (r == ZE_RESULT_SUCCESS) {
628+
return;
629+
}
617630
}
618631
}
619632
}
620633
#endif
634+
635+
if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_SYCL) {
636+
if (q_dst.get_device().ext_oneapi_can_access_peer(q_src.get_device(),
637+
sycl::ext::oneapi::peer_access::access_supported)) {
638+
GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by SYCL\n");
639+
SYCL_CHECK(CHECK_TRY_ERROR(q_dst.memcpy(ptr_dst, ptr_src, size).wait()));
640+
return;
641+
}
642+
}
643+
621644
// Host-staged copy
645+
GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by host forward\n");
622646
char *host_buf = (char *)malloc(size);
623647
q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
624648
q_dst.memcpy((char *)ptr_dst, host_buf, size).wait();

0 commit comments

Comments
 (0)