Skip to content

Commit d804641

Browse files
authored
Merge branch 'ikawrakow:main' into main
2 parents 49605e1 + 1d7d022 commit d804641

File tree

12 files changed

+870
-256
lines changed

12 files changed

+870
-256
lines changed

cmake/FindNCCL.cmake

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Find the nccl libraries
2+
#
3+
# The following variables are optionally searched for defaults
4+
# NCCL_ROOT: Base directory where all NCCL components are found
5+
# NCCL_INCLUDE_DIR: Directory where NCCL header is found
6+
# NCCL_LIB_DIR: Directory where NCCL library is found
7+
#
8+
# The following are set after configuration is done:
9+
# NCCL_FOUND
10+
# NCCL_INCLUDE_DIRS
11+
# NCCL_LIBRARIES
12+
#
13+
# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
14+
# install NCCL in the same location as the CUDA toolkit.
15+
# See https://github.com/caffe2/caffe2/issues/1601
16+
17+
set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
18+
set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
19+
set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
20+
21+
if ($ENV{NCCL_ROOT_DIR})
22+
message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
23+
endif()
24+
list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
25+
# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
26+
list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
27+
28+
find_path(NCCL_INCLUDE_DIRS
29+
NAMES nccl.h
30+
HINTS ${NCCL_INCLUDE_DIR})
31+
32+
if (USE_STATIC_NCCL)
33+
MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
34+
SET(NCCL_LIBNAME "nccl_static")
35+
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
36+
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
37+
endif()
38+
else()
39+
SET(NCCL_LIBNAME "nccl")
40+
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
41+
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
42+
endif()
43+
endif()
44+
45+
find_library(NCCL_LIBRARIES
46+
NAMES ${NCCL_LIBNAME}
47+
HINTS ${NCCL_LIB_DIR})
48+
49+
include(FindPackageHandleStandardArgs)
50+
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
51+
52+
if(NCCL_FOUND) # obtaining NCCL version and some sanity checks
53+
set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
54+
message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
55+
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
56+
list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
57+
include(CheckCXXSymbolExists)
58+
check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
59+
60+
if (NCCL_VERSION_DEFINED)
61+
set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
62+
file(WRITE ${file} "
63+
#include <iostream>
64+
#include <nccl.h>
65+
int main()
66+
{
67+
std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
68+
int x;
69+
ncclGetVersion(&x);
70+
return x == NCCL_VERSION_CODE;
71+
}
72+
")
73+
try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
74+
RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
75+
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
76+
LINK_LIBRARIES ${NCCL_LIBRARIES})
77+
if (NOT NCCL_VERSION_MATCHED)
78+
message(FATAL_ERROR "Found NCCL header version and library version do not match! \
79+
(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
80+
endif()
81+
message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
82+
else()
83+
message(STATUS "NCCL version < 2.3.5-5")
84+
endif ()
85+
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
86+
87+
message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
88+
mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
89+
endif()
90+

ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ endif()
9797
option(GGML_LASX "ggml: enable lasx" ON)
9898
option(GGML_LSX "ggml: enable lsx" ON)
9999
option(GGML_SVE "ggml: enable SVE" OFF)
100+
option(GGML_NCCL "ggml: enable NCCL" ON)
100101

101102
if (WIN32)
102103
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")

ggml/include/ggml.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,9 @@ extern "C" {
689689

690690
GGML_OP_GLU,
691691

692+
GGML_OP_REDUCE,
693+
GGML_OP_FAKE_CPY,
694+
692695
GGML_OP_COUNT,
693696
};
694697

@@ -3034,6 +3037,17 @@ extern "C" {
30343037
struct ggml_tensor ** splits;
30353038
} ggml_split_tensor_t;
30363039

3040+
GGML_API struct ggml_tensor * ggml_reduce(
3041+
struct ggml_context * ctx,
3042+
struct ggml_tensor ** a,
3043+
int n,
3044+
enum ggml_op op);
3045+
3046+
GGML_API struct ggml_tensor * ggml_fake_cpy(
3047+
struct ggml_context * ctx,
3048+
struct ggml_tensor * dst,
3049+
struct ggml_tensor * src);
3050+
30373051
#ifdef __cplusplus
30383052
}
30393053
#endif

ggml/src/CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,21 @@ if (GGML_CUDA)
497497
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
498498
endif()
499499
endif()
500+
501+
if (GGML_NCCL)
502+
find_package(NCCL)
503+
if (NCCL_FOUND)
504+
message("==================== NCCL found!")
505+
message("NCCL_LIBRARIES = ${NCCL_LIBRARIES}")
506+
message("NCCL_INCLUDE_DIRS = ${NCCL_INCLUDE_DIRS}")
507+
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${NCCL_LIBRARIES})
508+
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${NCCL_INCLUDE_DIRS})
509+
add_compile_definitions(GGML_USE_NCCL)
510+
else()
511+
message("==================== NCCL NOT found -> building wihout NCCL support")
512+
endif()
513+
endif()
514+
500515
if (NOT GGML_MUSA)
501516
set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES 0)
502517
set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES 0)

ggml/src/ggml-backend.cpp

Lines changed: 57 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1414,13 +1414,59 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
14141414
// do not overwrite user assignments
14151415
if (*leaf_backend_id == -1) {
14161416
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1417-
//printf("Pass 1: assigned backend %d to leaf %d, %s\n", *leaf_backend_id, i, graph->leafs[i]->name);
14181417
}
14191418
}
14201419

14211420
for (int i = 0; i < graph->n_nodes; i++) {
14221421
struct ggml_tensor * node = graph->nodes[i];
14231422
int * node_backend_id = &tensor_backend_id(node);
1423+
if (node->op == GGML_OP_REDUCE) {
1424+
auto view_src = node->view_src;
1425+
int src_id = -1;
1426+
for (int j = 0; j < node->op_params[1]; ++j) {
1427+
if (node->src[j]) {
1428+
int * this_node_backend_id = &tensor_backend_id(node->src[j]);
1429+
if (*this_node_backend_id == -1) {
1430+
*this_node_backend_id = j;
1431+
} else {
1432+
GGML_ASSERT(*this_node_backend_id == j);
1433+
}
1434+
if (view_src == node->src[j]) {
1435+
src_id = j;
1436+
}
1437+
}
1438+
}
1439+
if (src_id >= 0) {
1440+
int * this_node_backend_id = &tensor_backend_id(view_src);
1441+
*this_node_backend_id = tensor_backend_id(node->src[src_id]);
1442+
*node_backend_id = *this_node_backend_id;
1443+
}
1444+
}
1445+
else if (node->op == GGML_OP_MUL && node->src[0]->op == GGML_OP_NORM) {
1446+
// This is a hack for Cohere2. Without this hack the scheduler creates
1447+
// totally nonsensical splits for that arch
1448+
int * src1_id = &tensor_backend_id(node->src[1]);
1449+
if (*src1_id >= 0) {
1450+
int * src0_id = &tensor_backend_id(node->src[0]);
1451+
int * dst_id = &tensor_backend_id(node);
1452+
*src0_id = *src1_id;
1453+
*dst_id = *src1_id;
1454+
// For some reason that I don't understand, we can have norm backend already assigned
1455+
// at this point. How? That's why this more logical approach of first checking is commented out
1456+
//if (*src0_id < 0) {
1457+
// *src0_id = *src1_id;
1458+
//} else {
1459+
// printf("Oops: backend_id_src0(%s) = %d, backend_id_src1(%s) = %d\n", node->src[0]->name, *src0_id, node->src[1]->name, *src1_id);
1460+
// //GGML_ASSERT(*src0_id == *src1_id);
1461+
//}
1462+
//if (*dst_id < 0) {
1463+
// *dst_id = *src1_id;
1464+
//} else {
1465+
// printf("Oops: backend_id_dst(%s) = %d, backend_id_src1(%s) = %d\n", node->name, *dst_id, node->src[1]->name, *src1_id);
1466+
// //GGML_ASSERT(*dst_id == *src1_id);
1467+
//}
1468+
}
1469+
}
14241470
// do not overwrite user assignments
14251471
if (*node_backend_id == -1) {
14261472
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
@@ -1652,6 +1698,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
16521698
// check if we should start a new split based on the sources of the current node
16531699
bool need_new_split = false;
16541700
if ((node->op == GGML_OP_ADD && node->op_params[0] == 0xff) ||
1701+
node->op == GGML_OP_REDUCE ||
1702+
node->op == GGML_OP_FAKE_CPY ||
16551703
node->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] == 0xff) {
16561704
need_new_split = true;
16571705
}
@@ -1739,6 +1787,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
17391787
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
17401788
// create a copy of the input in the split's backend
17411789
if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
1790+
if (node->op == GGML_OP_REDUCE) {
1791+
//printf("setting tensor_id_copy(reduce, %zu, %d, %s) to %s\n", src_id, cur_backend_id, node->name, src->name);
1792+
tensor_id_copy(src_id, cur_backend_id, 0) = src;
1793+
} else if (node->op == GGML_OP_FAKE_CPY && src->op == GGML_OP_REDUCE) {
1794+
//printf("setting tensor_id_copy(fake_cpy, %zu, %d, %s) to %s\n", src_id, cur_backend_id, node->name, src->src[j]->name);
1795+
tensor_id_copy(src_id, cur_backend_id, 0) = src->src[j];
1796+
} else {
17421797
ggml_backend_t backend = sched->backends[cur_backend_id];
17431798
for (int c = 0; c < sched->n_copies; c++) {
17441799
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
@@ -1753,6 +1808,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
17531808
int n_inputs = split->n_inputs++;
17541809
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
17551810
split->inputs[n_inputs] = src;
1811+
}
17561812
}
17571813
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
17581814
}
@@ -2027,80 +2083,8 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
20272083
}
20282084
}
20292085

2030-
static ggml_status ggml_backend_sched_compute_splits_sm_graph(ggml_backend_sched_t sched) {
2031-
std::vector<int32_t> ids;
2032-
std::vector<uint32_t> unique_ids;
2033-
ggml_tensor * last_ids_tensor = nullptr;
2034-
2035-
std::array<bool, GGML_SCHED_MAX_BACKENDS> needs_sync{{true}};
2036-
2037-
auto splits = sched->splits;
2038-
2039-
std::vector<ggml_backend_sched_split *> this_split;
2040-
for (int i = 0; i < sched->n_splits; ++i) {
2041-
auto split_i = &splits[i];
2042-
this_split.clear();
2043-
this_split.push_back(split_i);
2044-
for (int j = i+1; j < sched->n_splits; ++j) {
2045-
auto split_j = &splits[j];
2046-
if (split_i->backend_id == split_j->backend_id) {
2047-
break;
2048-
}
2049-
int n_nodes = std::min(split_i->graph.n_nodes, split_j->graph.n_nodes);
2050-
bool same = true;
2051-
for (int k = 0; k < n_nodes; ++k) {
2052-
if (split_i->graph.nodes[k]->op != split_j->graph.nodes[k]->op) {
2053-
same = false; break;
2054-
}
2055-
}
2056-
if (!same) {
2057-
break;
2058-
}
2059-
this_split.push_back(split_j);
2060-
}
2061-
if (false) {
2062-
auto split = this_split.front();
2063-
if (this_split.size() == 1) {
2064-
printf("=== Split %d with %d inputs on backend %d\n", i, split->n_inputs, split->backend_id);
2065-
} else {
2066-
printf("=== Split %d with %d inputs on backends", i, split->n_inputs);
2067-
for (int j = 0; j < (int)this_split.size(); ++j) printf(" %d", this_split[j]->backend_id);
2068-
printf("\n");
2069-
}
2070-
for (int j = 0; j < split->graph.n_nodes; ++j) {
2071-
printf(" %d %s(%s)\n", j, ggml_op_name(split->graph.nodes[j]->op), split->graph.nodes[j]->name);
2072-
}
2073-
}
2074-
for (auto split : this_split) {
2075-
ggml_backend_sched_copy_inputs(sched, split, needs_sync, ids, unique_ids, last_ids_tensor);
2076-
}
2077-
for (auto split : this_split) {
2078-
auto split_backend_id = split->backend_id;
2079-
if (split->n_inputs > 0) {
2080-
needs_sync[split_backend_id] = true;
2081-
}
2082-
auto split_backend = sched->backends[split_backend_id];
2083-
auto ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
2084-
if (ec != GGML_STATUS_SUCCESS) {
2085-
return ec;
2086-
}
2087-
if (split->n_inputs > 0) {
2088-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
2089-
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
2090-
}
2091-
}
2092-
}
2093-
i += this_split.size() - 1;
2094-
}
2095-
return GGML_STATUS_SUCCESS;
2096-
}
2097-
20982086
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
20992087

2100-
if (false && sched->split_mode_graph) {
2101-
return ggml_backend_sched_compute_splits_sm_graph(sched);
2102-
}
2103-
21042088
std::array<bool, GGML_SCHED_MAX_BACKENDS> needs_sync{{true}};
21052089
std::array<bool, GGML_SCHED_MAX_BACKENDS> own_cpy{{false}};
21062090

0 commit comments

Comments
 (0)