Skip to content

Commit b4c31ba

Browse files
authored
Merge pull request #93 from ksenyako/release/ccl_2021.10
Intel(R) oneAPI Collective Communications Library (oneCCL) 2021.10
2 parents 0db8329 + 3f9d767 commit b4c31ba

File tree

204 files changed

+11962
-3130
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

204 files changed

+11962
-3130
lines changed

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ file(GLOB spv_kernels "${PROJECT_SOURCE_DIR}/src/kernels/kernels.spv")
302302
endif()
303303

304304
set(CCL_MAJOR_VERSION "2021")
305-
set(CCL_MINOR_VERSION "9")
305+
set(CCL_MINOR_VERSION "10")
306306
set(CCL_UPDATE_VERSION "0")
307307
set(CCL_PRODUCT_STATUS "Gold")
308308
string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ")

INSTALL.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
- Ubuntu* 18
66
- GNU*: C, C++ 4.8.5 or higher.
77

8-
Refer to [System Requirements](https://software.intel.com/content/www/us/en/develop/articles/oneapi-collective-communication-library-system-requirements.html) for more details.
8+
Refer to [System Requirements](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/oneapi-collective-communication-library-system-requirements.html) for more details.
99

1010
### SYCL support <!-- omit in toc -->
1111
Intel(R) oneAPI DPC++/C++ Compiler with L0 v1.0 support

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# oneAPI Collective Communications Library (oneCCL) <!-- omit in toc --> <img align="right" width="100" height="100" src="https://spec.oneapi.io/oneapi-logo-white-scaled.jpg">
22

3-
[Installation](#installation)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Usage](#usage)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Release Notes](https://software.intel.com/content/www/us/en/develop/articles/oneapi-collective-communication-library-ccl-release-notes.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](https://oneapi-src.github.io/oneCCL/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[How to Contribute](CONTRIBUTING.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[License](LICENSE)
3+
[Installation](#installation)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Usage](#usage)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Release Notes](https://www.intel.com/content/www/us/en/developer/articles/release-notes/oneapi-collective-communication-library-ccl-release-notes.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](https://oneapi-src.github.io/oneCCL/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[How to Contribute](CONTRIBUTING.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[License](LICENSE)
44

55
oneAPI Collective Communications Library (oneCCL) provides an efficient implementation of communication patterns used in deep learning.
66

@@ -30,7 +30,7 @@ oneCCL is part of [oneAPI](https://oneapi.io).
3030
- Ubuntu* 18
3131
- GNU*: C, C++ 4.8.5 or higher.
3232

33-
Refer to [System Requirements](https://software.intel.com/content/www/us/en/develop/articles/oneapi-collective-communication-library-system-requirements.html) for more details.
33+
Refer to [System Requirements](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/oneapi-collective-communication-library-system-requirements.html) for more details.
3434

3535
### SYCL support <!-- omit in toc -->
3636
Intel(R) oneAPI DPC++/C++ Compiler with Level Zero v1.0 support.

deps/hwloc/include/hwloc.h

+42-17
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright © 2009 CNRS
3-
* Copyright © 2009-2021 Inria. All rights reserved.
3+
* Copyright © 2009-2022 Inria. All rights reserved.
44
* Copyright © 2009-2012 Université Bordeaux
55
* Copyright © 2009-2020 Cisco Systems, Inc. All rights reserved.
66
* See COPYING in top-level directory.
@@ -29,7 +29,7 @@
2929
* THAT IS IN THE PDF/HTML THAT IS ***NOT*** IN hwloc.h!
3030
*
3131
* There are entire paragraph-length descriptions, discussions, and
32-
* pretty prictures to explain subtle corner cases, provide concrete
32+
* pretty pictures to explain subtle corner cases, provide concrete
3333
* examples, etc.
3434
*
3535
* Please, go read the documentation. :-)
@@ -93,7 +93,7 @@ extern "C" {
9393
* Two stable releases of the same series usually have the same ::HWLOC_API_VERSION
9494
* even if their HWLOC_VERSION are different.
9595
*/
96-
#define HWLOC_API_VERSION 0x00020500
96+
#define HWLOC_API_VERSION 0x00020800
9797

9898
/** \brief Indicate at runtime which hwloc API version was used at build time.
9999
*
@@ -517,7 +517,7 @@ struct hwloc_obj {
517517
* objects).
518518
*
519519
* If the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED configuration flag is set,
520-
* some of these CPUs may not be allowed for binding,
520+
* some of these CPUs may be online but not allowed for binding,
521521
* see hwloc_topology_get_allowed_cpuset().
522522
*
523523
* \note All objects have non-NULL CPU and node sets except Misc and I/O objects.
@@ -549,7 +549,7 @@ struct hwloc_obj {
549549
* nodes more precisely.
550550
*
551551
* If the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED configuration flag is set,
552-
* some of these nodes may not be allowed for allocation,
552+
* some of these nodes may be online but not allowed for allocation,
553553
* see hwloc_topology_get_allowed_nodeset().
554554
*
555555
* If there are no NUMA nodes in the machine, all the memory is close to this
@@ -642,7 +642,7 @@ union hwloc_obj_attr_u {
642642
unsigned char revision;
643643
float linkspeed; /* in GB/s */
644644
} pcidev;
645-
/** \brief Bridge specific Object Attribues */
645+
/** \brief Bridge specific Object Attributes */
646646
struct hwloc_bridge_attr_s {
647647
union {
648648
struct hwloc_pcidev_attr_s pci;
@@ -971,7 +971,7 @@ HWLOC_DECLSPEC const char * hwloc_obj_type_string (hwloc_obj_type_t type) __hwlo
971971
*
972972
* If \p size is 0, \p string may safely be \c NULL.
973973
*
974-
* \return the number of character that were actually written if not truncating,
974+
* \return the number of characters that were actually written if not truncating,
975975
* or that would have been written (not including the ending \\0).
976976
*/
977977
HWLOC_DECLSPEC int hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size,
@@ -986,7 +986,7 @@ HWLOC_DECLSPEC int hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_
986986
*
987987
* If \p size is 0, \p string may safely be \c NULL.
988988
*
989-
* \return the number of character that were actually written if not truncating,
989+
* \return the number of characters that were actually written if not truncating,
990990
* or that would have been written (not including the ending \\0).
991991
*/
992992
HWLOC_DECLSPEC int hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size,
@@ -1089,7 +1089,7 @@ HWLOC_DECLSPEC int hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const c
10891089
*
10901090
* Some operating systems only support binding threads or processes to a single PU.
10911091
* Others allow binding to larger sets such as entire Cores or Packages or
1092-
* even random sets of invididual PUs. In such operating system, the scheduler
1092+
* even random sets of individual PUs. In such operating system, the scheduler
10931093
* is free to run the task on one of these PU, then migrate it to another PU, etc.
10941094
* It is often useful to call hwloc_bitmap_singlify() on the target CPU set before
10951095
* passing it to the binding function to avoid these expensive migrations.
@@ -1167,7 +1167,7 @@ typedef enum {
11671167
* CPUs are idle, operating systems may execute the thread/process
11681168
* on those other CPUs instead of the designated CPUs, to let them
11691169
* progress anyway. Strict binding means that the thread/process
1170-
* will _never_ execute on other cpus than the designated CPUs, even
1170+
* will _never_ execute on other CPUs than the designated CPUs, even
11711171
* when those are busy with other tasks and other CPUs are idle.
11721172
*
11731173
* \note Depending on the operating system, strict binding may not
@@ -1204,7 +1204,7 @@ typedef enum {
12041204
HWLOC_CPUBIND_NOMEMBIND = (1<<3)
12051205
} hwloc_cpubind_flags_t;
12061206

1207-
/** \brief Bind current process or thread on cpus given in physical bitmap \p set.
1207+
/** \brief Bind current process or thread on CPUs given in physical bitmap \p set.
12081208
*
12091209
* \return -1 with errno set to ENOSYS if the action is not supported
12101210
* \return -1 with errno set to EXDEV if the binding cannot be enforced
@@ -1219,7 +1219,7 @@ HWLOC_DECLSPEC int hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_cpus
12191219
*/
12201220
HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
12211221

1222-
/** \brief Bind a process \p pid on cpus given in physical bitmap \p set.
1222+
/** \brief Bind a process \p pid on CPUs given in physical bitmap \p set.
12231223
*
12241224
* \note \p hwloc_pid_t is \p pid_t on Unix platforms,
12251225
* and \p HANDLE on native Windows platforms.
@@ -1250,7 +1250,7 @@ HWLOC_DECLSPEC int hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t
12501250
HWLOC_DECLSPEC int hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
12511251

12521252
#ifdef hwloc_thread_t
1253-
/** \brief Bind a thread \p thread on cpus given in physical bitmap \p set.
1253+
/** \brief Bind a thread \p thread on CPUs given in physical bitmap \p set.
12541254
*
12551255
* \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
12561256
* and \p HANDLE on native Windows platforms.
@@ -1914,8 +1914,9 @@ HWLOC_DECLSPEC int hwloc_topology_set_components(hwloc_topology_t __hwloc_restri
19141914
enum hwloc_topology_flags_e {
19151915
/** \brief Detect the whole system, ignore reservations, include disallowed objects.
19161916
*
1917-
* Gather all resources, even if some were disabled by the administrator.
1917+
* Gather all online resources, even if some were disabled by the administrator.
19181918
* For instance, ignore Linux Cgroup/Cpusets and gather all processors and memory nodes.
1919+
* However offline PUs and NUMA nodes are still ignored.
19191920
*
19201921
* When this flag is not set, PUs and NUMA nodes that are disallowed are not added to the topology.
19211922
* Parent objects (package, core, cache, etc.) are added only if some of their children are allowed.
@@ -2059,24 +2060,48 @@ enum hwloc_topology_flags_e {
20592060
* not change to due thread binding changes on Windows
20602061
* (see ::HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING).
20612062
*/
2062-
HWLOC_TOPOLOGY_FLAG_DONT_CHANGE_BINDING = (1UL<<6)
2063+
HWLOC_TOPOLOGY_FLAG_DONT_CHANGE_BINDING = (1UL<<6),
2064+
2065+
/** \brief Ignore distances.
2066+
*
2067+
* Ignore distance information from the operating systems (and from XML)
2068+
* and hence do not use distances for grouping.
2069+
*/
2070+
HWLOC_TOPOLOGY_FLAG_NO_DISTANCES = (1UL<<7),
2071+
2072+
/** \brief Ignore memory attributes.
2073+
*
2074+
* Ignore memory attribues from the operating systems (and from XML).
2075+
*/
2076+
HWLOC_TOPOLOGY_FLAG_NO_MEMATTRS = (1UL<<8),
2077+
2078+
/** \brief Ignore CPU Kinds.
2079+
*
2080+
* Ignore CPU kind information from the operating systems (and from XML).
2081+
*/
2082+
HWLOC_TOPOLOGY_FLAG_NO_CPUKINDS = (1UL<<9)
20632083
};
20642084

20652085
/** \brief Set OR'ed flags to non-yet-loaded topology.
20662086
*
20672087
* Set a OR'ed set of ::hwloc_topology_flags_e onto a topology that was not yet loaded.
20682088
*
2069-
* If this function is called multiple times, the last invokation will erase
2089+
* If this function is called multiple times, the last invocation will erase
20702090
* and replace the set of flags that was previously set.
20712091
*
2072-
* The flags set in a topology may be retrieved with hwloc_topology_get_flags()
2092+
* By default, no flags are set (\c 0).
2093+
*
2094+
* The flags set in a topology may be retrieved with hwloc_topology_get_flags().
20732095
*/
20742096
HWLOC_DECLSPEC int hwloc_topology_set_flags (hwloc_topology_t topology, unsigned long flags);
20752097

20762098
/** \brief Get OR'ed flags of a topology.
20772099
*
20782100
* Get the OR'ed set of ::hwloc_topology_flags_e of a topology.
20792101
*
2102+
* If hwloc_topology_set_flags() was not called earlier,
2103+
* no flags are set (\c 0 is returned).
2104+
*
20802105
* \return the flags previously set with hwloc_topology_set_flags().
20812106
*/
20822107
HWLOC_DECLSPEC unsigned long hwloc_topology_get_flags (hwloc_topology_t topology);

deps/hwloc/include/hwloc/autogen/config.h

+13-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* include/hwloc/autogen/config.h. Generated from config.h.in by configure. */
22
/* -*- c -*-
33
* Copyright © 2009 CNRS
4-
* Copyright © 2009-2020 Inria. All rights reserved.
4+
* Copyright © 2009-2022 Inria. All rights reserved.
55
* Copyright © 2009-2012 Université Bordeaux
66
* Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
77
* See COPYING in top-level directory.
@@ -12,11 +12,20 @@
1212
#ifndef HWLOC_CONFIG_H
1313
#define HWLOC_CONFIG_H
1414

15-
#define HWLOC_VERSION "2.7.0rc1-git"
15+
#define HWLOC_VERSION "2.9.0rc2-git"
1616
#define HWLOC_VERSION_MAJOR 2
17-
#define HWLOC_VERSION_MINOR 7
17+
#define HWLOC_VERSION_MINOR 9
1818
#define HWLOC_VERSION_RELEASE 0
19-
#define HWLOC_VERSION_GREEK "rc1"
19+
#define HWLOC_VERSION_GREEK "rc2"
20+
21+
/* #undef HWLOC_PCI_COMPONENT_BUILTIN */
22+
/* #undef HWLOC_OPENCL_COMPONENT_BUILTIN */
23+
/* #undef HWLOC_CUDA_COMPONENT_BUILTIN */
24+
/* #undef HWLOC_NVML_COMPONENT_BUILTIN */
25+
/* #undef HWLOC_RSMI_COMPONENT_BUILTIN */
26+
/* #undef HWLOC_LEVELZERO_COMPONENT_BUILTIN */
27+
/* #undef HWLOC_GL_COMPONENT_BUILTIN */
28+
/* #undef HWLOC_XML_LIBXML_COMPONENT_BUILTIN */
2029

2130
#if (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
2231
# define __hwloc_restrict __restrict

deps/hwloc/include/hwloc/bitmap.h

+7-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright © 2009 CNRS
3-
* Copyright © 2009-2020 Inria. All rights reserved.
3+
* Copyright © 2009-2022 Inria. All rights reserved.
44
* Copyright © 2009-2012 Université Bordeaux
55
* Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
66
* See COPYING in top-level directory.
@@ -112,7 +112,7 @@ HWLOC_DECLSPEC int hwloc_bitmap_copy(hwloc_bitmap_t dst, hwloc_const_bitmap_t sr
112112
*
113113
* If \p buflen is 0, \p buf may safely be \c NULL.
114114
*
115-
* \return the number of character that were actually written if not truncating,
115+
* \return the number of characters that were actually written if not truncating,
116116
* or that would have been written (not including the ending \\0).
117117
*/
118118
HWLOC_DECLSPEC int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
@@ -137,7 +137,7 @@ HWLOC_DECLSPEC int hwloc_bitmap_sscanf(hwloc_bitmap_t bitmap, const char * __hwl
137137
*
138138
* If \p buflen is 0, \p buf may safely be \c NULL.
139139
*
140-
* \return the number of character that were actually written if not truncating,
140+
* \return the number of characters that were actually written if not truncating,
141141
* or that would have been written (not including the ending \\0).
142142
*/
143143
HWLOC_DECLSPEC int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
@@ -161,7 +161,7 @@ HWLOC_DECLSPEC int hwloc_bitmap_list_sscanf(hwloc_bitmap_t bitmap, const char *
161161
*
162162
* If \p buflen is 0, \p buf may safely be \c NULL.
163163
*
164-
* \return the number of character that were actually written if not truncating,
164+
* \return the number of characters that were actually written if not truncating,
165165
* or that would have been written (not including the ending \\0).
166166
*/
167167
HWLOC_DECLSPEC int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
@@ -357,11 +357,11 @@ HWLOC_DECLSPEC int hwloc_bitmap_last_unset(hwloc_const_bitmap_t bitmap) __hwloc_
357357
* The loop must start with hwloc_bitmap_foreach_begin() and end
358358
* with hwloc_bitmap_foreach_end() followed by a terminating ';'.
359359
*
360-
* \p index is the loop variable; it should be an unsigned int. The
361-
* first iteration will set \p index to the lowest index in the bitmap.
360+
* \p id is the loop variable; it should be an unsigned int. The
361+
* first iteration will set \p id to the lowest index in the bitmap.
362362
* Successive iterations will iterate through, in order, all remaining
363363
* indexes set in the bitmap. To be specific: each iteration will return a
364-
* value for \p index such that hwloc_bitmap_isset(bitmap, index) is true.
364+
* value for \p id such that hwloc_bitmap_isset(bitmap, id) is true.
365365
*
366366
* The assert prevents the loop from being infinite if the bitmap is infinitely set.
367367
*

deps/hwloc/include/hwloc/deprecated.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright © 2009 CNRS
3-
* Copyright © 2009-2021 Inria. All rights reserved.
3+
* Copyright © 2009-2022 Inria. All rights reserved.
44
* Copyright © 2009-2012 Université Bordeaux
55
* Copyright © 2009-2010 Cisco Systems, Inc. All rights reserved.
66
* See COPYING in top-level directory.
@@ -55,7 +55,7 @@ hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj
5555
*
5656
* If \p size is 0, \p string may safely be \c NULL.
5757
*
58-
* \return the number of character that were actually written if not truncating,
58+
* \return the number of characters that were actually written if not truncating,
5959
* or that would have been written (not including the ending \\0).
6060
*/
6161
static __hwloc_inline int

deps/hwloc/include/hwloc/distances.h

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright © 2010-2021 Inria. All rights reserved.
2+
* Copyright © 2010-2022 Inria. All rights reserved.
33
* See COPYING in top-level directory.
44
*/
55

@@ -35,8 +35,8 @@ extern "C" {
3535
* from a core in another node.
3636
* The corresponding kind is ::HWLOC_DISTANCES_KIND_FROM_OS | ::HWLOC_DISTANCES_KIND_FROM_USER.
3737
* The name of this distances structure is "NUMALatency".
38-
* Others distance structures include and "XGMIBandwidth", "XGMIHops"
39-
* and "NVLinkBandwidth".
38+
* Others distance structures include and "XGMIBandwidth", "XGMIHops",
39+
* "XeLinkBandwidth" and "NVLinkBandwidth".
4040
*
4141
* The matrix may also contain bandwidths between random sets of objects,
4242
* possibly provided by the user, as specified in the \p kind attribute.
@@ -160,7 +160,8 @@ hwloc_distances_get_by_type(hwloc_topology_t topology, hwloc_obj_type_t type,
160160
* Usually only one distances structure may match a given name.
161161
*
162162
* The name of the most common structure is "NUMALatency".
163-
* Others include "XGMIBandwidth", "XGMIHops" and "NVLinkBandwidth".
163+
* Others include "XGMIBandwidth", "XGMIHops", "XeLinkBandwidth",
164+
* and "NVLinkBandwidth".
164165
*/
165166
HWLOC_DECLSPEC int
166167
hwloc_distances_get_by_name(hwloc_topology_t topology, const char *name,

deps/hwloc/include/hwloc/helper.h

+1-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright © 2009 CNRS
3-
* Copyright © 2009-2021 Inria. All rights reserved.
3+
* Copyright © 2009-2022 Inria. All rights reserved.
44
* Copyright © 2009-2012 Université Bordeaux
55
* Copyright © 2009-2010 Cisco Systems, Inc. All rights reserved.
66
* See COPYING in top-level directory.
@@ -886,9 +886,6 @@ enum hwloc_distrib_flags_e {
886886
* \p flags should be 0 or a OR'ed set of ::hwloc_distrib_flags_e.
887887
*
888888
* \note This function requires the \p roots objects to have a CPU set.
889-
*
890-
* \note This function replaces the now deprecated hwloc_distribute()
891-
* and hwloc_distributev() functions.
892889
*/
893890
static __hwloc_inline int
894891
hwloc_distrib(hwloc_topology_t topology,

0 commit comments

Comments
 (0)