diff --git a/src/ucp/Makefile.am b/src/ucp/Makefile.am index b4f55c6fdaf..0d570535234 100644 --- a/src/ucp/Makefile.am +++ b/src/ucp/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2021. ALL RIGHTS RESERVED. +# Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2026. ALL RIGHTS RESERVED. # Copyright (c) UT-Battelle, LLC. 2017. ALL RIGHTS RESERVED. # Copyright (C) Los Alamos National Security, LLC. 2019. ALL RIGHTS RESERVED. # See file LICENSE for terms. @@ -50,6 +50,7 @@ noinst_HEADERS = \ dt/dt.inl \ dt/dt_contig.h \ dt/dt_iov.h \ + dt/dt_sgl.h \ dt/dt_generic.h \ proto/lane_type.h \ proto/proto_am.h \ @@ -120,6 +121,7 @@ libucp_la_SOURCES = \ core/ucp_device.c \ dt/datatype_iter.c \ dt/dt_iov.c \ + dt/dt_sgl.c \ dt/dt_generic.c \ dt/dt.c \ proto/lane_type.c \ diff --git a/src/ucp/am/eager_multi.c b/src/ucp/am/eager_multi.c index cf1440137a9..de619cd5c48 100644 --- a/src/ucp/am/eager_multi.c +++ b/src/ucp/am/eager_multi.c @@ -1,5 +1,5 @@ /** - * Copyright (C) 2022, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. + * Copyright (C) 2022-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -175,6 +175,7 @@ ucp_proto_t ucp_am_eager_multi_bcopy_proto = { .name = "am/egr/multi/bcopy", .desc = UCP_PROTO_MULTI_FRAG_DESC " " UCP_PROTO_COPY_IN_DESC, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_am_eager_multi_bcopy_proto_probe, .query = ucp_proto_multi_query, .progress = {ucp_am_eager_multi_bcopy_proto_progress}, @@ -349,6 +350,7 @@ ucp_proto_t ucp_am_eager_multi_zcopy_proto = { .name = "am/egr/multi/zcopy", .desc = UCP_PROTO_MULTI_FRAG_DESC " " UCP_PROTO_ZCOPY_DESC, .flags = 0, + .dt_mask = UCP_DT_MASK_CONTIG_IOV, .probe = ucp_am_eager_multi_zcopy_proto_probe, .query = ucp_proto_multi_query, .progress = {ucp_am_eager_multi_zcopy_proto_progress}, @@ -449,6 +451,7 @@ ucp_proto_t ucp_am_eager_multi_zcopy_psn_proto = { .name = "am/egr/multi/zcopy/psn", .desc = UCP_PROTO_MULTI_FRAG_DESC " " UCP_PROTO_ZCOPY_DESC " psn", .flags = 0, + .dt_mask = UCP_DT_MASK_CONTIG_IOV, .probe = ucp_am_eager_multi_zcopy_psn_proto_probe, .query = ucp_proto_multi_query, .progress = {ucp_am_eager_multi_zcopy_psn_proto_progress}, diff --git a/src/ucp/am/eager_single.c b/src/ucp/am/eager_single.c index ff5e9d71c98..04eaa25e72e 100644 --- a/src/ucp/am/eager_single.c +++ b/src/ucp/am/eager_single.c @@ -1,5 +1,5 @@ /** - * Copyright (C) 2021, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. + * Copyright (C) 2021-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -139,6 +139,7 @@ ucp_proto_t ucp_am_eager_short_proto = { .name = "am/egr/short", .desc = UCP_PROTO_SHORT_DESC, .flags = UCP_PROTO_FLAG_AM_SHORT, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_am_eager_short_probe, .query = ucp_proto_single_query, .progress = {ucp_am_eager_short_proto_progress}, @@ -162,6 +163,7 @@ ucp_proto_t ucp_am_eager_short_reply_proto = { .name = "am/egr/short/reply", .desc = UCP_PROTO_SHORT_DESC, .flags = UCP_PROTO_FLAG_AM_SHORT, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_am_eager_short_reply_probe, .query = ucp_proto_single_query, .progress = {ucp_am_eager_short_reply_proto_progress}, @@ -264,6 +266,7 @@ ucp_proto_t ucp_am_eager_single_bcopy_proto = { .name = "am/egr/single/bcopy", .desc = UCP_PROTO_COPY_IN_DESC, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_am_eager_single_bcopy_probe, .query = ucp_proto_single_query, .progress = {ucp_am_eager_single_bcopy_proto_progress}, @@ -301,6 +304,7 @@ ucp_proto_t ucp_am_eager_single_bcopy_reply_proto = { .name = "am/egr/single/bcopy/reply", .desc = UCP_PROTO_COPY_IN_DESC, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_am_eager_single_bcopy_reply_probe, .query = ucp_proto_single_query, .progress = {ucp_am_eager_single_bcopy_reply_proto_progress}, @@ -339,8 +343,7 @@ static void ucp_am_eager_single_zcopy_probe_common( }; if (!ucp_am_check_init_params(init_params, UCS_BIT(op_id), - UCP_PROTO_SELECT_OP_FLAG_AM_RNDV) || - (init_params->select_param->dt_class != UCP_DATATYPE_CONTIG)) { + UCP_PROTO_SELECT_OP_FLAG_AM_RNDV)) { return; } @@ -402,6 +405,7 @@ ucp_proto_t ucp_am_eager_single_zcopy_proto = { .name = "am/egr/single/zcopy", .desc = UCP_PROTO_ZCOPY_DESC, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_am_eager_single_zcopy_probe, .query = ucp_proto_single_query, .progress = {ucp_am_eager_single_zcopy_proto_progress}, @@ -458,6 +462,7 @@ ucp_proto_t ucp_am_eager_single_zcopy_reply_proto = { .name = "am/egr/single/zcopy/reply", .desc = UCP_PROTO_ZCOPY_DESC, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_am_eager_single_zcopy_reply_probe, .query = ucp_proto_single_query, .progress = {ucp_am_eager_single_zcopy_reply_proto_progress}, diff --git a/src/ucp/am/rndv.c b/src/ucp/am/rndv.c index 06703d4e7a8..1ba4237f8ac 100644 --- a/src/ucp/am/rndv.c +++ b/src/ucp/am/rndv.c @@ -1,5 +1,5 @@ /** - * Copyright (C) 2022, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. + * Copyright (C) 2022-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -73,6 +73,7 @@ ucp_proto_t ucp_am_rndv_proto = { .name = "am/rndv", .desc = NULL, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_am_rndv_rts_probe, .query = ucp_proto_rndv_rts_query, .progress = {ucp_am_rndv_proto_progress}, diff --git a/src/ucp/api/ucp.h b/src/ucp/api/ucp.h index 7dbcf68f644..0df0ae504ba 100644 --- a/src/ucp/api/ucp.h +++ b/src/ucp/api/ucp.h @@ -1,5 +1,5 @@ /* -* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2020. ALL RIGHTS RESERVED. +* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2026. ALL RIGHTS RESERVED. * Copyright (C) UT-Battelle, LLC. 2014-2017. ALL RIGHTS RESERVED. * Copyright (C) ARM Ltd. 2016-2017. ALL RIGHTS RESERVED. * Copyright (C) Los Alamos National Security, LLC. 2018 ALL RIGHTS RESERVED. @@ -941,6 +941,10 @@ enum ucp_dt_remote_sgl_field { * * @note Currently only N->N mapping is supported: both sides must use * the SGL datatype with equal counts and matching lengths. + * + * @note All buffers in the descriptor must share the same memory type + * (see @ref ucs_memory_type_t), otherwise @ref UCS_ERR_INVALID_PARAM + * is returned. */ typedef struct { uint64_t field_mask; /**< Valid fields, using bits from diff --git a/src/ucp/core/ucp_am.c b/src/ucp/core/ucp_am.c index 9ff64ef88ee..d08ad4b23b7 100644 --- a/src/ucp/core/ucp_am.c +++ b/src/ucp/core/ucp_am.c @@ -1,6 +1,6 @@ /** * Copyright (C) Los Alamos National Security, LLC. 2019 ALL RIGHTS RESERVED. -* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2019. ALL RIGHTS RESERVED. +* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2019-2026. ALL RIGHTS RESERVED. * Copyright (C) Advanced Micro Devices, Inc. 2024. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -1132,6 +1132,7 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_am_recv_data_nbx, UCP_CONTEXT_CHECK_FEATURE_FLAGS(context, UCP_FEATURE_AM, return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM)); + UCP_REQUEST_CHECK_PARAM(param); UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); diff --git a/src/ucp/core/ucp_mm.inl b/src/ucp/core/ucp_mm.inl index 8c8c748092a..87beceaf3f7 100644 --- a/src/ucp/core/ucp_mm.inl +++ b/src/ucp/core/ucp_mm.inl @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -113,6 +113,16 @@ static UCS_F_ALWAYS_INLINE int ucp_memh_is_user_memh(ucp_mem_h memh) return (memh->parent != NULL) && !ucp_memh_is_zero_length(memh); } +static UCS_F_ALWAYS_INLINE ucp_memory_info_t +ucp_memory_info_from_memh(ucp_mem_h memh) +{ + ucp_memory_info_t mem_info; + + mem_info.type = memh->mem_type; + mem_info.sys_dev = memh->sys_dev; + return mem_info; +} + static UCS_F_ALWAYS_INLINE int ucp_memh_is_buffer_in_range(const ucp_mem_h memh, const void *buffer, size_t length) diff --git a/src/ucp/core/ucp_request.inl b/src/ucp/core/ucp_request.inl index 5eb90289be5..007ccd828b9 100644 --- a/src/ucp/core/ucp_request.inl +++ b/src/ucp/core/ucp_request.inl @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2019. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -186,23 +186,49 @@ UCS_PTR_MAP_IMPL(request, 0); } -#define UCP_REQUEST_CHECK_PARAM(_param) \ - if (ENABLE_PARAMS_CHECK) { \ - if (((_param)->op_attr_mask & UCP_OP_ATTR_FIELD_MEMORY_TYPE) && \ - ((_param)->memory_type > UCS_MEMORY_TYPE_LAST)) { \ - ucs_error("invalid memory type parameter: %d", \ - (_param)->memory_type); \ - return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ +#define UCP_REQUEST_CHECK_PARAM_COMMON(_param) \ + do { \ + if (ENABLE_PARAMS_CHECK) { \ + if (((_param)->op_attr_mask & UCP_OP_ATTR_FIELD_MEMORY_TYPE) && \ + ((_param)->memory_type > UCS_MEMORY_TYPE_LAST)) { \ + ucs_error("invalid memory type parameter: %d", \ + (_param)->memory_type); \ + return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ + } \ + \ + if (ucs_test_all_flags((_param)->op_attr_mask, \ + (UCP_OP_ATTR_FLAG_FAST_CMPL | \ + UCP_OP_ATTR_FLAG_MULTI_SEND))) { \ + ucs_error("UCP_OP_ATTR_FLAG_FAST_CMPL and " \ + "UCP_OP_ATTR_FLAG_MULTI_SEND are mutually exclusive"); \ + return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ + } \ } \ - \ - if (ucs_test_all_flags((_param)->op_attr_mask, \ - (UCP_OP_ATTR_FLAG_FAST_CMPL | \ - UCP_OP_ATTR_FLAG_MULTI_SEND))) { \ - ucs_error("UCP_OP_ATTR_FLAG_FAST_CMPL and " \ - "UCP_OP_ATTR_FLAG_MULTI_SEND are mutually exclusive"); \ + } while (0) + + +#define UCP_REQUEST_CHECK_PARAM_NO_REMOTE(_param) \ + do { \ + if (ENABLE_PARAMS_CHECK && \ + (((_param)->op_attr_mask & \ + (UCP_OP_ATTR_FIELD_REMOTE | \ + UCP_OP_ATTR_FIELD_REMOTE_DATATYPE | \ + UCP_OP_ATTR_FIELD_REMOTE_COUNT)) || \ + (((_param)->op_attr_mask & UCP_OP_ATTR_FIELD_DATATYPE) && \ + (((_param)->datatype & UCP_DATATYPE_CLASS_MASK) == \ + UCP_DATATYPE_SGL)))) { \ + ucs_error("SGL datatype and remote descriptor parameters are " \ + "only supported for ucp_put_nbx"); \ return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ } \ - } + } while (0) + + +#define UCP_REQUEST_CHECK_PARAM(_param) \ + do { \ + UCP_REQUEST_CHECK_PARAM_COMMON(_param); \ + UCP_REQUEST_CHECK_PARAM_NO_REMOTE(_param); \ + } while (0) #if UCS_ENABLE_ASSERT diff --git a/src/ucp/core/ucp_rkey.c b/src/ucp/core/ucp_rkey.c index 9e8349d1112..eee3bb550ae 100644 --- a/src/ucp/core/ucp_rkey.c +++ b/src/ucp/core/ucp_rkey.c @@ -1,5 +1,5 @@ /** -* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2015. ALL RIGHTS RESERVED. +* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -674,9 +674,8 @@ static ssize_t ucp_memh_do_pack(ucp_mem_h memh, uint64_t flags, int rkey_compat, ucs_fatal("packing rkey using ucp_memh_pack() is unsupported"); } - mem_info.type = memh->mem_type; - mem_info.sys_dev = memh->sys_dev; - sys_distance = sys_dev_distances; + mem_info = ucp_memory_info_from_memh(memh); + sys_distance = sys_dev_distances; ucs_for_each_bit(ep_sys_dev, sys_dev_map) { status = ucs_topo_get_distance(memh->sys_dev, ep_sys_dev, sys_distance); diff --git a/src/ucp/dt/datatype_iter.c b/src/ucp/dt/datatype_iter.c index a014768cf87..39606b38895 100644 --- a/src/ucp/dt/datatype_iter.c +++ b/src/ucp/dt/datatype_iter.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -9,6 +9,7 @@ #endif #include "datatype_iter.inl" +#include "dt_sgl.h" #define ucp_datatype_iter_iov_for_each(_iov_index, _length, _dt_iter) \ @@ -282,6 +283,139 @@ size_t ucp_datatype_iter_iov_next_iov(const ucp_datatype_iter_t *dt_iter, return dst_iov_index; } +ucs_status_t ucp_datatype_iter_sgl_init(ucp_context_h context, + ucp_datatype_iter_t *dt_iter, + const ucp_dt_local_sgl_t *local, + const ucp_dt_remote_sgl_t *remote, + size_t count, + const ucp_request_param_t *param) +{ + ucs_status_t status; + + /* For Coverity */ + ucs_assert(remote != NULL); + + dt_iter->dt_class = UCP_DATATYPE_SGL; + dt_iter->length = count; + dt_iter->offset = 0; + dt_iter->type.sgl.buffers = local->buffers; + dt_iter->type.sgl.lengths = local->lengths; + dt_iter->type.sgl.remote_addrs = remote->remote_addrs; + dt_iter->type.sgl.rkeys = remote->rkeys; + + if (ucs_unlikely(count == 0)) { + dt_iter->type.sgl.memhs = NULL; + ucp_memory_info_set_host(&dt_iter->mem_info); + } else if (local->field_mask & UCP_DT_LOCAL_SGL_FIELD_MEMHS) { + ucs_assertv(ucp_memh_is_user_memh(local->memhs[0]), "memh=%p", + local->memhs[0]); + dt_iter->type.sgl.memhs = (ucp_mem_h*)local->memhs; + + status = ucp_datatype_iter_init_mem_info_from_user_memh(dt_iter, + local->memhs[0]); + if (status != UCS_OK) { + return status; + } + } else { + dt_iter->type.sgl.memhs = NULL; + ucp_datatype_iter_detect_mem_info(context, local->buffers[0], + local->lengths[0], dt_iter, param); + if (ENABLE_PARAMS_CHECK && (count > 1)) { + status = ucp_dt_sgl_check_same_mem_info(context, + local->buffers + 1, + local->lengths + 1, + count - 1, + &dt_iter->mem_info); + if (status != UCS_OK) { + return status; + } + } + } + + if (ENABLE_PARAMS_CHECK && (count > 1)) { + status = ucp_dt_sgl_check_same_rkey_config(remote->rkeys, count); + if (status != UCS_OK) { + return status; + } + } + + return UCS_OK; +} + +ucs_status_t ucp_datatype_iter_sgl_mem_reg(ucp_context_h context, + ucp_datatype_iter_t *dt_iter, + ucp_md_map_t md_map, + unsigned uct_flags) +{ + size_t count = dt_iter->length; + ucs_status_t status; + ucp_mem_h *memhs; + size_t i; + + if ((md_map == 0) || (dt_iter->type.sgl.memhs != NULL)) { + return UCS_OK; + } + + memhs = ucs_calloc(count, sizeof(*memhs), "dt_sgl_memh"); + if (memhs == NULL) { + return UCS_ERR_NO_MEMORY; + } + + for (i = 0; i < count; ++i) { + status = ucp_datatype_iter_mem_reg_single( + context, dt_iter->type.sgl.buffers[i], + dt_iter->type.sgl.lengths[i], dt_iter->mem_info.type, + md_map, uct_flags, &memhs[i]); + if (status != UCS_OK) { + while (i-- > 0) { + ucp_datatype_iter_mem_dereg_single(&memhs[i]); + } + ucs_free(memhs); + return status; + } + } + + dt_iter->type.sgl.memhs = memhs; + return UCS_OK; +} + +void ucp_datatype_iter_sgl_mem_dereg(ucp_datatype_iter_t *dt_iter) +{ + size_t count = dt_iter->length; + size_t i; + + ucs_assert(dt_iter->type.sgl.memhs != NULL); + for (i = 0; i < count; ++i) { + ucp_datatype_iter_mem_dereg_single(&dt_iter->type.sgl.memhs[i]); + } +} + +static UCS_F_ALWAYS_INLINE int +ucp_datatype_iter_sgl_owns_memhs(const ucp_datatype_iter_t *dt_iter) +{ + return (dt_iter->type.sgl.memhs != NULL) && + !ucp_memh_is_user_memh(dt_iter->type.sgl.memhs[0]); +} + +void ucp_datatype_iter_sgl_cleanup(ucp_datatype_iter_t *dt_iter, int dereg) +{ + size_t i; + + if (!ucp_datatype_iter_sgl_owns_memhs(dt_iter)) { + return; + } + + if (dereg) { + ucp_datatype_iter_sgl_mem_dereg(dt_iter); + } else if (UCS_ENABLE_ASSERT) { + for (i = 0; i < dt_iter->length; ++i) { + ucp_datatatype_iter_memh_cleanup_check(dt_iter->type.sgl.memhs[i]); + } + } + + ucs_free(dt_iter->type.sgl.memhs); +} + void ucp_datatype_iter_str(const ucp_datatype_iter_t *dt_iter, ucs_string_buffer_t *strb) { @@ -341,7 +475,11 @@ int ucp_datatype_iter_is_user_memh_valid(const ucp_datatype_iter_t *dt_iter, const ucp_mem_h memh) { UCS_STRING_BUFFER_ONSTACK(err_msg, 256); + ucp_mem_h err_memh = memh; + ucp_memory_info_t cur, ref; + ucp_mem_h sgl_memh; size_t iov_count; + size_t i; if (memh == NULL) { ucs_error("got NULL memory handle"); @@ -365,6 +503,33 @@ int ucp_datatype_iter_is_user_memh_valid(const ucp_datatype_iter_t *dt_iter, goto err_memh_mismatch; } break; + case UCP_DATATYPE_SGL: + ref = ucp_memory_info_from_memh(memh); + for (i = 0; i < dt_iter->length; ++i) { + sgl_memh = dt_iter->type.sgl.memhs[i]; + if (sgl_memh == NULL) { + ucs_error("sgl[%zu]: got NULL memory handle", i); + return 0; + } + + if (!ucp_memh_is_buffer_in_range(sgl_memh, + dt_iter->type.sgl.buffers[i], + dt_iter->type.sgl.lengths[i])) { + err_memh = sgl_memh; + ucs_string_buffer_appendf(&err_msg, + "sgl[%zu] [buffer %p length %zu]", + i, dt_iter->type.sgl.buffers[i], + dt_iter->type.sgl.lengths[i]); + goto err_memh_mismatch; + } + + cur = ucp_memory_info_from_memh(sgl_memh); + if (ucp_dt_mem_info_verify("sgl", i, &cur, &ref, + dt_iter->length) != UCS_OK) { + return 0; + } + } + break; default: ucs_error("unsupported memory handle datatype: [%s]", ucp_datatype_class_names[dt_iter->dt_class]); @@ -375,7 +540,7 @@ int ucp_datatype_iter_is_user_memh_valid(const ucp_datatype_iter_t *dt_iter, err_memh_mismatch: ucs_error("mismatched memory handle %p [address %p length %zu] for %s", - memh, ucp_memh_address(memh), ucp_memh_length(memh), + err_memh, ucp_memh_address(err_memh), ucp_memh_length(err_memh), ucs_string_buffer_cstr(&err_msg)); return 0; } diff --git a/src/ucp/dt/datatype_iter.h b/src/ucp/dt/datatype_iter.h index 1651a4805d0..5bd1026828a 100644 --- a/src/ucp/dt/datatype_iter.h +++ b/src/ucp/dt/datatype_iter.h @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -27,6 +27,14 @@ #define UCP_DT_MASK_CONTIG_IOV \ (UCS_BIT(UCP_DATATYPE_CONTIG) | UCS_BIT(UCP_DATATYPE_IOV)) +/* + * dt_mask argument which contains contiguous, iov and generic datatypes + */ +#define UCP_PROTO_DT_MASK_DEFAULT \ + (UCS_BIT(UCP_DATATYPE_CONTIG) | \ + UCS_BIT(UCP_DATATYPE_IOV) | \ + UCS_BIT(UCP_DATATYPE_GENERIC)) + /* * Iterator on a datatype, used to produce data from send buffer or consume data @@ -65,6 +73,14 @@ typedef struct { * iov_offset = iter.length - iter.iov[iter.iov_index].start_offset */ } iov; + struct { + void * const *buffers; + const size_t *lengths; + ucp_mem_h *memhs; + const uint64_t *remote_addrs; + ucp_rkey_h const *rkeys; + /* length = element count, offset = current element index */ + } sgl; } type; } ucp_datatype_iter_t; @@ -99,4 +115,20 @@ void ucp_datatype_iter_str(const ucp_datatype_iter_t *dt_iter, int ucp_datatype_iter_is_user_memh_valid(const ucp_datatype_iter_t *dt_iter, const ucp_mem_h memh); +ucs_status_t ucp_datatype_iter_sgl_init(ucp_context_h context, + ucp_datatype_iter_t *dt_iter, + const ucp_dt_local_sgl_t *local, + const ucp_dt_remote_sgl_t *remote, + size_t count, + const ucp_request_param_t *param); + +ucs_status_t ucp_datatype_iter_sgl_mem_reg(ucp_context_h context, + ucp_datatype_iter_t *dt_iter, + ucp_md_map_t md_map, + unsigned uct_flags); + +void ucp_datatype_iter_sgl_mem_dereg(ucp_datatype_iter_t *dt_iter); + +void ucp_datatype_iter_sgl_cleanup(ucp_datatype_iter_t *dt_iter, int dereg); + #endif diff --git a/src/ucp/dt/datatype_iter.inl b/src/ucp/dt/datatype_iter.inl index 609c1f44c6d..f95734d4fea 100644 --- a/src/ucp/dt/datatype_iter.inl +++ b/src/ucp/dt/datatype_iter.inl @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * Copyright (C) Advanced Micro Devices, Inc. 2024. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -44,8 +44,7 @@ ucp_datatype_iter_init_mem_info_from_user_memh(ucp_datatype_iter_t *dt_iter, return UCS_ERR_INVALID_PARAM; } - dt_iter->mem_info.type = memh->mem_type; - dt_iter->mem_info.sys_dev = memh->sys_dev; + dt_iter->mem_info = ucp_memory_info_from_memh(memh); return UCS_OK; } @@ -142,6 +141,12 @@ ucp_datatype_iter_init(ucp_context_h context, void *buffer, size_t count, length = ucp_dt_iov_length((const ucp_dt_iov_t*)buffer, count); return ucp_datatype_iov_iter_init(context, buffer, count, length, dt_iter, param); + } else if (dt_iter->dt_class == UCP_DATATYPE_SGL) { + *sg_count = 0; + return ucp_datatype_iter_sgl_init(context, dt_iter, + (const ucp_dt_local_sgl_t*)buffer, + (const ucp_dt_remote_sgl_t*)param->remote, + count, param); } else if (!ENABLE_PARAMS_CHECK || (dt_iter->dt_class == UCP_DATATYPE_GENERIC)) { *sg_count = 0; @@ -335,6 +340,8 @@ ucp_datatype_iter_cleanup(ucp_datatype_iter_t *dt_iter, int dereg, ucp_datatatype_iter_memh_cleanup_check(dt_iter->type.contig.memh); } else if (ucp_datatype_iter_is_class(dt_iter, UCP_DATATYPE_IOV, dt_mask)) { ucp_datatype_iter_iov_cleanup(dt_iter, dereg); + } else if (ucp_datatype_iter_is_class(dt_iter, UCP_DATATYPE_SGL, dt_mask)) { + ucp_datatype_iter_sgl_cleanup(dt_iter, dereg); } else if (ucp_datatype_iter_is_class(dt_iter, UCP_DATATYPE_GENERIC, dt_mask)) { dt_iter->type.generic.dt_gen->ops.finish(dt_iter->type.generic.state); @@ -600,6 +607,15 @@ ucp_datatype_iter_next_iov(const ucp_datatype_iter_t *dt_iter, } } +static UCS_F_ALWAYS_INLINE size_t +ucp_datatype_iter_next_sgl(const ucp_datatype_iter_t *dt_iter, + size_t max_elem_count, + ucp_datatype_iter_t *next_iter) +{ + ucs_assert(dt_iter->dt_class == UCP_DATATYPE_SGL); + return ucp_datatype_iter_next(dt_iter, max_elem_count, next_iter); +} + /* * Copy iterator position only. * 'src_dt_iter' must be initialized from the same datatype object as 'dt_iter', @@ -670,6 +686,9 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucp_datatype_iter_mem_reg( } else if (ucp_datatype_iter_is_class(dt_iter, UCP_DATATYPE_IOV, dt_mask)) { return ucp_datatype_iter_iov_mem_reg(context, dt_iter, md_map, uct_flags); + } else if (ucp_datatype_iter_is_class(dt_iter, UCP_DATATYPE_SGL, + dt_mask)) { + return ucp_datatype_iter_sgl_mem_reg(context, dt_iter, md_map, uct_flags); } else if (ucp_datatype_iter_is_class(dt_iter, UCP_DATATYPE_GENERIC, dt_mask)) { return UCS_OK; @@ -692,6 +711,10 @@ ucp_datatype_iter_mem_dereg(ucp_datatype_iter_t *dt_iter, unsigned dt_mask) if (dt_iter->type.iov.memh != NULL) { ucp_datatype_iter_iov_mem_dereg(dt_iter); } + } else if (ucp_datatype_iter_is_class(dt_iter, UCP_DATATYPE_SGL, dt_mask)) { + if (dt_iter->type.sgl.memhs != NULL) { + ucp_datatype_iter_sgl_mem_dereg(dt_iter); + } } } diff --git a/src/ucp/dt/dt.c b/src/ucp/dt/dt.c index d29a946a6ab..dd7dfc9a38e 100644 --- a/src/ucp/dt/dt.c +++ b/src/ucp/dt/dt.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2017. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2026. ALL RIGHTS RESERVED. * Copyright (C) Advanced Micro Devices, Inc. 2024. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -23,10 +23,42 @@ const char * ucp_datatype_class_names[] = { [UCP_DATATYPE_CONTIG] = "contiguous", [UCP_DATATYPE_STRIDED] = "strided", [UCP_DATATYPE_IOV] = "iov", + [UCP_DATATYPE_SGL] = "sgl", [UCP_DATATYPE_GENERIC] = "generic" }; +ucs_status_t ucp_dt_mem_info_check_elem(ucp_context_h context, + const void *buffer, size_t length, + const ucp_memory_info_t *ref, + const char *dt_name, size_t index, + size_t count) +{ + ucp_memory_info_t memory_info; + + ucp_memory_detect(context, buffer, length, &memory_info); + return ucp_dt_mem_info_verify(dt_name, index, &memory_info, ref, count); +} + +ucs_status_t ucp_dt_mem_info_verify(const char *dt_name, size_t index, + const ucp_memory_info_t *cur, + const ucp_memory_info_t *ref, + size_t count) +{ + if (ucp_memory_info_equal(cur, ref)) { + return UCS_OK; + } + + ucs_error("inconsistent %s mem_info: [%zu]=%s-%s [0]=%s-%s count=%zu", + dt_name, index, + ucs_memory_type_names[cur->type], + ucs_topo_sys_device_get_name(cur->sys_dev), + ucs_memory_type_names[ref->type], + ucs_topo_sys_device_get_name(ref->sys_dev), count); + return UCS_ERR_INVALID_PARAM; +} + + UCS_PROFILE_FUNC_VOID(ucp_mem_type_unpack, (worker, buffer, recv_data, recv_length, mem_type), ucp_worker_h worker, void *buffer, const void *recv_data, diff --git a/src/ucp/dt/dt.h b/src/ucp/dt/dt.h index a0a8f59e111..34f99bbb313 100644 --- a/src/ucp/dt/dt.h +++ b/src/ucp/dt/dt.h @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2016. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2016-2026. ALL RIGHTS RESERVED. * Copyright (C) Advanced Micro Devices, Inc. 2024. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -59,6 +59,23 @@ size_t ucp_dt_pack(ucp_worker_h worker, ucp_datatype_t datatype, ucs_memory_type_t mem_type, void *dest, const void *src, ucp_dt_state_t *state, size_t length); +ucs_status_t ucp_dt_mem_info_check_elem(ucp_context_h context, + const void *buffer, size_t length, + const ucp_memory_info_t *ref, + const char *dt_name, size_t index, + size_t count); + +ucs_status_t ucp_dt_mem_info_verify(const char *dt_name, size_t index, + const ucp_memory_info_t *cur, + const ucp_memory_info_t *ref, + size_t count); + +static UCS_F_ALWAYS_INLINE int +ucp_memory_info_equal(const ucp_memory_info_t *a, const ucp_memory_info_t *b) +{ + return (a->type == b->type) && (a->sys_dev == b->sys_dev); +} + void ucp_mem_type_pack(ucp_worker_h worker, void *dest, const void *src, size_t length, ucs_memory_type_t mem_type); diff --git a/src/ucp/dt/dt_iov.c b/src/ucp/dt/dt_iov.c index cee11b9eed7..b1df342a6b9 100644 --- a/src/ucp/dt/dt_iov.c +++ b/src/ucp/dt/dt_iov.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2015. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2026. ALL RIGHTS RESERVED. * Copyright (C) Advanced Micro Devices, Inc. 2024. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -127,21 +127,15 @@ ucs_status_t ucp_dt_iov_memtype_check(ucp_context_h context, const ucp_dt_iov_t *iov, size_t iovcnt, const ucp_memory_info_t *mem_info) { - ucp_memory_info_t mem_info_iter; + ucs_status_t status; size_t i; for (i = 0; i < iovcnt; ++i) { - ucp_memory_detect(context, iov[i].buffer, iov[i].length, - &mem_info_iter); - if ((mem_info_iter.type != mem_info->type) || - (mem_info_iter.sys_dev != mem_info->sys_dev)) { - ucs_error("inconsistent iov memtypes: iov[%zu]=%s-%s iov[0]=%s-%s" - " iovcnt=%zu", - i, ucs_memory_type_names[mem_info_iter.type], - ucs_topo_sys_device_get_name(mem_info_iter.sys_dev), - ucs_memory_type_names[mem_info->type], - ucs_topo_sys_device_get_name(mem_info->sys_dev), iovcnt); - return UCS_ERR_INVALID_PARAM; + status = ucp_dt_mem_info_check_elem(context, iov[i].buffer, + iov[i].length, mem_info, "iov", i, + iovcnt); + if (status != UCS_OK) { + return status; } } diff --git a/src/ucp/dt/dt_sgl.c b/src/ucp/dt/dt_sgl.c new file mode 100644 index 00000000000..ffef7641824 --- /dev/null +++ b/src/ucp/dt/dt_sgl.c @@ -0,0 +1,78 @@ +/** + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2026. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "dt_sgl.h" + +#include +#include +#include + + +ucs_status_t ucp_dt_sgl_check_same_mem_info(ucp_context_h context, + void * const *buffers, + const size_t *lengths, + size_t count, + const ucp_memory_info_t *mem_info) +{ + ucs_status_t status; + size_t i; + + for (i = 0; i < count; ++i) { + status = ucp_dt_mem_info_check_elem(context, buffers[i], lengths[i], + mem_info, "sgl", i, count); + if (status != UCS_OK) { + return status; + } + } + + return UCS_OK; +} + +ucs_status_t ucp_dt_sgl_check_same_rkey_config(const ucp_rkey_h *rkeys, + size_t count) +{ + ucp_rkey_h ref_rkey = rkeys[0]; + ucp_rkey_h rkey; + size_t i; + + if (ucs_unlikely(ref_rkey == NULL)) { + ucs_error("sgl[0]: rkey is NULL"); + return UCS_ERR_INVALID_PARAM; + } + + for (i = 1; i < count; ++i) { + rkey = rkeys[i]; + if (ucs_unlikely(rkey == NULL)) { + ucs_error("sgl[%zu]: rkey is NULL", i); + return UCS_ERR_INVALID_PARAM; + } + +#if ENABLE_PARAMS_CHECK + if (ucs_unlikely(rkey->ep != ref_rkey->ep)) { + ucs_error("sgl[%zu]: rkey %p was unpacked on ep %p, but sgl[0] " + "rkey %p was unpacked on ep %p, all rkeys must belong " + "to the same endpoint", + i, rkey, rkey->ep, ref_rkey, ref_rkey->ep); + return UCS_ERR_INVALID_PARAM; + } +#endif + + if (ucs_unlikely(rkey->cfg_index != ref_rkey->cfg_index)) { + ucs_error("sgl[%zu]: rkey %p has cfg_index %u, but sgl[0] rkey %p " + "has cfg_index %u, all rkeys must map to the same " + "remote endpoint configuration", + i, rkey, rkey->cfg_index, ref_rkey, + ref_rkey->cfg_index); + return UCS_ERR_INVALID_PARAM; + } + } + + return UCS_OK; +} diff --git a/src/ucp/dt/dt_sgl.h b/src/ucp/dt/dt_sgl.h new file mode 100644 index 00000000000..19d58993343 --- /dev/null +++ b/src/ucp/dt/dt_sgl.h @@ -0,0 +1,50 @@ +/** + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2026. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + + +#ifndef UCP_DT_SGL_H_ +#define UCP_DT_SGL_H_ + +#include +#include + + +#define UCP_DT_IS_SGL(_datatype) \ + (((_datatype) & UCP_DATATYPE_CLASS_MASK) == UCP_DATATYPE_SGL) + + +/** + * Check that all SGL entries match the given memory info + * + * @param [in] context Context for memory detection + * @param [in] buffers Array of buffer pointers to check + * @param [in] lengths Array of buffer lengths + * @param [in] count Number of entries in @a buffers / @a lengths + * @param [in] mem_info Compare the SGL entries to this memory info + * + * @return UCS_OK if all SGL entries match the given memory info, otherwise + * return UCS_ERR_INVALID_PARAM + */ +ucs_status_t ucp_dt_sgl_check_same_mem_info(ucp_context_h context, + void * const *buffers, + const size_t *lengths, + size_t count, + const ucp_memory_info_t *mem_info); + + +/** + * Check that all rkeys in an SGL map to the same remote endpoint configuration + * + * @param [in] rkeys Array of remote keys to check + * @param [in] count Number of entries in the @a rkeys array + * + * @return UCS_OK if all rkeys share the same configuration, otherwise return + * UCS_ERR_INVALID_PARAM + */ +ucs_status_t ucp_dt_sgl_check_same_rkey_config(const ucp_rkey_h *rkeys, + size_t count); + +#endif diff --git a/src/ucp/proto/proto.c b/src/ucp/proto/proto.c index 7cb232873c0..4eb5af6cd9e 100644 --- a/src/ucp/proto/proto.c +++ b/src/ucp/proto/proto.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -29,6 +29,7 @@ _macro(ucp_put_offload_short_proto) \ _macro(ucp_put_offload_bcopy_proto) \ _macro(ucp_put_offload_zcopy_proto) \ + _macro(ucp_put_sgl_offload_proto) \ _macro(ucp_eager_bcopy_multi_proto) \ _macro(ucp_eager_sync_bcopy_multi_proto) \ _macro(ucp_eager_zcopy_multi_proto) \ diff --git a/src/ucp/proto/proto.h b/src/ucp/proto/proto.h index 2600a07e665..ae7227b1998 100644 --- a/src/ucp/proto/proto.h +++ b/src/ucp/proto/proto.h @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -201,6 +201,11 @@ struct ucp_proto { const char *desc; /* Protocol description */ unsigned flags; /* Protocol flags for special handling */ + /* Bitmap of UCS_BIT(UCP_DATATYPE_xxx) classes this protocol supports. + * Probe is skipped for any other dt_class. Must be non-zero. + */ + unsigned dt_mask; + /* Probe and add protocol instances */ ucp_proto_probe_func_t probe; diff --git a/src/ucp/proto/proto_common.c b/src/ucp/proto/proto_common.c index 4bafde5baa2..63b470d0b31 100644 --- a/src/ucp/proto/proto_common.c +++ b/src/ucp/proto/proto_common.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -560,7 +560,8 @@ ucp_proto_common_filter_min_frag(const ucp_proto_init_params_t *params, ucp_lane_index_t ucp_proto_common_find_lanes(const ucp_proto_init_params_t *params, unsigned flags, ucp_lane_type_t lane_type, - uint64_t tl_cap_flags, ucp_lane_index_t max_lanes, + uint64_t tl_cap_flags, uint64_t tl_v2_cap_flags, + ucp_lane_index_t max_lanes, ucp_lane_map_t exclude_map, ucp_proto_common_filter_lane_cb_t filter, ucp_lane_index_t *lanes) @@ -573,6 +574,7 @@ ucp_proto_common_find_lanes(const ucp_proto_init_params_t *params, const ucp_lane_map_t failed_lanes = ucp_ep_config_get_failed_lanes(ep_config_key); const uct_iface_attr_t *iface_attr; + uct_iface_attr_v2_t iface_attr_v2; ucp_lane_index_t lane, num_lanes; const uct_md_attr_v2_t *md_attr; const uct_component_attr_t *cmpt_attr; @@ -581,6 +583,7 @@ ucp_proto_common_find_lanes(const ucp_proto_init_params_t *params, ucp_lane_map_t lane_map; char lane_desc[64]; ucs_sys_device_t lane_sys_dev; + ucs_status_t status; if (max_lanes == 0) { return 0; @@ -637,6 +640,26 @@ ucp_proto_common_find_lanes(const ucp_proto_init_params_t *params, continue; } + /* Check v2 iface capabilities */ + if (tl_v2_cap_flags != 0) { + iface_attr_v2.field_mask = UCT_IFACE_ATTR_FIELD_CAP_FLAGS; + status = uct_iface_query_v2( + ucp_worker_iface(params->worker, rsc_index)->iface, + &iface_attr_v2); + if (status != UCS_OK) { + ucs_trace("%s: iface_query_v2 failed: %s", lane_desc, + ucs_status_string(status)); + continue; + } + + if (!ucs_test_all_flags(iface_attr_v2.cap.flags, + tl_v2_cap_flags)) { + ucs_trace("%s: no v2 cap 0x%" PRIx64, lane_desc, + tl_v2_cap_flags); + continue; + } + } + md_index = context->tl_rscs[rsc_index].md_index; md_attr = &context->tl_mds[md_index].attr; cmpt_attr = ucp_cmpt_attr_by_md_index(context, md_index); diff --git a/src/ucp/proto/proto_common.h b/src/ucp/proto/proto_common.h index a3325d902af..c7aaa9da473 100644 --- a/src/ucp/proto/proto_common.h +++ b/src/ucp/proto/proto_common.h @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -301,7 +301,8 @@ ucp_proto_common_filter_min_frag(const ucp_proto_init_params_t *params, ucp_lane_index_t ucp_proto_common_find_lanes(const ucp_proto_init_params_t *params, unsigned flags, ucp_lane_type_t lane_type, - uint64_t tl_cap_flags, ucp_lane_index_t max_lanes, + uint64_t tl_cap_flags, uint64_t tl_v2_cap_flags, + ucp_lane_index_t max_lanes, ucp_lane_map_t exclude_map, ucp_proto_common_filter_lane_cb_t filter, ucp_lane_index_t *lanes); diff --git a/src/ucp/proto/proto_common.inl b/src/ucp/proto/proto_common.inl index 8cb0747355e..6def997b254 100644 --- a/src/ucp/proto/proto_common.inl +++ b/src/ucp/proto/proto_common.inl @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -97,7 +97,8 @@ static UCS_F_ALWAYS_INLINE void ucp_proto_request_zcopy_complete(ucp_request_t *req, ucs_status_t status) { ucp_datatype_iter_cleanup(&req->send.state.dt_iter, 1, - UCP_DT_MASK_CONTIG_IOV); + UCP_DT_MASK_CONTIG_IOV | + UCS_BIT(UCP_DATATYPE_SGL)); if (ucp_proto_select_op_id(&req->send.proto_config->select_param) == UCP_OP_ID_TAG_SEND) { UCP_EP_STAT_TAG_OP(req->send.ep, EAGER) diff --git a/src/ucp/proto/proto_multi.c b/src/ucp/proto/proto_multi.c index e1efa16d4b9..edb40299fba 100644 --- a/src/ucp/proto/proto_multi.c +++ b/src/ucp/proto/proto_multi.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -240,6 +240,8 @@ ucs_status_t ucp_proto_multi_init(const ucp_proto_multi_init_params_t *params, uint32_t weight_sum; ucs_status_t status; int fixed_first_lane; + uct_iface_attr_v2_t iface_attr_v2; + ucp_rsc_index_t rsc_index; ucs_assert(params->max_lanes <= UCP_PROTO_MAX_LANES); @@ -261,8 +263,8 @@ ucs_status_t ucp_proto_multi_init(const ucp_proto_multi_init_params_t *params, /* Find first lane */ num_lanes = ucp_proto_common_find_lanes( ¶ms->super.super, params->super.flags, params->first.lane_type, - params->first.tl_cap_flags, 1, 0, ucp_proto_common_filter_min_frag, - lanes); + params->first.tl_cap_flags, params->first.tl_v2_cap_flags, + 1, 0, ucp_proto_common_filter_min_frag, lanes); if (num_lanes == 0) { ucs_trace("no lanes for %s", ucp_proto_id_field(params->super.super.proto_id, name)); @@ -272,8 +274,9 @@ ucs_status_t ucp_proto_multi_init(const ucp_proto_multi_init_params_t *params, /* Find rest of the lanes */ num_lanes += ucp_proto_common_find_lanes( ¶ms->super.super, params->super.flags, params->middle.lane_type, - params->middle.tl_cap_flags, UCP_PROTO_MAX_LANES - 1, - UCS_BIT(lanes[0]), ucp_proto_common_filter_min_frag, lanes + 1); + params->middle.tl_cap_flags, params->middle.tl_v2_cap_flags, + UCP_PROTO_MAX_LANES - 1, UCS_BIT(lanes[0]), + ucp_proto_common_filter_min_frag, lanes + 1); /* Get bandwidth of all lanes and max_bandwidth */ max_bandwidth = 0; @@ -460,6 +463,26 @@ ucs_status_t ucp_proto_multi_init(const ucp_proto_multi_init_params_t *params, mpriv->align_thresh = ucs_max(mpriv->align_thresh, lpriv->opt_align); lpriv->flush_sys_dev_mask = ucp_proto_multi_init_flush_sys_dev_mask(params, lane); + + if ((params->first.tl_v2_cap_flags | params->middle.tl_v2_cap_flags) & + UCT_IFACE_FLAG_V2_PUT_SGL_ZCOPY) { + rsc_index = ucp_proto_common_get_rsc_index(¶ms->super.super, + lane); + iface_attr_v2.field_mask = + UCT_IFACE_ATTR_FIELD_MAX_PUT_SGL_ZCOPY_COUNT; + status = uct_iface_query_v2( + ucp_worker_iface(params->super.super.worker, + rsc_index)->iface, + &iface_attr_v2); + if (status != UCS_OK) { + return status; + } + + lpriv->max_put_sgl_zcopy_count = + iface_attr_v2.max_put_sgl_zcopy_count; + } else { + lpriv->max_put_sgl_zcopy_count = 0; + } } ucs_assert(mpriv->num_lanes == ucs_popcount(selection.lane_map)); diff --git a/src/ucp/proto/proto_multi.h b/src/ucp/proto/proto_multi.h index a129fd08bb5..11003503dd0 100644 --- a/src/ucp/proto/proto_multi.h +++ b/src/ucp/proto/proto_multi.h @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -84,6 +84,11 @@ typedef struct { /* Map of system devices that require a flush operation */ ucp_sys_dev_map_t flush_sys_dev_mask; + + /* Maximal number of local SGL buffers per uct_ep_put_sgl_zcopy on this + * lane, cached from uct_iface_attr_v2 at protocol init when PUT SGL zcopy + * is selected, otherwise zero */ + size_t max_put_sgl_zcopy_count; } ucp_proto_multi_lane_priv_t; @@ -131,6 +136,9 @@ typedef struct { /* Required iface capabilities */ uint64_t tl_cap_flags; + /* Required v2 iface capabilities */ + uint64_t tl_v2_cap_flags; + /* Required lane type */ ucp_lane_type_t lane_type; } first, middle; diff --git a/src/ucp/proto/proto_reconfig.c b/src/ucp/proto/proto_reconfig.c index bc5846f2417..5cca5085d5d 100644 --- a/src/ucp/proto/proto_reconfig.c +++ b/src/ucp/proto/proto_reconfig.c @@ -155,6 +155,7 @@ ucp_proto_t ucp_reconfig_proto = { .name = "reconfig", .desc = "stub protocol", .flags = UCP_PROTO_FLAG_INVALID, + .dt_mask = UCP_DT_MASK_ALL, .probe = ucp_proto_reconfig_probe, .query = ucp_proto_default_query, .progress = {ucp_proto_reconfig_progress}, diff --git a/src/ucp/proto/proto_select.c b/src/ucp/proto/proto_select.c index ec66179280c..b8c74c691d6 100644 --- a/src/ucp/proto/proto_select.c +++ b/src/ucp/proto/proto_select.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -218,10 +218,18 @@ ucp_proto_select_init_protocols(ucp_worker_h worker, ucs_array_init_dynamic(&proto_init->priv_buf); ucs_for_each_bit(init_params.proto_id, worker->context->proto_bitmap) { + const ucp_proto_t *proto; + ucs_assert(init_params.proto_id < ucp_protocols_count()); /* Coverity */ - ucs_trace("probing %s", ucp_proto_id_field(init_params.proto_id, name)); + proto = ucp_protocols[init_params.proto_id]; + ucs_assertv(proto->dt_mask != 0, "%s: dt_mask must be set", proto->name); + if (!(UCS_BIT(select_param->dt_class) & proto->dt_mask)) { + continue; + } + + ucs_trace("probing %s", proto->name); ucs_log_indent(1); - ucp_proto_id_call(init_params.proto_id, probe, &init_params); + proto->probe(&init_params); ucs_log_indent(-1); } diff --git a/src/ucp/proto/proto_single.c b/src/ucp/proto/proto_single.c index 866d62771c0..5cd3f28efd4 100644 --- a/src/ucp/proto/proto_single.c +++ b/src/ucp/proto/proto_single.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -36,7 +36,8 @@ ucs_status_t ucp_proto_single_init(const ucp_proto_single_init_params_t *params, num_lanes = ucp_proto_common_find_lanes( ¶ms->super.super, params->super.flags, params->lane_type, - params->tl_cap_flags, 1, params->super.exclude_map, + params->tl_cap_flags, params->tl_v2_cap_flags, + 1, params->super.exclude_map, ucp_proto_common_filter_min_frag, &lane); if (num_lanes == 0) { ucs_trace("no lanes for %s", diff --git a/src/ucp/proto/proto_single.h b/src/ucp/proto/proto_single.h index 923b6afc4c0..347c19513ab 100644 --- a/src/ucp/proto/proto_single.h +++ b/src/ucp/proto/proto_single.h @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2021. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -19,8 +19,9 @@ typedef struct { typedef struct { ucp_proto_common_init_params_t super; - ucp_lane_type_t lane_type; /* Type of lane to select */ - uint64_t tl_cap_flags; /* Required iface capabilities */ + ucp_lane_type_t lane_type; /* Type of lane to select */ + uint64_t tl_cap_flags; /* Required iface capabilities */ + uint64_t tl_v2_cap_flags; /* Required v2 iface capabilities */ } ucp_proto_single_init_params_t; diff --git a/src/ucp/rma/amo_offload.c b/src/ucp/rma/amo_offload.c index b8a7ccfd0d7..7f216a4122d 100644 --- a/src/ucp/rma/amo_offload.c +++ b/src/ucp/rma/amo_offload.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. * * See file LICENSE for terms. @@ -182,8 +182,7 @@ static void ucp_proto_amo_probe(const ucp_proto_init_params_t *init_params, .tl_cap_flags = 0 }; - if ((init_params->select_param->dt_class != UCP_DATATYPE_CONTIG) || - !ucp_proto_init_check_op(init_params, UCS_BIT(op_id))) { + if (!ucp_proto_init_check_op(init_params, UCS_BIT(op_id))) { return; } @@ -268,6 +267,7 @@ static void ucp_proto_amo_query(const ucp_proto_query_params_t *params, ucp_proto_t ucp_amo##_bits##_##_id##_proto = { \ .name = "amo" #_bits "/" _name, \ .desc = NULL, \ + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), \ .probe = ucp_proto_amo##_bits##_##_id##_probe, \ .query = ucp_proto_amo##_bits##_##_id##_query, \ .progress = {ucp_proto_amo##_bits##_id##_progress}, \ diff --git a/src/ucp/rma/amo_sw.c b/src/ucp/rma/amo_sw.c index 327cd0ccde1..45569b3968c 100644 --- a/src/ucp/rma/amo_sw.c +++ b/src/ucp/rma/amo_sw.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2018. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2026. ALL RIGHTS RESERVED. * Copyright (C) Huawei Technologies Co., Ltd. 2021. ALL RIGHTS RESERVED. * Copyright (C) Advanced Micro Devices, Inc. 2024. ALL RIGHTS RESERVED. * @@ -462,8 +462,7 @@ static ucs_status_t ucp_proto_amo_sw_progress_post(uct_pending_req_t *self) static void ucp_proto_amo_sw_post_probe(const ucp_proto_init_params_t *init_params) { - if (!ucp_proto_init_check_op(init_params, UCS_BIT(UCP_OP_ID_AMO_POST)) || - (init_params->select_param->dt_class != UCP_DATATYPE_CONTIG)) { + if (!ucp_proto_init_check_op(init_params, UCS_BIT(UCP_OP_ID_AMO_POST))) { return; } @@ -474,6 +473,7 @@ ucp_proto_t ucp_get_amo_post_proto = { .name = "amo/post/sw", .desc = UCP_PROTO_RMA_EMULATION_DESC, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_amo_sw_post_probe, .query = ucp_proto_single_query, .progress = {ucp_proto_amo_sw_progress_post}, @@ -491,8 +491,7 @@ ucp_proto_amo_sw_fetch_probe(const ucp_proto_init_params_t *init_params) { if (!ucp_proto_init_check_op(init_params, UCS_BIT(UCP_OP_ID_AMO_FETCH) | - UCS_BIT(UCP_OP_ID_AMO_CSWAP)) || - (init_params->select_param->dt_class != UCP_DATATYPE_CONTIG)) { + UCS_BIT(UCP_OP_ID_AMO_CSWAP))) { return; } @@ -503,6 +502,7 @@ ucp_proto_t ucp_get_amo_fetch_proto = { .name = "amo/fetch/sw", .desc = UCP_PROTO_RMA_EMULATION_DESC, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_amo_sw_fetch_probe, .query = ucp_proto_single_query, .progress = {ucp_proto_amo_sw_progress_fetch}, diff --git a/src/ucp/rma/get_am.c b/src/ucp/rma/get_am.c index cee11975b62..3c3af1e5de4 100644 --- a/src/ucp/rma/get_am.c +++ b/src/ucp/rma/get_am.c @@ -122,6 +122,7 @@ ucp_proto_t ucp_get_am_bcopy_proto = { .name = "get/am/bcopy", .desc = UCP_PROTO_RMA_EMULATION_DESC, .flags = 0, + .dt_mask = UCP_DT_MASK_CONTIG_IOV, .probe = ucp_proto_get_am_bcopy_probe, .query = ucp_proto_single_query, .progress = {ucp_proto_get_am_bcopy_progress}, diff --git a/src/ucp/rma/get_offload.c b/src/ucp/rma/get_offload.c index 7bec24804cf..f599a04a1ce 100644 --- a/src/ucp/rma/get_offload.c +++ b/src/ucp/rma/get_offload.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * Copyright (C) Advanced Micro Devices, Inc. 2024. ALL RIGHTS RESERVED. * * See file LICENSE for terms. @@ -118,8 +118,7 @@ ucp_proto_get_offload_bcopy_probe(const ucp_proto_init_params_t *init_params) .opt_align_offs = UCP_PROTO_COMMON_OFFSET_INVALID }; - if ((init_params->select_param->dt_class != UCP_DATATYPE_CONTIG) || - !ucp_proto_init_check_op(init_params, UCS_BIT(UCP_OP_ID_GET))) { + if (!ucp_proto_init_check_op(init_params, UCS_BIT(UCP_OP_ID_GET))) { return; } @@ -143,6 +142,7 @@ ucp_proto_t ucp_get_offload_bcopy_proto = { .name = "get/bcopy", .desc = UCP_PROTO_COPY_OUT_DESC, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_get_offload_bcopy_probe, .query = ucp_proto_multi_query, .progress = {ucp_proto_get_offload_bcopy_progress}, @@ -246,6 +246,7 @@ ucp_proto_t ucp_get_offload_zcopy_proto = { .name = "get/zcopy", .desc = UCP_PROTO_ZCOPY_DESC, .flags = 0, + .dt_mask = UCP_DT_MASK_CONTIG_IOV, .probe = ucp_proto_get_offload_zcopy_probe, .query = ucp_proto_multi_query, .progress = {ucp_proto_get_offload_zcopy_progress}, diff --git a/src/ucp/rma/put_am.c b/src/ucp/rma/put_am.c index 1d80c650f52..f8ca4b5da37 100644 --- a/src/ucp/rma/put_am.c +++ b/src/ucp/rma/put_am.c @@ -130,6 +130,7 @@ ucp_proto_t ucp_put_am_bcopy_proto = { .name = "put/am/bcopy", .desc = UCP_PROTO_RMA_EMULATION_DESC, .flags = 0, + .dt_mask = UCP_DT_MASK_CONTIG_IOV, .probe = ucp_proto_put_am_bcopy_probe, .query = ucp_proto_multi_query, .progress = {ucp_proto_put_am_bcopy_progress}, diff --git a/src/ucp/rma/put_offload.c b/src/ucp/rma/put_offload.c index e522296a893..693ae53bc56 100644 --- a/src/ucp/rma/put_offload.c +++ b/src/ucp/rma/put_offload.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -101,6 +101,7 @@ ucp_proto_t ucp_put_offload_short_proto = { .name = "put/offload/short", .desc = UCP_PROTO_SHORT_DESC, .flags = UCP_PROTO_FLAG_PUT_SHORT, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_put_offload_short_probe, .query = ucp_proto_single_query, .progress = {ucp_proto_put_offload_short_progress}, @@ -235,6 +236,7 @@ ucp_proto_t ucp_put_offload_bcopy_proto = { .name = "put/offload/bcopy", .desc = UCP_PROTO_COPY_IN_DESC, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_proto_put_offload_bcopy_probe, .query = ucp_proto_multi_query, .progress = {ucp_proto_put_offload_bcopy_progress}, @@ -335,9 +337,152 @@ ucp_proto_t ucp_put_offload_zcopy_proto = { .name = "put/offload/zcopy", .desc = UCP_PROTO_ZCOPY_DESC, .flags = 0, + .dt_mask = UCP_DT_MASK_CONTIG_IOV, .probe = ucp_proto_put_offload_zcopy_probe, .query = ucp_proto_multi_query, .progress = {ucp_proto_put_offload_zcopy_progress}, .abort = ucp_proto_request_zcopy_abort, .reset = ucp_proto_offload_zcopy_reset }; + +static void +ucp_proto_put_sgl_offload_probe(const ucp_proto_init_params_t *init_params) +{ + ucp_context_t *context = init_params->worker->context; + ucp_proto_multi_init_params_t params = { + .super.super = *init_params, + .super.latency = 0, + .super.overhead = context->config.ext.proto_overhead_multi, + .super.cfg_thresh = context->config.ext.zcopy_thresh, + .super.cfg_priority = 30, + .super.min_length = 0, + .super.max_length = SIZE_MAX, + .super.min_iov = 1, + .super.min_frag_offs = ucs_offsetof(uct_iface_attr_t, + cap.put.min_zcopy), + .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t, + cap.put.max_zcopy), + .super.max_iov_offs = ucs_offsetof(uct_iface_attr_t, + cap.put.max_iov), + .super.hdr_size = 0, + .super.send_op = UCT_EP_OP_PUT_ZCOPY, + .super.memtype_op = UCT_EP_OP_LAST, + .super.flags = UCP_PROTO_COMMON_INIT_FLAG_SEND_ZCOPY | + UCP_PROTO_COMMON_INIT_FLAG_RECV_ZCOPY | + UCP_PROTO_COMMON_INIT_FLAG_REMOTE_ACCESS | + UCP_PROTO_COMMON_INIT_FLAG_ERR_HANDLING, + .super.exclude_map = 0, + .super.reg_mem_info = ucp_proto_common_select_param_mem_info( + init_params->select_param), + .max_lanes = context->config.ext.max_rma_lanes, + .min_chunk = context->config.ext.min_rma_chunk_size, + .initial_reg_md_map = 0, + .first.tl_cap_flags = UCT_IFACE_FLAG_PUT_ZCOPY, + .first.tl_v2_cap_flags = UCT_IFACE_FLAG_V2_PUT_SGL_ZCOPY, + .first.lane_type = UCP_LANE_TYPE_RMA_BW, + .middle.tl_cap_flags = UCT_IFACE_FLAG_PUT_ZCOPY, + .middle.tl_v2_cap_flags = UCT_IFACE_FLAG_V2_PUT_SGL_ZCOPY, + .middle.lane_type = UCP_LANE_TYPE_RMA_BW, + .opt_align_offs = UCP_PROTO_COMMON_OFFSET_INVALID, + }; + + if (!ucp_proto_init_check_op(init_params, UCS_BIT(UCP_OP_ID_PUT))) { + return; + } + + ucp_proto_multi_probe(¶ms); +} + +static UCS_F_ALWAYS_INLINE ucs_status_t +ucp_proto_put_sgl_offload_send_func(ucp_request_t *req, + const ucp_proto_multi_lane_priv_t *lpriv, + ucp_datatype_iter_t *next_iter, + ucp_lane_index_t *lane_shift) +{ + ucp_ep_t *ep = req->send.ep; + ucp_datatype_iter_t *dt_iter = &req->send.state.dt_iter; + ucp_lane_index_t lane = lpriv->super.lane; + uct_ep_h uct_ep = ucp_ep_get_lane(ep, lane); + ucp_md_index_t md_index = ucp_ep_md_index(ep, lane); + ucp_rsc_index_t rkey_index = lpriv->super.rkey_index; + size_t start_index = dt_iter->offset; + size_t max_sgl_count = lpriv->max_put_sgl_zcopy_count; + size_t elem_count = ucp_datatype_iter_next_sgl(dt_iter, + max_sgl_count, + next_iter); + size_t rkeys_size = elem_count * sizeof(uct_rkey_t); + size_t memhs_size = elem_count * sizeof(uct_mem_h); + ucp_mem_h *sgl_memhs = dt_iter->type.sgl.memhs; + uct_mem_h *uct_memhs; + uct_rkey_t *uct_rkeys; + ucs_status_t status; + size_t i; + + uct_rkeys = ucs_alloc_on_stack(rkeys_size, "uct_sgl_rkeys"); + if (uct_rkeys == NULL) { + return UCS_ERR_NO_MEMORY; + } + + uct_memhs = ucs_alloc_on_stack(memhs_size, "uct_sgl_memhs"); + if (uct_memhs == NULL) { + ucs_free_on_stack(uct_rkeys, rkeys_size); + return UCS_ERR_NO_MEMORY; + } + + for (i = 0; i < elem_count; i++) { + uct_memhs[i] = (sgl_memhs != NULL) ? + sgl_memhs[start_index + i]->uct[md_index] : + UCT_MEM_HANDLE_NULL; + uct_rkeys[i] = ucp_rkey_get_tl_rkey( + dt_iter->type.sgl.rkeys[start_index + i], rkey_index); + } + + status = uct_ep_put_sgl_zcopy( + uct_ep, + &dt_iter->type.sgl.buffers[start_index], + &dt_iter->type.sgl.lengths[start_index], + uct_memhs, + &dt_iter->type.sgl.remote_addrs[start_index], + uct_rkeys, + elem_count, &req->send.state.uct_comp); + + ucs_free_on_stack(uct_memhs, memhs_size); + ucs_free_on_stack(uct_rkeys, rkeys_size); + + if (!UCS_STATUS_IS_ERR(status)) { + ucp_proto_put_offload_update_remote_flush( + ep, lpriv->flush_sys_dev_mask, + ucp_rkey_get_tl_rkey(dt_iter->type.sgl.rkeys[start_index], + rkey_index), + uct_ep, + dt_iter->type.sgl.remote_addrs[start_index]); + } + + return status; +} + +static ucs_status_t +ucp_proto_put_sgl_offload_progress(uct_pending_req_t *self) +{ + ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); + + /* coverity[tainted_data_downcast] */ + return ucp_proto_multi_zcopy_progress( + req, req->send.proto_config->priv, ucp_proto_multi_rma_init_func, + UCT_MD_MEM_ACCESS_LOCAL_READ, UCS_BIT(UCP_DATATYPE_SGL), + ucp_proto_put_sgl_offload_send_func, + ucp_request_invoke_uct_completion_success, + ucp_proto_request_zcopy_completion); +} + +ucp_proto_t ucp_put_sgl_offload_proto = { + .name = "put/sgl/offload", + .desc = "sgl", + .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_SGL), + .probe = ucp_proto_put_sgl_offload_probe, + .query = ucp_proto_multi_query, + .progress = {ucp_proto_put_sgl_offload_progress}, + .abort = ucp_proto_request_zcopy_abort, + .reset = ucp_proto_offload_zcopy_reset +}; diff --git a/src/ucp/rma/rma_send.c b/src/ucp/rma/rma_send.c index 2d76c6aacc7..8835b8c5e9d 100644 --- a/src/ucp/rma/rma_send.c +++ b/src/ucp/rma/rma_send.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2018. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -12,11 +12,103 @@ #include "rma.inl" #include +#include #include #include #include #include +#include +#include + + +#define UCP_PUT_SGL_CHECK_PARAMS(_buffer, _count, _remote_addr, _rkey, \ + _param) \ + do { \ + if (!ENABLE_PARAMS_CHECK) { \ + ucs_assert((_param)->remote != NULL); /* For Coverity */ \ + } else { \ + const ucp_dt_local_sgl_t *_local; \ + const ucp_dt_remote_sgl_t *_remote; \ + \ + if (ucs_unlikely((_remote_addr) != UCP_REMOTE_ADDR_INVALID)) { \ + ucs_error("sgl put: remote_addr must be " \ + "UCP_REMOTE_ADDR_INVALID, got 0x%" PRIx64, \ + (_remote_addr)); \ + ret = UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ + goto out_unlock; \ + } \ + \ + if (ucs_unlikely((_rkey) != UCP_RKEY_INVALID)) { \ + ucs_error("sgl put: rkey must be UCP_RKEY_INVALID, got %p", \ + (_rkey)); \ + ret = UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ + goto out_unlock; \ + } \ + \ + if (ucs_unlikely(((_param)->op_attr_mask & \ + UCP_OP_ATTR_FIELD_REMOTE) == 0)) { \ + ucs_error("sgl put: UCP_OP_ATTR_FIELD_REMOTE must be set"); \ + ret = UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ + goto out_unlock; \ + } \ + \ + if (ucs_unlikely((_param)->remote == NULL)) { \ + ucs_error("sgl put: remote descriptor must not be NULL"); \ + ret = UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ + goto out_unlock; \ + } \ + \ + _local = (const ucp_dt_local_sgl_t*)(_buffer); \ + \ + if (ucs_unlikely((_local->field_mask & \ + UCP_DT_LOCAL_SGL_FIELD_BUFFERS) == 0)) { \ + ucs_error("sgl put: local buffers field must be set"); \ + ret = UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ + goto out_unlock; \ + } \ + \ + if (ucs_unlikely((_local->field_mask & \ + UCP_DT_LOCAL_SGL_FIELD_LENGTHS) == 0)) { \ + ucs_error("sgl put: local lengths field must be set"); \ + ret = UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ + goto out_unlock; \ + } \ + \ + _remote = (_param)->remote; \ + \ + if (ucs_unlikely((_remote->field_mask & \ + UCP_DT_REMOTE_SGL_FIELD_REMOTE_ADDRS) == 0)) { \ + ucs_error("sgl put: remote addrs field must be set"); \ + ret = UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ + goto out_unlock; \ + } \ + \ + if (ucs_unlikely((_remote->field_mask & \ + UCP_DT_REMOTE_SGL_FIELD_LENGTHS) == 0)) { \ + ucs_error("sgl put: remote lengths field must be set"); \ + ret = UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ + goto out_unlock; \ + } \ + \ + if (ucs_unlikely((_remote->field_mask & \ + UCP_DT_REMOTE_SGL_FIELD_RKEYS) == 0)) { \ + ucs_error("sgl put: remote rkeys field must be set"); \ + ret = UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ + goto out_unlock; \ + } \ + \ + if (ucs_unlikely(((_param)->op_attr_mask & \ + UCP_OP_ATTR_FIELD_REMOTE_COUNT) && \ + ((_param)->remote_count != (_count)))) { \ + ucs_error("sgl put: local count %zu != remote count %zu" \ + " (only N->N mapping is supported)", \ + (_count), (_param)->remote_count); \ + ret = UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \ + goto out_unlock; \ + } \ + } \ + } while (0) #define UCP_RMA_CHECK_BUFFER(_buffer, _action) \ @@ -118,11 +210,12 @@ ucs_status_ptr_t ucp_put_nbx(ucp_ep_h ep, const void *buffer, size_t count, ucp_worker_h worker = ep->worker; size_t contig_length = 0; ucp_datatype_t datatype = ucp_dt_make_contig(1); + const ucp_dt_remote_sgl_t *remote; ucs_status_ptr_t ret; ucs_status_t status; ucp_request_t *req; - UCP_REQUEST_CHECK_PARAM(param); + UCP_REQUEST_CHECK_PARAM_COMMON(param); UCP_RMA_CHECK_PTR(worker->context, buffer, count); UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); @@ -143,21 +236,26 @@ ucs_status_ptr_t ucp_put_nbx(ucp_ep_h ep, const void *buffer, size_t count, goto out_unlock; } - req = ucp_request_get_param(worker, param, - {ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); - goto out_unlock;}); - req->send.rma.rkey = rkey; - req->send.rma.remote_addr = remote_addr; if (ucs_unlikely(param->op_attr_mask & UCP_OP_ATTR_FIELD_DATATYPE)) { datatype = param->datatype; - if (UCP_DT_IS_CONTIG(datatype)) { + if (UCP_DT_IS_SGL(datatype)) { + UCP_PUT_SGL_CHECK_PARAMS(buffer, count, remote_addr, rkey, param); + remote = param->remote; + rkey = remote->rkeys[0]; + } else if (UCP_DT_IS_CONTIG(datatype)) { contig_length = ucp_contig_dt_length(datatype, count); } } else { contig_length = count; } + req = ucp_request_get_param(worker, param, + {ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); + goto out_unlock;}); + req->send.rma.rkey = rkey; + req->send.rma.remote_addr = remote_addr; + ret = ucp_proto_request_send_op_rma( ep, rkey, req, ucp_ep_rma_get_fence_flag(ep), UCP_OP_ID_PUT, buffer, count, datatype, contig_length, param, 0, 0); diff --git a/src/ucp/rndv/proto_rndv.c b/src/ucp/rndv/proto_rndv.c index d68bba2a514..3b4f2937ee7 100644 --- a/src/ucp/rndv/proto_rndv.c +++ b/src/ucp/rndv/proto_rndv.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -513,8 +513,8 @@ ucp_proto_rndv_find_ctrl_lane(const ucp_proto_init_params_t *params) num_lanes = ucp_proto_common_find_lanes(params, UCP_PROTO_COMMON_INIT_FLAG_HDR_ONLY, UCP_LANE_TYPE_AM, - UCT_IFACE_FLAG_AM_BCOPY, 1, 0, NULL, - &lane); + UCT_IFACE_FLAG_AM_BCOPY, 0, 1, 0, + NULL, &lane); if (num_lanes == 0) { ucs_debug("no active message lane for %s", ucp_proto_id_field(params->proto_id, name)); diff --git a/src/ucp/rndv/rndv_am.c b/src/ucp/rndv/rndv_am.c index adc94ce1c8b..245dc788d2e 100644 --- a/src/ucp/rndv/rndv_am.c +++ b/src/ucp/rndv/rndv_am.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -135,6 +135,7 @@ ucp_proto_t ucp_rndv_am_bcopy_proto = { .name = "rndv/am/bcopy", .desc = "fragmented " UCP_PROTO_COPY_IN_DESC " " UCP_PROTO_COPY_OUT_DESC, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_rndv_am_bcopy_probe, .query = ucp_proto_multi_query, .progress = {ucp_proto_rndv_am_bcopy_progress}, @@ -210,6 +211,7 @@ ucp_proto_t ucp_rndv_am_zcopy_proto = { .name = "rndv/am/zcopy", .desc = UCP_PROTO_ZCOPY_DESC, .flags = 0, + .dt_mask = UCP_DT_MASK_CONTIG_IOV, .probe = ucp_rndv_am_zcopy_probe, .query = ucp_proto_multi_query, .progress = {ucp_rndv_am_zcopy_proto_progress}, diff --git a/src/ucp/rndv/rndv_ats.c b/src/ucp/rndv/rndv_ats.c index 9d5f96a22d1..db22a96c81f 100644 --- a/src/ucp/rndv/rndv_ats.c +++ b/src/ucp/rndv/rndv_ats.c @@ -1,5 +1,5 @@ /** -* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2022. ALL RIGHTS RESERVED. +* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2022-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -64,6 +64,7 @@ ucp_proto_t ucp_rndv_ats_proto = { .name = "rndv/ats", .desc = "no data fetch", .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_proto_rndv_ats_probe, .query = ucp_proto_rndv_ats_query, .progress = {ucp_proto_rndv_ats_progress}, diff --git a/src/ucp/rndv/rndv_get.c b/src/ucp/rndv/rndv_get.c index 9a8fb8c28f4..67c60290428 100644 --- a/src/ucp/rndv/rndv_get.c +++ b/src/ucp/rndv/rndv_get.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -60,8 +60,7 @@ ucp_proto_rndv_get_common_probe(const ucp_proto_init_params_t *init_params, ucp_proto_perf_t *perf; ucs_status_t status; - if ((init_params->select_param->dt_class != UCP_DATATYPE_CONTIG) || - !ucp_proto_rndv_op_check(init_params, UCP_OP_ID_RNDV_RECV, + if (!ucp_proto_rndv_op_check(init_params, UCP_OP_ID_RNDV_RECV, support_ppln)) { return; } @@ -227,6 +226,7 @@ ucp_proto_t ucp_rndv_get_zcopy_proto = { .name = "rndv/get/zcopy", .desc = UCP_PROTO_ZCOPY_DESC " " UCP_PROTO_RNDV_GET_DESC, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_rndv_get_zcopy_probe, .query = ucp_proto_rndv_get_zcopy_query, .progress = { @@ -376,6 +376,7 @@ ucp_proto_t ucp_rndv_get_mtype_proto = { .name = "rndv/get/mtype", .desc = NULL, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_rndv_get_mtype_probe, .query = ucp_proto_rndv_get_mtype_query, .progress = { diff --git a/src/ucp/rndv/rndv_mtype.inl b/src/ucp/rndv/rndv_mtype.inl index 93a958be7ac..4b11edb3fd8 100644 --- a/src/ucp/rndv/rndv_mtype.inl +++ b/src/ucp/rndv/rndv_mtype.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. + * Copyright (C) 2021-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -32,8 +32,7 @@ ucp_proto_rndv_mtype_init(const ucp_proto_init_params_t *init_params, ucp_context_h context = worker->context; ucs_memory_type_t mem_type = init_params->select_param->mem_type; - if ((init_params->select_param->dt_class != UCP_DATATYPE_CONTIG) || - (ucp_proto_rndv_mtype_ep(worker, frag_mem_type, mem_type) == NULL) || + if ((ucp_proto_rndv_mtype_ep(worker, frag_mem_type, mem_type) == NULL) || !init_params->worker->context->config.ext.memtype_copy_enable || !ucp_proto_init_check_op(init_params, UCP_PROTO_RNDV_OP_ID_MASK)) { return UCS_ERR_UNSUPPORTED; diff --git a/src/ucp/rndv/rndv_ppln.c b/src/ucp/rndv/rndv_ppln.c index 17fc9e8b16b..3fdfeeeae03 100644 --- a/src/ucp/rndv/rndv_ppln.c +++ b/src/ucp/rndv/rndv_ppln.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. + * Copyright (C) 2021-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -74,8 +74,7 @@ ucp_proto_rndv_ppln_probe(const ucp_proto_init_params_t *init_params) ack_params.flags |= UCP_PROTO_COMMON_INIT_FLAG_ERR_HANDLING; } - if ((select_param->dt_class != UCP_DATATYPE_CONTIG) || - !ucp_proto_init_check_op(init_params, UCP_PROTO_RNDV_OP_ID_MASK) || + if (!ucp_proto_init_check_op(init_params, UCP_PROTO_RNDV_OP_ID_MASK) || !ucp_proto_common_init_check_err_handling(&ack_params) || ucp_proto_rndv_init_params_is_ppln_frag(init_params) || !ucp_proto_common_check_memtype_copy(&ack_params)) { @@ -344,6 +343,7 @@ ucp_proto_t ucp_rndv_send_ppln_proto = { .name = "rndv/send/ppln", .desc = NULL, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_rndv_send_ppln_probe, .query = ucp_proto_rndv_ppln_query, .progress = { @@ -397,6 +397,7 @@ ucp_proto_t ucp_rndv_recv_ppln_proto = { .name = "rndv/recv/ppln", .desc = NULL, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_rndv_recv_ppln_probe, .query = ucp_proto_rndv_ppln_query, .progress = { diff --git a/src/ucp/rndv/rndv_put.c b/src/ucp/rndv/rndv_put.c index 68989a923ef..95ed67baf42 100644 --- a/src/ucp/rndv/rndv_put.c +++ b/src/ucp/rndv/rndv_put.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -281,8 +281,7 @@ ucp_proto_rndv_put_common_probe(const ucp_proto_init_params_t *init_params, ucs_status_t status; unsigned atp_map; - if ((init_params->select_param->dt_class != UCP_DATATYPE_CONTIG) || - !ucp_proto_rndv_op_check(init_params, UCP_OP_ID_RNDV_SEND, + if (!ucp_proto_rndv_op_check(init_params, UCP_OP_ID_RNDV_SEND, support_ppln) || !ucp_proto_common_init_check_err_handling(¶ms.super)) { return; @@ -475,6 +474,7 @@ ucp_proto_t ucp_rndv_put_zcopy_proto = { .name = "rndv/put/zcopy", .desc = NULL, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_rndv_put_zcopy_probe, .query = ucp_proto_rndv_put_zcopy_query, .progress = { @@ -661,6 +661,7 @@ ucp_proto_t ucp_rndv_put_mtype_proto = { .name = "rndv/put/mtype", .desc = NULL, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_rndv_put_mtype_probe, .query = ucp_proto_rndv_put_mtype_query, .progress = { diff --git a/src/ucp/rndv/rndv_rkey_ptr.c b/src/ucp/rndv/rndv_rkey_ptr.c index 7878e44234c..f18e00c8e52 100644 --- a/src/ucp/rndv/rndv_rkey_ptr.c +++ b/src/ucp/rndv/rndv_rkey_ptr.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ @@ -237,6 +237,7 @@ ucp_proto_t ucp_rndv_rkey_ptr_proto = { .name = "rndv/rkey_ptr", .desc = "copy from mapped remote memory", .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_proto_rndv_rkey_ptr_probe, .query = ucp_proto_rndv_rkey_ptr_query, .progress = { @@ -396,7 +397,8 @@ ucp_proto_t ucp_rndv_rkey_ptr_mtype_proto = { .name = "rndv/rkey_ptr/mtype", .desc = "copy to mapped remote memory", .flags = 0, - .probe = ucp_proto_rndv_rkey_ptr_mtype_probe, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), + .probe = ucp_proto_rndv_rkey_ptr_mtype_probe, .query = ucp_proto_rndv_rkey_ptr_mtype_query, .progress = { [UCP_PROTO_RNDV_RKEY_PTR_STAGE_COPY] = ucp_proto_rndv_rkey_ptr_mtype_copy_progress, diff --git a/src/ucp/rndv/rndv_rtr.c b/src/ucp/rndv/rndv_rtr.c index 11493a511c9..5c160c7db94 100644 --- a/src/ucp/rndv/rndv_rtr.c +++ b/src/ucp/rndv/rndv_rtr.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -242,6 +242,7 @@ ucp_proto_t ucp_rndv_rtr_proto = { .name = "rndv/rtr", .desc = NULL, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_proto_rndv_rtr_probe, .query = ucp_proto_rndv_rtr_query, .progress = {ucp_proto_rndv_rtr_progress}, @@ -478,6 +479,7 @@ ucp_proto_t ucp_rndv_rtr_mtype_proto = { .name = "rndv/rtr/mtype", .desc = NULL, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_rndv_rtr_mtype_probe, .query = ucp_proto_rndv_rtr_mtype_query, .progress = {ucp_proto_rndv_rtr_mtype_progress}, diff --git a/src/ucp/stream/stream_multi.c b/src/ucp/stream/stream_multi.c index 6ba0fafd95a..b36dcadab97 100644 --- a/src/ucp/stream/stream_multi.c +++ b/src/ucp/stream/stream_multi.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2023. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2023-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -109,6 +109,7 @@ ucp_proto_t ucp_stream_multi_bcopy_proto = { .name = "stream/multi/bcopy", .desc = UCP_PROTO_MULTI_FRAG_DESC " " UCP_PROTO_STREAM_BCOPY_DESC, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_stream_multi_bcopy_probe, .query = ucp_proto_multi_query, .progress = {ucp_stream_multi_bcopy_progress}, @@ -184,6 +185,7 @@ ucp_proto_t ucp_stream_multi_zcopy_proto = { .name = "stream/multi/zcopy", .desc = UCP_PROTO_MULTI_FRAG_DESC " " UCP_PROTO_STREAM_ZCOPY_DESC, .flags = 0, + .dt_mask = UCP_DT_MASK_CONTIG_IOV, .probe = ucp_stream_multi_zcopy_probe, .query = ucp_proto_multi_query, .progress = {ucp_stream_multi_zcopy_progress}, diff --git a/src/ucp/tag/eager_multi.c b/src/ucp/tag/eager_multi.c index 3e006c07c88..2dab0eb53cd 100644 --- a/src/ucp/tag/eager_multi.c +++ b/src/ucp/tag/eager_multi.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2021. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -134,6 +134,7 @@ ucp_proto_t ucp_eager_bcopy_multi_proto = { .name = "egr/multi/bcopy", .desc = UCP_PROTO_MULTI_FRAG_DESC " " UCP_PROTO_EAGER_BCOPY_DESC, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_proto_eager_bcopy_multi_probe, .query = ucp_proto_multi_query, .progress = {ucp_proto_eager_bcopy_multi_progress}, @@ -217,6 +218,7 @@ ucp_proto_t ucp_eager_sync_bcopy_multi_proto = { .name = "egrsnc/multi/bcopy", .desc = UCP_PROTO_MULTI_FRAG_DESC " " UCP_PROTO_EAGER_BCOPY_DESC, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_proto_eager_sync_bcopy_multi_probe, .query = ucp_proto_multi_query, .progress = {ucp_proto_eager_sync_bcopy_multi_progress}, @@ -292,6 +294,7 @@ ucp_proto_t ucp_eager_zcopy_multi_proto = { .name = "egr/multi/zcopy", .desc = UCP_PROTO_MULTI_FRAG_DESC " " UCP_PROTO_EAGER_ZCOPY_DESC, .flags = 0, + .dt_mask = UCP_DT_MASK_CONTIG_IOV, .probe = ucp_proto_eager_zcopy_multi_probe, .query = ucp_proto_multi_query, .progress = {ucp_proto_eager_zcopy_multi_progress}, diff --git a/src/ucp/tag/eager_single.c b/src/ucp/tag/eager_single.c index 9d66c7e8b3c..2a2d91c740a 100644 --- a/src/ucp/tag/eager_single.c +++ b/src/ucp/tag/eager_single.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2021. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -85,6 +85,7 @@ ucp_proto_t ucp_eager_short_proto = { .name = "egr/short", .desc = "eager " UCP_PROTO_SHORT_DESC, .flags = UCP_PROTO_FLAG_AM_SHORT, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_eager_short_probe, .query = ucp_proto_single_query, .progress = {ucp_eager_short_progress}, @@ -158,6 +159,7 @@ ucp_proto_t ucp_eager_bcopy_single_proto = { .name = "egr/single/bcopy", .desc = UCP_PROTO_EAGER_BCOPY_DESC, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_proto_eager_bcopy_single_probe, .query = ucp_proto_single_query, .progress = {ucp_eager_bcopy_single_progress}, @@ -196,8 +198,7 @@ ucp_proto_eager_zcopy_single_probe(const ucp_proto_init_params_t *init_params) }; /* AM based proto can not be used if tag offload lane configured */ - if (!ucp_tag_eager_check_op_id(init_params, UCP_OP_ID_TAG_SEND, 0) || - (init_params->select_param->dt_class != UCP_DATATYPE_CONTIG)) { + if (!ucp_tag_eager_check_op_id(init_params, UCP_OP_ID_TAG_SEND, 0)) { return; } @@ -234,6 +235,7 @@ ucp_proto_t ucp_eager_zcopy_single_proto = { .name = "egr/single/zcopy", .desc = UCP_PROTO_EAGER_ZCOPY_DESC, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_eager_zcopy_single_probe, .query = ucp_proto_single_query, .progress = {ucp_proto_eager_zcopy_single_progress}, diff --git a/src/ucp/tag/offload/eager.c b/src/ucp/tag/offload/eager.c index 84290ba5c2c..c95966e6432 100644 --- a/src/ucp/tag/offload/eager.c +++ b/src/ucp/tag/offload/eager.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2021-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -82,6 +82,7 @@ ucp_proto_t ucp_eager_tag_offload_short_proto = { .name = "egr/offload/short", .desc = UCP_PROTO_EAGER_OFFLOAD_DESC " " UCP_PROTO_SHORT_DESC, .flags = UCP_PROTO_FLAG_TAG_SHORT, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_eager_tag_offload_short_probe, .query = ucp_proto_single_query, .progress = {ucp_proto_eager_tag_offload_short_progress}, @@ -178,6 +179,7 @@ ucp_proto_t ucp_tag_offload_eager_bcopy_single_proto = { .name = "egr/offload/bcopy", .desc = UCP_PROTO_EAGER_OFFLOAD_DESC " " UCP_PROTO_COPY_IN_DESC, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_proto_eager_tag_offload_bcopy_probe, .query = ucp_proto_single_query, .progress = {ucp_proto_eager_tag_offload_bcopy_progress}, @@ -218,6 +220,7 @@ ucp_proto_t ucp_eager_sync_bcopy_single_proto = { .name = "egrsnc/offload/bcopy", .desc = UCP_PROTO_EAGER_OFFLOAD_DESC " " UCP_PROTO_COPY_IN_DESC, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_proto_eager_sync_tag_offload_bcopy_probe, .query = ucp_proto_single_query, .progress = {ucp_proto_eager_sync_tag_offload_bcopy_progress}, @@ -258,8 +261,7 @@ static void ucp_proto_eager_tag_offload_zcopy_probe_common( }; /* offload proto can not be used if no tag offload lane configured */ - if (!ucp_tag_eager_check_op_id(init_params, op_id, 1) || - (init_params->select_param->dt_class != UCP_DATATYPE_CONTIG)) { + if (!ucp_tag_eager_check_op_id(init_params, op_id, 1)) { return; } @@ -300,6 +302,7 @@ ucp_proto_t ucp_tag_offload_eager_zcopy_single_proto = { .name = "egr/offload/zcopy", .desc = UCP_PROTO_EAGER_OFFLOAD_DESC " " UCP_PROTO_ZCOPY_DESC, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_eager_tag_offload_zcopy_probe, .query = ucp_proto_single_query, .progress = {ucp_proto_eager_tag_offload_zcopy_progress}, @@ -361,6 +364,7 @@ ucp_proto_t ucp_eager_sync_zcopy_single_proto = { .name = "egrsnc/offload/zcopy", .desc = UCP_PROTO_EAGER_OFFLOAD_DESC " " UCP_PROTO_ZCOPY_DESC, .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_proto_eager_sync_tag_offload_zcopy_probe, .query = ucp_proto_single_query, .progress = {ucp_proto_eager_sync_tag_offload_zcopy_progress}, diff --git a/src/ucp/tag/offload/rndv.c b/src/ucp/tag/offload/rndv.c index c82f18249ce..109b6eeeefa 100644 --- a/src/ucp/tag/offload/rndv.c +++ b/src/ucp/tag/offload/rndv.c @@ -1,5 +1,5 @@ /** - * Copyright (C) 2023, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. + * Copyright (C) 2023-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -51,8 +51,7 @@ ucp_tag_rndv_offload_proto_probe(const ucp_proto_init_params_t *init_params) .tl_cap_flags = UCT_IFACE_FLAG_TAG_RNDV_ZCOPY }; - if (!ucp_tag_rndv_check_op_id(init_params) || - (init_params->select_param->dt_class != UCP_DATATYPE_CONTIG)) { + if (!ucp_tag_rndv_check_op_id(init_params)) { return; } @@ -139,6 +138,7 @@ ucp_proto_t ucp_tag_rndv_offload_proto = { .name = "tag/rndv/offload", .desc = "rendezvous tag offload", .flags = 0, + .dt_mask = UCS_BIT(UCP_DATATYPE_CONTIG), .probe = ucp_tag_rndv_offload_proto_probe, .query = ucp_proto_single_query, .progress = {ucp_tag_rndv_offload_proto_progress}, @@ -231,6 +231,7 @@ ucp_proto_t ucp_tag_rndv_offload_sw_proto = { .name = "tag/rndv/offload_sw", .desc = NULL, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_tag_rndv_offload_sw_proto_probe, .query = ucp_proto_rndv_rts_query, .progress = {ucp_tag_rndv_offload_sw_proto_progress}, diff --git a/src/ucp/tag/tag_recv.c b/src/ucp/tag/tag_recv.c index 66c04d470c1..b110a13c7e3 100644 --- a/src/ucp/tag/tag_recv.c +++ b/src/ucp/tag/tag_recv.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2015. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -200,6 +200,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_tag_recv_nbr, }; ucs_status_ptr_t status; + /* coverity[var_deref_model] */ status = ucp_tag_recv_nbx(worker, buffer, count, tag, tag_mask, ¶m); return UCS_PTR_IS_ERR(status) ? UCS_PTR_STATUS(status) : UCS_OK; } @@ -218,6 +219,7 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_recv_nb, .datatype = datatype }; + /* coverity[var_deref_model] */ return ucp_tag_recv_nbx(worker, buffer, count, tag, tag_mask, ¶m); } @@ -263,6 +265,7 @@ ucs_status_ptr_t ucp_tag_msg_recv_nb(ucp_worker_h worker, void *buffer, size_t c .cb.recv = (ucp_tag_recv_nbx_callback_t)cb }; + /* coverity[var_deref_model] */ return ucp_tag_msg_recv_nbx(worker, buffer, count, message, ¶m); } diff --git a/src/ucp/tag/tag_rndv.c b/src/ucp/tag/tag_rndv.c index 3e9343c2459..4af19f96933 100644 --- a/src/ucp/tag/tag_rndv.c +++ b/src/ucp/tag/tag_rndv.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2020-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -167,6 +167,7 @@ ucp_proto_t ucp_tag_rndv_proto = { .name = "tag/rndv", .desc = NULL, .flags = 0, + .dt_mask = UCP_PROTO_DT_MASK_DEFAULT, .probe = ucp_tag_rndv_rts_probe, .query = ucp_proto_rndv_rts_query, .progress = {ucp_tag_rndv_rts_progress}, diff --git a/test/gtest/common/test_obj_size.cc b/test/gtest/common/test_obj_size.cc index 7f0c228c2cd..076cf38ab1c 100644 --- a/test/gtest/common/test_obj_size.cc +++ b/test/gtest/common/test_obj_size.cc @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2019. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -57,7 +57,7 @@ UCS_TEST_F(test_obj_size, size) { EXPECTED_SIZE(ucp_rkey_t, 24); #endif /* TODO reduce request size to 240 or less after removing old protocols state */ - EXPECTED_SIZE(ucp_request_t, 272); + EXPECTED_SIZE(ucp_request_t, 280); EXPECTED_SIZE(ucp_recv_desc_t, 48); EXPECTED_SIZE(ucp_mem_t, 160); EXPECTED_SIZE(uct_ep_t, 8); diff --git a/test/gtest/ucp/test_ucp_dt.cc b/test/gtest/ucp/test_ucp_dt.cc index 06275c9c0e2..00953aff6e7 100644 --- a/test/gtest/ucp/test_ucp_dt.cc +++ b/test/gtest/ucp/test_ucp_dt.cc @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2018. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -12,6 +12,7 @@ extern "C" { #include #include +#include } class test_ucp_dt_iov : public ucs::test { @@ -280,3 +281,137 @@ INSTANTIATE_TEST_SUITE_P(iov, test_ucp_dt_iter, INSTANTIATE_TEST_SUITE_P(generic, test_ucp_dt_iter, testing::ValuesIn(test_ucp_dt_iter::enum_dt_generic_params())); + +class test_ucp_dt_sgl : public ucs::test { +protected: + virtual void init() { + ucp_params_t ctx_params; + ctx_params.field_mask = UCP_PARAM_FIELD_FEATURES; + ctx_params.features = UCP_FEATURE_RMA; + UCS_TEST_CREATE_HANDLE(ucp_context_h, m_ucph, ucp_cleanup, ucp_init, + &ctx_params, NULL); + } + + virtual void cleanup() { + m_ucph.reset(); + } + + void init_sgl_iter(size_t count) { + m_buffers.resize(count); + m_lengths.resize(count); + m_remote_addrs.resize(count); + m_rkeys.resize(count); + + m_dummy_rkey = {}; + m_dummy_rkey.cfg_index = 0; +#if ENABLE_PARAMS_CHECK + m_dummy_rkey.ep = nullptr; +#endif + + for (size_t i = 0; i < count; i++) { + m_buffers[i] = &m_buffers[i]; + m_lengths[i] = (i + 1) * 64; + m_remote_addrs[i] = 0x1000 + i * 0x100; + m_rkeys[i] = &m_dummy_rkey; + } + + m_local = {}; + m_local.field_mask = UCP_DT_LOCAL_SGL_FIELD_BUFFERS | + UCP_DT_LOCAL_SGL_FIELD_LENGTHS; + m_local.buffers = m_buffers.data(); + m_local.lengths = m_lengths.data(); + + m_remote = {}; + m_remote.field_mask = UCP_DT_REMOTE_SGL_FIELD_REMOTE_ADDRS | + UCP_DT_REMOTE_SGL_FIELD_RKEYS; + m_remote.remote_addrs = m_remote_addrs.data(); + m_remote.rkeys = m_rkeys.data(); + + ucp_request_param_t param = {}; + param.op_attr_mask = UCP_OP_ATTR_FIELD_DATATYPE | + UCP_OP_ATTR_FIELD_REMOTE; + param.datatype = ucp_dt_make_sgl(); + param.remote = &m_remote; + + ucs_status_t status = ucp_datatype_iter_sgl_init(m_ucph.get(), + &m_dt_iter, &m_local, + &m_remote, count, + ¶m); + ASSERT_UCS_OK(status); + } + +private: + ucs::handle m_ucph; + std::vector m_buffers; + std::vector m_lengths; + std::vector m_remote_addrs; + std::vector m_rkeys; + ucp_rkey_t m_dummy_rkey; + ucp_dt_local_sgl_t m_local; + ucp_dt_remote_sgl_t m_remote; + +protected: + ucp_datatype_iter_t m_dt_iter; +}; + +UCS_TEST_F(test_ucp_dt_sgl, iter_next_chunked) { + static constexpr size_t NUM_ELEMS = 10; + static constexpr size_t MAX_PER_STEP = 3; + + init_sgl_iter(NUM_ELEMS); + + size_t total_advanced = 0; + while (!ucp_datatype_iter_is_end(&m_dt_iter)) { + ucp_datatype_iter_t next_iter = {}; + size_t advanced = ucp_datatype_iter_next_sgl(&m_dt_iter, + MAX_PER_STEP, + &next_iter); + EXPECT_LE(advanced, MAX_PER_STEP); + EXPECT_GT(advanced, 0u); + total_advanced += advanced; + ucp_datatype_iter_copy_position(&m_dt_iter, &next_iter, UINT_MAX); + } + + EXPECT_EQ(NUM_ELEMS, total_advanced); + EXPECT_TRUE(ucp_datatype_iter_is_end(&m_dt_iter)); +} + +UCS_TEST_F(test_ucp_dt_sgl, iter_next_single_step) { + static constexpr size_t counts[] = {1, 5, 10}; + for (size_t count : counts) { + init_sgl_iter(count); + + ucp_datatype_iter_t next_iter = {}; + size_t advanced = ucp_datatype_iter_next_sgl(&m_dt_iter, count, + &next_iter); + EXPECT_EQ(count, advanced); + ucp_datatype_iter_copy_position(&m_dt_iter, &next_iter, UINT_MAX); + EXPECT_TRUE(ucp_datatype_iter_is_end(&m_dt_iter)); + } +} + +UCS_TEST_F(test_ucp_dt_sgl, iter_next_one_by_one) { + static constexpr size_t NUM_ELEMS = 5; + + init_sgl_iter(NUM_ELEMS); + + for (size_t i = 0; i < NUM_ELEMS; i++) { + EXPECT_FALSE(ucp_datatype_iter_is_end(&m_dt_iter)); + ucp_datatype_iter_t next_iter = {}; + size_t advanced = ucp_datatype_iter_next_sgl(&m_dt_iter, 1, + &next_iter); + EXPECT_EQ(1u, advanced); + EXPECT_EQ(i + 1, next_iter.offset); + ucp_datatype_iter_copy_position(&m_dt_iter, &next_iter, UINT_MAX); + } + + EXPECT_TRUE(ucp_datatype_iter_is_end(&m_dt_iter)); +} + +UCS_TEST_F(test_ucp_dt_sgl, init_zero_count) { + init_sgl_iter(0); + + EXPECT_TRUE(ucp_datatype_iter_is_end(&m_dt_iter)); + EXPECT_EQ(UCS_MEMORY_TYPE_HOST, m_dt_iter.mem_info.type); + EXPECT_EQ(UCS_SYS_DEVICE_ID_UNKNOWN, m_dt_iter.mem_info.sys_dev); +} diff --git a/test/gtest/ucp/test_ucp_rma.cc b/test/gtest/ucp/test_ucp_rma.cc index e9606bafbc2..750ae1d9cbd 100755 --- a/test/gtest/ucp/test_ucp_rma.cc +++ b/test/gtest/ucp/test_ucp_rma.cc @@ -11,7 +11,9 @@ extern "C" { #include #include /* for UCP_MEM_IS_ACCESSIBLE_FROM_CPU */ #include +#include #include +#include } @@ -830,3 +832,524 @@ UCS_TEST_P(test_ucp_ep_based_fence, test_ep_based_fence_before_atomic) { } UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_ep_based_fence, all, "all") + +class test_ucp_rma_sgl : public test_ucp_rma { +public: + static void get_base_variants(std::vector& variants) { + add_variant(variants, UCP_FEATURE_RMA); + } + + static void get_test_variants(std::vector& variants) { + add_variant_memtypes(variants, get_base_variants, + UCS_BIT(UCS_MEMORY_TYPE_CUDA) | + UCS_BIT(UCS_MEMORY_TYPE_HOST)); + } + + virtual void init() override { + modify_config("MAX_RMA_RAILS", "2"); + test_ucp_rma::init(); + } + + ucs_memory_type_t mem_type() const { + return static_cast(get_variant_value()); + } + +protected: + struct sgl_ctx { + std::vector src; + std::vector dst; + std::vector> rkey_handles; + std::vector buffers; + std::vector remote_addrs; + std::vector lengths; + std::vector remote_lengths; + std::vector memhs; + std::vector rkeys; + }; + + void init_sgl_ctx(sgl_ctx &ctx, const std::vector &elem_sizes) { + ucs_memory_type_t mtype = mem_type(); + size_t num = elem_sizes.size(); + + ctx.rkey_handles.resize(num); + ctx.buffers.resize(num); + ctx.remote_addrs.resize(num); + ctx.lengths.resize(num); + ctx.memhs.resize(num); + ctx.rkeys.resize(num); + ctx.src.reserve(num); + ctx.dst.reserve(num); + + for (size_t i = 0; i < num; i++) { + ctx.src.emplace_back(elem_sizes[i], sender(), 0, mtype); + ctx.dst.emplace_back(elem_sizes[i], receiver(), 0, mtype); + } + + for (size_t i = 0; i < num; i++) { + ctx.src[i].memset(static_cast(i + 1)); + ctx.dst[i].memset(0); + ctx.dst[i].rkey(sender(), ctx.rkey_handles[i]); + + ctx.buffers[i] = ctx.src[i].ptr(); + ctx.memhs[i] = ctx.src[i].memh(); + ctx.remote_addrs[i] = reinterpret_cast(ctx.dst[i].ptr()); + ctx.lengths[i] = elem_sizes[i]; + ctx.rkeys[i] = ctx.rkey_handles[i]; + } + + ctx.remote_lengths = ctx.lengths; + } + + void init_sgl_ctx(sgl_ctx &ctx, size_t num_elems, size_t buf_size) { + init_sgl_ctx(ctx, std::vector(num_elems, buf_size)); + } + + void init_sgl_ctx_mixed_mem_types(sgl_ctx &ctx) { + static constexpr size_t buf_size = 64; + static constexpr size_t num = 2; + const ucs_memory_type_t types[] = {UCS_MEMORY_TYPE_HOST, + UCS_MEMORY_TYPE_CUDA}; + + ctx.rkey_handles.resize(num); + ctx.buffers.resize(num); + ctx.remote_addrs.resize(num); + ctx.lengths.resize(num); + ctx.memhs.resize(num); + ctx.rkeys.resize(num); + ctx.src.reserve(num); + ctx.dst.reserve(num); + + for (size_t i = 0; i < num; i++) { + ctx.src.emplace_back(buf_size, sender(), 0, types[i]); + ctx.dst.emplace_back(buf_size, receiver(), 0, types[i]); + } + + for (size_t i = 0; i < num; i++) { + ctx.src[i].memset(static_cast(i + 1)); + ctx.dst[i].memset(0); + ctx.dst[i].rkey(sender(), ctx.rkey_handles[i]); + + ctx.buffers[i] = ctx.src[i].ptr(); + ctx.memhs[i] = ctx.src[i].memh(); + ctx.remote_addrs[i] = reinterpret_cast(ctx.dst[i].ptr()); + ctx.lengths[i] = buf_size; + ctx.rkeys[i] = ctx.rkey_handles[i]; + } + + ctx.remote_lengths = ctx.lengths; + } + + static ucp_dt_local_sgl_t + make_local_sgl(sgl_ctx &ctx, uint64_t field_mask) { + ucp_dt_local_sgl_t sgl = {}; + sgl.field_mask = field_mask; + sgl.buffers = ctx.buffers.data(); + sgl.lengths = ctx.lengths.data(); + if (field_mask & UCP_DT_LOCAL_SGL_FIELD_MEMHS) { + sgl.memhs = ctx.memhs.data(); + } + return sgl; + } + + static ucp_dt_remote_sgl_t + make_remote_sgl(sgl_ctx &ctx, uint64_t field_mask) { + ucp_dt_remote_sgl_t sgl = {}; + sgl.field_mask = field_mask; + sgl.remote_addrs = ctx.remote_addrs.data(); + sgl.lengths = ctx.remote_lengths.data(); + sgl.rkeys = ctx.rkeys.data(); + return sgl; + } + + ucp_request_param_t + make_sgl_param(ucp_dt_remote_sgl_t *remote, size_t remote_count, + uint32_t extra_mask = 0) { + ucp_request_param_t param = {}; + param.op_attr_mask = UCP_OP_ATTR_FIELD_DATATYPE | + UCP_OP_ATTR_FIELD_REMOTE_DATATYPE | + UCP_OP_ATTR_FIELD_REMOTE | + UCP_OP_ATTR_FIELD_REMOTE_COUNT | + extra_mask; + param.datatype = ucp_dt_make_sgl(); + param.remote_datatype = ucp_dt_make_sgl(); + param.remote = remote; + param.remote_count = remote_count; + return param; + } + + void test_put_sgl(const std::vector &elem_sizes, + bool use_memhs = true, bool use_callback = false, + bool set_remote_count = true, + bool expect_immediate_completion = false) { + ASSERT_FALSE(expect_immediate_completion && use_callback); + + if (!sender().has_lane_with_caps(0, + UCT_IFACE_FLAG_V2_PUT_SGL_ZCOPY)) { + UCS_TEST_SKIP_R("put_sgl_zcopy is not supported"); + } + + sgl_ctx ctx; + init_sgl_ctx(ctx, elem_sizes); + + uint64_t local_mask = LOCAL_MASK_DEFAULT; + if (use_memhs) { + local_mask |= UCP_DT_LOCAL_SGL_FIELD_MEMHS; + } + + size_t num = elem_sizes.size(); + ucp_dt_local_sgl_t local = make_local_sgl(ctx, local_mask); + ucp_dt_remote_sgl_t remote = make_remote_sgl(ctx, REMOTE_MASK_DEFAULT); + ucp_request_param_t param = make_sgl_param(&remote, num); + + if (!set_remote_count) { + param.op_attr_mask &= ~UCP_OP_ATTR_FIELD_REMOTE_COUNT; + } + + struct cb_state { + bool completed; + ucs_status_t status; + } cb = {false, UCS_INPROGRESS}; + + if (use_callback) { + param.op_attr_mask |= UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_USER_DATA; + param.cb.send = [](void *request, ucs_status_t status, + void *user_data) { + cb_state *s = static_cast(user_data); + s->status = status; + s->completed = true; + }; + param.user_data = &cb; + } + + ucs_status_ptr_t sptr = ucp_put_nbx(sender().ep(), &local, num, + UCP_REMOTE_ADDR_INVALID, + UCP_RKEY_INVALID, ¶m); + if (expect_immediate_completion) { + EXPECT_FALSE(UCS_PTR_IS_ERR(sptr)); + EXPECT_FALSE(UCS_PTR_IS_PTR(sptr)); + EXPECT_EQ(UCS_OK, UCS_PTR_STATUS(sptr)); + return; + } + + ASSERT_TRUE(UCS_PTR_IS_PTR(sptr)); + + auto verify_sgl_put_buffers = [&]() { + ucs_memory_type_t mtype = mem_type(); + for (size_t i = 0; i < num; i++) { + uint8_t expected = static_cast(i + 1); + std::vector host_buf(ctx.lengths[i]); + mem_buffer::copy_from(host_buf.data(), ctx.dst[i].ptr(), + ctx.lengths[i], mtype); + for (size_t j = 0; j < ctx.lengths[i]; j++) { + ASSERT_EQ(expected, host_buf[j]) + << "Mismatch at element " << i << " byte " << j; + } + } + }; + + if (use_callback) { + while (!cb.completed) { + ucp_worker_progress(sender().worker()); + ucp_worker_progress(receiver().worker()); + } + EXPECT_UCS_OK(cb.status); + } else { + while (!ucp_request_is_completed(sptr)) { + ucp_worker_progress(sender().worker()); + ucp_worker_progress(receiver().worker()); + } + } + + ucp_request_release(sptr); + flush_ep(sender()); + verify_sgl_put_buffers(); + } + + void test_put_sgl(size_t num_elems, size_t buf_size, + bool use_memhs = true, bool use_callback = false, + bool set_remote_count = true, + bool expect_immediate_completion = false) { + test_put_sgl(std::vector(num_elems, buf_size), + use_memhs, use_callback, set_remote_count, + expect_immediate_completion); + } + + static constexpr uint64_t LOCAL_MASK_DEFAULT = + UCP_DT_LOCAL_SGL_FIELD_BUFFERS | UCP_DT_LOCAL_SGL_FIELD_LENGTHS; + + static constexpr uint64_t REMOTE_MASK_DEFAULT = + UCP_DT_REMOTE_SGL_FIELD_REMOTE_ADDRS | + UCP_DT_REMOTE_SGL_FIELD_LENGTHS | + UCP_DT_REMOTE_SGL_FIELD_RKEYS; + + void expect_sgl_put_status_ctx( + sgl_ctx &ctx, uint64_t local_mask, uint64_t remote_mask, + size_t count, ucs_status_t expected_status, + uint32_t extra_param_mask = 0, + uint64_t remote_addr = UCP_REMOTE_ADDR_INVALID, + ucp_rkey_h rkey = UCP_RKEY_INVALID, + uint32_t clear_param_mask = 0, bool null_remote = false, + size_t remote_count = 0) + { + size_t effective_remote_count = remote_count ? remote_count : count; + ucp_dt_local_sgl_t local = make_local_sgl(ctx, local_mask); + ucp_dt_remote_sgl_t remote = make_remote_sgl(ctx, remote_mask); + ucp_request_param_t param = make_sgl_param(&remote, + effective_remote_count, + extra_param_mask); + param.op_attr_mask &= ~clear_param_mask; + + if (null_remote) { + param.remote = nullptr; + } + + scoped_log_handler wrap_err(wrap_errors_logger); + ucs_status_ptr_t sptr = ucp_put_nbx(sender().ep(), &local, count, + remote_addr, rkey, ¶m); + EXPECT_EQ(expected_status, UCS_PTR_STATUS(sptr)); + } + + void expect_sgl_put_invalid_param_ctx( + sgl_ctx &ctx, uint64_t local_mask, uint64_t remote_mask, + size_t count, uint64_t remote_addr = UCP_REMOTE_ADDR_INVALID, + ucp_rkey_h rkey = UCP_RKEY_INVALID, + uint32_t clear_param_mask = 0, bool null_remote = false, + size_t remote_count = 0) + { + expect_sgl_put_status_ctx(ctx, local_mask, remote_mask, count, + UCS_ERR_INVALID_PARAM, 0, remote_addr, rkey, + clear_param_mask, null_remote, remote_count); + } + + void expect_sgl_put_invalid_param(uint64_t local_mask, + uint64_t remote_mask, + uint64_t remote_addr = UCP_REMOTE_ADDR_INVALID, + ucp_rkey_h rkey = UCP_RKEY_INVALID, + uint32_t clear_param_mask = 0, + bool null_remote = false, + size_t count = 2, + size_t remote_count = 0) + { + sgl_ctx ctx; + init_sgl_ctx(ctx, count, 64); + expect_sgl_put_invalid_param_ctx(ctx, local_mask, remote_mask, count, + remote_addr, rkey, clear_param_mask, + null_remote, remote_count); + } +}; + +UCS_TEST_P(test_ucp_rma_sgl, put_various_counts) { + static const size_t counts[] = {1, 2, 4, 10, 1024}; + for (size_t count : counts) { + test_put_sgl(count, 2 * UCS_KBYTE); + if (HasFailure() || (num_errors() > 0)) { + break; + } + } + + if (!RUNNING_ON_VALGRIND && !HasFailure() && (num_errors() == 0)) { + test_put_sgl(1000, 2 * UCS_KBYTE); + if (!HasFailure() && (num_errors() == 0)) { + test_put_sgl(4096, 2 * UCS_KBYTE); + } + } +} + +UCS_TEST_P(test_ucp_rma_sgl, put_various_sizes) { + static const size_t buf_sizes[] = {1, 256, UCS_KBYTE, 4 * UCS_KBYTE}; + for (size_t buf_size : buf_sizes) { + test_put_sgl(4, buf_size); + if (HasFailure() || (num_errors() > 0)) { + break; + } + } +} + +UCS_TEST_P(test_ucp_rma_sgl, put_various_lengths) { + test_put_sgl({64, 256, UCS_KBYTE, 4 * UCS_KBYTE, 512}); +} + +UCS_TEST_P(test_ucp_rma_sgl, put_with_callback) { + test_put_sgl(10, UCS_KBYTE, true, true); +} + +UCS_TEST_P(test_ucp_rma_sgl, put_no_memhs) { + test_put_sgl(4, 2 * UCS_KBYTE, false); +} + +UCS_TEST_P(test_ucp_rma_sgl, put_no_remote_count) { + test_put_sgl(4, 2 * UCS_KBYTE, true, false, false); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_multi_rail, + RUNNING_ON_VALGRIND) { + static const char *rail_counts[] = {"1", "4", "6", "8"}; + for (const char *rails : rail_counts) { + cleanup(); + modify_config("MAX_RMA_RAILS", rails); + test_ucp_rma::init(); + test_put_sgl(100, 2 * UCS_KBYTE); + if (HasFailure() || (num_errors() > 0)) { + break; + } + } +} + +UCS_TEST_P(test_ucp_rma_sgl, put_force_imm_cmpl) { + static constexpr size_t NUM_ELEMS = 4; + + sgl_ctx ctx; + init_sgl_ctx(ctx, NUM_ELEMS, UCS_KBYTE); + expect_sgl_put_status_ctx( + ctx, LOCAL_MASK_DEFAULT | UCP_DT_LOCAL_SGL_FIELD_MEMHS, + REMOTE_MASK_DEFAULT, NUM_ELEMS, UCS_ERR_NO_RESOURCE, + UCP_OP_ATTR_FLAG_FORCE_IMM_CMPL); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_invalid_remote_addr, + !ENABLE_PARAMS_CHECK) { + expect_sgl_put_invalid_param(LOCAL_MASK_DEFAULT, REMOTE_MASK_DEFAULT, + 0x1234, UCP_RKEY_INVALID); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_invalid_rkey, + !ENABLE_PARAMS_CHECK) { + sgl_ctx ctx; + init_sgl_ctx(ctx, 2, 64); + expect_sgl_put_invalid_param_ctx(ctx, LOCAL_MASK_DEFAULT, + REMOTE_MASK_DEFAULT, 2, + UCP_REMOTE_ADDR_INVALID, ctx.rkeys[0]); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_missing_remote_field, + !ENABLE_PARAMS_CHECK) { + expect_sgl_put_invalid_param(LOCAL_MASK_DEFAULT, REMOTE_MASK_DEFAULT, + UCP_REMOTE_ADDR_INVALID, UCP_RKEY_INVALID, + UCP_OP_ATTR_FIELD_REMOTE); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_null_remote, + !ENABLE_PARAMS_CHECK) { + expect_sgl_put_invalid_param(LOCAL_MASK_DEFAULT, REMOTE_MASK_DEFAULT, + UCP_REMOTE_ADDR_INVALID, UCP_RKEY_INVALID, + 0, true); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_missing_local_buffers, + !ENABLE_PARAMS_CHECK) { + expect_sgl_put_invalid_param(UCP_DT_LOCAL_SGL_FIELD_LENGTHS, + REMOTE_MASK_DEFAULT); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_missing_local_lengths, + !ENABLE_PARAMS_CHECK) { + expect_sgl_put_invalid_param(UCP_DT_LOCAL_SGL_FIELD_BUFFERS, + REMOTE_MASK_DEFAULT); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_missing_remote_addrs, + !ENABLE_PARAMS_CHECK) { + expect_sgl_put_invalid_param(LOCAL_MASK_DEFAULT, + UCP_DT_REMOTE_SGL_FIELD_LENGTHS | + UCP_DT_REMOTE_SGL_FIELD_RKEYS); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_missing_remote_rkeys, + !ENABLE_PARAMS_CHECK) { + expect_sgl_put_invalid_param(LOCAL_MASK_DEFAULT, + UCP_DT_REMOTE_SGL_FIELD_REMOTE_ADDRS | + UCP_DT_REMOTE_SGL_FIELD_LENGTHS); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_missing_remote_lengths, + !ENABLE_PARAMS_CHECK) { + expect_sgl_put_invalid_param(LOCAL_MASK_DEFAULT, + UCP_DT_REMOTE_SGL_FIELD_REMOTE_ADDRS | + UCP_DT_REMOTE_SGL_FIELD_RKEYS); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_count_mismatch, + !ENABLE_PARAMS_CHECK) { + expect_sgl_put_invalid_param(LOCAL_MASK_DEFAULT, REMOTE_MASK_DEFAULT, + UCP_REMOTE_ADDR_INVALID, UCP_RKEY_INVALID, + 0, false, 4, 3); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_mixed_mem_types, + !ENABLE_PARAMS_CHECK || + !mem_buffer::is_mem_type_supported( + UCS_MEMORY_TYPE_CUDA)) { + sgl_ctx ctx; + init_sgl_ctx_mixed_mem_types(ctx); + expect_sgl_put_invalid_param_ctx(ctx, LOCAL_MASK_DEFAULT, + REMOTE_MASK_DEFAULT, 2); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_memhs_null_memh, + !ENABLE_PARAMS_CHECK) { + sgl_ctx ctx; + init_sgl_ctx(ctx, 2, 64); + ctx.memhs[1] = NULL; + expect_sgl_put_invalid_param_ctx( + ctx, LOCAL_MASK_DEFAULT | UCP_DT_LOCAL_SGL_FIELD_MEMHS, + REMOTE_MASK_DEFAULT, 2); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_memhs_buffer_out_of_range, + !ENABLE_PARAMS_CHECK) { + sgl_ctx ctx; + init_sgl_ctx(ctx, 2, 64); + ctx.buffers[1] = reinterpret_cast(0xdeadbeefUL); + expect_sgl_put_invalid_param_ctx( + ctx, LOCAL_MASK_DEFAULT | UCP_DT_LOCAL_SGL_FIELD_MEMHS, + REMOTE_MASK_DEFAULT, 2); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_memhs_inconsistent_mem_info, + !ENABLE_PARAMS_CHECK || + !mem_buffer::is_mem_type_supported( + UCS_MEMORY_TYPE_CUDA)) { + sgl_ctx ctx; + init_sgl_ctx_mixed_mem_types(ctx); + expect_sgl_put_invalid_param_ctx( + ctx, LOCAL_MASK_DEFAULT | UCP_DT_LOCAL_SGL_FIELD_MEMHS, + REMOTE_MASK_DEFAULT, 2); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_rkeys_null, + !ENABLE_PARAMS_CHECK) { + sgl_ctx ctx; + init_sgl_ctx(ctx, 2, 64); + ctx.rkeys[1] = NULL; + expect_sgl_put_invalid_param_ctx(ctx, LOCAL_MASK_DEFAULT, + REMOTE_MASK_DEFAULT, 2); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_rkeys_mismatched_cfg, + !ENABLE_PARAMS_CHECK) { + sgl_ctx ctx; + init_sgl_ctx(ctx, 2, 64); + + ucp_worker_cfg_index_t saved_cfg_index = ctx.rkeys[1]->cfg_index; + ctx.rkeys[1]->cfg_index = saved_cfg_index + 1; + expect_sgl_put_invalid_param_ctx(ctx, LOCAL_MASK_DEFAULT, + REMOTE_MASK_DEFAULT, 2); + ctx.rkeys[1]->cfg_index = saved_cfg_index; +} + +UCS_TEST_P(test_ucp_rma_sgl, put_zero_count) { + test_put_sgl(0, 64, true, false, true, true); +} + +UCS_TEST_SKIP_COND_P(test_ucp_rma_sgl, put_without_proto, + !ENABLE_PARAMS_CHECK, "PROTO_ENABLE=n") +{ + sgl_ctx ctx; + init_sgl_ctx(ctx, 2, 64); + expect_sgl_put_status_ctx( + ctx, LOCAL_MASK_DEFAULT | UCP_DT_LOCAL_SGL_FIELD_MEMHS, + REMOTE_MASK_DEFAULT, 2, UCS_ERR_UNSUPPORTED); +} + +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_rma_sgl, all, "all") diff --git a/test/gtest/ucp/ucp_test.cc b/test/gtest/ucp/ucp_test.cc index d1f849f7734..e73bbe52b81 100644 --- a/test/gtest/ucp/ucp_test.cc +++ b/test/gtest/ucp/ucp_test.cc @@ -1,5 +1,5 @@ /** -* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2014. ALL RIGHTS RESERVED. +* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2026. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -1170,17 +1170,32 @@ void ucp_test_base::entity::ep_destructor(ucp_ep_h ep, entity *e) ucp_request_release(req); } -bool ucp_test_base::entity::has_lane_with_caps(uint64_t caps) const +bool ucp_test_base::entity::has_lane_with_caps(uint64_t caps, + uint64_t v2_caps) const { ucp_ep_h ep = this->ep(); ucp_worker_h worker = this->worker(); ucp_lane_index_t lane; uct_iface_attr_t *iface_attr; + uct_iface_attr_v2_t iface_attr_v2; for (lane = 0; lane < ucp_ep_config(ep)->key.num_lanes; lane++) { iface_attr = ucp_worker_iface_get_attr(worker, ucp_ep_get_rsc_index(ep, lane)); - if (ucs_test_all_flags(iface_attr->cap.flags, caps)) { + if (!ucs_test_all_flags(iface_attr->cap.flags, caps)) { + continue; + } + + if (v2_caps == 0) { + return true; + } + + iface_attr_v2.field_mask = UCT_IFACE_ATTR_FIELD_CAP_FLAGS; + ASSERT_UCS_OK(uct_iface_query_v2( + ucp_worker_iface(worker, + ucp_ep_get_rsc_index(ep, lane))->iface, + &iface_attr_v2)); + if (ucs_test_all_flags(iface_attr_v2.cap.flags, v2_caps)) { return true; } } diff --git a/test/gtest/ucp/ucp_test.h b/test/gtest/ucp/ucp_test.h index 190b93c4470..53bc2c830e9 100644 --- a/test/gtest/ucp/ucp_test.h +++ b/test/gtest/ucp/ucp_test.h @@ -1,5 +1,5 @@ /** -* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2014. ALL RIGHTS RESERVED. +* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2001-2026. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -154,7 +154,8 @@ class ucp_test_base : public ucs::test_base { static void ep_destructor(ucp_ep_h ep, entity *e); - bool has_lane_with_caps(uint64_t caps) const; + bool has_lane_with_caps(uint64_t caps, + uint64_t v2_caps = 0) const; bool is_rndv_put_ppln_supported() const;