Skip to content

Commit c2855a4

Browse files
authored
UCT: Shorter info about unreachable address (#11228)
1 parent fe8d828 commit c2855a4

File tree

18 files changed

+144
-135
lines changed

18 files changed

+144
-135
lines changed

src/ucs/sys/sys.c

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,8 @@ uint32_t ucs_file_checksum(const char *filename)
166166
return crc;
167167
}
168168

169-
ucs_status_t ucs_ifname_to_index(const char *ndev_name, unsigned *ndev_index_p)
169+
ucs_status_t
170+
ucs_ifname_to_ndev_index(const char *ndev_name, unsigned *ndev_index_p)
170171
{
171172
unsigned ndev_index = if_nametoindex(ndev_name);
172173
if (ndev_index == 0) {
@@ -178,6 +179,33 @@ ucs_status_t ucs_ifname_to_index(const char *ndev_name, unsigned *ndev_index_p)
178179
return UCS_OK;
179180
}
180181

182+
const char *
183+
ucs_ndev_index_to_ifname(unsigned ndev_index, char *ndev_name, size_t max)
184+
{
185+
char tmp_ndev_name[IFNAMSIZ];
186+
187+
if (if_indextoname(ndev_index, tmp_ndev_name) == NULL) {
188+
snprintf(ndev_name, max, "ndev[%u]", ndev_index);
189+
} else {
190+
ucs_strncpy_safe(ndev_name, tmp_ndev_name, max);
191+
}
192+
193+
return ndev_name;
194+
}
195+
196+
ucs_status_t ucs_get_loopback_ndev_index(unsigned *ndev_index_p)
197+
{
198+
static unsigned lo_ndev_index = UINT_MAX;
199+
static ucs_status_t init_status = UCS_ERR_LAST;
200+
201+
if ((init_status == UCS_ERR_LAST) && (lo_ndev_index == UINT_MAX)) {
202+
init_status = ucs_ifname_to_ndev_index("lo", &lo_ndev_index);
203+
}
204+
205+
*ndev_index_p = lo_ndev_index;
206+
return init_status;
207+
}
208+
181209
static uint64_t ucs_get_mac_address()
182210
{
183211
static uint64_t mac_address = 0;
@@ -1411,9 +1439,10 @@ void ucs_sys_cpuset_copy(ucs_cpu_set_t *dst, const ucs_sys_cpuset_t *src)
14111439
}
14121440
}
14131441

1414-
ucs_sys_ns_t ucs_sys_get_default_ns(ucs_sys_namespace_type_t ns)
1442+
ucs_sys_ns_t ucs_sys_get_default_ns(ucs_sys_namespace_type_t name)
14151443
{
1416-
return (ns < UCS_SYS_NS_TYPE_LAST) ? ucs_sys_namespace_info[ns].dflt : 0;
1444+
return (name < UCS_SYS_NS_TYPE_LAST) ? ucs_sys_namespace_info[name].dflt :
1445+
0;
14171446
}
14181447

14191448
ucs_sys_ns_t ucs_sys_get_ns(ucs_sys_namespace_type_t ns)

src/ucs/sys/sys.h

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,28 @@ uint32_t ucs_file_checksum(const char *filename);
189189
/**
190190
* Get interface index for a given interface name.
191191
*/
192-
ucs_status_t ucs_ifname_to_index(const char *ndev_name, unsigned *ndev_index_p);
192+
ucs_status_t
193+
ucs_ifname_to_ndev_index(const char *ndev_name, unsigned *ndev_index_p);
194+
195+
196+
/**
197+
* Get interface name for a given interface index.
198+
* Wrapper around if_indextoname.
199+
*
200+
* @param [in] ndev_index Interface index.
201+
* @param [out] ndev_name Interface name.
202+
* @param [in] max Maximum length of the interface name.
203+
*
204+
* @return Interface name.
205+
*/
206+
const char *
207+
ucs_ndev_index_to_ifname(unsigned ndev_index, char *ndev_name, size_t max);
208+
209+
210+
/**
211+
* Get interface index for a the loopback interface.
212+
*/
213+
ucs_status_t ucs_get_loopback_ndev_index(unsigned *ndev_index_p);
193214

194215

195216
/**
@@ -587,11 +608,11 @@ ucs_sys_ns_t ucs_sys_get_ns(ucs_sys_namespace_type_t name);
587608
/**
588609
* Get default namespace value for a given namespace type.
589610
*
590-
* @param [in] type Namespace type to get default value for
611+
* @param [in] name Namespace to get default value for
591612
*
592613
* @return default namespace value or 0 if type is not supported
593614
*/
594-
ucs_sys_ns_t ucs_sys_get_default_ns(ucs_sys_namespace_type_t type);
615+
ucs_sys_ns_t ucs_sys_get_default_ns(ucs_sys_namespace_type_t name);
595616

596617

597618
/**

src/uct/base/uct_iface.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -920,7 +920,7 @@ int uct_iface_local_is_reachable(uct_iface_local_addr_ns_t *addr_ns,
920920
const uct_iface_is_reachable_params_t *params);
921921

922922
void uct_iface_fill_info_str_buf(const uct_iface_is_reachable_params_t *params,
923-
const char *fmt, ...);
923+
const char *fmt, ...) UCS_F_PRINTF(2, 3);
924924

925925
int uct_iface_is_reachable_params_valid(
926926
const uct_iface_is_reachable_params_t *params, uint64_t flags);

src/uct/cuda/cuda_copy/cuda_copy_iface.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,7 @@ static int uct_cuda_copy_iface_is_reachable_v2(
8787
}
8888

8989
if (iface->id != *addr) {
90-
uct_iface_fill_info_str_buf(
91-
params, "different iface id %"PRIx64" vs %"PRIx64"",
92-
iface->id, *addr);
90+
uct_iface_fill_info_str_buf(params, "iface mismatch");
9391
return 0;
9492
}
9593

src/uct/cuda/cuda_ipc/cuda_ipc_iface.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ uct_cuda_ipc_iface_is_reachable_v2(const uct_iface_h tl_iface,
147147
return uct_iface_scope_is_reachable(tl_iface, params);
148148
}
149149

150-
uct_iface_fill_info_str_buf(params, "MNNVL is not supported");
150+
uct_iface_fill_info_str_buf(params, "different machine and no MNNVL");
151151
return 0;
152152
}
153153

src/uct/cuda/gdr_copy/gdr_copy_iface.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,7 @@ uct_gdr_copy_iface_is_reachable_v2(const uct_iface_h tl_iface,
8080
}
8181

8282
if (iface->id != *addr) {
83-
uct_iface_fill_info_str_buf(params,
84-
"different iface id %"PRIx64" vs %"PRIx64"",
85-
iface->id, *addr);
83+
uct_iface_fill_info_str_buf(params, "iface mismatch");
8684
return 0;
8785
}
8886

src/uct/ib/base/ib_device.c

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@
3131
#include <rdma/rdma_netlink.h>
3232
#endif
3333

34-
#define UCT_IB_DEVICE_LOOPBACK_NDEV_INDEX_INVALID 0
35-
3634

3735
/* This table is according to "Encoding for RNR NAK Timer Field"
3836
* in IBTA specification */
@@ -1594,22 +1592,6 @@ uct_ib_device_get_roce_ndev_name(uct_ib_device_t *dev, uint8_t port_num,
15941592
return UCS_OK;
15951593
}
15961594

1597-
ucs_status_t uct_ib_iface_get_loopback_ndev_index(unsigned *ndev_index_p)
1598-
{
1599-
static unsigned loopback_ndev_index = UCT_IB_DEVICE_LOOPBACK_NDEV_INDEX_INVALID;
1600-
ucs_status_t status;
1601-
1602-
if (loopback_ndev_index == UCT_IB_DEVICE_LOOPBACK_NDEV_INDEX_INVALID) {
1603-
status = ucs_ifname_to_index("lo", &loopback_ndev_index);
1604-
if (status != UCS_OK) {
1605-
return status;
1606-
}
1607-
}
1608-
1609-
*ndev_index_p = loopback_ndev_index;
1610-
return UCS_OK;
1611-
}
1612-
16131595
ucs_status_t
16141596
uct_ib_device_get_roce_ndev_index(uct_ib_device_t *dev, uint8_t port_num,
16151597
uint8_t gid_index, unsigned *ndev_index_p)
@@ -1640,7 +1622,7 @@ uct_ib_device_get_roce_ndev_index(uct_ib_device_t *dev, uint8_t port_num,
16401622
goto out_unlock;
16411623
}
16421624

1643-
status = ucs_ifname_to_index(ndev_name, &ndev_index);
1625+
status = ucs_ifname_to_ndev_index(ndev_name, &ndev_index);
16441626
if (status != UCS_OK) {
16451627
goto out_unlock;
16461628
}

src/uct/ib/base/ib_device.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -412,8 +412,6 @@ ucs_status_t uct_ib_device_get_roce_ndev_name(uct_ib_device_t *dev,
412412
uint8_t gid_index,
413413
char *ndev_name, size_t max);
414414

415-
ucs_status_t uct_ib_iface_get_loopback_ndev_index(unsigned *ndev_index_p);
416-
417415
ucs_status_t
418416
uct_ib_device_get_roce_ndev_index(uct_ib_device_t *dev, uint8_t port_num,
419417
uint8_t gid_index, unsigned *ndev_index_p);

src/uct/ib/base/ib_iface.c

Lines changed: 51 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <ucs/time/time.h>
2626
#include <ucs/sys/netlink.h>
2727
#include <ucs/sys/sock.h>
28+
#include <net/if.h>
2829
#include <string.h>
2930
#include <stdlib.h>
3031
#include <poll.h>
@@ -699,14 +700,14 @@ uct_ib_iface_roce_is_routable(uct_ib_iface_t *iface, uint8_t gid_index,
699700
{
700701
uct_ib_device_t *dev = uct_ib_iface_device(iface);
701702
uint8_t port_num = iface->config.port_num;
703+
char ndev_ifname[IFNAMSIZ], lo_ifname[IFNAMSIZ];
702704
char remote_str[128];
703705
unsigned ndev_index, lo_ndev_index;
704706

705707
if (uct_ib_device_get_roce_ndev_index(dev, port_num, gid_index,
706708
&ndev_index) != UCS_OK) {
707709
uct_iface_fill_info_str_buf(params,
708-
"iface index is not found for "
709-
UCT_IB_IFACE_FMT ", gid index %u",
710+
"no ndev for " UCT_IB_IFACE_FMT " gid[%u]",
710711
UCT_IB_IFACE_ARG(iface), gid_index);
711712
return 0;
712713
}
@@ -720,23 +721,23 @@ uct_ib_iface_roce_is_routable(uct_ib_iface_t *iface, uint8_t gid_index,
720721
* because it may be used for routing in case of an interface with
721722
* VRF is configured and a RoCE IP interface uses this VRF table for
722723
* routing. */
723-
if ((uct_ib_iface_get_loopback_ndev_index(&lo_ndev_index) == UCS_OK) &&
724+
if ((ucs_get_loopback_ndev_index(&lo_ndev_index) == UCS_OK) &&
724725
ucs_netlink_route_exists(lo_ndev_index, sa_remote, NULL)) {
725726
ucs_trace(UCT_IB_IFACE_FMT ": found specific route via loopback to %s",
726727
UCT_IB_IFACE_ARG(iface),
727728
ucs_sockaddr_str(sa_remote, remote_str, sizeof(remote_str)));
728729
return 1;
729730
}
730731

731-
uct_iface_fill_info_str_buf(params,
732-
"remote address %s is not routable "
733-
"neither by interface " UCT_IB_IFACE_FMT
734-
" (ifname_index=%u) nor by loopback "
735-
"interface (ifname_index=%u)",
736-
ucs_sockaddr_str(sa_remote, remote_str,
737-
sizeof(remote_str)),
738-
UCT_IB_IFACE_ARG(iface), ndev_index,
739-
lo_ndev_index);
732+
uct_iface_fill_info_str_buf(
733+
params, "no route to %s from %s (idx %u) or %s (idx %u)",
734+
ucs_sockaddr_str(sa_remote, remote_str, sizeof(remote_str)),
735+
ucs_ndev_index_to_ifname(ndev_index, ndev_ifname,
736+
sizeof(ndev_ifname)),
737+
ndev_index,
738+
ucs_ndev_index_to_ifname(lo_ndev_index, lo_ifname,
739+
sizeof(lo_ifname)),
740+
lo_ndev_index);
740741
return 0;
741742
}
742743

@@ -762,11 +763,9 @@ uct_ib_iface_roce_is_local_subnet(int prefix_bits,
762763

763764
if (!matched) {
764765
uct_iface_fill_info_str_buf(
765-
params,
766-
"IP addresses do not match with a %u-bit prefix. local IP"
767-
" is %s, remote IP is %s",
768-
prefix_bits, ucs_sockaddr_str(sa_local, local_str, 128),
769-
ucs_sockaddr_str(sa_remote, remote_str, 128));
766+
params, "subnet local %s/%u remote %s/%u",
767+
ucs_sockaddr_str(sa_local, local_str, 128), prefix_bits,
768+
ucs_sockaddr_str(sa_remote, remote_str, 128), prefix_bits);
770769
}
771770

772771
return matched;
@@ -780,7 +779,8 @@ uct_ib_iface_roce_is_reachable(uct_ib_iface_t *iface,
780779
uct_ib_device_gid_info_t local_gid_info = iface->gid_info;
781780
sa_family_t local_ib_addr_af = local_gid_info.roce_info.addr_family;
782781
uct_ib_roce_version_t local_roce_ver = local_gid_info.roce_info.ver;
783-
uint8_t remote_ib_addr_flags = remote_ib_addr->flags;
782+
const union ibv_gid *remote_gid = (union ibv_gid*)(remote_ib_addr + 1);
783+
uint8_t remote_ib_addr_flags = remote_ib_addr->flags;
784784
struct sockaddr_storage sa_local, sa_remote;
785785
uct_ib_roce_version_t remote_roce_ver;
786786
sa_family_t remote_ib_addr_af;
@@ -799,16 +799,14 @@ uct_ib_iface_roce_is_reachable(uct_ib_iface_t *iface,
799799
ucs_assert(local_roce_ver != UCT_IB_DEVICE_ROCE_ANY);
800800

801801
if (local_roce_ver != remote_roce_ver) {
802-
uct_iface_fill_info_str_buf(
803-
params,
804-
"different RoCE versions detected. local %s (gid=%s) "
805-
"remote %s (gid=%s)",
806-
uct_ib_roce_version_str(local_roce_ver),
807-
uct_ib_gid_str(&local_gid_info.gid, local_str,
808-
sizeof(local_str)),
809-
uct_ib_roce_version_str(remote_roce_ver),
810-
uct_ib_gid_str((union ibv_gid*)(remote_ib_addr + 1), remote_str,
811-
sizeof(remote_str)));
802+
uct_iface_fill_info_str_buf(params, "local %s/%s remote %s/%s",
803+
uct_ib_gid_str(&local_gid_info.gid,
804+
local_str,
805+
sizeof(local_str)),
806+
uct_ib_roce_version_str(local_roce_ver),
807+
uct_ib_gid_str(remote_gid, remote_str,
808+
sizeof(remote_str)),
809+
uct_ib_roce_version_str(remote_roce_ver));
812810
return 0;
813811
}
814812

@@ -819,25 +817,27 @@ uct_ib_iface_roce_is_reachable(uct_ib_iface_t *iface,
819817
remote_ib_addr_af = uct_ib_address_flags_get_roce_af(remote_ib_addr_flags);
820818
if (local_ib_addr_af != remote_ib_addr_af) {
821819
uct_iface_fill_info_str_buf(
822-
params, "different IP versions, local %s vs remote %s\n",
823-
local_ib_addr_af == AF_INET ? "IPv4": "IPv6",
824-
remote_ib_addr_af == AF_INET ? "IPv4": "IPv6");
820+
params, "local %s remote %s",
821+
ucs_sockaddr_address_family_str(local_ib_addr_af),
822+
ucs_sockaddr_address_family_str(remote_ib_addr_af));
825823
return 0;
826824
}
827825

828826
if ((uct_ib_device_roce_gid_to_sockaddr(local_ib_addr_af,
829827
&local_gid_info.gid,
830828
&sa_local) != UCS_OK)) {
831-
uct_iface_fill_info_str_buf(
832-
params, "couldn't convert local RoCE address to socket address");
829+
uct_iface_fill_info_str_buf(params, "invalid local GID %s",
830+
uct_ib_gid_str(&local_gid_info.gid,
831+
local_str,
832+
sizeof(local_str)));
833833
return 0;
834834
}
835835

836-
if (uct_ib_device_roce_gid_to_sockaddr(remote_ib_addr_af,
837-
remote_ib_addr + 1,
836+
if (uct_ib_device_roce_gid_to_sockaddr(remote_ib_addr_af, remote_gid,
838837
&sa_remote) != UCS_OK) {
839-
uct_iface_fill_info_str_buf(
840-
params, "couldn't convert remote RoCE address to socket address");
838+
uct_iface_fill_info_str_buf(params, "invalid remote GID %s",
839+
uct_ib_gid_str(remote_gid, remote_str,
840+
sizeof(remote_str)));
841841
return 0;
842842
}
843843

@@ -911,22 +911,18 @@ static int uct_ib_iface_dev_addr_is_reachable(
911911
return 0;
912912
}
913913

914-
/* at least one PKEY has to be with full membership */
915-
if (!((params.pkey | iface->pkey) & UCT_IB_PKEY_MEMBERSHIP_MASK)) {
916-
uct_iface_fill_info_str_buf(
917-
is_reachable_params,
918-
"both local and remote pkeys (0x%x, 0x%x) "
919-
"have partial membership",
920-
iface->pkey, params.pkey);
914+
/* PKEY values have to be equal */
915+
if ((params.pkey ^ iface->pkey) & UCT_IB_PKEY_PARTITION_MASK) {
916+
uct_iface_fill_info_str_buf(is_reachable_params,
917+
"pkey local 0x%x remote 0x%x", iface->pkey,
918+
params.pkey);
921919
return 0;
922920
}
923921

924-
/* PKEY values have to be equal */
925-
if ((params.pkey ^ iface->pkey) & UCT_IB_PKEY_PARTITION_MASK) {
926-
uct_iface_fill_info_str_buf(
927-
is_reachable_params,
928-
"local pkey 0x%x differs from remote pkey 0x%x",
929-
iface->pkey, params.pkey);
922+
/* At least one PKEY has to be with full membership */
923+
if (!((params.pkey | iface->pkey) & UCT_IB_PKEY_MEMBERSHIP_MASK)) {
924+
uct_iface_fill_info_str_buf(is_reachable_params,
925+
"partial member pkey 0x%x", params.pkey);
930926
return 0;
931927
}
932928

@@ -948,8 +944,7 @@ static int uct_ib_iface_dev_addr_is_reachable(
948944

949945
uct_iface_fill_info_str_buf(
950946
is_reachable_params,
951-
"different subnet prefix 0x%" PRIx64 "/0x%" PRIx64
952-
" and FLID is %s",
947+
"IB subnet local %" PRIx64 " remote %" PRIx64 " FLID %s",
953948
be64toh(iface->gid_info.gid.global.subnet_prefix),
954949
be64toh(params.gid.global.subnet_prefix), flid_info_str);
955950
return 0;
@@ -962,11 +957,10 @@ static int uct_ib_iface_dev_addr_is_reachable(
962957
} else {
963958
/* local and remote have different link layers and therefore are unreachable */
964959
uct_iface_fill_info_str_buf(
965-
is_reachable_params,
966-
"link layers differ %s (local) vs %s (remote)",
967-
is_local_eth ? "RoCE" : "IB",
968-
ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH ?
969-
"RoCE" : "IB");
960+
is_reachable_params, "local %s remote %s",
961+
is_local_eth ? "RoCE" : "IB",
962+
(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH) ? "RoCE" :
963+
"IB");
970964
return 0;
971965
}
972966
}

0 commit comments

Comments
 (0)