Skip to content

Assertion `worker->keepalive.iter == &worker->all_eps' failed #11217

@ivanallen

Description

@ivanallen

Describe the bug

ucx: 1.18.0

(gdb) bt
#0  0x00007f97e87fa02c in __pthread_kill_implementation () from /lib64/libc.so.6
#1  0x00007f97e87acb86 in raise () from /lib64/libc.so.6
#2  0x00007f97e8796873 in abort () from /lib64/libc.so.6
#3  0x0000000002c7624e in ucs_fatal_error_message (file=file@entry=0x320748b "core/ucp_worker.c", line=line@entry=3488, function=function@entry=0x3209300 <__func__.12> "ucp_worker_do_keepalive_progress",
    message_buf=message_buf@entry=0x7f97d6fa5420 "Assertion `worker->keepalive.iter == &worker->all_eps' failed") at debug/assert.c:38
#4  0x0000000002c76329 in ucs_fatal_error_format (file=file@entry=0x320748b "core/ucp_worker.c", line=line@entry=3488, function=function@entry=0x3209300 <__func__.12> "ucp_worker_do_keepalive_progress",
    format=format@entry=0x3207475 "Assertion `%s' failed") at debug/assert.c:53
#5  0x0000000002b0e6f3 in ucp_worker_do_keepalive_progress (worker=0x54d38000) at core/ucp_worker.c:3488
#6  0x0000000002b15d3a in ucs_callbackq_dispatch (cbq=<optimized out>) at /ucx/1.18.0/source/ucx/src/ucs/datastruct/callbackq.h:215
#7  uct_worker_progress (worker=<optimized out>) at /ucx/1.18.0/source/ucx/src/uct/api/uct.h:2813
#8  ucp_worker_progress (worker=0x54d38000) at core/ucp_worker.c:3033
#9  0x0000000002251e13 in xrpc::Worker::progress (this=0x53526a00) at src/worker.cc:88
#10 0x0000000002236f65 in xrpc::Server::progress (this=<optimized out>, i=<optimized out>) at src/server.cc:390
#11 0x0000000001d4c02d in bs::xrpc_server::PollContext::poll (this=<optimized out>, id=<optimized out>) at src/xrpc_server/xrpc_server.cc:21
#12 0x00000000012463e5 in spdk::Poller::Impl::run (this=this@entry=0x77021340) at src/util/spdk_util.cc:140
#13 0x00000000012443ab in spdk::Poller::poller_fn (arg=<optimized out>) at src/util/spdk_util.cc:236
#14 0x000000000291d6a0 in thread_execute_poller (poller=0x77028b60, thread=0xbfffc00) at thread.c:993
#15 thread_poll (thread=thread@entry=0xbfffc00, max_msgs=max_msgs@entry=0, now=now@entry=2890344622443933) at thread.c:1119
#16 0x000000000291ed32 in spdk_thread_poll (thread=thread@entry=0xbfffc00, max_msgs=max_msgs@entry=0, now=2890344622443933) at thread.c:1228
#17 0x0000000002951e61 in _reactor_run (reactor=0xbd5e440) at reactor.c:914
#18 reactor_run (arg=0xbd5e440) at reactor.c:952
#19 0x000000000297e2e6 in eal_thread_loop (arg=<optimized out>) at ../lib/eal/common/eal_common_thread.c:212
#20 0x000000000298fbc9 in eal_worker_thread_loop (arg=<optimized out>) at ../lib/eal/linux/eal.c:916
#21 0x00007f97e87f82ea in start_thread () from /lib64/libc.so.6
#22 0x00007f97e887d3c0 in clone3 () from /lib64/libc.so.6
(gdb) f 5
#5  0x0000000002b0e6f3 in ucp_worker_do_keepalive_progress (worker=0x54d38000) at core/ucp_worker.c:3488
3488    core/ucp_worker.c: No such file or directory.
(gdb) p worker
$1 = (ucp_worker_h) 0x54d38000
(gdb) p worker[0]
value of type `ucp_worker' requires 141952 bytes, which is more than max-value-size
(gdb) set max-value-size unlimited
(gdb) p worker[0]
$2 = {flags = 68719476736, async = {{thread = {{spinlock = {super = {lock = 0}, count = 1, owner = 140290123839040}, mutex = {lock = {__data = {__lock = 0, __count = 1, __owner = -687917504, __nusers = 32663, __kind = 0, __spins = 0,
                __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\000\000\000\000\001\000\000\000@6\377֗\177", '\000' <repeats 25 times>, __align = 4294967296}, owner = 0, count = 0}}}, signal = {tid = 0,
        block_count = 1, pthread = 140290123839040, timer = 0x0}, poll_block = 0}, mode = UCS_ASYNC_MODE_THREAD, missed = {lock = {lock = 1}, queue = {head = 0x0, ptail = 0x54d38050}}, last_wakeup = 2890323697152257,
    owner = 140290123839040}, context = 0x54cdbe00, uuid = 14362267146549230667, client_id = 79, uct = 0x544d7b80, req_mp = {freelist = 0x400cf2bb8, data = 0x544b9320}, rkey_mp = {freelist = 0x0, data = 0x544b9380}, atomic_tls = {
    bits = {0, 0}}, inprogress = 1, name = "0x54d38000", '\000' <repeats 21 times>, address_name = "H04-200G-ASW017-M08:14\000\000\000\000\000\000\000\000\000", flush_ops_count = 0, event_fd = -1, event_set = 0x0, eventfd = -1,
  uct_events = 0, arm_ifaces = {prev = 0x54d7a728, next = 0x54d7a728}, user_data = 0x0, ep_alloc = {freelist = 0x7f9314264100, chunks = {head = 0x7f9314281ff8, ptail = 0x7f9314281ff8}, elem_size = 88, stride_count = 1, inuse_count = 0},
  stream_ready_eps = {prev = 0x54d38160, next = 0x54d38160}, num_all_eps = 0, all_eps = {prev = 0x54d38178, next = 0x54d38178}, internal_eps = {prev = 0x54d38188, next = 0x54d38188}, conn_match_ctx = {hash = {n_buckets = 0, size = 0,
      n_occupied = 0, upper_bound = 0, flags = 0x0, keys = 0x0, vals = 0x0}, max_conn_sn = 65535, address_length = 8, ops = {get_address = 0x2b9dda0 <ucp_ep_match_get_address>, get_conn_sn = 0x2b9ddb0 <ucp_ep_match_get_conn_sn>,
      address_str = 0x2b9ddc0 <ucp_ep_match_address_str>, purge_cb = 0x0}}, ifaces = 0x4df7e880, num_ifaces = 3, num_active_ifaces = 1, scalable_tl_bitmap = {bits = {7, 0}}, cms = 0x4df36c00, am_mps = {bitmap = 2147484736, map = {
      0x4df660e0 <repeats 21 times>, 0x4df660d0, 0x4df660d0, 0x4df660d0, 0x4df660d0, 0x4df660c0, 0x4df660c0, 0x4df660c0, 0x4df660c0, 0x4df660c0, 0x4df660c0, 0x4df660c0}, data = 0x4df660c0}, reg_mp = {freelist = 0x0, data = 0x544b93e0},
  mpool_hash = {n_buckets = 0, size = 0, n_occupied = 0, upper_bound = 0, flags = 0x0, keys = 0x0, vals = 0x0}, rkey_ptr_reqs = {head = 0x0, ptail = 0x54d38360}, rkey_ptr_cb_id = -1, tm = {expected = {wildcard = {queue = {head = 0x0,
          ptail = 0x54d38378}, sw_count = 0, block_count = 0}, hash = 0x550e2000, sn = 0, sw_all_count = 0}, unexpected = {all = {prev = 0x54d383a8, next = 0x54d383a8}, hash = 0x54ce8000}, frag_hash = {n_buckets = 0, size = 0,
      n_occupied = 0, upper_bound = 0, flags = 0x0, keys = 0x0, vals = 0x0}, offload = {sync_reqs = {head = 0x0, ptail = 0x54d383e8}, tag_hash = {n_buckets = 0, size = 0, n_occupied = 0, upper_bound = 0, flags = 0x0, keys = 0x0,
        vals = 0x0}, iface = 0x0, thresh = 18446744073709551615, zcopy_thresh = 18446744073709551615}}, am = {alignment = 1, cbs = {buffer = 0x78248a80, length = 34, capacity = 34, is_fixed = 0 '\000'}},
  am_message_id = 5661291001548429202, max_am_header = 8131, mem_type_ep = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, cpu_mask = {ucs_bits = {0 <repeats 16 times>}}, rkey_config_hash = {n_buckets = 8, size = 6, n_occupied = 6,
    upper_bound = 6, flags = 0x358fd1a50, keys = 0x359098d80, vals = 0x2a7241900 "\210"}, discard_uct_ep_hash = {n_buckets = 1024, size = 0, n_occupied = 440, upper_bound = 788, flags = 0x273a68b00, keys = 0x2ecc36000,
    vals = 0x31b058000}, ep_map = {ptr_map = {next_id = 904, hash = {n_buckets = 256, size = 0, n_occupied = 98, upper_bound = 197, flags = 0x2d3791600, keys = 0x29aebc800, vals = 0x2a1e92000}}, safe = {{hash = {n_buckets = 256,
          size = 0, n_occupied = 146, upper_bound = 197, flags = 0x2743bd0c0, keys = 0x271b9b800, vals = 0x271b9b000}, lock = {lock = 1}}}}, request_map = {ptr_map = {next_id = 0, hash = {n_buckets = 0, size = 0, n_occupied = 0,
        upper_bound = 0, flags = 0x0, keys = 0x0, vals = 0x0}}, safe = 0x54d38608}, ep_config = {buffer = 0x54d84000, length = 6, capacity = 32, is_fixed = 0 '\000'}, rkey_config_count = 6, rkey_config = {{key = {md_map = 1,
        ep_cfg_index = 0 '\000', sys_dev = 255 '\377', mem_type = UCS_MEMORY_TYPE_HOST, unreachable_md_map = 0}, put_short = {max_length_host_mem = 587, max_length_unknown_mem = 587, lane = 1 '\001', rkey_index = 0 '\000'},
      lanes_distance = {{latency = 0, bandwidth = inf}, {latency = 0, bandwidth = inf}, {latency = 0, bandwidth = 0} <repeats 62 times>}, proto_select = {hash = 0x296e68a80, cache = {key = 18446744073709551615, value = 0x0}}}, {key = {
        md_map = 1, ep_cfg_index = 1 '\001', sys_dev = 255 '\377', mem_type = UCS_MEMORY_TYPE_HOST, unreachable_md_map = 0}, put_short = {max_length_host_mem = 587, max_length_unknown_mem = 587, lane = 1 '\001', rkey_index = 0 '\000'},
      lanes_distance = {{latency = 0, bandwidth = inf}, {latency = 0, bandwidth = inf}, {latency = 0, bandwidth = 0} <repeats 62 times>}, proto_select = {hash = 0x2a6c9cd80, cache = {key = 18446744073709551615, value = 0x0}}}, {key = {
        md_map = 3, ep_cfg_index = 2 '\002', sys_dev = 255 '\377', mem_type = UCS_MEMORY_TYPE_HOST, unreachable_md_map = 0}, put_short = {max_length_host_mem = 587, max_length_unknown_mem = 587, lane = 1 '\001', rkey_index = 0 '\000'},
      lanes_distance = {{latency = 0, bandwidth = inf}, {latency = 0, bandwidth = inf}, {latency = 0, bandwidth = inf}, {latency = 0, bandwidth = 0} <repeats 61 times>}, proto_select = {hash = 0x29c447ef0, cache = {
          key = 18446744073709551615, value = 0x0}}}, {key = {md_map = 1, ep_cfg_index = 3 '\003', sys_dev = 255 '\377', mem_type = UCS_MEMORY_TYPE_HOST, unreachable_md_map = 0}, put_short = {max_length_host_mem = 587,
        max_length_unknown_mem = 587, lane = 1 '\001', rkey_index = 0 '\000'}, lanes_distance = {{latency = 0, bandwidth = inf}, {latency = 0, bandwidth = inf}, {latency = 0, bandwidth = 0} <repeats 62 times>}, proto_select = {
        hash = 0x2a5d64060, cache = {key = 18446744073709551615, value = 0x0}}}, {key = {md_map = 3, ep_cfg_index = 4 '\004', sys_dev = 255 '\377', mem_type = UCS_MEMORY_TYPE_HOST, unreachable_md_map = 0}, put_short = {
        max_length_host_mem = 587, max_length_unknown_mem = 587, lane = 1 '\001', rkey_index = 0 '\000'}, lanes_distance = {{latency = 0, bandwidth = inf}, {latency = 0, bandwidth = inf}, {latency = 0, bandwidth = inf}, {latency = 0,
          bandwidth = 0} <repeats 61 times>}, proto_select = {hash = 0x35dae2f00, cache = {key = 18446744073709551615, value = 0x0}}}, {key = {md_map = 3, ep_cfg_index = 5 '\005', sys_dev = 255 '\377', mem_type = UCS_MEMORY_TYPE_HOST,
        unreachable_md_map = 0}, put_short = {max_length_host_mem = 587, max_length_unknown_mem = 587, lane = 1 '\001', rkey_index = 0 '\000'}, lanes_distance = {{latency = 0, bandwidth = inf}, {latency = 0, bandwidth = inf}, {
          latency = 0, bandwidth = inf}, {latency = 0, bandwidth = 0} <repeats 61 times>}, proto_select = {hash = 0x367563da0, cache = {key = 18446744073709551615, value = 0x0}}}, {key = {md_map = 0, ep_cfg_index = 0 '\000',
        sys_dev = 0 '\000', mem_type = UCS_MEMORY_TYPE_HOST, unreachable_md_map = 0}, put_short = {max_length_host_mem = 0, max_length_unknown_mem = 0, lane = 0 '\000', rkey_index = 0 '\000'}, lanes_distance = {{latency = 0,
          bandwidth = 0} <repeats 64 times>}, proto_select = {hash = 0x0, cache = {key = 0, value = 0x0}}} <repeats 122 times>}, keepalive = {timerfd = -1, cb_id = 2, last_round = 2890302657459615, iter = 0x4cca7cb50, ep_count = 0,
    iter_count = 4843457, round_count = 2}, counters = {ep_creations = 452, ep_creation_failures = 0, ep_closures = 452, ep_failures = 452}, usage_tracker = {handle = 0x0, iter_count = 0, rounds_count = 0, last_round = 0}}
(gdb) p worker[0].keepalive.iter
$3 = (ucs_list_link_t *) 0x4cca7cb50

Claude Code Analysis

Context

A coredump occurs in ucp_worker_do_keepalive_progress at ucp_worker.c:3488 with:
Assertion `worker->keepalive.iter == &worker->all_eps' failed

The assertion fires when the all_eps list is empty but keepalive.iter (= 0x4cca7cb50) still points to a freed EP's ep_list instead of &worker->all_eps. All 452 endpoints failed and were closed (ep_failures = 452).

Root Cause

Bug in ucp_worker_keepalive_remove_ep() (src/ucp/core/ucp_worker.c:3553-3576).

The function has an early return that skips the iterator check:

 void ucp_worker_keepalive_remove_ep(ucp_ep_h ep)
 {
     ucp_worker_h worker = ep->worker;

     // BUG: early return WITHOUT checking if keepalive.iter points to this EP
     if ((ep->cfg_index == UCP_WORKER_CFG_INDEX_NULL) ||
         (ucp_ep_config(ep)->key.keepalive_lane == UCP_NULL_LANE)) {
         return;
     }
     ...
     if (worker->keepalive.iter == &ep->ext->ep_list) {
         // update iter...
     }
 }

However, keepalive.iter CAN point to EPs with cfg_index == NULL or no keepalive lane. This is because ucp_worker_do_ep_keepalive() (line 3411-3414) skips such EPs but still advances the iterator past them (line 3459). Between progress
calls, the iterator can be left pointing at such an EP.

Crash scenario:

  1. Keepalive progress runs, processes max_ep_count EPs, stops with iter pointing to the next EP (which has cfg_index == NULL or no keepalive lane)
  2. That EP is destroyed → ucp_worker_keepalive_remove_ep() returns early due to the cfg_index/keepalive_lane check
  3. ucs_list_del() removes the EP from the list
  4. keepalive.iter is now a dangling pointer
  5. Next keepalive progress: list is empty, but iter != &all_eps → assertion failure

Fix

File: src/ucp/core/ucp_worker.c (function ucp_worker_keepalive_remove_ep, lines 3552-3576)

Move the iterator check before the early return for cfg_index/keepalive_lane. The iterator must always be updated when removing an EP from all_eps, regardless of the EP's configuration state:

 void ucp_worker_keepalive_remove_ep(ucp_ep_h ep)
 {
     ucp_worker_h worker = ep->worker;

     if (ep->flags & UCP_EP_FLAG_INTERNAL) {
         return;  /* Internal EPs are in internal_eps, not all_eps */
     }

     if (worker->keepalive.iter == &ep->ext->ep_list) {
         ucs_debug("worker %p: removed keepalive current ep %p, moving to next",
                   worker, ep);
         worker->keepalive.iter = worker->keepalive.iter->next;
         ucs_assert(worker->keepalive.iter != &ep->ext->ep_list);

         if (worker->keepalive.iter == &worker->all_eps) {
             ucs_debug("worker %p: all_eps was reached after %p was removed -"
                       "complete keepalive", worker, ep);
             ucp_worker_keepalive_complete(worker, ucs_get_time());
         }
     }
 }

Key changes:

  1. Replace the cfg_index == NULL / keepalive_lane == NULL early return with an UCP_EP_FLAG_INTERNAL check (internal EPs use internal_eps list, never all_eps, so keepalive.iter can never point to them)
  2. Always check and update keepalive.iter for non-internal EPs being removed from all_eps

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions