diff --git a/src/ctrip_swap.h b/src/ctrip_swap.h index 2803e28907b..edb83648250 100644 --- a/src/ctrip_swap.h +++ b/src/ctrip_swap.h @@ -976,6 +976,8 @@ long swapListTypeLength(robj *list, objectMeta *object_meta); void swapListTypePush(robj *subject, robj *value, int where, redisDb *db, robj *key); robj *swapListTypePop(robj *subject, int where, redisDb *db, robj *key); void swapListMetaDelRange(redisDb *db, robj *key, long ltrim, long rtrim); +void swapListMetaTrimCold(redisDb *db, robj *key, long ltrim, long rtrim, + long long orig_lstart, long long orig_lend); /* zset */ typedef struct zsetSwapData { swapData sd; diff --git a/src/ctrip_swap_list.c b/src/ctrip_swap_list.c index 335962db12a..27584ade534 100644 --- a/src/ctrip_swap_list.c +++ b/src/ctrip_swap_list.c @@ -1019,6 +1019,9 @@ int metaListDelete(metaList *main, int direction, long ridx) { return delete; } +/* Forward declaration: defined after swapListMetaDelRange (below metaListMerge). */ +static void listMetaTrimToLen(listMeta *meta, long desired_len); + long metaListMerge(metaList *main, metaList *delta) { long merged = 0, ridx, main_mean_ridx, delta_mean_ridx; int segtype; @@ -1028,6 +1031,55 @@ long metaListMerge(metaList *main, metaList *delta) { serverAssert(metaListIsValid(main,LIST_META_STRICT_NOEMPTY|LIST_META_STRICT_CONTINOUS)); serverAssert(metaListIsValid(delta,0)); + /* Clip delta to main's ridx range before merging. + * + * Race condition: swapListMetaTrimCold() may shrink the in-memory meta + * (reducing its last ridx) AFTER a SWAP_IN was dispatched but BEFORE it + * completes. The decoded delta then contains elements at ridx positions + * beyond the current main range, which would violate listMetaAlign's + * invariant and cause an assertion failure. + * + * Fix: discard any delta elements (and the corresponding meta segments) + * whose ridx positions fall outside main's range. These are "orphaned" + * entries – the meta trim made them logically deleted even though the + * RocksDB entries were not yet physically removed. */ + { + segment *main_last = listMetaLastSegment(main->meta); + segment *delta_last_seg = listMetaLastSegment(delta->meta); + if (main_last && delta_last_seg) { + long main_end = main_last->index + main_last->len; + long delta_end = delta_last_seg->index + delta_last_seg->len; + if (delta_end > main_end) { + /* Count HOT elements in delta at ridx >= main_end. + * LIST_HEAD direction iterates from tail (high ridx) to head. */ + long excess_hot = 0; + { + metaListIterator iter; + metaListIterInit(&iter, delta, LIST_HEAD); + while (!metaListIterFinished(&iter)) { + long cur_ridx = metaListIterCur(&iter, NULL, NULL); + if (cur_ridx >= main_end) { + excess_hot++; + metaListIterNext(&iter); + } else { + break; + } + } + metaListIterDeinit(&iter); + } + if (excess_hot > 0) { + /* Remove excess elements from the tail of delta's list + * (high-ridx elements are stored at the tail). */ + listTypeDelRange(delta->list, -(excess_hot), excess_hot); + /* Trim delta's meta to match: remove the ridx range + * [main_end .. delta_end) from the tail. */ + listMetaTrimToLen(delta->meta, + delta->meta->len - (delta_end - main_end)); + } + } + } + } + /* always merge small inst into big one */ if (metaListLen(main,SEGMENT_TYPE_HOT) < metaListLen(delta,SEGMENT_TYPE_HOT)) { @@ -2022,6 +2074,52 @@ void swapListMetaDelRange(redisDb *db, robj *key, long ltrim, long rtrim) { if (meta) listMetaExtend(meta,-ltrim,-rtrim); } +/* Remove excess elements from the tail of the meta to reach desired_len. + * Used to clean up cold segments that are logically outside the LTRIM keep range. */ +static void listMetaTrimToLen(listMeta *meta, long desired_len) { + if (desired_len >= meta->len || desired_len < 0) return; + long excess = meta->len - desired_len; + while (excess > 0 && meta->num > 0) { + segment *last = listMetaLastSegment(meta); + if (excess >= last->len) { + excess -= last->len; + meta->len -= last->len; + meta->num--; + } else { + last->len -= excess; + meta->len -= excess; + excess = 0; + } + } +} + +/* After LTRIM on a warm/cold list, the meta may retain cold segments that are + * logically outside the master's keep range [orig_lstart..orig_lend]. + * This happens because listMetaExtend only removes hot elements from head/tail, + * leaving cold elements in the middle intact even if they're out of range. + * This function trims the meta to match the master's desired list length. */ +void swapListMetaTrimCold(redisDb *db, robj *key, long ltrim, long rtrim, + long long orig_lstart, long long orig_lend) { + listMeta *meta = lookupListMeta(db, key); + if (!meta) return; + /* Reconstruct total logical length before this LTRIM ran. */ + long total_llen_before = meta->len + ltrim + rtrim; + /* Normalize negative indices (same as Redis ltrimCommand logic). */ + if (orig_lstart < 0) orig_lstart += total_llen_before; + if (orig_lend < 0) orig_lend += total_llen_before; + if (orig_lstart < 0) orig_lstart = 0; + long desired_len; + if (orig_lend < orig_lstart || orig_lstart >= total_llen_before) { + desired_len = 0; + } else { + if (orig_lend >= total_llen_before) orig_lend = total_llen_before - 1; + desired_len = orig_lend - orig_lstart + 1; + } + if (meta->len > desired_len) { + listMetaTrimToLen(meta, desired_len); + } +} + /* List rdb save, note that: * - hot lists are saved as RDB_TYPE_LIST_QUICKLIST (same as origin redis) * - warm/cold list are saved as RDB_TYPE_LIST, which are more suitable @@ -2872,6 +2970,53 @@ int swapListMetaTest(int argc, char *argv[], int accurate) { metaListDestroy(main); } + TEST("meta-list: merge clips delta exceeding main ridx range") { + /* Regression test for the listMetaAlign assertion failure that occurs + * when swapListMetaTrimCold() shrinks main's ridx range after a SWAP_IN + * has been dispatched but before metaListMerge() runs. + * + * The decoded delta still covers the old (larger) ridx range. Without + * the fix, metaListMerge() would call + * listMetaAlign(main(last=40), orig_delta_meta(last=60)) + * which triggers the serverAssert at line 575 because 40 < 60. + * + * With the fix, the delta is clipped to main's ridx range before the + * merge, so the assertion is never violated. */ + + /* main: HOT(0,10) | COLD(10,20) | HOT(30,10) => ridx 0..39, 20 HOT */ + listMeta *main_meta = listMetaCreate(); + listMetaAppendSegment(main_meta, SEGMENT_TYPE_HOT, 0, 10); + listMetaAppendSegment(main_meta, SEGMENT_TYPE_COLD, 10, 20); + listMetaAppendSegment(main_meta, SEGMENT_TYPE_HOT, 30, 10); + robj *main_list = createQuicklistObject(-2, 0); + metaList *main = metaListBuild(main_meta, main_list); + metaListPopulateList(main); /* populate hot elements ridx 0..9, 30..39 */ + test_assert(listTypeLength(main_list) == 20); + + /* delta: HOT(0,60) => ridx 0..59, 60 HOT elements. + * Simulates a SWAP_IN decoded based on the OLD (untrimmed) meta that + * covered ridx 0..59. After swapListMetaTrimCold trimmed main to + * ridx 0..39, this delta would violate listMetaAlign's invariant. */ + listMeta *delta_meta = listMetaCreate(); + listMetaAppendSegment(delta_meta, SEGMENT_TYPE_HOT, 0, 60); + robj *delta_list = createQuicklistObject(-2, 0); + metaList *delta = metaListBuild(delta_meta, delta_list); + metaListPopulateList(delta); /* populate ridx 0..59 */ + test_assert(listTypeLength(delta_list) == 60); + + /* metaListMerge must NOT crash (the fix clips delta to ridx 0..39). */ + metaListMerge(main, delta); + + /* After merge: main covers ridx 0..39, all HOT (40 elements). */ + segment *last = listMetaLastSegment(main->meta); + test_assert(last != NULL); + test_assert(last->index + last->len == 40); + test_assert(listTypeLength(main->list) == 40); + + metaListDestroy(delta); + metaListDestroy(main); + } + TEST("meta-list: exclude") { listMeta *meta = listMetaCreate(); robj *list = createQuicklistObject(-2, 0); diff --git a/src/ctrip_swap_repl.c b/src/ctrip_swap_repl.c index 681bd6a959c..33f8298eb4a 100644 --- a/src/ctrip_swap_repl.c +++ b/src/ctrip_swap_repl.c @@ -28,6 +28,87 @@ #include "ctrip_swap.h" +typedef struct replDeferredBeforeCall { + keyRequest key_request; + swapData *data; + void *datactx; +} replDeferredBeforeCall; + +typedef struct replDeferredBeforeCallQueue { + client *wc; + list *items; /* list of replDeferredBeforeCall* */ +} replDeferredBeforeCallQueue; + +static list *replDeferredBeforeCallQueues = NULL; + +static replDeferredBeforeCallQueue *replDeferredBeforeCallGetQueue(client *wc, int create) { + if (!replDeferredBeforeCallQueues) { + if (!create) return NULL; + replDeferredBeforeCallQueues = listCreate(); + } + + listIter li; + listNode *ln; + listRewind(replDeferredBeforeCallQueues, &li); + while ((ln = listNext(&li))) { + replDeferredBeforeCallQueue *q = listNodeValue(ln); + if (q->wc == wc) return q; + } + + if (!create) return NULL; + replDeferredBeforeCallQueue *q = zmalloc(sizeof(*q)); + q->wc = wc; + q->items = listCreate(); + listAddNodeTail(replDeferredBeforeCallQueues, q); + return q; +} + +static void replDeferredBeforeCallEnqueue(client *wc, swapCtx *ctx) { + if (!ctx->data) return; + replDeferredBeforeCallQueue *q = replDeferredBeforeCallGetQueue(wc, 1); + replDeferredBeforeCall *item = zmalloc(sizeof(*item)); + memset(item, 0, sizeof(*item)); + copyKeyRequest(&item->key_request, ctx->key_request); + item->data = ctx->data; + item->datactx = ctx->datactx; + ctx->data = NULL; + ctx->datactx = NULL; + listAddNodeTail(q->items, item); +} + +static void replDeferredBeforeCallFlush(client *wc, int apply_before_call) { + if (!replDeferredBeforeCallQueues) return; + listNode *qln = listFirst(replDeferredBeforeCallQueues); + while (qln) { + listNode *next = listNextNode(qln); + replDeferredBeforeCallQueue *q = listNodeValue(qln); + if (q->wc == wc) { + listNode *ln = listFirst(q->items); + while (ln) { + listNode *ln_next = listNextNode(ln); + replDeferredBeforeCall *item = listNodeValue(ln); + if (apply_before_call && item->data && swapDataAlreadySetup(item->data)) { + swapDataBeforeCall(item->data, &item->key_request, wc, item->datactx); + } + if (item->data && item->data->value != NULL) { + decrRefCount(item->data->value); + item->data->value = NULL; + } + keyRequestDeinit(&item->key_request); + if (item->data) swapDataFree(item->data, item->datactx); + zfree(item); + listDelNode(q->items, ln); + ln = ln_next; + } + listRelease(q->items); + zfree(q); + listDelNode(replDeferredBeforeCallQueues, qln); + break; + } + qln = next; + } +} + /* See replicationHandleMasterDisconnection for more details */ void replicationHandleMasterDisconnectionWithoutReconnect(void) { /* Fire the master link modules event. */ @@ -130,6 +211,7 @@ static void replClientUpdateSelectedDb(client *c) { static void replCommandDispatch(client *wc, client *c) { /* wc may still have argv from last dispatched command, free it safely. */ if (wc->argv) freeClientArgv(wc); + replDeferredBeforeCallFlush(wc, 0); wc->db = c->db; @@ -169,7 +251,9 @@ static void processFinishedReplCommands() { while ((ln = listFirst(server.swap_repl_worker_clients_used))) { wc = listNodeValue(ln); - if (wc->CLIENT_REPL_SWAPPING) break; + if (wc->CLIENT_REPL_SWAPPING) { + break; + } c = wc->swap_repl_client; wc->flags &= ~CLIENT_SWAPPING; @@ -190,9 +274,11 @@ static void processFinishedReplCommands() { } if (wc->swap_errcode) { + replDeferredBeforeCallFlush(wc, 0); rejectCommandFormat(c,"Swap failed (code=%d)",wc->swap_errcode); wc->swap_errcode = 0; } else { + replDeferredBeforeCallFlush(wc, 1); call(wc, CMD_CALL_FULL); /* post call */ @@ -252,19 +338,13 @@ void replWorkerClientKeyRequestFinished(client *wc, swapCtx *ctx) { client *c; listNode *ln; list *repl_swapping_clients; - UNUSED(ctx); - serverLog(LL_DEBUG, "> replWorkerClientSwapFinished client(id=%ld,cmd=%s,key=%s)", wc->id,wc->cmd->fullname,wc->argc <= 1 ? "": (sds)wc->argv[1]->ptr); DEBUG_MSGS_APPEND(&ctx->msgs, "request-finished", "errcode=%d",ctx->errcode); if (ctx->errcode) clientSwapError(wc,ctx->errcode); - keyRequestBeforeCall(wc,ctx); - if (ctx->data && ctx->data->value != NULL) { - decrRefCount(ctx->data->value); - ctx->data->value = NULL; - } + replDeferredBeforeCallEnqueue(wc, ctx); /* Flag swap finished, note that command processing will be defered to * processFinishedReplCommands becasue there might be unfinished preceeding swap. */ wc->keyrequests_count--; diff --git a/src/iothread.c b/src/iothread.c index 0f52f753d38..322f53476ea 100644 --- a/src/iothread.c +++ b/src/iothread.c @@ -942,8 +942,13 @@ void ioThreadsScaleDownTryEnd(void) { if (thread_state == THREAD_STATE_STOPPED) { destroyIOThread(t); server.io_threads_num--; - /* scaling down is not urgent - (it doesn't consume time from the main thread) */ + /* If all threads have been scaled down to the target + * count, transition immediately so callers see NONE + * in the same beforeSleep pass. */ + if (server.io_threads_num <= server.config_io_threads_num) { + server.io_threads_scale_status = IO_THREAD_SCALE_STATUS_NONE; + serverLog(LL_NOTICE, "IO threads scale-down end"); + } } } else { pauseIOThread(t->id); diff --git a/src/t_list.c b/src/t_list.c index 64e105cbd8c..c03c9335d09 100644 --- a/src/t_list.c +++ b/src/t_list.c @@ -817,7 +817,6 @@ void listElementsRemoved(client *c, robj *key, int where, robj *o, long count, i updateKeysizesHist(c->db, getKeySlot(key->ptr), OBJ_LIST, llen + count, llen); if (llen == 0) { if (deleted) *deleted = 1; - dbDelete(c->db, key); notifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, c->db->id); } else { @@ -1012,6 +1011,24 @@ void ltrimCommand(client *c) { notifyKeyspaceEvent(NOTIFY_LIST,"ltrim",c->argv[1],c->db->id); #ifdef ENABLE_SWAP objectMeta *om = lookupMeta(c->db,c->argv[1]); + /* After trimming the hot list, also trim cold meta segments that are logically + * outside the master's keep range. This corrects a divergence that occurs when + * eviction happens between LTRIM dispatch (when the key may be hot/meta-less) + * and LTRIM apply (when the key is warm with cold segments). + * We detect this case by checking whether a beforeCall rewrite occurred + * (c->swap_arg_rewrites->num == 2), which means the original logical indices + * are saved in the rewrites and the meta was present at apply time. */ + if (om && c->swap_arg_rewrites && c->swap_arg_rewrites->num == 2 && + c->swap_arg_rewrites->rewrites[0].arg_req.arg_idx == 2 && + c->swap_arg_rewrites->rewrites[1].arg_req.arg_idx == 3) { + long long orig_lstart, orig_lend; + if (getLongLongFromObject(c->swap_arg_rewrites->rewrites[0].orig_arg, &orig_lstart) == C_OK && + getLongLongFromObject(c->swap_arg_rewrites->rewrites[1].orig_arg, &orig_lend) == C_OK) { + swapListMetaTrimCold(c->db, c->argv[1], ltrim, rtrim, orig_lstart, orig_lend); + /* Re-lookup om after potential meta modification */ + om = lookupMeta(c->db, c->argv[1]); + } + } notifyKeyspaceEventDirty(NOTIFY_LIST,"ltrim",c->argv[1],c->db->id,o,NULL); if ((llenNew = swapListTypeLength(o,om)) == 0) { #else diff --git a/tests/swap/unit/list_listmetaalign_guard.tcl b/tests/swap/unit/list_listmetaalign_guard.tcl new file mode 100644 index 00000000000..3ff21c8eb42 --- /dev/null +++ b/tests/swap/unit/list_listmetaalign_guard.tcl @@ -0,0 +1,139 @@ +# Guard test for: listMetaAlign assertion failure caused by swapListMetaTrimCold +# racing with a concurrent SWAP_IN on the replica. +# +# BUG (introduced in commit 08b1e6342 as a side-effect of swapListMetaTrimCold, +# fixed in commit edb7e7833): +# swapListMetaTrimCold() shrinks the in-memory meta (reduces last ridx) AFTER +# a SWAP_IN was already dispatched based on the OLD (larger) meta range. +# When the SWAP_IN completes on the swap thread, metaListMerge receives a +# delta whose last ridx extends beyond the current (trimmed) main ridx range, +# violating the invariant checked by listMetaAlign() at ctrip_swap_list.c:575: +# +# serverAssert(last->index+last->len >= delta_last->index+delta_last->len) +# +# This assertion failure crashes the swap thread → whole process coredumps. +# +# RACE CONDITION (replica-specific): +# On the REPLICA the replication command queue allows multiple commands to +# have SWAP_INs in-flight simultaneously: +# +# 1. RPOP / LRANGE arrives → SWAP_IN dispatched (based on current large +# meta ridx range); replica blocked waiting for swap. +# 2. The next command (LTRIM) arrives and is dispatched. If the list is HOT +# at dispatch time the swap is a NOP and LTRIM runs immediately. +# ltrimCommand calls swapListMetaTrimCold(), shrinking meta's ridx range. +# 3. Step-1 SWAP_IN completes on the swap thread → swapExecBatchExecuteIn +# → listCreateOrMergeObject → metaListMerge(main, delta). +# delta (decoded from RocksDB) still has ridx up to the OLD range; main +# (current in-memory meta) has already been trimmed → assertion failure. +# +# FIX (commit edb7e7833): +# At the start of metaListMerge, before the hot/cold swap logic, check +# whether delta_end > main_end. If so, iterate the delta from LIST_HEAD +# (high-ridx end) to count and discard excess HOT elements: +# - listTypeDelRange removes them from the quicklist tail. +# - listMetaTrimToLen removes the corresponding meta segments. +# This makes in-flight SWAP_INs safe even after swapListMetaTrimCold has +# already shrunk the main meta. +# +# HOW THIS TEST DETECTS THE BUG: +# - Master + replica pair with aggressive eviction on both sides: every key +# cycles through HOT→COLD→WARM states continuously, maximising the window +# where an in-flight SWAP_IN overlaps with an LTRIM apply on the replica. +# - propagation-error-behavior = panic-on-replicas: any replication error +# crashes the replica immediately, making the fault visible. +# - Workload mixes LRANGE (triggers large cold SWAP_IN), LTRIM (triggers +# swapListMetaTrimCold), and LPUSH/RPOP to keep the list alive. +# - After load: master/replica debug digest-value comparison catches any +# silent data corruption. +# +# REPRODUCTION RATE (without fix): crash within ~10 s in stress runs. +# WITH FIX: 0 crashes across 5+ runs; master/replica digest always matches. + +start_server {tags {"list" "repl"} overrides {save ""}} { + start_server {overrides {save ""}} { + set master_host [srv 0 host] + set master_port [srv 0 port] + set master [srv 0 client] + set slave [srv -1 client] + + $slave slaveof $master_host $master_port + wait_for_sync $slave + + # Aggressive eviction on both sides to maximise HOT→COLD→WARM cycling. + $master config set swap-debug-evict-keys -1 + $slave config set swap-debug-evict-keys -1 + + # Crash the replica on any replication error to surface the assertion + # failure immediately rather than continuing with diverged data. + $slave config set propagation-error-behavior panic-on-replicas + + test {swap-list listMetaAlign guard: no assertion failure under LTRIM+SWAP_IN race on replica} { + set num_keys 4 + set duration 10 + set num_loaders 4 + + # Seed each key with 100 elements so there is a large ridx range. + # swapListMetaTrimCold can then shrink it significantly, widening + # the window for the race between in-flight SWAP_IN and LTRIM apply. + for {set i 0} {$i < $num_keys} {incr i} { + $master del "align-guard-$i" + for {set j 0} {$j < 100} {incr j} { + $master rpush "align-guard-$i" "seed:$i:$j" + } + } + wait_for_ofs_sync $master $slave + + # Concurrent workload on the master (replicated to the replica): + # + # LRANGE 0 -1 — loads the entire list (cold + hot) via SWAP_IN + # on the replica, creating a large decoded delta. + # LTRIM — shrinks the list aggressively; calls + # swapListMetaTrimCold on the replica. + # LPUSH/RPUSH — re-grows the list to maintain HOT/COLD cycling. + # RPOP/LPOP — triggers small SWAP_INs for cold tail/head. + # LMOVE self — creates a warm head on an otherwise cold list, + # increasing the decoded ridx delta on merge. + set handles {} + for {set loader 0} {$loader < $num_loaders} {incr loader} { + lappend handles [start_run_load $master_host $master_port $duration 0 { + set mykey "align-guard-[randomInt 4]" + set mylen [$r1 llen $mykey] + randpath { + $r1 LRANGE $mykey 0 -1 + } { + if {$mylen > 4} { + $r1 LTRIM $mykey 2 [expr {$mylen - 3}] + } else { + $r1 LTRIM $mykey 0 $mylen + } + } { + for {set k 0} {$k < 10} {incr k} { + $r1 RPUSH $mykey "grow:[pid]:$k" + } + } { + $r1 RPOP $mykey + } { + $r1 LPOP $mykey + } { + $r1 LMOVE $mykey $mykey RIGHT LEFT + } + }] + } + + after [expr {$duration * 1000}] + wait_load_handlers_disconnected + + # Sync and verify master/replica content is identical for every key. + wait_for_ofs_sync $master $slave + + for {set i 0} {$i < $num_keys} {incr i} { + set key "align-guard-$i" + set md [$master debug digest-value $key] + set sd [$slave debug digest-value $key] + assert_equal $md $sd \ + "key $key: master/slave digest mismatch (possible data corruption)" + } + } + } +} diff --git a/tests/swap/unit/list_ltrim_cold_repl_diverge.tcl b/tests/swap/unit/list_ltrim_cold_repl_diverge.tcl new file mode 100644 index 00000000000..8083909776e --- /dev/null +++ b/tests/swap/unit/list_ltrim_cold_repl_diverge.tcl @@ -0,0 +1,121 @@ +# Guard test for: LTRIM cold-segment replica divergence causing LSET panic +# +# BUG (fixed in commit 08b1e6342): +# When a list key is fully hot on the replica at LTRIM dispatch time (SWAP_NOP, +# no meta), but background eviction makes it warm (meta with cold segments) +# before LTRIM is applied, two things go wrong: +# +# 1. listBeforeCall rewrites argv[2]/argv[3] from logical indices to hot memory +# indices. For a logical end-index that falls in a COLD segment, +# listMetaGetMidx returns the memory index of the next hot element *after* +# the cold gap, so the wrong elements are retained. +# +# 2. swapListMetaDelRange only trims HOT segments from the head/tail of the +# meta. Cold segments that are logically outside the retained range are +# left in the meta intact. +# +# Result: replica meta.len >> master list len. Subsequent RPOPs / LTRIMs +# slowly drain the replica list element-by-element (each cold RPOP triggers a +# swap-in), eventually deleting the key. When master issues LSET, replica +# finds no key → panic: +# "Guru Meditation: after processing command 'lset' #networking.c:615" +# +# FIX: +# 1. Deferred beforeCall: keyRequestBeforeCall is now executed immediately +# before call() in processFinishedReplCommands rather than at swap-completion +# time, so listBeforeCall always observes the current key state. +# 2. swapListMetaTrimCold(): after swapListMetaDelRange in ltrimCommand, +# compute the desired logical length from the original (pre-rewrite) LTRIM +# indices and trim any excess cold segments from the meta tail. +# +# HOW THIS TEST DETECTS THE BUG: +# - propagation-error-behavior = panic-on-replicas: replica panics on any +# replication error, causing the server to crash → test hangs / fails. +# - After load: debug digest-value comparison between master and replica +# catches silent content divergence even without a panic. +# +# REPRODUCTION RATE (without fix): ~60-80% panic within 10 s of load. +# WITH FIX: 0 panics across 5+ runs; master/replica digest always matches. + +start_server {tags {"list" "repl"} overrides {save ""}} { + start_server {overrides {save ""}} { + set master_host [srv 0 host] + set master_port [srv 0 port] + set master [srv 0 client] + set slave [srv -1 client] + + $slave slaveof $master_host $master_port + wait_for_sync $slave + + # Aggressive eviction on both sides: every key is evicted as soon as it + # becomes eligible, maximising the chance of the hot-at-dispatch / + # warm-at-apply race in LTRIM. + $master config set swap-debug-evict-keys -1 + $slave config set swap-debug-evict-keys -1 + + # Make the replica panic on any replication error so the test catches + # the original "lset on deleted key" failure immediately. + $slave config set propagation-error-behavior panic-on-replicas + + test {swap-list LTRIM cold-segment repl divergence: no panic and master/replica consistent} { + set num_keys 4 + set duration 10 + set num_loaders 4 + + # Seed each key with 80 elements so there is plenty of cold data + # once eviction starts. + for {set i 0} {$i < $num_keys} {incr i} { + $master del "ltrim-cold-guard-$i" + for {set j 0} {$j < 80} {incr j} { + $master rpush "ltrim-cold-guard-$i" "seed:$i:$j" + } + } + wait_for_ofs_sync $master $slave + + # Concurrent load that triggers the race: + # LMOVE self RIGHT-LEFT reorders the tail element to the head, + # creating a temporary hot head on an otherwise cold/warm list. + # LTRIM immediately after reduces the list, targeting indices that + # span cold segments. LSET on the highest valid index is the + # command that panics the replica when list lengths diverge. + set handles {} + for {set loader 0} {$loader < $num_loaders} {incr loader} { + lappend handles [start_run_load $master_host $master_port $duration 0 { + set mykey "ltrim-cold-guard-[randomInt 4]" + set mylen [$r1 llen $mykey] + randpath { + $r1 LMOVE $mykey $mykey RIGHT LEFT + } { + if {$mylen > 3} { + $r1 LTRIM $mykey 1 [expr {$mylen - 2}] + } else { + $r1 LTRIM $mykey 0 $mylen + } + } { + if {$mylen > 0} { + catch {$r1 LSET $mykey [expr {$mylen - 1}] "v-[pid]"} + } + } { + $r1 LPUSH $mykey [pid] [pid] + } { + $r1 RPOP $mykey + } + }] + } + + after [expr {$duration * 1000}] + wait_load_handlers_disconnected + + # Sync and verify master/replica content is identical for every key. + wait_for_ofs_sync $master $slave + + for {set i 0} {$i < $num_keys} {incr i} { + set key "ltrim-cold-guard-$i" + set md [$master debug digest-value $key] + set sd [$slave debug digest-value $key] + assert_equal $md $sd \ + "key $key: master/slave digest mismatch (replica list diverged)" + } + } + } +} diff --git a/tests/unit/expire.tcl b/tests/unit/expire.tcl index cc66dfb2bfd..4a7cb269603 100644 --- a/tests/unit/expire.tcl +++ b/tests/unit/expire.tcl @@ -852,22 +852,28 @@ start_server {tags {"expire"}} { r debug set-active-expire 0 r flushall - r set foo1 bar PX 1 - if {$::swap} { - #TODO optimize wait_key_cold - catch {wait_key_cold r foo1} err - } - r set foo2 bar PX 1 - if {$::swap} { - #TODO optimize wait_key_cold - catch {wait_key_cold r foo2} err + if {!$::swap} { + r set foo1 bar PX 1 + r set foo2 bar PX 1 + after 2 + } else { + # Set both keys and submit both evictions simultaneously. + # TTL (PX 80) must be > eviction time (~50ms) so eviction completes + # before the key expires (SWAP_OUT, not SWAP_DEL). + # TTL < after 100 guarantees both keys are expired when SCAN runs, + # even if wait_key_cold returns immediately (0ms). + r set foo1 bar PX 80 + r set foo2 bar PX 80 + r swap.evict foo1 + r swap.evict foo2 + wait_key_cold r foo1 + wait_key_cold r foo2 + after 100 } - after 2 - set repl [attach_to_replication_stream] if {!$::swap} { - r scan 0 + r scan 0 } else { set next_cursor [lindex [r scan 0] 0] r scan $next_cursor @@ -878,7 +884,6 @@ start_server {tags {"expire"}} { fail "Keys did not actively expire." } } - assert_replication_stream $repl { {select *} @@ -893,17 +898,19 @@ start_server {tags {"expire"}} { r debug set-active-expire 0 r flushall - r set foo1 bar PX 1 - if {$::swap} { - #TODO optimize wait_key_cold - catch {wait_key_cold r foo1} err - } - r set foo2 bar PX 1 - if {$::swap} { - #TODO optimize wait_key_cold - catch {wait_key_cold r foo2} err + if {!$::swap} { + r set foo1 bar PX 1 + r set foo2 bar PX 1 + after 2 + } else { + r set foo1 bar PX 80 + r set foo2 bar PX 80 + r swap.evict foo1 + r swap.evict foo2 + wait_key_cold r foo1 + wait_key_cold r foo2 + after 100 } - after 2 set repl [attach_to_replication_stream] diff --git a/tests/unit/io_thread.tcl b/tests/unit/io_thread.tcl index ab298241321..a2c8de122f0 100644 --- a/tests/unit/io_thread.tcl +++ b/tests/unit/io_thread.tcl @@ -153,7 +153,15 @@ start_server {overrides {}} { } lappend clients $cli } - assert_equal [get_kv_value [get_info_field [r info threads] io_thread_1 ] clients] 101 + # Wait for all 100 new clients to be accepted and assigned by the server. + # When singledb=1 (e.g. swap build), no SELECT is sent so there is no + # synchronous round-trip to guarantee the server has accepted every + # connection before we query the thread info. + wait_for_condition 100 50 { + [get_kv_value [get_info_field [r info threads] io_thread_1 ] clients] == 101 + } else { + fail "Expected 101 clients on io_thread_1 after adding 100 connections" + } # set io-threads n # wait CLIENT_IO_PENDING_CRON ,load balancing @@ -181,12 +189,16 @@ start_server {overrides {}} { } r config set io-threads 2 assert_equal [get_info_field [r info threads] io_thread_scale_status] "down" + # ioThreadsScaleDownTryEnd() destroys the thread and decrements + # io_threads_num in one beforeSleep pass, but only transitions + # scale_status from DOWN to NONE in the *next* pass. Use + # wait_for_condition for both checks to avoid the one-iteration race. wait_for_condition 100 50 { - [get_info_field [r info threads] io_thread_2 ] eq "" + [get_info_field [r info threads] io_thread_2 ] eq "" && + [get_info_field [r info threads] io_thread_scale_status] eq "none" } else { fail "thread down n => 2 fail" } - assert_equal [get_info_field [r info threads] io_thread_scale_status] "none" if {!$::external} { verify_log_message 0 "*IO threads scale-down end*" $lines }