From 02deba7470fbb4c754e6c813695891bfb37ca0ed Mon Sep 17 00:00:00 2001 From: jjuleslasarte Date: Wed, 18 Mar 2026 14:31:17 -0700 Subject: [PATCH 1/8] aof: offload appendfsync=always flush+fsync to IO threads Move the expensive AOF write+fsync off the main thread when IO threads are available. This prevents the main thread from blocking on disk I/O when appendfsync is set to 'always'. Add a generic trySendJobToIOThreads() API to io_threads with round-robin distribution, and an aof IO flush state machine (IDLE/PENDING/DONE/ERR) with atomic coordination between main and IO threads. The adjustIOThreadsByEventLoad() function gains a has_background_work parameter to ensure IO threads stay active when AOF fsync work is pending, even during low-traffic periods. Signed-off-by: jjuleslasarte --- src/aof.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++ src/io_threads.c | 32 ++++++++++- src/io_threads.h | 3 +- 3 files changed, 177 insertions(+), 3 deletions(-) diff --git a/src/aof.c b/src/aof.c index bfebed1f47a..1a8fe26bd40 100644 --- a/src/aof.c +++ b/src/aof.c @@ -29,6 +29,7 @@ #include "server.h" #include "bio.h" +#include "io_threads.h" #include "rio.h" #include "functions.h" #include "module.h" @@ -51,6 +52,23 @@ aofManifest *aofLoadManifestFromFile(sds am_filepath); void aofManifestFreeAndUpdate(aofManifest *am); void aof_background_fsync_and_close(int fd); +enum { + AOF_IO_FLUSH_IDLE = 0, + AOF_IO_FLUSH_PENDING, + AOF_IO_FLUSH_DONE, + AOF_IO_FLUSH_ERR, +}; + +typedef struct aofIOFlushJob { + int fd; + sds buf; + size_t len; + long long reploff; +} aofIOFlushJob; + +static void processAofIOThreadFlushResult(void); +static int tryOffloadAofAlwaysFlushToIOThreads(void); + /* ---------------------------------------------------------------------------- * AOF Manifest file implementation. * @@ -952,6 +970,9 @@ void stopAppendOnly(void) { server.aof_last_incr_size = 0; server.aof_last_incr_fsync_offset = 0; server.fsynced_reploff = -1; + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_IDLE, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_errno, 0, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_size, 0, memory_order_relaxed); atomic_store_explicit(&server.fsynced_reploff_pending, 0, memory_order_relaxed); killAppendOnlyChild(); sdsfree(server.aof_buf); @@ -1002,6 +1023,9 @@ int startAppendOnly(void) { serverLog(LL_WARNING, "AOF reopen, just ignore the last error."); server.aof_last_write_status = C_OK; } + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_IDLE, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_errno, 0, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_size, 0, memory_order_relaxed); return C_OK; } @@ -1156,6 +1180,113 @@ ssize_t aofWrite(int fd, const char *buf, size_t len) { return totwritten; } +static void aofIOThreadFlushJobHandler(void *data) { + aofIOFlushJob *job = data; + int err = 0; + ssize_t nwritten = aofWrite(job->fd, job->buf, job->len); + if (nwritten != (ssize_t)job->len) { + err = (nwritten == -1) ? errno : ENOSPC; + goto done; + } + + if (valkey_fsync(job->fd) == -1) { + err = errno; + goto done; + } + + atomic_store_explicit(&server.fsynced_reploff_pending, job->reploff, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_size, job->len, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_DONE, memory_order_release); + sdsfree(job->buf); + zfree(job); + return; + +done: + atomic_store_explicit(&server.aof_io_flush_errno, err, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_ERR, memory_order_release); + sdsfree(job->buf); + zfree(job); +} + +int aofIOFlushInProgress(void) { + return atomic_load_explicit(&server.aof_io_flush_state, memory_order_acquire) == AOF_IO_FLUSH_PENDING; +} + +static void processAofIOThreadFlushResult(void) { + int state = atomic_load_explicit(&server.aof_io_flush_state, memory_order_acquire); + if (state == AOF_IO_FLUSH_IDLE || state == AOF_IO_FLUSH_PENDING) return; + + if (state == AOF_IO_FLUSH_DONE) { + off_t nwritten = atomic_load_explicit(&server.aof_io_flush_size, memory_order_relaxed); + server.aof_current_size += nwritten; + server.aof_last_incr_size += nwritten; + server.aof_last_incr_fsync_offset = server.aof_last_incr_size; + server.aof_last_fsync = server.mstime; + if (server.aof_last_write_status == C_ERR) { + serverLog(LL_NOTICE, "AOF write error looks solved. The server can write again."); + server.aof_last_write_status = C_OK; + } + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_IDLE, memory_order_release); + + /* Notify sync replication that AOF fsync completed so blocked clients can be unblocked */ + notifyDurabilityProgress(); + return; + } + + int err = atomic_load_explicit(&server.aof_io_flush_errno, memory_order_relaxed); + server.aof_last_write_errno = err; + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_IDLE, memory_order_release); + + if (server.aof_fsync == AOF_FSYNC_ALWAYS) { + serverLog(LL_WARNING, + "Can't persist AOF from IO thread when the " + "AOF fsync policy is 'always': %s. Exiting...", + strerror(err)); + exit(1); + } + server.aof_last_write_status = C_ERR; +} + +static int tryOffloadAofAlwaysFlushToIOThreads(void) { + if (server.aof_fsync != AOF_FSYNC_ALWAYS || sdslen(server.aof_buf) == 0 || aofIOFlushInProgress()) { + return C_ERR; + } + + /* If IO threads are configured but not active, we can't offload. + * Note: Thread activation based on AOF workload is handled by + * adjustIOThreadsByEventLoad() via the has_background_work parameter. */ + if (server.io_threads_num <= 1 || server.active_io_threads_num <= 1) { + return C_ERR; + } + + /* NOTE: With sync replication enabled, we still want to offload fsync to + * IO threads to avoid blocking the main thread. The notifyDurabilityProgress() + * callback will be invoked in beforeSleep() when we check for completed IO thread + * jobs, which will then unblock waiting clients. This adds at most one + * event loop iteration of latency but keeps the main thread responsive. */ + + aofIOFlushJob *job = zmalloc(sizeof(*job)); + job->fd = server.aof_fd; + job->buf = server.aof_buf; + job->len = sdslen(job->buf); + job->reploff = server.primary_repl_offset; + + server.aof_buf = sdsempty(); + atomic_store_explicit(&server.aof_io_flush_errno, 0, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_size, 0, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_PENDING, memory_order_release); + if (trySendJobToIOThreads(aofIOThreadFlushJobHandler, job) == C_OK) { + server.aof_flush_postponed_start = 0; + return C_OK; + } + + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_IDLE, memory_order_release); + sdsfree(server.aof_buf); + server.aof_buf = job->buf; + zfree(job); + return C_ERR; +} + /* Write the append only file buffer on disk. * * Since we are required to write the AOF before replying to the client, @@ -1180,6 +1311,15 @@ void flushAppendOnlyFile(int force) { int sync_in_progress = 0; mstime_t latency; + processAofIOThreadFlushResult(); + if (aofIOFlushInProgress()) { + if (!force) return; + while (aofIOFlushInProgress()) { + usleep(100); + processAofIOThreadFlushResult(); + } + } + if (sdslen(server.aof_buf) == 0) { /* Check if we need to do fsync even the aof buffer is empty, * because previously in AOF_FSYNC_EVERYSEC mode, fsync is @@ -1234,6 +1374,11 @@ void flushAppendOnlyFile(int force) { "without waiting for fsync to complete, this may slow down the server."); } } + + if (server.aof_fsync == AOF_FSYNC_ALWAYS && !force && tryOffloadAofAlwaysFlushToIOThreads() == C_OK) { + return; + } + /* We want to perform a single write. This should be guaranteed atomic * at least if the filesystem we are writing is a real physical one. * While this will save us against the server being killed I don't think diff --git a/src/io_threads.c b/src/io_threads.c index 3fe5e14d5c1..63b3d48b5e4 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -9,6 +9,7 @@ static _Thread_local int thread_id = 0; /* Thread local var */ static pthread_t io_threads[IO_THREADS_MAX_NUM] = {0}; static pthread_mutex_t io_threads_mutex[IO_THREADS_MAX_NUM]; +static size_t next_generic_job_rr = 0; /* Main-thread round-robin counter for generic IO jobs. */ /* IO jobs queue functions - Used to send jobs from the main-thread to the IO thread. */ typedef void (*job_handler)(void *); @@ -131,6 +132,25 @@ int inMainThread(void) { return thread_id == 0; } +/* Attempts to offload a generic job to an IO thread. + * Returns C_OK if the job is enqueued, C_ERR otherwise. */ +int trySendJobToIOThreads(void (*handler)(void *), void *data) { + if (!inMainThread() || server.active_io_threads_num <= 1) return C_ERR; + size_t workers = (size_t)server.active_io_threads_num - 1; + size_t start = (next_generic_job_rr++ % workers) + 1; + + /* Distribute jobs across active IO threads and fall back to any + * available queue if the preferred one is full. */ + for (size_t i = 0; i < workers; i++) { + size_t tid = ((start - 1 + i) % workers) + 1; + IOJobQueue *jq = &io_jobs[tid]; + if (IOJobQueue_isFull(jq)) continue; + IOJobQueue_push(jq, handler, data); + return C_OK; + } + return C_ERR; +} + int getIOThreadID(void) { return thread_id; } @@ -167,14 +187,22 @@ void waitForClientIO(client *c) { } /** Adjusts the number of active I/O threads based on the current event load. - * If increase_only is non-zero, only allows increasing the number of threads.*/ -void adjustIOThreadsByEventLoad(int numevents, int increase_only) { + * If increase_only is non-zero, only allows increasing the number of threads. + * If has_background_work is non-zero, ensures at least one IO thread is active + * for background jobs like AOF fsync. */ +void adjustIOThreadsByEventLoad(int numevents, int increase_only, int has_background_work) { if (server.io_threads_num == 1) return; /* All I/O is being done by the main thread. */ debugServerAssertWithInfo(NULL, NULL, server.io_threads_num > 1); /* When events_per_io_thread is set to 0, we offload all events to the IO threads. * This is used mainly for testing purposes. */ int target_threads = server.events_per_io_thread == 0 ? (numevents + 1) : numevents / server.events_per_io_thread; + /* If there's background work (like AOF fsync), ensure at least 2 threads are active + * so generic jobs can be offloaded to IO threads. */ + if (has_background_work && target_threads < 2) { + target_threads = 2; + } + target_threads = max(1, min(target_threads, server.io_threads_num)); if (target_threads == server.active_io_threads_num) return; diff --git a/src/io_threads.h b/src/io_threads.h index 308dc6dbff8..4fa9ac6f898 100644 --- a/src/io_threads.h +++ b/src/io_threads.h @@ -6,11 +6,12 @@ void initIOThreads(void); void killIOThreads(void); int inMainThread(void); +int trySendJobToIOThreads(void (*handler)(void *), void *data); int trySendReadToIOThreads(client *c); int trySendWriteToIOThreads(client *c); int tryOffloadFreeObjToIOThreads(robj *o); int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv); -void adjustIOThreadsByEventLoad(int numevents, int increase_only); +void adjustIOThreadsByEventLoad(int numevents, int increase_only, int has_background_work); void drainIOThreadsQueue(void); void trySendPollJobToIOThreads(void); int trySendAcceptToIOThreads(connection *conn); From 75cc8de2d4e07fe03d5dc5df339a5689932bd221 Mon Sep 17 00:00:00 2001 From: jjuleslasarte Date: Wed, 18 Mar 2026 14:31:29 -0700 Subject: [PATCH 2/8] durability: add pluggable durability provider framework Introduce a provider registry that allows multiple durability backends (AOF fsync, replicas, etc.) to register and contribute to a consensus offset. The overall durability consensus is the MIN (AND) of all enabled providers' acknowledged offsets. Include the built-in AOF provider that tracks fsynced_reploff_pending when appendfsync=always, and transparently passes through when not. Add pause/resume support for providers (used via DEBUG commands) to enable deterministic testing by freezing a provider's acknowledged offset at a point-in-time snapshot. Signed-off-by: jjuleslasarte --- src/durability_provider.c | 186 ++++++++++++++++++++++++++++++++++++++ src/durability_provider.h | 56 ++++++++++++ 2 files changed, 242 insertions(+) create mode 100644 src/durability_provider.c create mode 100644 src/durability_provider.h diff --git a/src/durability_provider.c b/src/durability_provider.c new file mode 100644 index 00000000000..fd3c72a8095 --- /dev/null +++ b/src/durability_provider.c @@ -0,0 +1,186 @@ +#include "durability_provider.h" +#include "server.h" +#include +#include + +/*================================= Durability Provider Registry ============= */ + +/* Provider registry: static array of registered providers */ +static durabilityProvider *durability_providers[MAX_DURABILITY_PROVIDERS]; +static int num_durability_providers = 0; + +/** + * Register a durability provider. Providers are checked in registration order. + * The overall durability consensus is the MIN (AND) of all enabled providers. + */ +void registerDurabilityProvider(durabilityProvider *provider) { + serverAssert(num_durability_providers < MAX_DURABILITY_PROVIDERS); + durability_providers[num_durability_providers++] = provider; + serverLog(LL_NOTICE, "Registered durability provider: %s", provider->name); +} + +/** + * Unregister a durability provider by pointer. + */ +void unregisterDurabilityProvider(durabilityProvider *provider) { + for (int i = 0; i < num_durability_providers; i++) { + if (durability_providers[i] == provider) { + /* Shift remaining providers down */ + for (int j = i; j < num_durability_providers - 1; j++) { + durability_providers[j] = durability_providers[j + 1]; + } + num_durability_providers--; + serverLog(LL_NOTICE, "Unregistered durability provider: %s", provider->name); + return; + } + } +} + +bool anyDurabilityProviderEnabled(void) { + for (int i = 0; i < num_durability_providers; i++) { + if (durability_providers[i]->isEnabled()) return true; + } + return false; +} + +/** + * Reset the durability provider registry so it can be re-initialized. + */ +void resetDurabilityProviders(void) { + num_durability_providers = 0; +} + +/*================================= Built-in AOF Provider ==================== */ + +static bool aofProviderIsEnabled(void) { + return server.aof_state != AOF_OFF; +} + +static long long aofProviderGetAckedOffset(void) { + /* If appendfsync is not "always", we cannot guarantee data is on disk + * after each write. Return primary_repl_offset to indicate "no constraint", + * effectively making this provider a transparent pass-through that doesn't + * block consensus. When appendfsync is switched to "always", the provider + * immediately starts returning the actual fsynced offset. */ + if (server.aof_fsync != AOF_FSYNC_ALWAYS) { + return server.primary_repl_offset; + } + + /* Use fsynced_reploff_pending directly instead of fsynced_reploff. + * When async AOF flushing is used (IO threads), fsynced_reploff_pending + * is updated by the IO thread upon fsync completion, but fsynced_reploff + * is only updated in the next beforeSleep() iteration. Using the pending + * value ensures we see the most up-to-date fsync progress immediately. */ + long long fsynced_offset = atomic_load_explicit(&server.fsynced_reploff_pending, memory_order_relaxed); + /* Handle the case where AOF is enabled but no data has been fsynced yet + * (fsynced_reploff_pending is 0 initially). In that case, use fsynced_reploff + * if it's been properly initialized. */ + if (fsynced_offset == 0 && server.fsynced_reploff > 0) { + fsynced_offset = server.fsynced_reploff; + } + return fsynced_offset; +} + +static durabilityProvider builtinAofProvider = { + .name = "aof", + .isEnabled = aofProviderIsEnabled, + .getAckedOffset = aofProviderGetAckedOffset, + .paused = false, + .pausedOffset = 0, +}; + +/** + * Register the built-in durability providers. Called from durabilityInit(). + * + * Currently only the AOF provider is built-in. Replica-based durability + * (e.g. raft consensus) should be registered externally as a provider + * via registerDurabilityProvider(). + */ +void registerBuiltinDurabilityProviders(void) { + /* Only register if not already registered (idempotent) */ + if (num_durability_providers == 0) { + registerDurabilityProvider(&builtinAofProvider); + } +} + +/*================================= Consensus Calculation ==================== */ + +/** + * Returns the durability consensus offset by iterating all registered + * providers and returning the MIN of all enabled providers' acknowledged + * offsets (AND semantics: all must acknowledge). + * + * If a provider returns -1, it means the provider cannot make progress + * (e.g. insufficient replicas), which blocks consensus advancement. + * + * If no providers are enabled, returns server.primary_repl_offset + * (i.e. no blocking). + */ +long long getDurabilityConsensusOffset(void) { + long long consensus = server.primary_repl_offset; + bool any_enabled = false; + + for (int i = 0; i < num_durability_providers; i++) { + durabilityProvider *p = durability_providers[i]; + if (!p->isEnabled()) continue; + any_enabled = true; + + long long offset; + if (p->paused) { + /* Paused provider (via DEBUG) returns the offset snapshot + * captured at pause time, freezing consensus at that point. */ + offset = p->pausedOffset; + } else { + offset = p->getAckedOffset(); + } + + if (offset == -1) { + /* Provider cannot make progress — block consensus. */ + return -1; + } + if (offset < consensus) consensus = offset; + } + + return any_enabled ? consensus : server.primary_repl_offset; +} + +/** + * Pause a durability provider by name (via DEBUG command). + * When paused, the provider's current acknowledged offset is captured and + * frozen — any writes after the pause point will block until the provider + * is resumed and catches up. + * Returns true if provider was found, false otherwise. + */ +bool pauseDurabilityProvider(const char *name) { + for (int i = 0; i < num_durability_providers; i++) { + if (!strcasecmp(durability_providers[i]->name, name)) { + /* Snapshot the current acked offset before pausing so that + * writes already acknowledged remain unblocked. */ + durability_providers[i]->pausedOffset = durability_providers[i]->getAckedOffset(); + durability_providers[i]->paused = true; + serverLog(LL_NOTICE, "Paused durability provider: %s (frozen at offset %lld)", + name, durability_providers[i]->pausedOffset); + return true; + } + } + return false; +} + +/** + * Resume a durability provider by name (via DEBUG command). + * After resuming, triggers a durability progress check to unblock + * any clients that can now proceed. + * Returns true if provider was found, false otherwise. + */ +bool resumeDurabilityProvider(const char *name) { + for (int i = 0; i < num_durability_providers; i++) { + if (!strcasecmp(durability_providers[i]->name, name)) { + durability_providers[i]->paused = false; + /* Trigger a durability check to unblock any clients that can now proceed */ + notifyDurabilityProgress(); + serverLog(LL_NOTICE, "Resumed durability provider: %s", name); + return true; + } + } + return false; +} diff --git a/src/durability_provider.h b/src/durability_provider.h new file mode 100644 index 00000000000..e43885763f9 --- /dev/null +++ b/src/durability_provider.h @@ -0,0 +1,56 @@ +#ifndef DURABILITY_PROVIDER_H +#define DURABILITY_PROVIDER_H + +#include + +/*================================= Durability Provider Interface ============ */ + +/** + * Maximum number of durability providers that can be registered. + * Built-in providers: replica, aof. + */ +#define MAX_DURABILITY_PROVIDERS 4 + +/** + * A durability provider represents a source of durability acknowledgment. + * Each provider tracks progress independently and the overall durability + * consensus is the MIN (AND) of all enabled providers' acknowledged offsets. + * + * Examples: replica acknowledgments, AOF fsync. + */ +typedef struct durabilityProvider { + const char *name; /* Human-readable name, e.g. "replica", "aof" */ + bool (*isEnabled)(void); /* Is this provider currently active? */ + long long (*getAckedOffset)(void); /* What offset has this provider acknowledged? */ + bool paused; /* When true (via DEBUG), getAckedOffset() returns + * the offset captured at pause time to freeze + * consensus progress. Used for testing. */ + long long pausedOffset; /* Offset snapshot taken when provider is paused. */ +} durabilityProvider; + +/* Provider registry */ +void registerDurabilityProvider(durabilityProvider *provider); +void unregisterDurabilityProvider(durabilityProvider *provider); +bool anyDurabilityProviderEnabled(void); +bool pauseDurabilityProvider(const char *name); +bool resumeDurabilityProvider(const char *name); + +/** + * Returns the durability consensus offset by iterating all registered + * providers and returning the MIN of all enabled providers' acknowledged + * offsets (AND semantics: all must acknowledge). + */ +long long getDurabilityConsensusOffset(void); + +/** + * Register the built-in durability providers (replica + AOF). + * Called from durabilityInit(). + */ +void registerBuiltinDurabilityProviders(void); + +/** + * Reset the durability provider registry (for cleanup/shutdown). + */ +void resetDurabilityProviders(void); + +#endif /* DURABILITY_PROVIDER_H */ From 05c14378be250b976cc8ff1bd44b71c993d8d904 Mon Sep 17 00:00:00 2001 From: jjuleslasarte Date: Wed, 18 Mar 2026 14:31:42 -0700 Subject: [PATCH 3/8] durability: add deferred task system for post-ack execution Add a task registry that defers side-effects (keyspace notifications, key invalidations, flush invalidations) until durability providers acknowledge the associated write offset. Each task type registers create/destroy/execute/onClientDestroy handlers. Tasks are created during command execution with a deferred offset, then moved to an official waiting list once the replication offset is known. When the consensus offset advances past a task's offset, the task is executed and freed. Key invalidation tasks track the originating client pointer and properly handle client disconnection before task execution. Signed-off-by: jjuleslasarte --- src/durable_task.c | 381 +++++++++++++++++++++++++++++++++++++++++++++ src/durable_task.h | 89 +++++++++++ 2 files changed, 470 insertions(+) create mode 100644 src/durable_task.c create mode 100644 src/durable_task.h diff --git a/src/durable_task.c b/src/durable_task.c new file mode 100644 index 00000000000..b9c07bb6ae3 --- /dev/null +++ b/src/durable_task.c @@ -0,0 +1,381 @@ +#include "durable_task.h" +#include "durability_provider.h" +#include "server.h" +#include "zmalloc.h" +#include +#include + +/* Forward declarations from module.h to avoid pulling in full module internals + * which has header dependency issues when included before server.h */ +void moduleNotifyKeyspaceEvent(int type, const char *event, robj *key, int dbid); + +/*================================= Internal Data structures ======================== */ + +/** + * Internal structure used to track replication offset and arguments needed in + * executing task when offset has been acked by required number of replicas. + */ +typedef struct taskWaitingAck { + int type; // Task type + int64_t offset; + void **argv; +} taskWaitingAck; + +/** + * Internal structure used to define all handlers for a task type + */ +typedef struct taskWaitingAckType { + taskWaitingAck *(*createTask)(va_list); + void (*destroyTask)(void *); + void (*executeTask)(const taskWaitingAck *); + void (*onClientDestroy)(void *); +} taskWaitingAckType; + +static taskWaitingAckType taskTypes[DURABLE_TASK_TYPE_MAX]; + +/*================================= Keyspace Notify Task ===================== */ + +/** + * Create the keyspace notify task. + */ +static taskWaitingAck *createKeyspaceNotifyTask(va_list ap) { + int argc = 4; // 4 arguments for notify function: type, event, key, dbid + taskWaitingAck *task = zcalloc(sizeof(taskWaitingAck)); + task->argv = zmalloc(argc * sizeof(void *)); + for (int i = 0; i < argc; i++) { + task->argv[i] = va_arg(ap, void *); + } + + // Increase reference count to avoid the key from being deleted + robj *key = (robj *)task->argv[2]; + if (key) { + incrRefCount(key); + } + return task; +} + +/** + * Destroy the keyspace notify task. + */ +static void destroyKeyspaceNotifyTask(void *ptr) { + taskWaitingAck *task = (taskWaitingAck *)ptr; + if (task->argv[2]) { + robj *key = (robj *)task->argv[2]; + decrRefCount(key); + } + zfree(task->argv); + zfree(task); +} + +/** + * Execute the keyspace notify task. + */ +static void executeKeyspaceNotifyTask(const taskWaitingAck *task) { + static_assert(sizeof(long long) == sizeof(void *), "void* is not the same size as long long"); + notifyKeyspaceEvent((int)(long long)task->argv[0], + (char *)task->argv[1], + (robj *)task->argv[2], + (int)(long long)task->argv[3]); +} + +/*================================= Key Invalidation Task ==================== */ + +/** + * Create the key invalidation task. + */ +static taskWaitingAck *createKeyInvalidationTask(va_list ap) { + // A key invalidation task has 2 arguments: + // 1. client* which generated the modification on the key + // 2. serverObject* that is modified + int argc = 2; + taskWaitingAck *task = zcalloc(sizeof(taskWaitingAck)); + task->argv = zmalloc(argc * sizeof(void *)); + for (int i = 0; i < argc; i++) { + task->argv[i] = va_arg(ap, void *); + } + + // Track the pending notification task in the referenced client + client *c = (client *)task->argv[0]; + if (c != NULL) { + listAddNodeTail(c->clientDurabilityInfo.pending_notify_tasks, task); + } + + // Increase reference count to avoid the key from being deleted + robj *key = (robj *)task->argv[1]; + if (key) { + incrRefCount(key); + } + return task; +} + +/** + * Destroy the key invalidation task. + */ +static void destroyKeyInvalidationTask(void *ptr) { + taskWaitingAck *task = (taskWaitingAck *)ptr; + // Remove the current task from the list of pending tasks for the client. + // The tasks are tracked in FIFO order so we only need to look at the first one. + client *c = (client *)task->argv[0]; + if (c != NULL) { + serverAssert(listLength(c->clientDurabilityInfo.pending_notify_tasks) > 0); + listNode *first = listFirst(c->clientDurabilityInfo.pending_notify_tasks); + serverAssert(task == (taskWaitingAck *)listNodeValue(first)); + listDelNode(c->clientDurabilityInfo.pending_notify_tasks, first); + } + + // Decrement the refcount for the key + if (task->argv[1]) { + robj *key = (robj *)task->argv[1]; + decrRefCount(key); + } + zfree(task->argv); + zfree(task); +} + +/** + * De-reference the client argument from the key invalidation task + */ +static void destroyClientForKeyInvalidationTask(void *task_ptr) { + taskWaitingAck *task = (taskWaitingAck *)task_ptr; + // The first argument is the client pointer + task->argv[0] = NULL; +} + +/** + * Execute the key invalidation task. + */ +static void executeKeyInvalidationTask(const taskWaitingAck *task) { + trackingInvalidateKey((client *)task->argv[0], (robj *)task->argv[1], 1); +} + +/*================================= Flush Invalidation Task ================== */ + +/** + * Create the flush invalidation task. + */ +static taskWaitingAck *createFlushInvalidationTask(va_list ap) { + // Flush invalidation task has database ID as argument + int argc = 1; + taskWaitingAck *task = zcalloc(sizeof(taskWaitingAck)); + task->argv = zmalloc(argc * sizeof(void *)); + for (int i = 0; i < argc; i++) { + task->argv[i] = va_arg(ap, void *); + } + return task; +} + +/** + * Destroy the flush invalidation task. + */ +static void destroyFlushInvalidationTask(void *ptr) { + taskWaitingAck *task = (taskWaitingAck *)ptr; + zfree(task->argv); + zfree(task); +} + +/** + * Execute the flush invalidation task. + */ +static void executeFlushInvalidationTask(const taskWaitingAck *task) { + bool is_flush_all = (bool)task->argv[0]; + // Use DBID -1 for FLUSHALL, otherwise use 0 for DBID + // Note: This assumes the OSS Redis code below doesn't operate on the actual + // DBID besides differentiating between FLUSHDB and FLUSHALL. + trackingInvalidateKeysOnFlush(is_flush_all ? -1 : 0); +} + +/*================================= Default callback ========================= */ + +/** + * Default callback on client destroy doing no-op + */ +static void destroyClientDefaultCallback(void *task) { + UNUSED(task); + return; +} + +/*================================= Task Type Registry ======================= */ + +void initTaskTypes(void) { + taskTypes[DURABLE_KEYSPACE_NOTIFY_TASK] = (taskWaitingAckType){ + createKeyspaceNotifyTask, + destroyKeyspaceNotifyTask, + executeKeyspaceNotifyTask, + destroyClientDefaultCallback}; + taskTypes[DURABLE_KEY_INVALIDATION_TASK] = (taskWaitingAckType){ + createKeyInvalidationTask, + destroyKeyInvalidationTask, + executeKeyInvalidationTask, + destroyClientForKeyInvalidationTask}; + // needed + taskTypes[DURABLE_FLUSH_INVALIDATION_TASK] = (taskWaitingAckType){ + createFlushInvalidationTask, + destroyFlushInvalidationTask, + executeFlushInvalidationTask, + destroyClientDefaultCallback}; +} + +/*================================= Task Registration ======================== */ + +/** + * Create task based on the given task type and arguments, and append the new + * task to the end of the linkedlist of the pending tasks of that task type. + * + * Note that at this point in time, we might not know about the replication + * offset we want to configure this task with so we put it onto a pending list. + * And at a later point in time, when we know the replication offset, we would + * set it and move the task to the official tasks list. + */ +bool durabilityRegisterDeferredTask(int type, ...) { + /* Check durability is active and the type is valid */ + if (!isPrimaryDurabilityEnabled() || (type == DURABLE_TASK_TYPE_MAX)) { + return false; + } + + va_list ap; + bool return_code = false; + va_start(ap, type); + taskWaitingAck *task = taskTypes[type].createTask(ap); + if (task) { + task->type = type; + if (server.current_client != NULL) { + // Here the notification is triggered by an incoming client request when we + // don't yet know the actual replication offset after command is applied, + // so we need to put it onto a pending tasks list. + listAddNodeTail(server.durability.pending_tasks_waiting_ack[type], task); + } else { + /* This notification is triggered from a background job such as + * active expiry or eviction outside of a regular client command. + * The replication offset is already updated so we use it directly. */ + task->offset = server.primary_repl_offset; + listAddNodeTail(server.durability.tasks_waiting_ack[type], task); + } + return_code = true; + } + va_end(ap); + return return_code; +} + +/*================================= Signal Handlers ========================== */ + +bool durabilitySignalModifiedKey(struct client *c, struct serverDb *db, struct serverObject *key) { + UNUSED(db); + /* Defer key invalidation messages until the durability providers acknowledge. */ + return durabilityRegisterDeferredTask(DURABLE_KEY_INVALIDATION_TASK, + (void *)c, (void *)key); +} + + +bool durabilitySignalFlushedDb(int dbid) { + /* Defer flush invalidation messages until the durability providers acknowledge. */ + return durabilityRegisterDeferredTask(DURABLE_FLUSH_INVALIDATION_TASK, + (void *)(long long)(dbid == -1)); +} + +/*================================= Task Execution =========================== */ + +/** + * Find and execute deferred tasks when 'consensus_ack_offset' is acked. + */ +void executeDeferredTasksForAck(const long long consensus_ack_offset) { + listIter li; + listNode *ln; + struct durable_t *durability = &server.durability; + + for (int i = 0; i < DURABLE_TASK_TYPE_MAX; i++) { + listRewind(durability->tasks_waiting_ack[i], &li); + while ((ln = listNext(&li))) { + taskWaitingAck *task = listNodeValue(ln); + if (task->offset <= consensus_ack_offset) { + taskTypes[i].executeTask(task); + listDelNode(durability->tasks_waiting_ack[i], ln); + } else { + break; + } + } + } +} + +/** + * Move pending deferred tasks to the official list with the current replication offset. + */ +void certifyPendingDeferredTasks(void) { + listIter li; + listNode *ln; + for (int i = 0; i < DURABLE_TASK_TYPE_MAX; i++) { + listRewind(server.durability.pending_tasks_waiting_ack[i], &li); + while ((ln = listNext(&li))) { + taskWaitingAck *task = listNodeValue(ln); + serverAssert(task->offset == 0); + task->offset = server.primary_repl_offset; + if (task->type == DURABLE_KEYSPACE_NOTIFY_TASK) { + moduleNotifyKeyspaceEvent( + /*type*/ (intptr_t)task->argv[0], + /*event*/ (char *)task->argv[1], + /*key*/ (robj *)task->argv[2], + /*dbid*/ (intptr_t)task->argv[3]); + } + } + if (listLength(server.durability.pending_tasks_waiting_ack[i]) > 0) { + listJoin(server.durability.tasks_waiting_ack[i], server.durability.pending_tasks_waiting_ack[i]); + } + serverAssert(listLength(server.durability.pending_tasks_waiting_ack[i]) == 0); + } +} + +/*================================= Client Lifecycle ========================= */ + +/** + * Notify the task system that a client is being destroyed so that + * any tasks referencing it can de-reference the client pointer. + */ +void durableTaskNotifyClientDestroy(struct list *pending_notify_tasks) { + listIter li; + listNode *ln; + listRewind(pending_notify_tasks, &li); + while ((ln = listNext(&li))) { + taskWaitingAck *task = (taskWaitingAck *)listNodeValue(ln); + if (task) { + taskTypes[task->type].onClientDestroy(task); + } + } +} + +/*================================= Init / Cleanup =========================== */ + +/** + * Initialize the task lists in the durability structure. + * Called from durabilityInit(). + */ +void durableTaskInitLists(void) { + for (int i = 0; i < DURABLE_TASK_TYPE_MAX; i++) { + server.durability.tasks_waiting_ack[i] = listCreate(); + server.durability.pending_tasks_waiting_ack[i] = listCreate(); + listSetFreeMethod(server.durability.tasks_waiting_ack[i], + taskTypes[i].destroyTask); + listSetFreeMethod(server.durability.pending_tasks_waiting_ack[i], + taskTypes[i].destroyTask); + } +} + +/** + * Release (free) all task lists. Called from durabilityCleanup(). + */ +void durableTaskCleanupLists(void) { + for (int i = 0; i < DURABLE_TASK_TYPE_MAX; i++) { + listRelease(server.durability.tasks_waiting_ack[i]); + server.durability.tasks_waiting_ack[i] = NULL; + listRelease(server.durability.pending_tasks_waiting_ack[i]); + server.durability.pending_tasks_waiting_ack[i] = NULL; + } +} + +/** + * Empty (but don't free) all task lists. Called during primary state reset. + */ +void durableTaskEmptyLists(void) { + for (int i = 0; i < DURABLE_TASK_TYPE_MAX; i++) { + listEmpty(server.durability.tasks_waiting_ack[i]); + listEmpty(server.durability.pending_tasks_waiting_ack[i]); + } +} diff --git a/src/durable_task.h b/src/durable_task.h new file mode 100644 index 00000000000..f16cafa8a0c --- /dev/null +++ b/src/durable_task.h @@ -0,0 +1,89 @@ +#ifndef DURABLE_TASK_H +#define DURABLE_TASK_H + +#include +#include + +struct client; +struct serverDb; +struct serverObject; +struct list; + +/** + * Define the supported task types for deferred work that executes + * after durability has been confirmed (replica ACK). + */ +typedef enum { + DURABLE_KEYSPACE_NOTIFY_TASK = 0, /* KEYSPACE NOTIFY task */ + DURABLE_KEY_INVALIDATION_TASK, /* Key invalidation task for client side caching */ + DURABLE_FLUSH_INVALIDATION_TASK, /* FLUSH invalidation task for client side caching */ + DURABLE_TASK_TYPE_MAX /* Max task type */ +} durableTaskType; + +/** + * Initialize the task type registry (create/destroy/execute handlers). + * Must be called before any task registration. + */ +void initTaskTypes(void); + +/** + * Register a deferred task for execution after the current replication + * offset is acknowledged by durability providers. The task is created + * from the variadic arguments based on the given task type. + * + * Returns true if the task was successfully registered, false otherwise. + */ +bool durabilityRegisterDeferredTask(int type, ...); + +/** + * Find and execute all deferred tasks whose offset <= consensus_ack_offset. + */ +void executeDeferredTasksForAck(long long consensus_ack_offset); + +/** + * Move pending tasks (registered during command execution before the + * replication offset was known) to the official tasks list, setting + * their offset to server.primary_repl_offset. + */ +void certifyPendingDeferredTasks(void); + +/** + * Notify the task system that a client is being destroyed so that + * any tasks referencing it can de-reference the client pointer. + * Iterates all tasks in the given pending_notify_tasks list. + */ +void durableTaskNotifyClientDestroy(struct list *pending_notify_tasks); + +/** + * Custom processing whenever a key gets modified. Invoked from signalModifiedKey(). + * + * Return true if no further processing are required in signalModifiedKey() such + * as some async tasks are created which need some time to finish, false otherwise. + */ +bool durabilitySignalModifiedKey(struct client *c, struct serverDb *db, struct serverObject *key); + +/** + * Custom processing whenever a FLUSH happens. Invoked from signalFlushedDb(). + * + * Return true if no further processing are required in signalFlushedDb() such + * as some async tasks are created which need some time to finish, false otherwise. + */ +bool durabilitySignalFlushedDb(int dbid); + +/** + * Initialize the task lists in the durability structure. + * Called from durabilityInit(). + */ +void durableTaskInitLists(void); + +/** + * Release (free) all task lists. Called from durabilityCleanup(). + */ +void durableTaskCleanupLists(void); + +/** + * Empty (but don't free) all task lists. Called during primary state reset. + */ +void durableTaskEmptyLists(void); + +#endif /* DURABLE_TASK_H */ From f15f181fdd9eda151a03f9b92286e6b7fab05395 Mon Sep 17 00:00:00 2001 From: jjuleslasarte Date: Wed, 18 Mar 2026 14:31:54 -0700 Subject: [PATCH 4/8] durability: add uncommitted key tracking per database Track which keys have been modified but not yet acknowledged by durability providers using a per-database hashtable. This enables rejecting reads of uncommitted keys to ensure clients only see durable data (zero-data-loss semantics). Each uncommitted key stores the replication offset at which it was last modified. Keys are purged when the durability consensus offset advances past their stored offset. Include incremental cleanup via serverCron that scans databases round-robin with a configurable time limit, plus immediate purging on read access (lazy cleanup). Also handle database-level modifications (FLUSHDB, FLUSHALL, SWAPDB) and function store dirty tracking for transactions. Signed-off-by: jjuleslasarte --- src/uncommitted_keys.c | 521 +++++++++++++++++++++++++++++++++++++++++ src/uncommitted_keys.h | 97 ++++++++ 2 files changed, 618 insertions(+) create mode 100644 src/uncommitted_keys.c create mode 100644 src/uncommitted_keys.h diff --git a/src/uncommitted_keys.c b/src/uncommitted_keys.c new file mode 100644 index 00000000000..3c83d1f360d --- /dev/null +++ b/src/uncommitted_keys.c @@ -0,0 +1,521 @@ +#include "uncommitted_keys.h" +#include "server.h" +#include "zmalloc.h" +#include "script.h" +#include +#include + +/*================================= Internal Data Structures ================= */ +typedef struct uncommittedKeyEntry { + sds key; + long long offset; +} uncommittedKeyEntry; + +typedef struct uncommittedKeyCleanupCtx { + hashtable *ht; + long long acked_offset; + unsigned long long *scan_count; +} uncommittedKeyCleanupCtx; + +/** + * Below are the data structures used to buffer intermediate dirty keys/DBs for multi-command + * blocks including MULTI/EXEC and Lua script. As we execute the individual commands in the + * transaction, we don't know the final replication offset so we store the updated keys and DBs + * in afterCommandTrackReplOffset(), and process them in postCommandExec() after the entire transaction is + * propagated to the replication buffer. + */ +typedef struct pendingUncommittedKey { + robj *key; + hashtable *uncommitted_keys; +} pendingUncommittedKey; + +// Track the list of pending uncommitted keys for an ongoing multi-command block +static list *pending_uncommitted_keys; + +// Track the list of pending uncommitted databases for an ongoing multi-command block +static list *pending_uncommitted_dbs; + + +/*================================= Internal Prototypes ====================== */ + +static void addUncommittedKey(sds key, long long offset, hashtable *uncommittedKeys); +static void uncommittedKeysCleanupScanCallback(void *privdata, void *entry); +static void pendingUncommittedKeyDestructor(void *entry); +static uint64_t uncommittedKeysHash(const void *key); +static int uncommittedKeysKeyCompare(const void *key1, const void *key2); +static const void *uncommittedKeyEntryGetKey(const void *entry); +static void uncommittedKeyEntryDestructor(void *entry); +static void handleDirtyDatabase(client *c, serverDb *db); +static bool swapdbGetParams(robj **argv, int argc, int *id1_p, int *id2_p); +static bool selectGetParams(robj **argv, int argc, client *permission_client, int *dbid_p); +static bool getDbIdFromRobj(robj *obj, int *db_id); +static int isSingleCommandAccessingUncommittedKeys(const serverDb *db, struct serverCommand *cmd, robj **argv, int argc); +static int isAccessingUncommittedData(client *c); + +/*================================= Hashtable Type =========================== */ + +static hashtableType uncommittedKeysHashtableType = { + .entryGetKey = uncommittedKeyEntryGetKey, + .hashFunction = uncommittedKeysHash, + .keyCompare = uncommittedKeysKeyCompare, + .entryDestructor = uncommittedKeyEntryDestructor, +}; + +/*================================= Utility Functions ======================== */ + +static void pendingUncommittedKeyDestructor(void *entry) { + if (entry == NULL) return; + pendingUncommittedKey *uk = entry; + if (uk->key != NULL) decrRefCount(uk->key); + zfree(uk); +} + +static uint64_t uncommittedKeysHash(const void *key) { + const sds keystr = (const sds)key; + return hashtableGenHashFunction(keystr, sdslen(keystr)); +} + +static int uncommittedKeysKeyCompare(const void *key1, const void *key2) { + const sds s1 = (const sds)key1; + const sds s2 = (const sds)key2; + return sdslen(s1) != sdslen(s2) || memcmp(s1, s2, sdslen(s1)); +} + +static const void *uncommittedKeyEntryGetKey(const void *entry) { + return ((const uncommittedKeyEntry *)entry)->key; +} + +static void uncommittedKeyEntryDestructor(void *entry) { + if (entry == NULL) return; + uncommittedKeyEntry *uke = entry; + sdsfree(uke->key); + zfree(uke); +} + +unsigned long long getNumberOfUncommittedKeys(void) { + unsigned long long num_uncommitted_keys = 0; + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i] != NULL) { + num_uncommitted_keys += hashtableSize(server.db[i]->uncommitted_keys); + } + } + return num_uncommitted_keys; +} + +unsigned long long getUncommittedKeysCleanupTimeLimit(unsigned long long num_uncommitted_keys) { + unsigned long long time_limit_ms = 1; + if (num_uncommitted_keys > 0) { + time_limit_ms = ceil(server.durability.keys_cleanup_time_limit_ms * MIN(1, (double)(num_uncommitted_keys / 1000000.0))); + } + return time_limit_ms; +} + +/*================================= Key Tracking ============================= */ + +/** + * Mark a key as uncommitted at a particular replication offset. + */ +static void addUncommittedKey(const sds key, const long long offset, hashtable *uncommittedKeys) { + uncommittedKeyEntry *entry = zmalloc(sizeof(*entry)); + entry->key = sdsdup(key); + entry->offset = offset; + + void *existing = NULL; + if (hashtableAddOrFind(uncommittedKeys, entry, &existing)) { + return; + } + + uncommittedKeyEntry *existing_entry = existing; + existing_entry->offset = offset; + sdsfree(entry->key); + zfree(entry); +} + +/** + * Callback for hashtableScan for cleaning up uncommitted keys. + */ +static void uncommittedKeysCleanupScanCallback(void *privdata, void *entry) { + uncommittedKeyCleanupCtx *ctx = privdata; + uncommittedKeyEntry *uke = entry; + if (uke->offset <= ctx->acked_offset) { + hashtableDelete(ctx->ht, uke->key); + } + (*ctx->scan_count)++; +} + +/** + * Retrieve the uncommitted replication offset for a given key, purge the given + * key from uncommitted keys set if the replication offset has been committed. + */ +long long durabilityPurgeAndGetUncommittedKeyOffset(const sds key, serverDb *db) { + serverAssert(iAmPrimary()); + uncommittedKeyEntry *entry = NULL; + if (!hashtableFind(db->uncommitted_keys, key, (void **)&entry)) { + return -1; + } + + long long key_offset = entry->offset; + + if (key_offset <= server.durability.previous_acked_offset) { + hashtableDelete(db->uncommitted_keys, key); + return -1; + } + + return key_offset; +} + +/** + * Handle a dirty key for a given client. + */ +void handleUncommittedKeyForClient(const client *c, robj *key, serverDb *db) { + if ((c != NULL) && ((c->flag.multi) || scriptIsRunning())) { + if (server.durability.all_dbs_dirty_in_current_cmd) return; + if (pending_uncommitted_keys == NULL) { + pending_uncommitted_keys = listCreate(); + listSetFreeMethod(pending_uncommitted_keys, pendingUncommittedKeyDestructor); + } + pendingUncommittedKey *dirty_key = (pendingUncommittedKey *)zmalloc(sizeof(pendingUncommittedKey)); + incrRefCount(key); + dirty_key->key = key; + dirty_key->uncommitted_keys = db->uncommitted_keys; + listAddNodeTail(pending_uncommitted_keys, dirty_key); + } else { + addUncommittedKey(objectGetVal(key), server.primary_repl_offset, db->uncommitted_keys); + } +} + +/*================================= Database Modification ==================== */ + +static void handleDirtyDatabase(client *c, serverDb *db) { + if ((c->flag.multi) || scriptIsRunning()) { + if (server.durability.all_dbs_dirty_in_current_cmd) return; + if (db != NULL) { + listAddNodeTail(pending_uncommitted_dbs, db); + } else { + server.durability.all_dbs_dirty_in_current_cmd = true; + listEmpty(pending_uncommitted_keys); + listEmpty(pending_uncommitted_dbs); + } + } else { + if (db != NULL) { + db->dirty_repl_offset = server.primary_repl_offset; + } else { + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i] != NULL) { + server.db[i]->dirty_repl_offset = server.primary_repl_offset; + } + } + } + } +} + +void handleDatabaseModification(client *c) { + if (c->cmd->proc == swapdbCommand && server.cluster_enabled == 0) { + int id1, id2; + if (swapdbGetParams(c->argv, c->argc, &id1, &id2)) { + handleDirtyDatabase(c, server.db[id1]); + handleDirtyDatabase(c, server.db[id2]); + } + } else if (c->cmd->proc == flushdbCommand) { + handleDirtyDatabase(c, c->db); + } else if (c->cmd->proc == flushallCommand) { + handleDirtyDatabase(c, NULL); + } +} + +/*================================= Command Parameter Helpers ================ */ + +static bool swapdbGetParams(robj **argv, int argc, int *id1_p, int *id2_p) { + long long dbid1, dbid2; + if (argc != 3) return false; + if (server.cluster_enabled) return false; + if (getLongLongFromObject(argv[1], &dbid1) != C_OK) return false; + if (getLongLongFromObject(argv[2], &dbid2) != C_OK) return false; + if (dbid1 < 0 || dbid1 >= server.dbnum) return false; + if (dbid2 < 0 || dbid2 >= server.dbnum) return false; + if (dbid1 == dbid2) return false; + + *id1_p = (int)dbid1; + *id2_p = (int)dbid2; + return true; +} + +static bool selectGetParams(robj **argv, int argc, client *permission_client, int *dbid_p) { + UNUSED(permission_client); + int dbid; + if (argc != 2) return false; + if (getIntFromObject(argv[1], &dbid) != C_OK) return false; + if (dbid < 0 || dbid >= server.dbnum) return false; + + *dbid_p = dbid; + return true; +} + +static bool getDbIdFromRobj(robj *obj, int *db_id) { + if ((getIntFromObject(obj, db_id) != C_OK) || (*db_id < 0) || (*db_id >= server.dbnum)) { + return false; + } + return true; +} + +bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid, int *target_dbid) { + const int copy_command_optional_arg_start_index = 3; + + *target_dbid = selected_dbid; + + for (int j = copy_command_optional_arg_start_index; j < argc; j++) { + if (!strcasecmp(objectGetVal(argv[j]), "replace")) { + continue; + } else if (!strcasecmp(objectGetVal(argv[j]), "db") && (argc > j + 1)) { + if (!getDbIdFromRobj(argv[j + 1], target_dbid)) { + return false; + } + j++; + } else { + return false; + } + } + return true; +} + +/*================================= Cleanup ================================== */ + +/** + * Clears all uncommitted DBs and keys that are properly acknowledged. + */ +void clearUncommittedKeysAcknowledged(void) { + if (!isPrimaryDurabilityEnabled()) { + return; + } + + durable_t *durability = &server.durability; + const int TIME_CHECK_INTERVAL = 100; + unsigned long long scan_count = 0; + + unsigned long long num_uncommitted_keys = getNumberOfUncommittedKeys(); + if (num_uncommitted_keys == 0) return; + + unsigned long long time_limit_ms = getUncommittedKeysCleanupTimeLimit(num_uncommitted_keys); + unsigned long long start_time_ms = mstime(); + unsigned long long next_time_check = TIME_CHECK_INTERVAL; + while (durability->curr_db_scan_idx < server.dbnum) { + serverDb *db = server.db[durability->curr_db_scan_idx]; + if (db != NULL) { + if (db->dirty_repl_offset <= server.durability.previous_acked_offset) { + db->dirty_repl_offset = -1; + } + + if (hashtableSize(db->uncommitted_keys) > 0) { + uncommittedKeyCleanupCtx ctx = { + .ht = db->uncommitted_keys, + .acked_offset = server.durability.previous_acked_offset, + .scan_count = &scan_count, + }; + + if (!db->scan_in_progress) { + db->uncommitted_keys_cursor = 0; + db->scan_in_progress = 1; + } + + do { + db->uncommitted_keys_cursor = + hashtableScan(db->uncommitted_keys, db->uncommitted_keys_cursor, uncommittedKeysCleanupScanCallback, &ctx); + + if (time_limit_ms > 0 && scan_count >= next_time_check) { + const unsigned long long cur_time_ms = mstime(); + if (cur_time_ms - start_time_ms > time_limit_ms) { + return; + } + next_time_check += TIME_CHECK_INTERVAL; + } + } while (db->uncommitted_keys_cursor != 0); + } + + if (db->scan_in_progress) { + db->scan_in_progress = 0; + } + } + durability->curr_db_scan_idx++; + } + + if (durability->curr_db_scan_idx == server.dbnum) { + durability->curr_db_scan_idx = 0; + } +} + +/** + * Initialize sync replication related fields for a database. + */ +void durabilityInitDatabase(serverDb *db) { + db->uncommitted_keys = hashtableCreate(&uncommittedKeysHashtableType); + db->dirty_repl_offset = -1; + db->uncommitted_keys_cursor = 0; + db->scan_in_progress = 0; +} + +/** + * Clear all uncommitted keys for each database. + */ +void clearAllUncommittedKeys(void) { + serverLog(LL_NOTICE, "Clearing all uncommitted keys for sync replication"); + for (int i = 0; i < server.dbnum; i++) { + serverDb *db = server.db[i]; + if (db == NULL) continue; + hashtableRelease(db->uncommitted_keys); + durabilityInitDatabase(db); + } + server.durability.curr_db_scan_idx = 0; +} + +/*================================= Access Validation ======================== */ + +/** + * Determines if a single command is trying to access an uncommitted key. + */ +int isSingleCommandAccessingUncommittedKeys(const serverDb *db, struct serverCommand *cmd, robj **argv, int argc) { + if (hashtableSize(db->uncommitted_keys) == 0) return 0; + + getKeysResult keysResult; + initGetKeysResult(&keysResult); + const int numKeys = getKeysFromCommand(cmd, argv, argc, &keysResult); + const keyReference *keys = keysResult.keys; + + for (int i = 0; i < numKeys; i++) { + const sds keyStr = objectGetVal(argv[keys[i].pos]); + if (hashtableFind(db->uncommitted_keys, keyStr, NULL)) { + getKeysFreeResult(&keysResult); + return 1; + } + } + + getKeysFreeResult(&keysResult); + return 0; +} + +/** + * Determine if there are uncommitted keys in the server. + */ +int hasUncommittedKeys(void) { + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i] && (hashtableSize(server.db[i]->uncommitted_keys) > 0)) + return 1; + } + return 0; +} + +/** + * Determine if a client is trying to access uncommitted data. + */ +int isAccessingUncommittedData(client *c) { + // Informational command handling + if (IS_KEYSPACE_INFORMATIONAL(c->cmd) && (hasUncommittedKeys() || isDurableFunctionStoreUncommitted())) { + return 1; + } + + // Single command handling + if (isSingleCommandAccessingUncommittedKeys(c->db, c->cmd, c->argv, c->argc) || (isFunctionStoreRWCommand(c) && isDurableFunctionStoreUncommitted())) { + return 1; + } + + int ret_val = 0; + if ((c->flag.multi) && c->cmd->proc == execCommand) { + serverDb *cur_db = c->db; + for (int i = 0; i < c->mstate->count; i++) { + multiCmd mc = c->mstate->commands[i]; + if (mc.cmd->proc == selectCommand) { + int db_id; + if (selectGetParams(mc.argv, mc.argc, c, &db_id)) { + c->db = server.db[db_id]; + continue; + } else { + discardTransaction(c); + ret_val = 1; + break; + } + } + if (isSingleCommandAccessingUncommittedKeys(c->db, mc.cmd, mc.argv, mc.argc) || (isFunctionStoreRWCommand(c) && isDurableFunctionStoreUncommitted())) { + discardTransaction(c); + ret_val = 1; + break; + } + } + c->db = cur_db; + } + return ret_val; +} + +/** + * Checks if we should reject a command that is accessing uncommitted data. + */ +bool shouldRejectCommandWithUncommittedData(client *c) { + if (c->cmd == NULL || ((c->cmd->flags & CMD_ADMIN)) || c->flag.primary) { + return false; + } + + if ((!iAmPrimary()) && isAccessingUncommittedData(c)) { + return true; + } + + return false; +} + +/*================================= Pending Data Processing ================== */ + +void uncommittedKeysInitPending(void) { + pending_uncommitted_keys = listCreate(); + listSetFreeMethod(pending_uncommitted_keys, pendingUncommittedKeyDestructor); + pending_uncommitted_dbs = listCreate(); + server.durability.all_dbs_dirty_in_current_cmd = false; +} + +void uncommittedKeysCleanupPending(void) { + if (pending_uncommitted_keys != NULL) { + listRelease(pending_uncommitted_keys); + pending_uncommitted_keys = NULL; + } + if (pending_uncommitted_dbs != NULL) { + listRelease(pending_uncommitted_dbs); + pending_uncommitted_dbs = NULL; + } +} + +/** + * Marks keys, databases, and the function store dirty at the current + * replication offset if they were updated during a transaction. + */ +void processPendingUncommittedData(long long blocking_repl_offset) { + if (listLength(pending_uncommitted_keys) > 0) { + listIter li; + listNode *key_node; + listRewind(pending_uncommitted_keys, &li); + while ((key_node = listNext(&li)) != NULL) { + const pendingUncommittedKey *uk = listNodeValue(key_node); + addUncommittedKey(objectGetVal(uk->key), blocking_repl_offset, uk->uncommitted_keys); + listDelNode(pending_uncommitted_keys, key_node); + } + } + + if (server.durability.all_dbs_dirty_in_current_cmd) { + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i] != NULL) { + server.db[i]->dirty_repl_offset = blocking_repl_offset; + } + } + server.durability.all_dbs_dirty_in_current_cmd = false; + } else if (listLength(pending_uncommitted_dbs) > 0) { + listIter li; + listNode *db_node; + listRewind(pending_uncommitted_dbs, &li); + while ((db_node = listNext(&li)) != NULL) { + serverDb *db = listNodeValue(db_node); + db->dirty_repl_offset = blocking_repl_offset; + listDelNode(pending_uncommitted_dbs, db_node); + } + } + + serverAssert(listLength(pending_uncommitted_keys) == 0); + serverAssert(listLength(pending_uncommitted_dbs) == 0); + serverAssert(server.durability.all_dbs_dirty_in_current_cmd == false); + + updateFuncStoreBlockingOffsetForWrite(blocking_repl_offset); +} diff --git a/src/uncommitted_keys.h b/src/uncommitted_keys.h new file mode 100644 index 00000000000..c8bad24df73 --- /dev/null +++ b/src/uncommitted_keys.h @@ -0,0 +1,97 @@ +#ifndef UNCOMMITTED_KEYS_H +#define UNCOMMITTED_KEYS_H + +#include +#include "sds.h" + +struct client; +struct serverObject; +struct serverDb; +struct serverCommand; + +/* Note: robj is typedef'd in server.h as `typedef struct serverObject robj;` + * We use struct serverObject * in declarations here to avoid duplicate typedefs. */ + +/*================================= Uncommitted Key Tracking ================= */ + +/** + * Initialize durability-related fields for a database. + */ +void durabilityInitDatabase(struct serverDb *db); + +/** + * Handle a dirty key for a given client. + * @param c The calling client. NULL if the key becomes dirty outside a client command (i.e. expiry/eviction) + * @param key The key object + * @param db The database + */ +void handleUncommittedKeyForClient(const struct client *c, struct serverObject *key, struct serverDb *db); + +/** + * Retrieve the uncommitted replication offset for a given key, purge the given + * key from uncommitted keys set if the replication offset has been committed. + * @return the ACK offset of the key if key is uncommitted, returns -1 otherwise. + */ +long long durabilityPurgeAndGetUncommittedKeyOffset(sds key, struct serverDb *db); + +/** + * Clears all uncommitted DBs and keys that are properly acknowledged by + * sufficient number of replicas. + */ +void clearUncommittedKeysAcknowledged(void); + +/** + * Clear all uncommitted keys for each database. + */ +void clearAllUncommittedKeys(void); + +/** + * Get the number of uncommitted keys across all databases. + */ +unsigned long long getNumberOfUncommittedKeys(void); + +/** + * Calculate cleanup time limit based on number of uncommitted keys. + */ +unsigned long long getUncommittedKeysCleanupTimeLimit(unsigned long long num_uncommitted_keys); + +/*================================= Database Modification Tracking =========== */ + +/** + * Handle database-level modification commands (FLUSHDB, FLUSHALL, SWAPDB). + */ +void handleDatabaseModification(struct client *c); + +/** + * Process pending uncommitted data (keys, databases, function store) + * after a transaction completes. + */ +void processPendingUncommittedData(long long blocking_repl_offset); + +/** + * Initialize the pending uncommitted data structures. + */ +void uncommittedKeysInitPending(void); + +/** + * Clean up the pending uncommitted data structures. + */ +void uncommittedKeysCleanupPending(void); + +/*================================= Uncommitted Data Access Checks =========== */ + +/** + * Checks if we should reject a command that is accessing uncommitted data. + */ +bool shouldRejectCommandWithUncommittedData(struct client *c); + +/** + * Determine if there are uncommitted keys in the server. + */ +int hasUncommittedKeys(void); + +/*================================= Command parameter helpers ================ */ + +bool getTargetDbIdForCopyCommand(int argc, struct serverObject **argv, int selected_dbid, int *target_dbid); + +#endif /* UNCOMMITTED_KEYS_H */ From 72a31e2ce18e4904079b0190c5c999952d9ec04a Mon Sep 17 00:00:00 2001 From: jjuleslasarte Date: Wed, 18 Mar 2026 14:32:23 -0700 Subject: [PATCH 5/8] durability: add reply blocking and wire into server subsystems Add the core orchestration layer that blocks client responses in the client output buffer (COB) until durability providers confirm the write offset, then unblocks and flushes responses to clients. reply_blocking.c/h contains: - durabilityInit/Cleanup/Reset lifecycle management - beforeCommandTrackReplOffset/afterCommandTrackReplOffset for tracking which replication offsets each command produces - preCommandExec: rejects commands accessing uncommitted keys - postCommandExec: blocks client responses until providers acknowledge - notifyDurabilityProgress: called from beforeSleep to unblock clients whose offsets have been acknowledged - blockClientOnReplOffset/unblockResponsesWithAckOffset - Function store dirty tracking for FUNCTION LOAD/DELETE - INFO durability stats generation Integration points across the server: - server.c: init/cleanup in server lifecycle, pre/post command hooks in call() and processCommand(), notifyDurabilityProgress in beforeSleep, uncommitted keys cleanup in serverCron, per-DB init, INFO section - server.h: durable_t in server struct, clientDurabilityInfo in client, uncommitted_keys/dirty_repl_offset in serverDb, new client flag - config.c: 'durability' bool config with dynamic update callback - db.c: durabilitySignalModifiedKey/durabilitySignalFlushedDb hooks - networking.c: client durability init/reset, COB reply limiting - notify.c: defer keyspace notifications when durability is enabled - script.c/module.c: pre-script checks for uncommitted data access - replication.c: clear durability state on primary change - debug.c: durability-provider-pause/resume DEBUG subcommands - object.c: getIntFromObject utility Signed-off-by: jjuleslasarte --- src/config.c | 7 + src/db.c | 7 + src/debug.c | 13 + src/module.c | 11 + src/networking.c | 22 +- src/notify.c | 40 ++- src/object.c | 10 + src/replication.c | 2 + src/reply_blocking.c | 798 +++++++++++++++++++++++++++++++++++++++++++ src/reply_blocking.h | 225 ++++++++++++ src/script.c | 7 + src/server.c | 36 +- src/server.h | 24 +- 13 files changed, 1189 insertions(+), 13 deletions(-) create mode 100644 src/reply_blocking.c create mode 100644 src/reply_blocking.h diff --git a/src/config.c b/src/config.c index 93ef289e328..fcc3be8c095 100644 --- a/src/config.c +++ b/src/config.c @@ -2697,6 +2697,12 @@ int updateAppendFsync(const char **err) { return 1; } +static int updateDurabilityEnabled(const char **err) { + UNUSED(err); + durabilityReset(); + return 1; +} + /* applyBind affects both TCP and TLS (if enabled) together */ static int applyBind(const char **err) { connListener *tcp_listener = listenerByType(CONN_TYPE_SOCKET); @@ -3293,6 +3299,7 @@ standardConfig static_configs[] = { createBoolConfig("hide-user-data-from-log", NULL, MODIFIABLE_CONFIG, server.hide_user_data_from_log, 1, NULL, NULL), createBoolConfig("lua-enable-insecure-api", "lua-enable-deprecated-api", MODIFIABLE_CONFIG | HIDDEN_CONFIG | PROTECTED_CONFIG, server.lua_enable_insecure_api, 0, NULL, updateLuaEnableInsecureApi), createBoolConfig("import-mode", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.import_mode, 0, NULL, NULL), + createBoolConfig("durability", "sync-replication", MODIFIABLE_CONFIG, server.durability.enabled, 0, NULL, updateDurabilityEnabled), /* String Configs */ createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.acl_filename, "", NULL, NULL), diff --git a/src/db.c b/src/db.c index ba9d25c2fa6..be91f3dcdf7 100644 --- a/src/db.c +++ b/src/db.c @@ -752,6 +752,9 @@ long long dbTotalServerKeyCount(void) { * a context of a client. */ void signalModifiedKey(client *c, serverDb *db, robj *key) { touchWatchedKey(db, key); + if (durabilitySignalModifiedKey(c, db, key)) { + return; + } trackingInvalidateKey(c, key, 1); } @@ -770,6 +773,10 @@ void signalFlushedDb(int dbid, int async) { touchAllWatchedKeysInDb(server.db[j], NULL); } + if (durabilitySignalFlushedDb(dbid)) { + return; + } + trackingInvalidateKeysOnFlush(async); /* Changes in this method may take place in swapMainDbWithTempDb as well, diff --git a/src/debug.c b/src/debug.c index bbb02dc2d25..55ea08d49ff 100644 --- a/src/debug.c +++ b/src/debug.c @@ -39,6 +39,7 @@ #include "io_threads.h" #include "sds.h" #include "module.h" +#include "durability_provider.h" #include #include @@ -1066,6 +1067,18 @@ void debugCommand(client *c) { } else if (!strcasecmp(objectGetVal(c->argv[1]), "client-enforce-reply-list") && c->argc == 3) { server.debug_client_enforce_reply_list = atoi(objectGetVal(c->argv[2])); addReply(c, shared.ok); + } else if (!strcasecmp(objectGetVal(c->argv[1]), "durability-provider-pause") && c->argc == 3) { + if (pauseDurabilityProvider(objectGetVal(c->argv[2]))) { + addReply(c, shared.ok); + } else { + addReplyError(c, "No such durability provider"); + } + } else if (!strcasecmp(objectGetVal(c->argv[1]), "durability-provider-resume") && c->argc == 3) { + if (resumeDurabilityProvider(objectGetVal(c->argv[2]))) { + addReply(c, shared.ok); + } else { + addReplyError(c, "No such durability provider"); + } } else if (!handleDebugClusterCommand(c)) { addReplySubcommandSyntaxError(c); return; diff --git a/src/module.c b/src/module.c index de5a5510e40..2ac1b30a785 100644 --- a/src/module.c +++ b/src/module.c @@ -56,6 +56,7 @@ * function names. For details, see the script src/modules/gendoc.rb. * -------------------------------------------------------------------------- */ +#include "reply_blocking.h" #include "server.h" #include "cluster.h" #include "commandlog.h" @@ -6807,6 +6808,16 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const if (!(flags & VALKEYMODULE_ARGV_NO_AOF)) call_flags |= CMD_CALL_PROPAGATE_AOF; if (!(flags & VALKEYMODULE_ARGV_NO_REPLICAS)) call_flags |= CMD_CALL_PROPAGATE_REPL; } + + // check if we need to reject the execution due to access to dirty data + char *pre_script_err = preScriptCmd(c); + if (pre_script_err != NULL) { + if (error_as_call_replies) { + reply_error_msg = sdsnew(pre_script_err); + } + goto cleanup; + } + call(c, call_flags); /* Propagate database changes from the temporary client back to the context client diff --git a/src/networking.c b/src/networking.c index ddc6137e0ad..80a0b74671e 100644 --- a/src/networking.c +++ b/src/networking.c @@ -27,6 +27,7 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include "reply_blocking.h" #include "server.h" #include "cluster.h" #include "cluster_slot_stats.h" @@ -369,6 +370,11 @@ client *createClient(connection *conn) { c->io_last_written.buf = NULL; c->io_last_written.bufpos = 0; c->io_last_written.data_len = 0; + + // init durability info like + // key blocking on primary + durabilityClientInit(c); + return c; } @@ -1710,6 +1716,18 @@ void copyReplicaOutputBuffer(client *dst, client *src) { /* Return true if the specified client has pending reply buffers to write to * the socket. */ int clientHasPendingReplies(client *c) { + if (isClientReplyBufferLimited(c)) { + // Check if our first allowed reply boundary is in a position that comes + // after the current position that valkey has written up to in the COB. + const blockedResponse *n = listNodeValue(listFirst(c->clientDurabilityInfo.blocked_responses)); + if ((c->bufpos && n->disallowed_reply_block == NULL) || + (c->bufpos == 0 && n->disallowed_reply_block != NULL && listFirst(c->reply) == n->disallowed_reply_block)) { + // Both positions are pointing both at the initial 16KB buffer or the + // first reply block, compare the sentlen with the last allowed byte offset + return c->io_last_written.data_len < n->disallowed_byte_offset; + } + } + if (getClientType(c) == CLIENT_TYPE_REPLICA) { /* Replicas use global shared replication buffer instead of * private output buffer. */ @@ -1909,6 +1927,8 @@ void unlinkClient(client *c) { /* Wait for IO operations to be done before unlinking the client. */ waitForClientIO(c); + durabilityClientReset(c); + /* If this is marked as current client unset it. */ if (c->conn && server.current_client == c) server.current_client = NULL; @@ -3201,7 +3221,7 @@ int handleClientsWithPendingWrites(void) { /* Adjust the number of I/O threads based on the number of pending writes this is required in case pending_writes > * poll_events (for example in pubsub) */ - adjustIOThreadsByEventLoad(pending_writes, 1); + adjustIOThreadsByEventLoad(pending_writes, 1, 0); listIter li; listNode *ln; diff --git a/src/notify.c b/src/notify.c index d48c515b9da..636ef1e4107 100644 --- a/src/notify.c +++ b/src/notify.c @@ -27,6 +27,8 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include "durable_task.h" +#include "reply_blocking.h" #include "server.h" #include "module.h" @@ -117,16 +119,38 @@ void notifyKeyspaceEvent(int type, char *event, robj *key, int dbid) { c->flag.keyspace_notified == 1 || c->id == UINT64_MAX || // AOF client getClientType(c) != CLIENT_TYPE_NORMAL); - /* If any modules are interested in events, notify the module system now. - * This bypasses the notifications configuration, but the module engine - * will only call event subscribers if the event type matches the types - * they are interested in. */ - moduleNotifyKeyspaceEvent(type, event, key, dbid); - if (c) { - c->flag.keyspace_notified = 1; - commitDeferredReplyBuffer(c, 1); + + if (!(type & NOTIFY_IN_DURABLE_TASK)) { + if (isPrimaryDurabilityEnabled()) { + bool shouldSendDelayedNotificationToClients = (server.notify_keyspace_events & type); + + /* Defer client notifications until durability providers acknowledge the write. */ + if (shouldSendDelayedNotificationToClients) { + type = type | NOTIFY_IN_DURABLE_TASK; + /* Register deferred task, executed when offset is acknowledged + * by durability providers */ + durabilityRegisterDeferredTask( + DURABLE_KEYSPACE_NOTIFY_TASK, + (void *)(long)type, + (void *)event, + (void *)key, + (void *)(long)dbid); + } + + // At this point (ZDL branch), we have notified modules, or queued a task. For clients, + // there is never a direct notification (either queue the notification or nothing). + return; + } + moduleNotifyKeyspaceEvent(type, event, key, dbid); + } else { + if (c) { + c->flag.keyspace_notified = 1; + commitDeferredReplyBuffer(c, 1); + } } + type = type & ~NOTIFY_IN_DURABLE_TASK; + /* If notifications for this class of events are off, return ASAP. */ if (!(server.notify_keyspace_events & type)) return; diff --git a/src/object.c b/src/object.c index 8d757b1c44b..8de29cbe38a 100644 --- a/src/object.c +++ b/src/object.c @@ -1160,6 +1160,16 @@ int getPositiveLongFromObjectOrReply(client *c, robj *o, long *target, const cha } } +int getIntFromObject(robj *o, int *target) { + long long value; + + if (getLongLongFromObject(o, &value) != C_OK) return C_ERR; + if (value < INT_MIN || value > INT_MAX) return C_ERR; + + *target = value; + return C_OK; +} + int getIntFromObjectOrReply(client *c, robj *o, int *target, const char *msg) { long value; diff --git a/src/replication.c b/src/replication.c index dd02c9a2814..aa44a5c7898 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1472,6 +1472,7 @@ void replconfCommand(client *c) { if (c->repl_data->repl_state == REPLICA_STATE_BG_RDB_LOAD) { replicaPutOnline(c); } + /* Note: this command does not reply anything! */ return; } else if (!strcasecmp(objectGetVal(c->argv[j]), "getack")) { @@ -4435,6 +4436,7 @@ void replicationSetPrimary(char *ip, int port, int full_sync_required, bool disc freeClient(server.primary); } + durabilityClearPrimaryState(); /* Setting primary_host only after the call to freeClient since it calls * replicationHandlePrimaryDisconnection which can trigger a re-connect * directly from within that call. */ diff --git a/src/reply_blocking.c b/src/reply_blocking.c new file mode 100644 index 00000000000..c9d92ed7a32 --- /dev/null +++ b/src/reply_blocking.c @@ -0,0 +1,798 @@ +#include "reply_blocking.h" +#include "durable_task.h" +#include "durability_provider.h" +#include "uncommitted_keys.h" +#include "expire.h" +#include "server.h" +#include "zmalloc.h" +#include "script.h" +#include + +/* Forward declarations from module.h to avoid pulling in full module internals + * which has header dependency issues when included before server.h */ +int moduleClientIsBlockedOnKeys(client *c); + +/*============================ Internal prototypes ========================= */ +static void resetPreExecutionOffset(struct client *c); +static void trackCommandPreExecutionPosition(struct client *c); +static int unblockClientWaitingAck(struct client *c); +static bool clientEligibleForResponseTracking(client *c); +static void unblockFirstResponse(const struct client *c); +static int isBlockingNeededForOffset(const struct client *c, long long offset); +static void blockClientAndMonitorsOnReplOffset(struct client *c, long long blockingReplOffset); +static long long getSingleCommandBlockingOffsetForReplicatingCommand(client *c); +static long long getSingleCommandBlockingOffsetForNonReplicatingCommand(client *c); +static long long getSingleCommandBlockingOffsetForConsistentWrites(client *c); +static void durabilityResetPrimaryState(bool is_free_clients_needed); + + +/*================================= Utility functions ======================== */ + +/** + * Utility function to determine whether the durability flag has been enabled. + */ +int isDurabilityEnabled(void) { + return server.durability.enabled; +} + +/** + * Utility function to determine whether durability is enabled on a primary node. + */ +int isPrimaryDurabilityEnabled(void) { + return isDurabilityEnabled() && iAmPrimary(); +} + +/*================================= Client management ======================== */ + +/** + * Reset the pre-execution offset fields. + */ +static void resetPreExecutionOffset(struct client *c) { + c->clientDurabilityInfo.offset.recorded = false; + c->clientDurabilityInfo.offset.reply_block = NULL; + c->clientDurabilityInfo.offset.byte_offset = 0; +} + + +/** + * Track the pre-execution position in the client reply COB. + */ +static void trackCommandPreExecutionPosition(struct client *c) { + resetPreExecutionOffset(c); + list *reply = c->reply; + int bufpos = c->bufpos; + + if (reply != NULL && listLength(reply) > 0) { + listNode *last_reply_block = listLast(reply); + c->clientDurabilityInfo.offset.reply_block = last_reply_block; + c->clientDurabilityInfo.offset.byte_offset = ((clientReplyBlock *)listNodeValue(last_reply_block))->used; + } else if (bufpos > 0) { + c->clientDurabilityInfo.offset.reply_block = NULL; + c->clientDurabilityInfo.offset.byte_offset = bufpos; + } + c->clientDurabilityInfo.offset.recorded = true; +} + +/** + * If the client is currently waiting for durability acknowledgement, + * mark it unblocked and reset the client flags. + */ +static int unblockClientWaitingAck(struct client *c) { + if (c->clientDurabilityInfo.durability_blocked) { + listNode *node = listSearchKey(server.durability.clients_waiting_ack, c); + if (node != NULL) { + listDelNode(server.durability.clients_waiting_ack, node); + c->clientDurabilityInfo.durability_blocked = 0; + return 1; + } + } + return 0; +} + +/** + * Initialize the durability client attributes when client is created. + */ +void durabilityClientInit(client *c) { + if (!isDurabilityEnabled()) { + return; + } + if (c->clientDurabilityInfo.blocked_responses == NULL) { + c->clientDurabilityInfo.blocked_responses = listCreate(); + listSetFreeMethod(c->clientDurabilityInfo.blocked_responses, zfree); + resetPreExecutionOffset(c); + c->clientDurabilityInfo.current_command_repl_offset = -1; + c->clientDurabilityInfo.module_cmd_blocking_offset = -1; + c->clientDurabilityInfo.pending_notify_tasks = listCreate(); + } +} + +/** + * Reset the client durability attributes during a client clean-up. + */ +void durabilityClientReset(client *c) { + if (unblockClientWaitingAck(c)) { + server.durability.clients_disconnected_before_unblocking++; + } + + if (c->clientDurabilityInfo.blocked_responses != NULL) { + listRelease(c->clientDurabilityInfo.blocked_responses); + c->clientDurabilityInfo.blocked_responses = NULL; + } + + if (c->clientDurabilityInfo.pending_notify_tasks != NULL) { + durableTaskNotifyClientDestroy(c->clientDurabilityInfo.pending_notify_tasks); + listRelease(c->clientDurabilityInfo.pending_notify_tasks); + c->clientDurabilityInfo.pending_notify_tasks = NULL; + } + + resetPreExecutionOffset(c); + c->clientDurabilityInfo.current_command_repl_offset = -1; + c->clientDurabilityInfo.module_cmd_blocking_offset = -1; +} + +/** + * Determines if a client is doing a transaction or not. + */ +static bool isClientDoingTransaction(client *c) { + return c->cmd->proc == execCommand || IS_SCRIPT_CALL_CMD(c->cmd); +} + +/** + * Returns true if the client is eligible for keyspace tracking on a primary node. + */ +static bool clientEligibleForResponseTracking(client *c) { + serverAssert(iAmPrimary()); + + if (c->cmd == NULL) return false; + + bool is_keyspace_informational_cmd = IS_KEYSPACE_INFORMATIONAL(c->cmd); + + if ((c->cmd->flags & CMD_ADMIN) && !(c->cmd->flags & CMD_WRITE) && !is_keyspace_informational_cmd) { + return false; + } + + return ((c->cmd->flags & (CMD_WRITE | CMD_READONLY)) || isClientDoingTransaction(c) || is_keyspace_informational_cmd || isFunctionStoreRWCommand(c)); +} + +/** + * Check if we only allow client to receive up to a certain + * position in the client reply buffer. + */ +inline bool isClientReplyBufferLimited(client *c) { + return c->clientDurabilityInfo.blocked_responses != NULL && + listLength(c->clientDurabilityInfo.blocked_responses) > 0; +} + +/*================================= Response blocking ======================= */ + +/** + * Store the metrics for a command when blocking + * @param c The client that issued the command. + * @param br The Node at which commands are blocked. + */ +static inline void initCmdMetrics(const client *c, struct blockedResponse *br) { + if (!c->cmd) { + // If client command is NULL, eg Monitor clients, we do not start the timer + // because we do not emit metrics for this response. + br->blocked_command_timer = 0; + return; + } + + elapsedStart(&br->blocked_command_timer); + // For end-to-end latency measurement + + if (c->clientDurabilityInfo.durability_flags & DURABILITY_CLIENT_LAST_CMD_WRITE) { + server.durability.write_responses_blocked++; + br->cmd_type = DURABLE_BLOCKED_CMD_WRITE; + } else if (c->clientDurabilityInfo.durability_flags & DURABILITY_CLIENT_LAST_CMD_READONLY) { + server.durability.read_responses_blocked++; + br->cmd_type = DURABLE_BLOCKED_CMD_READ; + } else { + server.durability.other_responses_blocked++; + br->cmd_type = DURABLE_BLOCKED_CMD_OTHER; + } +} + + +/** + * Block the last response if it exists in the client output buffer. + */ +static void blockLastResponseIfExist(const client *c, const long long blocked_offset) { + serverAssert(c->clientDurabilityInfo.offset.recorded); + + bool has_new_response = false; + listNode *disallowed_reply_block = + c->clientDurabilityInfo.offset.reply_block; + size_t disallowed_byte_offset = + c->clientDurabilityInfo.offset.byte_offset; + + if (disallowed_reply_block == NULL) { + if ((size_t)c->bufpos > disallowed_byte_offset) { + has_new_response = true; + } else if (listLength(c->reply) > 0) { + has_new_response = true; + disallowed_byte_offset = 0; + disallowed_reply_block = listFirst(c->reply); + } + } else { + const clientReplyBlock *last_reply_block = listNodeValue(disallowed_reply_block); + if (last_reply_block->used > disallowed_byte_offset) { + has_new_response = true; + } else if (disallowed_reply_block->next != NULL) { + has_new_response = true; + disallowed_byte_offset = 0; + disallowed_reply_block = disallowed_reply_block->next; + } + } + + if (has_new_response) { + blockedResponse *new_block = zcalloc(sizeof(blockedResponse)); + new_block->primary_repl_offset = blocked_offset; + new_block->disallowed_byte_offset = disallowed_byte_offset; + new_block->disallowed_reply_block = disallowed_reply_block; + initCmdMetrics(c, new_block); + listAddNodeTail(c->clientDurabilityInfo.blocked_responses, new_block); + } +} + + +/** + * Process the metrics of all commands blocked at a BlockedResponse while unblocking + * @param br The Node at which commands are blocked. + */ +static inline void processCmdMetrics(struct blockedResponse *br) { + if (br->blocked_command_timer == 0) return; // Do not count the response if timer is not started + + unsigned long long duration = elapsedUs(br->blocked_command_timer); + + if (br->cmd_type == DURABLE_BLOCKED_CMD_WRITE) { + server.durability.write_responses_blocked_cumulative_time_us += duration; + server.durability.write_responses_unblocked++; + } else if (br->cmd_type == DURABLE_BLOCKED_CMD_READ) { + server.durability.read_responses_blocked_cumulative_time_us += duration; + server.durability.read_responses_unblocked++; + } else { + server.durability.other_responses_blocked_cumulative_time_us += duration; + server.durability.other_responses_unblocked++; + } +} +/** + * Unblocks the first response in the client's blocked responses list. + */ +static void unblockFirstResponse(const client *c) { + serverAssert(c->clientDurabilityInfo.blocked_responses != NULL); + if (listLength(c->clientDurabilityInfo.blocked_responses) > 0) { + listNode *first = listFirst(c->clientDurabilityInfo.blocked_responses); + processCmdMetrics(listNodeValue(first)); + listDelNode(c->clientDurabilityInfo.blocked_responses, first); + } +} + +/** + * Determines if we need to block on a given replication offset for a given client. + */ +static int isBlockingNeededForOffset(const client *c, const long long offset) { + if (offset == -1 || anyDurabilityProviderEnabled() == 0) { + return 0; + } + + if (listLength(c->clientDurabilityInfo.blocked_responses) == 0) + return 1; + + listNode *last_response = listLast(c->clientDurabilityInfo.blocked_responses); + long long previous_offset = ((blockedResponse *)listNodeValue(last_response))->primary_repl_offset; + return previous_offset < offset; +} + +/** + * Block a given client on the specified replication offset if applicable. + */ +void blockClientOnReplOffset(client *c, const long long blockingReplOffset) { + serverAssert(isPrimaryDurabilityEnabled()); + + if (isBlockingNeededForOffset(c, blockingReplOffset)) { + serverLog(LL_DEBUG, "client should be blocked at offset %lld, cmd=%s, is_write=%d", + blockingReplOffset, c->cmd->declared_name, (c->cmd->flags & CMD_WRITE) ? 1 : 0); + blockLastResponseIfExist(c, blockingReplOffset); + if (!c->clientDurabilityInfo.durability_blocked) { + listAddNodeTail(server.durability.clients_waiting_ack, c); + c->clientDurabilityInfo.durability_blocked = 1; + server.durability.clients_blocked++; + } + } + + resetPreExecutionOffset(c); +} + +/** + * Utility function to determine whether a command should be replicated to monitors. + */ +static inline int isCommandReplicatedToMonitors(void) { + return listLength(server.monitors) && !server.loading; +} + +/** + * Block a client and all connected MONITOR clients on the specified replication offset. + */ +static void blockClientAndMonitorsOnReplOffset(client *c, long long blockingReplOffset) { + blockClientOnReplOffset(c, blockingReplOffset); + + if (isCommandReplicatedToMonitors()) { + listNode *ln; + listIter li; + listRewind(server.monitors, &li); + while ((ln = listNext(&li))) { + client *monitor = ln->value; + blockClientOnReplOffset(monitor, blockingReplOffset); + } + } +} + +/*================================= Unblocking ============================== */ + +/** + * Unblock responses and tasks of all blocked clients with a given consensus acked offset. + */ +void unblockResponsesWithAckOffset(const durable_t *durability, const long long consensus_ack_offset) { + serverLog(LL_DEBUG, "unblocking clients for consensus offset %lld,", consensus_ack_offset); + listIter li, li_response; + listNode *ln, *ln_response; + listRewind(durability->clients_waiting_ack, &li); + blockedResponse *br = NULL; + while ((ln = listNext(&li))) { + client *c = ln->value; + + serverAssert(c->clientDurabilityInfo.blocked_responses != NULL); + listRewind(c->clientDurabilityInfo.blocked_responses, &li_response); + bool unblocked_responses = false; + + while ((ln_response = listNext(&li_response))) { + br = listNodeValue(ln_response); + if (br->primary_repl_offset <= consensus_ack_offset) { + unblockFirstResponse(c); + if (unblocked_responses == false) { + unblocked_responses = true; + } + } else { + break; + } + } + if (listLength(c->clientDurabilityInfo.blocked_responses) == 0) { + if (unblockClientWaitingAck(c)) { + server.durability.clients_unblocked++; + } + } + if (unblocked_responses) { + putClientInPendingWriteQueue(c); + } + } + + executeDeferredTasksForAck(consensus_ack_offset); +} + +/*================================= Post-ack handlers ======================= */ + +void notifyDurabilityProgress(void) { + if (!isPrimaryDurabilityEnabled()) { + return; + } + + durable_t *durability = &server.durability; + const long long consensus_ack_offset = getDurabilityConsensusOffset(); + if (consensus_ack_offset <= durability->previous_acked_offset) { + return; + } + + durability->previous_acked_offset = consensus_ack_offset; + unblockResponsesWithAckOffset(durability, consensus_ack_offset); +} + +/*================================= Function Store Tracking ================== */ + +bool isFunctionRWCommand(client *c) { + return (c->argc > 0 && (!strcasecmp(objectGetVal(c->argv[0]), "FUNCTION"))) && !(c->argc > 1 && !strcasecmp(objectGetVal(c->argv[1]), "HELP")); +} + +bool isFunctionStoreRWCommand(client *c) { + return isFunctionRWCommand(c) || c->cmd->proc == fcallCommand || c->cmd->proc == fcallroCommand; +} + +bool isDurableFunctionStoreUncommitted(void) { + return server.durability.func_store_blocking_offset > server.durability.previous_acked_offset; +} + +void handleUncommittedFunctionStore(void) { + if (server.execution_nesting) { + server.durability.processed_func_write_in_transaction = true; + } else { + server.durability.func_store_blocking_offset = server.primary_repl_offset; + } +} + +long long getFuncStoreBlockingOffset(void) { + return server.durability.func_store_blocking_offset; +} + +void updateFuncStoreBlockingOffsetForWrite(long long blocking_repl_offset) { + if (server.durability.processed_func_write_in_transaction) { + server.durability.func_store_blocking_offset = blocking_repl_offset; + server.durability.processed_func_write_in_transaction = false; + } +} + +/*========================== Command offset calculation ===================== */ + +/** + * Process a single replicating command for consistent write blocking. + */ +static long long getSingleCommandBlockingOffsetForReplicatingCommand(client *c) { + if (!(c->cmd->flags & CMD_WRITE)) { + return -1; + } + + if (isFunctionRWCommand(c)) { + handleUncommittedFunctionStore(); + } else { + getKeysResult result; + initGetKeysResult(&result); + int numkeys = getKeysFromCommand(c->cmd, c->argv, c->argc, &result); + keyReference *keys = result.keys; + if (numkeys > 0) { + if (c->cmd->proc == moveCommand) { + int dest_dbid = -1; + if (getIntFromObject(c->argv[2], &dest_dbid) == C_ERR) { + getKeysFreeResult(&result); + return -1; + } + handleUncommittedKeyForClient(c, c->argv[keys[0].pos], server.db[dest_dbid]); + } else if (c->cmd->proc == copyCommand) { + int dest_dbid; + if (!getTargetDbIdForCopyCommand(c->argc, c->argv, c->db->id, &dest_dbid)) { + getKeysFreeResult(&result); + return -1; + } + if (dest_dbid != c->db->id) { + handleUncommittedKeyForClient(c, c->argv[2], server.db[dest_dbid]); + } + } + + for (int i = 0; i < numkeys; i++) { + handleUncommittedKeyForClient(c, c->argv[keys[i].pos], c->db); + } + } + getKeysFreeResult(&result); + } + + if (!server.execution_nesting) { + return server.primary_repl_offset; + } + + return -1; +} + +/** + * Process a single non-replicating command for consistent write blocking. + */ +static long long getSingleCommandBlockingOffsetForNonReplicatingCommand(client *c) { + long long blocking_repl_offset = -1; + + if (isFunctionStoreRWCommand(c)) { + blocking_repl_offset = getFuncStoreBlockingOffset(); + } else if (IS_SCRIPT_CALL_READONLY_CMD(c->cmd)) { + return -1; + } else if ((c->cmd->flags & CMD_MODULE) && (c->clientDurabilityInfo.module_cmd_blocking_offset != -1)) { + blocking_repl_offset = c->clientDurabilityInfo.module_cmd_blocking_offset; + } else if (c->cmd->flags & (CMD_READONLY | CMD_WRITE)) { + blocking_repl_offset = c->db->dirty_repl_offset; + getKeysResult result; + initGetKeysResult(&result); + int numkeys = getKeysFromCommand(c->cmd, c->argv, c->argc, &result); + keyReference *keys = result.keys; + + for (int i = 0; i < numkeys; i++) { + sds keystr = objectGetVal(c->argv[keys[i].pos]); + long long offset = durabilityPurgeAndGetUncommittedKeyOffset(keystr, c->db); + if (offset > blocking_repl_offset) { + blocking_repl_offset = offset; + } + } + getKeysFreeResult(&result); + } + + return blocking_repl_offset; +} + +/** + * Process a single command for consistent write blocking. + */ +static long long getSingleCommandBlockingOffsetForConsistentWrites(struct client *c) { + serverAssert(isPrimaryDurabilityEnabled()); + + if (!anyDurabilityProviderEnabled()) + return -1; + + long long blocking_repl_offset = -1; + + // we can't trust keyspace info if we have any dirty data + if (IS_KEYSPACE_INFORMATIONAL(c->cmd) && + (listLength(server.durability.clients_waiting_ack) > 0 || hasUncommittedKeys() || isDurableFunctionStoreUncommitted())) { + blocking_repl_offset = server.primary_repl_offset; + } else if ((server.primary_repl_offset > server.durability.pre_call_replication_offset) || (server.also_propagate.numops > server.durability.pre_call_num_ops_pending_propagation)) { + blocking_repl_offset = getSingleCommandBlockingOffsetForReplicatingCommand(c); + } else { + blocking_repl_offset = getSingleCommandBlockingOffsetForNonReplicatingCommand(c); + } + + if (blocking_repl_offset <= server.durability.previous_acked_offset) { + blocking_repl_offset = -1; + } + + return blocking_repl_offset; +} + +static void durabilitySetClientCmdFlags(client *c) { + // Transaction wrapper commands, e.g., eval, exec, fcall, should not interfere with the + // final classification of the transaction itself as read or write. Rather the commands + // executed inside the transaction will define if it is read or write or none. + if (isClientDoingTransaction(c)) return; + if (c->cmd->flags & CMD_WRITE) + c->clientDurabilityInfo.durability_flags |= DURABILITY_CLIENT_LAST_CMD_WRITE; + else if (c->cmd->flags & CMD_READONLY) + c->clientDurabilityInfo.durability_flags |= DURABILITY_CLIENT_LAST_CMD_READONLY; +} + +/*=========================== Command hook functions ======================= */ + +/** + * Record the starting replication offset of the command about to be executed. + */ +void beforeCommandTrackReplOffset(struct client *c) { + if (!isPrimaryDurabilityEnabled()) return; + + durabilitySetClientCmdFlags(c); + + + server.durability.pre_call_replication_offset = server.primary_repl_offset; + server.durability.pre_call_num_ops_pending_propagation = server.also_propagate.numops; +} + +static bool isClientBlockedByModule(struct client *c) { + return c->flag.blocked && + c->bstate && + c->bstate->btype == BLOCKED_MODULE && + !moduleClientIsBlockedOnKeys(c); +} + +/** + * After processing a command, track the replication offset and update + * the blocking offset for the command block. + */ +void afterCommandTrackReplOffset(client *c) { + serverLog(LL_DEBUG, "afterCommandTrackReplOffset entered for command '%s'", c->cmd->declared_name); + if (!isPrimaryDurabilityEnabled() || (c->flag.blocked && !isClientBlockedByModule(c))) + return; + + long long current_cmd_blocking_offset = getSingleCommandBlockingOffsetForConsistentWrites(c); + + client *tracking_client = server.current_client ? server.current_client : c; + + if (current_cmd_blocking_offset > tracking_client->clientDurabilityInfo.current_command_repl_offset) { + tracking_client->clientDurabilityInfo.current_command_repl_offset = current_cmd_blocking_offset; + } + + handleDatabaseModification(c); +} + +char *preScriptCmd(client *c) { + if (!isDurabilityEnabled()) { + return NULL; + } + + if (shouldRejectCommandWithUncommittedData(c)) { + return DURABILITY_DATA_UNAVAILABLE; + } + + return NULL; +} + +/** + * Perform pre-processing before command execution for a given client. + */ +int preCommandExec(client *c) { + c->clientDurabilityInfo.current_command_repl_offset = -1; + c->clientDurabilityInfo.module_cmd_blocking_offset = -1; + + if (shouldRejectCommandWithUncommittedData(c)) { + serverAssert(!(c->cmd->flags & CMD_WRITE)); + flagTransaction(c); + addReplyError(c, DURABILITY_DATA_UNAVAILABLE); + return CMD_FILTER_REJECT; + } + + if (iAmPrimary() && clientEligibleForResponseTracking(c)) { + trackCommandPreExecutionPosition(c); + + if (isCommandReplicatedToMonitors()) { + listNode *ln; + listIter li; + listRewind(server.monitors, &li); + while ((ln = listNext(&li))) { + client *monitor = ln->value; + trackCommandPreExecutionPosition(monitor); + } + } + } + + server.durability.pre_command_replication_offset = server.primary_repl_offset; + return CMD_FILTER_ALLOW; +} + +/** + * Perform post-processing after command execution for a given client. + */ +void postCommandExec(client *c) { + if (!isPrimaryDurabilityEnabled() || c->cmd == NULL || c->flag.multi) { + return; + } + + long long blocking_repl_offset = c->clientDurabilityInfo.current_command_repl_offset; + + if (server.primary_repl_offset > server.durability.pre_command_replication_offset && (c->cmd->flags & CMD_WRITE || isClientDoingTransaction(c)) && c->cmd->proc != syncCommand && c->cmd->proc != clusterCommand && c->cmd->proc != shutdownCommand) { + blocking_repl_offset = server.primary_repl_offset; + } + + if (blocking_repl_offset > server.durability.pre_command_replication_offset) { + serverAssert(clientEligibleForResponseTracking(c)); + } + + processPendingUncommittedData(server.primary_repl_offset); + + blockClientAndMonitorsOnReplOffset(c, blocking_repl_offset); + + certifyPendingDeferredTasks(); +} + +/*================================= Lifecycle =============================== */ + +/** + * Initialize the durability subsystem. + */ +void durabilityInit(void) { + serverLog(LL_DEBUG, "Initializing durability subsystem"); + + /* Initialize uncommitted keys pending data */ + uncommittedKeysInitPending(); + + /* Have to init the handlers before using them. */ + initTaskTypes(); + server.durability.previous_acked_offset = -1; + server.durability.curr_db_scan_idx = 0; + server.durability.clients_waiting_ack = listCreate(); + durableTaskInitLists(); + server.durability.clients_blocked = 0; + server.durability.clients_unblocked = 0; + server.durability.clients_disconnected_before_unblocking = 0; + server.durability.read_responses_blocked = 0; + server.durability.write_responses_blocked = 0; + server.durability.other_responses_blocked = 0; + server.durability.read_responses_unblocked = 0; + server.durability.write_responses_unblocked = 0; + server.durability.other_responses_unblocked = 0; + server.durability.read_responses_blocked_cumulative_time_us = 0; + server.durability.write_responses_blocked_cumulative_time_us = 0; + server.durability.other_responses_blocked_cumulative_time_us = 0; + + /* Initialize function store blocking state */ + server.durability.all_dbs_dirty_in_current_cmd = false; + server.durability.func_store_blocking_offset = -1; + server.durability.processed_func_write_in_transaction = false; + + /* Register built-in durability providers (AOF) */ + registerBuiltinDurabilityProviders(); +} + +/** + * Clean up the durability subsystem on server shutdown. + */ +void durabilityCleanup(void) { + if (server.durability.clients_waiting_ack != NULL) { + listRelease(server.durability.clients_waiting_ack); + server.durability.clients_waiting_ack = NULL; + } + + uncommittedKeysCleanupPending(); + + /* Cleanup deferred tasks waiting for durability ack */ + durableTaskCleanupLists(); + + /* Reset the durability provider registry so it can be re-initialized */ + resetDurabilityProviders(); + + clearAllUncommittedKeys(); +} + +/** + * Disconnect and free clients waiting for durability ack. + */ +static void freeClientsWaitingAck(const durable_t *durability) { + listIter li; + listNode *ln; + listRewind(durability->clients_waiting_ack, &li); + while ((ln = listNext(&li))) { + client *c = listNodeValue(ln); + freeClient(c); + } + listEmpty(durability->clients_waiting_ack); +} + +/** + * Reset primary state for the durability subsystem. + */ +static void durabilityResetPrimaryState(bool is_free_clients_needed) { + if (listLength(server.durability.clients_waiting_ack) > 0) { + if (is_free_clients_needed) { + freeClientsWaitingAck(&server.durability); + } else { + unblockResponsesWithAckOffset(&server.durability, LLONG_MAX); + } + serverAssert(listLength(server.durability.clients_waiting_ack) == 0); + } + durableTaskEmptyLists(); +} + +/** + * Clear the durability attributes specific to the primary. + * Invoked when a primary node becomes a replica. + */ +void durabilityClearPrimaryState(void) { + if (!isDurabilityEnabled()) return; + durabilityResetPrimaryState(true); +} + +/** + * Generate INFO string for durability stats. + */ +sds genDurabilityInfoString(sds info) { + if (!isDurabilityEnabled()) { + info = sdscatprintf(info, "durability_enabled:0\r\n"); + return info; + } + + info = sdscatprintf(info, + "durability_enabled:1\r\n" + "durability_read_blocked_count:%lld\r\n" + "durability_write_blocked_count:%lld\r\n" + "durability_clients_waiting_ack:%lu\r\n" + "durability_uncommitted_keys:%llu\r\n" + "durability_previous_acked_offset:%lld\r\n" + "durability_primary_repl_offset:%lld\r\n", + server.durability.read_responses_blocked, + server.durability.write_responses_blocked, + listLength(server.durability.clients_waiting_ack), + getNumberOfUncommittedKeys(), + server.durability.previous_acked_offset, + server.primary_repl_offset); + + return info; +} + +/** + * Reset related resources when enabling/disabling durability. + */ +void durabilityReset(void) { + if (isDurabilityEnabled()) { + server.durability.pre_command_replication_offset = server.primary_repl_offset; + listIter li; + listNode *ln; + listRewind(server.clients, &li); + while ((ln = listNext(&li)) != NULL) { + client *c = listNodeValue(ln); + durabilityClientInit(c); + } + } else { + if (iAmPrimary()) { + durabilityResetPrimaryState(false); + } + clearAllUncommittedKeys(); + } +} diff --git a/src/reply_blocking.h b/src/reply_blocking.h new file mode 100644 index 00000000000..d4b94c2728e --- /dev/null +++ b/src/reply_blocking.h @@ -0,0 +1,225 @@ +#ifndef REPLY_BLOCKING_H +#define REPLY_BLOCKING_H + +#include +#include +#include +#include "expire.h" +#include "monotonic.h" +#include "sds.h" +#include "durability_provider.h" +#include "uncommitted_keys.h" +#include "durable_task.h" + +#define DURABILITY_DATA_UNAVAILABLE "Accessed data unavailable to be served" +/* Command filter codes that are used in pre execution stage of a command. */ +#define CMD_FILTER_ALLOW 0 +#define CMD_FILTER_REJECT 1 +// Returns true if the cmd is a script command that never replicates. +#define IS_SCRIPT_CALL_READONLY_CMD(cmd) ((cmd) && (((cmd)->proc == fcallroCommand) || ((cmd)->proc == evalRoCommand) || ((cmd)->proc == evalShaRoCommand))) + +// Returns true if the cmd is a script command +// (EVAL/EVAL_RO/EVALSHA/EVALSHA_RO/FCALL/FCALL_RO). +#define IS_SCRIPT_CALL_CMD(cmd) ((cmd) && (((cmd)->proc == fcallCommand) || ((cmd)->proc == fcallroCommand) || ((cmd)->proc == evalCommand) || ((cmd)->proc == evalRoCommand) || ((cmd)->proc == evalShaCommand) || ((cmd)->proc == evalShaRoCommand))) + +// Returns true if the cmd is a keyspace informational command — a command that is +// related to the keyspace (ACL_CATEGORY_KEYSPACE) but does not mutate it (not CMD_WRITE). +// These commands provide information about the keyspace and need to be tracked for +// durability response blocking even when they are admin or non-read/non-write commands. +#define IS_KEYSPACE_INFORMATIONAL(cmd) ((cmd) && ((cmd)->acl_categories & ACL_CATEGORY_KEYSPACE) && !((cmd)->flags & CMD_WRITE)) + +/* Flags below help in correctly classifying transactions as + * either read/write commands or non-keyspace commands. */ +// Indicates the client's last command was a mutative command. +#define DURABILITY_CLIENT_LAST_CMD_WRITE (1ULL << 20) +// Indicates the client's last command was read-only command. */ +#define DURABILITY_CLIENT_LAST_CMD_READONLY (1ULL << 21) + +struct client; +struct serverObject; +struct serverDb; +struct list; +struct listNode; + +typedef long long mstime_t; + +/* Indicate this type of notification is called inside of a durable task, + * which is used by the durability feature to defer notifications. */ +#define NOTIFY_IN_DURABLE_TASK (1 << 30) +/** + * Durability container to house all the durability related fields. + */ +typedef struct durable_t { + /* Flag to enable/disable durability (response blocking until providers ack) */ + int enabled; + /* Uncommitted keys cleanup configuration time limit in milliseconds */ + unsigned int keys_cleanup_time_limit_ms; + /* The current scanning database index, starting from 0 */ + int curr_db_scan_idx; + + /* Clients waiting for offset acknowledgement from durability providers */ + struct list *clients_waiting_ack; + + /* Deferred tasks waiting for offset acknowledgement from durability providers */ + struct list *tasks_waiting_ack[DURABLE_TASK_TYPE_MAX]; + + /* Pending lists of tasks waiting for durability ack. This list is populated + * when the current command is under execution but before we know about the + * updated primary_repl_offset. After the command execution completes, the + * server.primary_repl_offset would get incremented and we need to update + * this list and move all the pending tasks to the official + * tasks_waiting_ack list as part of the post-execution logic + */ + struct list *pending_tasks_waiting_ack[DURABLE_TASK_TYPE_MAX]; + + /* Previously acknowledged replication offset by durability providers */ + long long previous_acked_offset; + + /* Track the replication offset prior to executing a single command in call() */ + long long pre_call_replication_offset; + + /* Track the replication offset prior to executing a command block + including single command and multi-command transactions */ + long long pre_command_replication_offset; + + /* Track the number of commands awaiting propagation prior to executing a single command in call() */ + int pre_call_num_ops_pending_propagation; + + /* Counters for stats / info */ + + /* Counter of how many clients are blocked for durability */ + unsigned long long clients_blocked; + /* Counter of how many clients are unblocked for durability */ + unsigned long long clients_unblocked; + /* Counter of how many clients are disconnected before being unblocked for durability */ + unsigned long long clients_disconnected_before_unblocking; + /* Counter of how many responses are blocked/unblocked by type */ + unsigned long long read_responses_blocked; + unsigned long long write_responses_blocked; + unsigned long long other_responses_blocked; + unsigned long long read_responses_unblocked; + unsigned long long write_responses_unblocked; + unsigned long long other_responses_unblocked; + + /* Cumulative times for all the blocked responses */ + unsigned long long read_responses_blocked_cumulative_time_us; + unsigned long long write_responses_blocked_cumulative_time_us; + unsigned long long other_responses_blocked_cumulative_time_us; + + /* Tracks whether all databases were dirtied during the current command + * within a multi-command block (MULTI/EXEC or Lua script). */ + bool all_dbs_dirty_in_current_cmd; + + /* Function store blocking offset: tracks the replication offset at which + * the function store was last modified and needs durability acknowledgement. */ + long long func_store_blocking_offset; + + /* Flag indicating a function write occurred inside a transaction, so the + * blocking offset should be updated when the transaction completes. */ + bool processed_func_write_in_transaction; +} durable_t; + +/** + * Define the type of command being blocked + */ +typedef enum { + DURABLE_BLOCKED_CMD_OTHER = 0, + DURABLE_BLOCKED_CMD_WRITE, + DURABLE_BLOCKED_CMD_READ +} durableBlockedCmdType; + +// Blocked response structure used by client to mark +// the blocking information associated with each response +typedef struct blockedResponse { + // Pointer to the client's reply node where the blocked response starts. + // NULL if the blocked response starts from the 16KB initial buffer + struct listNode *disallowed_reply_block; + // The boundary in the reply buffer where the blocked response starts. + size_t disallowed_byte_offset; + // The replication offset to wait for acknowledgement from durability providers + long long primary_repl_offset; + + // Enum to store the type of blocked command + durableBlockedCmdType cmd_type; + // Timer for blocked command + monotime blocked_command_timer; +} blockedResponse; + +// Describes a pre-execution COB offset for a client +typedef struct preExecutionOffsetPosition { + // True if the pre execution offset/reply block are initialized + bool recorded; + // Track initial client COB position for client blocking + struct listNode *reply_block; + // Byte position boundary within the pre-execution reply block + size_t byte_offset; +} preExecutionOffsetPosition; + +typedef struct clientDurabilityInfo { + // Blocked client responses list for durability + struct list *blocked_responses; + + /* Pre-execution data recorded before a command is executed + * to record the boundaries of the COB. */ + preExecutionOffsetPosition offset; + + // Replication offset to block this current command response + long long current_command_repl_offset; + + // The list of async notification tasks that reference this client + struct list *pending_notify_tasks; + + // This client is waiting for durability providers to acknowledge + // the write before its response can be sent. + uint64_t durability_blocked : 1; + // Modules can set the blocking offset for read cmds + long long module_cmd_blocking_offset; + + uint64_t durability_flags; +} clientDurableInfo; + +/** + * Init / Lifecycle + */ +void durabilityInit(void); +void durabilityCleanup(void); +void durabilityReset(void); +void durabilityClientInit(struct client *c); +void durabilityClientReset(struct client *c); +void durabilityClearPrimaryState(void); + +/** + * Command processing hooks for offset and COB tracking + */ +void beforeCommandTrackReplOffset(client *c); +void afterCommandTrackReplOffset(client *c); +int preCommandExec(client *c); +char *preScriptCmd(client *c); +void postCommandExec(client *c); +void notifyDurabilityProgress(void); + +/** + * Response blocking + */ +void blockClientOnReplOffset(client *c, long long blockingReplOffset); +void unblockResponsesWithAckOffset(const durable_t *durability, long long consensus_ack_offset); + +/** + * Utils + */ +int isPrimaryDurabilityEnabled(void); +int isDurabilityEnabled(void); +bool isClientReplyBufferLimited(client *c); +sds genDurabilityInfoString(sds info); + +/** + * Function store dirty tracking (durability blocking for function store writes) + */ +bool isFunctionRWCommand(struct client *c); +bool isFunctionStoreRWCommand(struct client *c); +bool isDurableFunctionStoreUncommitted(void); +void handleUncommittedFunctionStore(void); +void updateFuncStoreBlockingOffsetForWrite(long long blocking_repl_offset); +long long getFuncStoreBlockingOffset(void); + +#endif /* REPLY_BLOCKING_H */ diff --git a/src/script.c b/src/script.c index 156fb12cf2b..a3781302d00 100644 --- a/src/script.c +++ b/src/script.c @@ -202,6 +202,13 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx, return C_ERR; } + // check if sync replication would want to stop the execution. + const char *pre_script_err = preScriptCmd(caller); + if (pre_script_err != NULL) { + addReplyError(caller, pre_script_err); + return C_ERR; + } + } else { /* Special handling for backwards compatibility (no shebang eval[sha]) mode */ if (running_stale) { diff --git a/src/server.c b/src/server.c index 69d0cd27ab0..fb5085d8c3b 100644 --- a/src/server.c +++ b/src/server.c @@ -1713,6 +1713,7 @@ long long serverCron(struct aeEventLoop *eventLoop, long long id, void *clientDa run_with_period(100) modulesCron(); } + run_with_period(1000) clearUncommittedKeysAcknowledged(); /* Fire the cron loop modules event. */ ValkeyModuleCronLoopV1 ei = {VALKEYMODULE_CRON_LOOP_VERSION, server.hz}; moduleFireServerEvent(VALKEYMODULE_EVENT_CRON_LOOP, 0, &ei); @@ -1828,7 +1829,9 @@ void beforeSleep(struct aeEventLoop *eventLoop) { processed += processIOThreadsReadDone(); processed += connTypeProcessPendingData(); if (server.aof_state == AOF_ON || server.aof_state == AOF_WAIT_REWRITE) flushAppendOnlyFile(0); - processed += handleClientsWithPendingWrites(); + if (!(server.aof_fsync == AOF_FSYNC_ALWAYS && aofIOFlushInProgress())) { + processed += handleClientsWithPendingWrites(); + } int last_processed = 0; do { /* Try to process all the pending IO events. */ @@ -1935,6 +1938,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { * wake them up ASAP. */ if (listLength(server.clients_waiting_acks) && prev_fsynced_reploff != server.fsynced_reploff) dont_sleep = 1; } + notifyDurabilityProgress(); /* Handle writes with pending output buffers. */ int client_writes = handleClientsWithPendingWrites(); @@ -2048,7 +2052,11 @@ void afterSleep(struct aeEventLoop *eventLoop, int numevents) { server.cmd_time_snapshot = server.mstime; } - adjustIOThreadsByEventLoad(numevents, 0); + /* Check if AOF always-fsync needs IO threads for background work */ + int aof_needs_io = (server.aof_state != AOF_OFF && + server.aof_fsync == AOF_FSYNC_ALWAYS && + sdslen(server.aof_buf) > 0); + adjustIOThreadsByEventLoad(numevents, 0, aof_needs_io); } /* =========================== Server initialization ======================== */ @@ -2297,6 +2305,9 @@ void initServerConfig(void) { server.aof_flush_sleep = 0; server.aof_last_fsync = time(NULL) * 1000; server.aof_cur_timestamp = 0; + atomic_store_explicit(&server.aof_io_flush_state, 0, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_errno, 0, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_size, 0, memory_order_relaxed); atomic_store_explicit(&server.aof_bio_fsync_status, C_OK, memory_order_relaxed); server.aof_rewrite_time_last = -1; server.aof_rewrite_time_start = -1; @@ -2868,6 +2879,8 @@ serverDb *createDatabase(int id) { db->ready_keys = dictCreate(&objectKeyPointerValueDictType); db->watched_keys = dictCreate(&keylistDictType); db->id = id; + + durabilityInitDatabase(db); resetDbExpiryState(db); return db; } @@ -3096,6 +3109,8 @@ void initServer(void) { /* Initialize the EVAL scripting component. */ evalInit(); + durabilityInit(); + applyWatchdogPeriod(); if (server.maxmemory_clients != 0) initServerClientMemUsageBuckets(); @@ -3833,6 +3848,7 @@ void call(client *c, int flags) { struct ClientFlags client_old_flags = c->flag; struct serverCommand *real_cmd = c->realcmd; + beforeCommandTrackReplOffset(c); client *prev_client = server.executing_client; server.executing_client = c; @@ -4030,7 +4046,9 @@ void call(client *c, int flags) { if (zmalloc_used > server.stat_peak_memory) server.stat_peak_memory = zmalloc_used; /* Do some maintenance job and cleanup */ + // TODO: should blocking postCall could be moved into afterCommand? afterCommand(c); + afterCommandTrackReplOffset(c); /* Remember the replication offset of the client, right after its last * command that resulted in propagation. */ @@ -4564,8 +4582,12 @@ int processCommand(client *c) { queueMultiCommand(c, cmd_flags); addReply(c, shared.queued); } else { + if (preCommandExec(c) == CMD_FILTER_REJECT) { + return C_OK; + } int flags = CMD_CALL_FULL; call(c, flags); + postCommandExec(c); if (listLength(server.ready_keys) && !isInsideYieldingLongCommand()) handleClientsBlockedOnKeys(); } return C_OK; @@ -4852,6 +4874,9 @@ int finishShutdown(void) { /* Fire the shutdown modules event. */ moduleFireServerEvent(VALKEYMODULE_EVENT_SHUTDOWN, 0, NULL); + /* Cleanup durability tracking resources. */ + durabilityCleanup(); + /* Remove the pid file if possible and needed. */ if (server.daemonize || server.pidfile) { serverLog(LL_NOTICE, "Removing the pid file."); @@ -6693,6 +6718,13 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "eventloop_cmd_per_cycle_max:%lld\r\n", server.el_cmd_cnt_max)); } + /* Sync replication / durability stats */ + if (all_sections || (dictFind(section_dict, "durability") != NULL)) { + if (sections++) info = sdscat(info, "\r\n"); + info = sdscatprintf(info, "# Durability\r\n"); + info = genDurabilityInfoString(info); + } + return info; } diff --git a/src/server.h b/src/server.h index b306db549e9..385716cc0b9 100644 --- a/src/server.h +++ b/src/server.h @@ -36,7 +36,9 @@ #include "rio.h" #include "commands.h" #include "allocator_defrag.h" +#include "reply_blocking.h" +#include #include #include #include @@ -946,6 +948,14 @@ typedef struct serverDb { long long avg_ttl; /* Average TTL, just for stats */ unsigned long cursor; /* Cursor of the active expire cycle. */ } expiry[ACTIVE_EXPIRY_TYPE_COUNT]; + + /* fields related to dirty key tracking + * for consistent writes with durability */ + hashtable *uncommitted_keys; /* Map of dirty keys to the offset required by replica acknowledgement */ + long long dirty_repl_offset; /* Replication offset for a dirty DB */ + size_t uncommitted_keys_cursor; /* Cursor for incremental cleanup scans */ + int scan_in_progress; /* Flag of showing whether db is in scan or not */ + rax *reply_duration; /* Radix tree tracking reply durations for durable blocked clients */ } serverDb; /* forward declaration for functions ctx */ @@ -1232,6 +1242,8 @@ typedef struct ClientFlags { or client::buf. */ uint64_t keyspace_notified : 1; /* Indicates that a keyspace notification was triggered during the execution of the current command. */ + uint64_t durable_blocked_client : 1; /* This is a durable blocked client that is waiting for the server to + * acknowledge the write of the command that caused it to be blocked. */ } ClientFlags; typedef struct ClientPubSubData { @@ -1431,6 +1443,7 @@ typedef struct client { #ifdef LOG_REQ_RES clientReqResInfo reqres; #endif + struct clientDurabilityInfo clientDurabilityInfo; } client; /* Forward declaration */ @@ -1442,7 +1455,7 @@ bool isImportSlotMigrationJob(slotMigrationJob *job); * The function will return one of the following: * CLIENT_TYPE_NORMAL -> Normal client, including MONITOR * CLIENT_TYPE_REPLICA -> replica - * CLIENT_TYPE_PUBSUB -> Client subscribed to Pub/Sub channels + * CLIENT_TYPE_PUBSUB -> Client subscribed to Pub\/Sub channels * CLIENT_TYPE_PRIMARY -> The client representing our replication primary. */ static inline int getClientType(client *c) { @@ -1455,6 +1468,7 @@ static inline int getClientType(client *c) { return CLIENT_TYPE_NORMAL; } + /* When a command generates a lot of discrete elements to the client output buffer, it is much faster to * skip certain types of initialization. This type is used to indicate a client that has been initialized * and can be used with addWritePreparedReply* functions. A client can be cast into this type with @@ -1769,6 +1783,7 @@ typedef enum childInfoType { } childInfoType; struct valkeyServer { + durable_t durability; /* General */ pid_t pid; /* Main process pid. */ pthread_t main_thread_id; /* Main thread id */ @@ -2059,6 +2074,9 @@ struct valkeyServer { int aof_load_truncated; /* Don't stop on unexpected AOF EOF. */ int aof_use_rdb_preamble; /* Specify base AOF to use RDB encoding on AOF rewrites. */ int aof_rewrite_use_rdb_preamble; /* Base AOF to use RDB encoding on AOF rewrites start. */ + _Atomic int aof_io_flush_state; /* AOF always-fsync IO-thread flush state. */ + _Atomic int aof_io_flush_errno; /* Errno of AOF always-fsync IO-thread flush. */ + _Atomic off_t aof_io_flush_size; /* Bytes written by the last IO-thread flush. */ _Atomic(int) aof_bio_fsync_status; /* Status of AOF fsync in bio job. */ _Atomic(int) aof_bio_fsync_errno; /* Errno of AOF fsync in bio job. */ aofManifest *aof_manifest; /* Used to track AOFs. */ @@ -2999,7 +3017,6 @@ size_t getClientOutputBufferMemoryUsage(client *c); size_t getClientMemoryUsage(client *c, size_t *output_buffer_mem_usage); int freeClientsInAsyncFreeQueue(void); int closeClientOnOutputBufferLimitReached(client *c, int async); -int getClientType(client *c); int getClientTypeByName(char *name); char *getClientTypeName(int client_class); void flushReplicasOutputBuffers(void); @@ -3047,6 +3064,8 @@ int processIOThreadsWriteDone(void); void releaseReplyReferences(client *c); void resetLastWrittenBuf(client *c); +int getIntFromObject(robj *o, int *target); + int parseExtendedCommandArgumentsOrReply(client *c, int command_type, int start_idx, int max_args, int *flags, int *unit, int *expire_idx, robj **expire, robj **compare_val); /* logreqres.c - logging of requests and responses */ @@ -3293,6 +3312,7 @@ void aofManifestFree(aofManifest *am); int aofDelHistoryFiles(void); int aofRewriteLimited(void); int rewriteSlotToAppendOnlyFileRio(rio *aof, int db_num, int hashslot, size_t *key_count); +int aofIOFlushInProgress(void); /* Child info */ void openChildInfoPipe(void); From 3253c5c8af5e8cfd2bb83169a4b22ff458f764a3 Mon Sep 17 00:00:00 2001 From: jjuleslasarte Date: Wed, 18 Mar 2026 14:32:34 -0700 Subject: [PATCH 6/8] build: add durability source files to Makefile and CMake Add reply_blocking.c, durable_task.c, durability_provider.c, and uncommitted_keys.c to the build system (both Makefile and CMake). Also fix a clang compatibility issue in unit test CMakeLists.txt: -fno-var-tracking-assignments is GCC-only, so guard it with a compiler ID check. Signed-off-by: jjuleslasarte --- cmake/Modules/SourceFiles.cmake | 4 ++++ src/Makefile | 4 ++++ src/unit/CMakeLists.txt | 5 ++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake index 2845a350192..f9fd13d45a6 100644 --- a/cmake/Modules/SourceFiles.cmake +++ b/cmake/Modules/SourceFiles.cmake @@ -11,6 +11,10 @@ set(VALKEY_SERVER_SRCS ${CMAKE_SOURCE_DIR}/src/ae.c ${CMAKE_SOURCE_DIR}/src/anet.c ${CMAKE_SOURCE_DIR}/src/dict.c + ${CMAKE_SOURCE_DIR}/src/reply_blocking.c + ${CMAKE_SOURCE_DIR}/src/durable_task.c + ${CMAKE_SOURCE_DIR}/src/durability_provider.c + ${CMAKE_SOURCE_DIR}/src/uncommitted_keys.c ${CMAKE_SOURCE_DIR}/src/hashtable.c ${CMAKE_SOURCE_DIR}/src/kvstore.c ${CMAKE_SOURCE_DIR}/src/sds.c diff --git a/src/Makefile b/src/Makefile index 73815ac2bcf..705cb7859b3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -524,6 +524,10 @@ ENGINE_SERVER_OBJ = \ rdma.o \ release.o \ replication.o \ + reply_blocking.o \ + durable_task.o \ + durability_provider.o \ + uncommitted_keys.o \ resp_parser.o \ rio.o \ script.o \ diff --git a/src/unit/CMakeLists.txt b/src/unit/CMakeLists.txt index 8a6156675c8..3e714cd0943 100644 --- a/src/unit/CMakeLists.txt +++ b/src/unit/CMakeLists.txt @@ -85,8 +85,11 @@ target_compile_options(valkey-unit-gtests PRIVATE -Og -g) target_compile_options(valkey-unit-gtests PRIVATE -Wno-deprecated-declarations -Wno-write-strings - -fno-var-tracking-assignments ) +# -fno-var-tracking-assignments is GCC-only, skip on clang +if (NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") + target_compile_options(valkey-unit-gtests PRIVATE -fno-var-tracking-assignments) +endif() # Include directories for C++ compilation target_include_directories(valkey-unit-gtests PRIVATE From a76070b3825ffd0fabb3d79062a97420ec5cfeba Mon Sep 17 00:00:00 2001 From: jjuleslasarte Date: Wed, 18 Mar 2026 14:32:44 -0700 Subject: [PATCH 7/8] tests: add unit tests for durability reply blocking Add comprehensive gtest-based unit tests covering the reply blocking subsystem including: - Client output buffer blocking and unblocking mechanics - Offset tracking through command execution - Multi-command transaction (MULTI/EXEC) offset handling - Durability provider consensus calculations - Deferred task lifecycle (create, execute, cleanup) - Uncommitted key tracking and purging - Edge cases: client disconnection, provider pause/resume Signed-off-by: jjuleslasarte --- src/unit/test_entry.cpp | 2 +- src/unit/test_files.h | 344 ++++++++++++ src/unit/test_reply_blocking.cpp | 876 +++++++++++++++++++++++++++++++ 3 files changed, 1221 insertions(+), 1 deletion(-) create mode 100644 src/unit/test_files.h create mode 100644 src/unit/test_reply_blocking.cpp diff --git a/src/unit/test_entry.cpp b/src/unit/test_entry.cpp index 20c5575d2fa..57a1a7d8ed8 100644 --- a/src/unit/test_entry.cpp +++ b/src/unit/test_entry.cpp @@ -171,7 +171,7 @@ TEST_F(EntryTest, entryUpdate) { // Update the value so that memory usage is less than 3/4 of the current allocation size // Ensuring required_embedded_size < current_embedded_allocation_size * 3 / 4, which creates a new entry size_t current_embedded_allocation_size = entryMemUsage(e9); - sds value10 = sdsnew("xxxxxxxxxxxxxxxxxxxxx"); + sds value10 = sdsnew("xxxxxx"); sds value_copy10 = sdsdup(value10); long long expiry10 = expiry9; entry *e10 = entryUpdate(e9, value10, expiry10); diff --git a/src/unit/test_files.h b/src/unit/test_files.h new file mode 100644 index 00000000000..30042dbf4e3 --- /dev/null +++ b/src/unit/test_files.h @@ -0,0 +1,344 @@ +/* Do not modify this file, it's automatically generated from utils/generate-unit-test-header.py */ +typedef int unitTestProc(int argc, char **argv, int flags); + +typedef struct unitTest { + char *name; + unitTestProc *proc; +} unitTest; + +int test_popcount(int argc, char **argv, int flags); +int test_crc64(int argc, char **argv, int flags); +int test_crc64combine(int argc, char **argv, int flags); +int test_dictCreate(int argc, char **argv, int flags); +int test_dictAdd16Keys(int argc, char **argv, int flags); +int test_dictDisableResize(int argc, char **argv, int flags); +int test_dictAddOneKeyTriggerResize(int argc, char **argv, int flags); +int test_dictDeleteKeys(int argc, char **argv, int flags); +int test_dictDeleteOneKeyTriggerResize(int argc, char **argv, int flags); +int test_dictEmptyDirAdd128Keys(int argc, char **argv, int flags); +int test_dictDisableResizeReduceTo3(int argc, char **argv, int flags); +int test_dictDeleteOneKeyTriggerResizeAgain(int argc, char **argv, int flags); +int test_dictBenchmark(int argc, char **argv, int flags); +int test_endianconv(int argc, char *argv[], int flags); +int test_entryCreate(int argc, char **argv, int flags); +int test_entryUpdate(int argc, char **argv, int flags); +int test_entryHasexpiry_entrySetExpiry(int argc, char **argv, int flags); +int test_entryIsExpired(int argc, char **argv, int flags); +int test_entryMemUsage_entrySetExpiry_entryUpdate(int argc, char **argv, int flags); +int test_entryStringRef(int argc, char **argv, int flags); +int test_fifoEmptyPop(int argc, char *argv[], int flags); +int test_fifoEmptyPeek(int argc, char *argv[], int flags); +int test_fifoSimplePushPop(int argc, char *argv[], int flags); +int test_fifoTryVariousSizes(int argc, char *argv[], int flags); +int test_fifoPushPopTest(int argc, char *argv[], int flags); +int test_fifoJoinTest(int argc, char *argv[], int flags); +int test_fifoComparePerformance(int argc, char *argv[], int flags); +int test_cursor(int argc, char **argv, int flags); +int test_set_hash_function_seed(int argc, char **argv, int flags); +int test_add_find_delete(int argc, char **argv, int flags); +int test_add_find_delete_avoid_resize(int argc, char **argv, int flags); +int test_instant_rehashing(int argc, char **argv, int flags); +int test_bucket_chain_length(int argc, char **argv, int flags); +int test_two_phase_insert_and_pop(int argc, char **argv, int flags); +int test_replace_reallocated_entry(int argc, char **argv, int flags); +int test_incremental_find(int argc, char **argv, int flags); +int test_scan(int argc, char **argv, int flags); +int test_iterator(int argc, char **argv, int flags); +int test_safe_iterator(int argc, char **argv, int flags); +int test_compact_bucket_chain(int argc, char **argv, int flags); +int test_random_entry(int argc, char **argv, int flags); +int test_random_entry_with_long_chain(int argc, char **argv, int flags); +int test_random_entry_sparse_table(int argc, char **argv, int flags); +int test_safe_iterator_invalidation(int argc, char **argv, int flags); +int test_safe_iterator_empty_no_invalidation(int argc, char **argv, int flags); +int test_safe_iterator_reset_invalidation(int argc, char **argv, int flags); +int test_safe_iterator_reset_untracking(int argc, char **argv, int flags); +int test_safe_iterator_pause_resume_tracking(int argc, char **argv, int flags); +int test_null_hashtable_iterator(int argc, char **argv, int flags); +int test_hashtable_retarget_iterator(int argc, char **argv, int flags); +int test_intsetValueEncodings(int argc, char **argv, int flags); +int test_intsetBasicAdding(int argc, char **argv, int flags); +int test_intsetLargeNumberRandomAdd(int argc, char **argv, int flags); +int test_intsetUpgradeFromint16Toint32(int argc, char **argv, int flags); +int test_intsetUpgradeFromint16Toint64(int argc, char **argv, int flags); +int test_intsetUpgradeFromint32Toint64(int argc, char **argv, int flags); +int test_intsetStressLookups(int argc, char **argv, int flags); +int test_intsetStressAddDelete(int argc, char **argv, int flags); +int test_kvstoreAdd16Keys(int argc, char **argv, int flags); +int test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **argv, int flags); +int test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv, int flags); +int test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **argv, int flags); +int test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv, int flags); +int test_kvstoreHashtableExpand(int argc, char **argv, int flags); +int test_listpackCreateIntList(int argc, char **argv, int flags); +int test_listpackCreateList(int argc, char **argv, int flags); +int test_listpackLpPrepend(int argc, char **argv, int flags); +int test_listpackLpPrependInteger(int argc, char **argv, int flags); +int test_listpackGetELementAtIndex(int argc, char **argv, int flags); +int test_listpackPop(int argc, char **argv, int flags); +int test_listpackGetELementAtIndex2(int argc, char **argv, int flags); +int test_listpackIterate0toEnd(int argc, char **argv, int flags); +int test_listpackIterate1toEnd(int argc, char **argv, int flags); +int test_listpackIterate2toEnd(int argc, char **argv, int flags); +int test_listpackIterateBackToFront(int argc, char **argv, int flags); +int test_listpackIterateBackToFrontWithDelete(int argc, char **argv, int flags); +int test_listpackDeleteWhenNumIsMinusOne(int argc, char **argv, int flags); +int test_listpackDeleteWithNegativeIndex(int argc, char **argv, int flags); +int test_listpackDeleteInclusiveRange0_0(int argc, char **argv, int flags); +int test_listpackDeleteInclusiveRange0_1(int argc, char **argv, int flags); +int test_listpackDeleteInclusiveRange1_2(int argc, char **argv, int flags); +int test_listpackDeleteWitStartIndexOutOfRange(int argc, char **argv, int flags); +int test_listpackDeleteWitNumOverflow(int argc, char **argv, int flags); +int test_listpackBatchDelete(int argc, char **argv, int flags); +int test_listpackDeleteFooWhileIterating(int argc, char **argv, int flags); +int test_listpackReplaceWithSameSize(int argc, char **argv, int flags); +int test_listpackReplaceWithDifferentSize(int argc, char **argv, int flags); +int test_listpackRegressionGt255Bytes(int argc, char **argv, int flags); +int test_listpackCreateLongListAndCheckIndices(int argc, char **argv, int flags); +int test_listpackCompareStrsWithLpEntries(int argc, char **argv, int flags); +int test_listpackLpMergeEmptyLps(int argc, char **argv, int flags); +int test_listpackLpMergeLp1Larger(int argc, char **argv, int flags); +int test_listpackLpMergeLp2Larger(int argc, char **argv, int flags); +int test_listpackLpNextRandom(int argc, char **argv, int flags); +int test_listpackLpNextRandomCC(int argc, char **argv, int flags); +int test_listpackRandomPairWithOneElement(int argc, char **argv, int flags); +int test_listpackRandomPairWithManyElements(int argc, char **argv, int flags); +int test_listpackRandomPairsWithOneElement(int argc, char **argv, int flags); +int test_listpackRandomPairsWithManyElements(int argc, char **argv, int flags); +int test_listpackRandomPairsUniqueWithOneElement(int argc, char **argv, int flags); +int test_listpackRandomPairsUniqueWithManyElements(int argc, char **argv, int flags); +int test_listpackPushVariousEncodings(int argc, char **argv, int flags); +int test_listpackLpFind(int argc, char **argv, int flags); +int test_listpackLpValidateIntegrity(int argc, char **argv, int flags); +int test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN(int argc, char **argv, int flags); +int test_listpackStressWithRandom(int argc, char **argv, int flags); +int test_listpackSTressWithVariableSize(int argc, char **argv, int flags); +int test_listpackBenchmarkInit(int argc, char *argv[], int flags); +int test_listpackBenchmarkLpAppend(int argc, char **argv, int flags); +int test_listpackBenchmarkLpFindString(int argc, char **argv, int flags); +int test_listpackBenchmarkLpFindNumber(int argc, char **argv, int flags); +int test_listpackBenchmarkLpSeek(int argc, char **argv, int flags); +int test_listpackBenchmarkLpValidateIntegrity(int argc, char **argv, int flags); +int test_listpackBenchmarkLpCompareWithString(int argc, char **argv, int flags); +int test_listpackBenchmarkLpCompareWithNumber(int argc, char **argv, int flags); +int test_listpackBenchmarkFree(int argc, char **argv, int flags); +int test_mutexQueueSimplePushPop(int argc, char *argv[], int flags); +int test_mutexQueueDoublePushPop(int argc, char *argv[], int flags); +int test_mutexQueuePriorityOrdering(int argc, char *argv[], int flags); +int test_mutexQueueFifoPopAll(int argc, char *argv[], int flags); +int test_mutexQueueFifoAddMultiple(int argc, char *argv[], int flags); +int test_mutexQueueSimpleThread(int argc, char *argv[], int flags); +int test_mutexQueueParallelWriters(int argc, char *argv[], int flags); +int test_mutexQueueParallelReaders(int argc, char *argv[], int flags); +int test_mutexQueueParallelReadWrite(int argc, char *argv[], int flags); +int test_writeToReplica(int argc, char **argv, int flags); +int test_postWriteToReplica(int argc, char **argv, int flags); +int test_backupAndUpdateClientArgv(int argc, char **argv, int flags); +int test_rewriteClientCommandArgument(int argc, char **argv, int flags); +int test_addRepliesWithOffloadsToBuffer(int argc, char **argv, int flags); +int test_addRepliesWithOffloadsToList(int argc, char **argv, int flags); +int test_addBufferToReplyIOV(int argc, char **argv, int flags); +int test_object_with_key(int argc, char **argv, int flags); +int test_embedded_string_with_key(int argc, char **argv, int flags); +int test_embedded_string_with_key_and_expire(int argc, char **argv, int flags); +int test_embedded_value(int argc, char **argv, int flags); +int test_unembed_value(int argc, char **argv, int flags); +int test_quicklistCreateList(int argc, char **argv, int flags); +int test_quicklistAddToTailOfEmptyList(int argc, char **argv, int flags); +int test_quicklistAddToHeadOfEmptyList(int argc, char **argv, int flags); +int test_quicklistAddToTail5xAtCompress(int argc, char **argv, int flags); +int test_quicklistAddToHead5xAtCompress(int argc, char **argv, int flags); +int test_quicklistAddToTail500xAtCompress(int argc, char **argv, int flags); +int test_quicklistAddToHead500xAtCompress(int argc, char **argv, int flags); +int test_quicklistRotateEmpty(int argc, char **argv, int flags); +int test_quicklistComprassionPlainNode(int argc, char **argv, int flags); +int test_quicklistNextPlainNode(int argc, char **argv, int flags); +int test_quicklistRotatePlainNode(int argc, char **argv, int flags); +int test_quicklistRotateOneValOnce(int argc, char **argv, int flags); +int test_quicklistRotate500Val5000TimesAtCompress(int argc, char **argv, int flags); +int test_quicklistPopEmpty(int argc, char **argv, int flags); +int test_quicklistPop1StringFrom1(int argc, char **argv, int flags); +int test_quicklistPopHead1NumberFrom1(int argc, char **argv, int flags); +int test_quicklistPopHead500From500(int argc, char **argv, int flags); +int test_quicklistPopHead5000From500(int argc, char **argv, int flags); +int test_quicklistIterateForwardOver500List(int argc, char **argv, int flags); +int test_quicklistIterateReverseOver500List(int argc, char **argv, int flags); +int test_quicklistInsertAfter1Element(int argc, char **argv, int flags); +int test_quicklistInsertBefore1Element(int argc, char **argv, int flags); +int test_quicklistInsertHeadWhileHeadNodeIsFull(int argc, char **argv, int flags); +int test_quicklistInsertTailWhileTailNodeIsFull(int argc, char **argv, int flags); +int test_quicklistInsertOnceInElementsWhileIteratingAtCompress(int argc, char **argv, int flags); +int test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress(int argc, char **argv, int flags); +int test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress(int argc, char **argv, int flags); +int test_quicklistDuplicateEmptyList(int argc, char **argv, int flags); +int test_quicklistDuplicateListOf1Element(int argc, char **argv, int flags); +int test_quicklistDuplicateListOf500(int argc, char **argv, int flags); +int test_quicklistIndex1200From500ListAtFill(int argc, char **argv, int flags); +int test_quicklistIndex12From500ListAtFill(int argc, char **argv, int flags); +int test_quicklistIndex100From500ListAtFill(int argc, char **argv, int flags); +int test_quicklistIndexTooBig1From50ListAtFill(int argc, char **argv, int flags); +int test_quicklistDeleteRangeEmptyList(int argc, char **argv, int flags); +int test_quicklistDeleteRangeOfEntireNodeInListOfOneNode(int argc, char **argv, int flags); +int test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts(int argc, char **argv, int flags); +int test_quicklistDeleteMiddle100Of500List(int argc, char **argv, int flags); +int test_quicklistDeleteLessThanFillButAcrossNodes(int argc, char **argv, int flags); +int test_quicklistDeleteNegative1From500List(int argc, char **argv, int flags); +int test_quicklistDeleteNegative1From500ListWithOverflowCounts(int argc, char **argv, int flags); +int test_quicklistDeleteNegative100From500List(int argc, char **argv, int flags); +int test_quicklistDelete10Count5From50List(int argc, char **argv, int flags); +int test_quicklistNumbersOnlyListRead(int argc, char **argv, int flags); +int test_quicklistNumbersLargerListRead(int argc, char **argv, int flags); +int test_quicklistNumbersLargerListReadB(int argc, char **argv, int flags); +int test_quicklistLremTestAtCompress(int argc, char **argv, int flags); +int test_quicklistIterateReverseDeleteAtCompress(int argc, char **argv, int flags); +int test_quicklistIteratorAtIndexTestAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestAAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestBAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestCAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestDAtCompress(int argc, char **argv, int flags); +int test_quicklistVerifySpecificCompressionOfInteriorNodes(int argc, char **argv, int flags); +int test_quicklistBookmarkGetUpdatedToNextItem(int argc, char **argv, int flags); +int test_quicklistBookmarkLimit(int argc, char **argv, int flags); +int test_quicklistCompressAndDecompressQuicklistListpackNode(int argc, char **argv, int flags); +int test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX(int argc, char **argv, int flags); +int test_raxRandomWalk(int argc, char **argv, int flags); +int test_raxIteratorUnitTests(int argc, char **argv, int flags); +int test_raxTryInsertUnitTests(int argc, char **argv, int flags); +int test_raxRegressionTest1(int argc, char **argv, int flags); +int test_raxRegressionTest2(int argc, char **argv, int flags); +int test_raxRegressionTest3(int argc, char **argv, int flags); +int test_raxRegressionTest4(int argc, char **argv, int flags); +int test_raxRegressionTest5(int argc, char **argv, int flags); +int test_raxRegressionTest6(int argc, char **argv, int flags); +int test_raxBenchmark(int argc, char **argv, int flags); +int test_raxHugeKey(int argc, char **argv, int flags); +int test_raxFuzz(int argc, char **argv, int flags); +int test_raxRecompressHugeKey(int argc, char **argv, int flags); +int test_sds(int argc, char **argv, int flags); +int test_typesAndAllocSize(int argc, char **argv, int flags); +int test_sdsHeaderSizes(int argc, char **argv, int flags); +int test_sdssplitargs(int argc, char **argv, int flags); +int test_sdsnsplitargs(int argc, char **argv, int flags); +int test_sdsnsplitargsBenchmark(int argc, char **argv, int flags); +int test_sha1(int argc, char **argv, int flags); +int test_sha256_abc(int argc, char **argv, int flags); +int test_sha256_large(int argc, char **argv, int flags); +int test_sha256_million_a(int argc, char **argv, int flags); +int test_string2ll(int argc, char **argv, int flags); +int test_string2l(int argc, char **argv, int flags); +int test_ll2string(int argc, char **argv, int flags); +int test_ld2string(int argc, char **argv, int flags); +int test_fixedpoint_d2string(int argc, char **argv, int flags); +int test_version2num(int argc, char **argv, int flags); +int test_reclaimFilePageCache(int argc, char **argv, int flags); +int test_writePointerWithPadding(int argc, char **argv, int flags); +int test_valkey_strtod(int argc, char **argv, int flags); +int test_vector(int argc, char **argv, int flags); +int test_vset_add_and_iterate(int argc, char **argv, int flags); +int test_vset_large_batch_same_expiry(int argc, char **argv, int flags); +int test_vset_large_batch_update_entry_same_expiry(int argc, char **argv, int flags); +int test_vset_large_batch_update_entry_multiple_expiries(int argc, char **argv, int flags); +int test_vset_iterate_multiple_expiries(int argc, char **argv, int flags); +int test_vset_add_and_remove_all(int argc, char **argv, int flags); +int test_vset_remove_expire_shrink(int argc, char **argv, int flags); +int test_vset_defrag(int argc, char **argv, int flags); +int test_vset_fuzzer(int argc, char **argv, int flags); +int test_ziplistCreateIntList(int argc, char **argv, int flags); +int test_ziplistPop(int argc, char **argv, int flags); +int test_ziplistGetElementAtIndex3(int argc, char **argv, int flags); +int test_ziplistGetElementOutOfRange(int argc, char **argv, int flags); +int test_ziplistGetLastElement(int argc, char **argv, int flags); +int test_ziplistGetFirstElement(int argc, char **argv, int flags); +int test_ziplistGetElementOutOfRangeReverse(int argc, char **argv, int flags); +int test_ziplistIterateThroughFullList(int argc, char **argv, int flags); +int test_ziplistIterateThroughListFrom1ToEnd(int argc, char **argv, int flags); +int test_ziplistIterateThroughListFrom2ToEnd(int argc, char **argv, int flags); +int test_ziplistIterateThroughStartOutOfRange(int argc, char **argv, int flags); +int test_ziplistIterateBackToFront(int argc, char **argv, int flags); +int test_ziplistIterateBackToFrontDeletingAllItems(int argc, char **argv, int flags); +int test_ziplistDeleteInclusiveRange0To0(int argc, char **argv, int flags); +int test_ziplistDeleteInclusiveRange0To1(int argc, char **argv, int flags); +int test_ziplistDeleteInclusiveRange1To2(int argc, char **argv, int flags); +int test_ziplistDeleteWithStartIndexOutOfRange(int argc, char **argv, int flags); +int test_ziplistDeleteWithNumOverflow(int argc, char **argv, int flags); +int test_ziplistDeleteFooWhileIterating(int argc, char **argv, int flags); +int test_ziplistReplaceWithSameSize(int argc, char **argv, int flags); +int test_ziplistReplaceWithDifferentSize(int argc, char **argv, int flags); +int test_ziplistRegressionTestForOver255ByteStrings(int argc, char **argv, int flags); +int test_ziplistRegressionTestDeleteNextToLastEntries(int argc, char **argv, int flags); +int test_ziplistCreateLongListAndCheckIndices(int argc, char **argv, int flags); +int test_ziplistCompareStringWithZiplistEntries(int argc, char **argv, int flags); +int test_ziplistMergeTest(int argc, char **argv, int flags); +int test_ziplistStressWithRandomPayloadsOfDifferentEncoding(int argc, char **argv, int flags); +int test_ziplistCascadeUpdateEdgeCases(int argc, char **argv, int flags); +int test_ziplistInsertEdgeCase(int argc, char **argv, int flags); +int test_ziplistStressWithVariableSize(int argc, char **argv, int flags); +int test_BenchmarkziplistFind(int argc, char **argv, int flags); +int test_BenchmarkziplistIndex(int argc, char **argv, int flags); +int test_BenchmarkziplistValidateIntegrity(int argc, char **argv, int flags); +int test_BenchmarkziplistCompareWithString(int argc, char **argv, int flags); +int test_BenchmarkziplistCompareWithNumber(int argc, char **argv, int flags); +int test_ziplistStress__ziplistCascadeUpdate(int argc, char **argv, int flags); +int test_zipmapIterateWithLargeKey(int argc, char *argv[], int flags); +int test_zipmapIterateThroughElements(int argc, char *argv[], int flags); +int test_zmallocAllocReallocCallocAndFree(int argc, char **argv, int flags); +int test_zmallocAllocZeroByteAndFree(int argc, char **argv, int flags); + +unitTest __test_bitops_c[] = {{"test_popcount", test_popcount}, {NULL, NULL}}; +unitTest __test_crc64_c[] = {{"test_crc64", test_crc64}, {NULL, NULL}}; +unitTest __test_crc64combine_c[] = {{"test_crc64combine", test_crc64combine}, {NULL, NULL}}; +unitTest __test_dict_c[] = {{"test_dictCreate", test_dictCreate}, {"test_dictAdd16Keys", test_dictAdd16Keys}, {"test_dictDisableResize", test_dictDisableResize}, {"test_dictAddOneKeyTriggerResize", test_dictAddOneKeyTriggerResize}, {"test_dictDeleteKeys", test_dictDeleteKeys}, {"test_dictDeleteOneKeyTriggerResize", test_dictDeleteOneKeyTriggerResize}, {"test_dictEmptyDirAdd128Keys", test_dictEmptyDirAdd128Keys}, {"test_dictDisableResizeReduceTo3", test_dictDisableResizeReduceTo3}, {"test_dictDeleteOneKeyTriggerResizeAgain", test_dictDeleteOneKeyTriggerResizeAgain}, {"test_dictBenchmark", test_dictBenchmark}, {NULL, NULL}}; +unitTest __test_endianconv_c[] = {{"test_endianconv", test_endianconv}, {NULL, NULL}}; +unitTest __test_entry_c[] = {{"test_entryCreate", test_entryCreate}, {"test_entryUpdate", test_entryUpdate}, {"test_entryHasexpiry_entrySetExpiry", test_entryHasexpiry_entrySetExpiry}, {"test_entryIsExpired", test_entryIsExpired}, {"test_entryMemUsage_entrySetExpiry_entryUpdate", test_entryMemUsage_entrySetExpiry_entryUpdate}, {"test_entryStringRef", test_entryStringRef}, {NULL, NULL}}; +unitTest __test_fifo_c[] = {{"test_fifoEmptyPop", test_fifoEmptyPop}, {"test_fifoEmptyPeek", test_fifoEmptyPeek}, {"test_fifoSimplePushPop", test_fifoSimplePushPop}, {"test_fifoTryVariousSizes", test_fifoTryVariousSizes}, {"test_fifoPushPopTest", test_fifoPushPopTest}, {"test_fifoJoinTest", test_fifoJoinTest}, {"test_fifoComparePerformance", test_fifoComparePerformance}, {NULL, NULL}}; +unitTest __test_hashtable_c[] = {{"test_cursor", test_cursor}, {"test_set_hash_function_seed", test_set_hash_function_seed}, {"test_add_find_delete", test_add_find_delete}, {"test_add_find_delete_avoid_resize", test_add_find_delete_avoid_resize}, {"test_instant_rehashing", test_instant_rehashing}, {"test_bucket_chain_length", test_bucket_chain_length}, {"test_two_phase_insert_and_pop", test_two_phase_insert_and_pop}, {"test_replace_reallocated_entry", test_replace_reallocated_entry}, {"test_incremental_find", test_incremental_find}, {"test_scan", test_scan}, {"test_iterator", test_iterator}, {"test_safe_iterator", test_safe_iterator}, {"test_compact_bucket_chain", test_compact_bucket_chain}, {"test_random_entry", test_random_entry}, {"test_random_entry_with_long_chain", test_random_entry_with_long_chain}, {"test_random_entry_sparse_table", test_random_entry_sparse_table}, {"test_safe_iterator_invalidation", test_safe_iterator_invalidation}, {"test_safe_iterator_empty_no_invalidation", test_safe_iterator_empty_no_invalidation}, {"test_safe_iterator_reset_invalidation", test_safe_iterator_reset_invalidation}, {"test_safe_iterator_reset_untracking", test_safe_iterator_reset_untracking}, {"test_safe_iterator_pause_resume_tracking", test_safe_iterator_pause_resume_tracking}, {"test_null_hashtable_iterator", test_null_hashtable_iterator}, {"test_hashtable_retarget_iterator", test_hashtable_retarget_iterator}, {NULL, NULL}}; +unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEncodings}, {"test_intsetBasicAdding", test_intsetBasicAdding}, {"test_intsetLargeNumberRandomAdd", test_intsetLargeNumberRandomAdd}, {"test_intsetUpgradeFromint16Toint32", test_intsetUpgradeFromint16Toint32}, {"test_intsetUpgradeFromint16Toint64", test_intsetUpgradeFromint16Toint64}, {"test_intsetUpgradeFromint32Toint64", test_intsetUpgradeFromint32Toint64}, {"test_intsetStressLookups", test_intsetStressLookups}, {"test_intsetStressAddDelete", test_intsetStressAddDelete}, {NULL, NULL}}; +unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable", test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable}, {"test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable", test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable}, {"test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable", test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable}, {"test_kvstoreHashtableExpand", test_kvstoreHashtableExpand}, {NULL, NULL}}; +unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}}; +unitTest __test_mutexqueue_c[] = {{"test_mutexQueueSimplePushPop", test_mutexQueueSimplePushPop}, {"test_mutexQueueDoublePushPop", test_mutexQueueDoublePushPop}, {"test_mutexQueuePriorityOrdering", test_mutexQueuePriorityOrdering}, {"test_mutexQueueFifoPopAll", test_mutexQueueFifoPopAll}, {"test_mutexQueueFifoAddMultiple", test_mutexQueueFifoAddMultiple}, {"test_mutexQueueSimpleThread", test_mutexQueueSimpleThread}, {"test_mutexQueueParallelWriters", test_mutexQueueParallelWriters}, {"test_mutexQueueParallelReaders", test_mutexQueueParallelReaders}, {"test_mutexQueueParallelReadWrite", test_mutexQueueParallelReadWrite}, {NULL, NULL}}; +unitTest __test_networking_c[] = {{"test_writeToReplica", test_writeToReplica}, {"test_postWriteToReplica", test_postWriteToReplica}, {"test_backupAndUpdateClientArgv", test_backupAndUpdateClientArgv}, {"test_rewriteClientCommandArgument", test_rewriteClientCommandArgument}, {"test_addRepliesWithOffloadsToBuffer", test_addRepliesWithOffloadsToBuffer}, {"test_addRepliesWithOffloadsToList", test_addRepliesWithOffloadsToList}, {"test_addBufferToReplyIOV", test_addBufferToReplyIOV}, {NULL, NULL}}; +unitTest __test_object_c[] = {{"test_object_with_key", test_object_with_key}, {"test_embedded_string_with_key", test_embedded_string_with_key}, {"test_embedded_string_with_key_and_expire", test_embedded_string_with_key_and_expire}, {"test_embedded_value", test_embedded_value}, {"test_unembed_value", test_unembed_value}, {NULL, NULL}}; +unitTest __test_quicklist_c[] = {{"test_quicklistCreateList", test_quicklistCreateList}, {"test_quicklistAddToTailOfEmptyList", test_quicklistAddToTailOfEmptyList}, {"test_quicklistAddToHeadOfEmptyList", test_quicklistAddToHeadOfEmptyList}, {"test_quicklistAddToTail5xAtCompress", test_quicklistAddToTail5xAtCompress}, {"test_quicklistAddToHead5xAtCompress", test_quicklistAddToHead5xAtCompress}, {"test_quicklistAddToTail500xAtCompress", test_quicklistAddToTail500xAtCompress}, {"test_quicklistAddToHead500xAtCompress", test_quicklistAddToHead500xAtCompress}, {"test_quicklistRotateEmpty", test_quicklistRotateEmpty}, {"test_quicklistComprassionPlainNode", test_quicklistComprassionPlainNode}, {"test_quicklistNextPlainNode", test_quicklistNextPlainNode}, {"test_quicklistRotatePlainNode", test_quicklistRotatePlainNode}, {"test_quicklistRotateOneValOnce", test_quicklistRotateOneValOnce}, {"test_quicklistRotate500Val5000TimesAtCompress", test_quicklistRotate500Val5000TimesAtCompress}, {"test_quicklistPopEmpty", test_quicklistPopEmpty}, {"test_quicklistPop1StringFrom1", test_quicklistPop1StringFrom1}, {"test_quicklistPopHead1NumberFrom1", test_quicklistPopHead1NumberFrom1}, {"test_quicklistPopHead500From500", test_quicklistPopHead500From500}, {"test_quicklistPopHead5000From500", test_quicklistPopHead5000From500}, {"test_quicklistIterateForwardOver500List", test_quicklistIterateForwardOver500List}, {"test_quicklistIterateReverseOver500List", test_quicklistIterateReverseOver500List}, {"test_quicklistInsertAfter1Element", test_quicklistInsertAfter1Element}, {"test_quicklistInsertBefore1Element", test_quicklistInsertBefore1Element}, {"test_quicklistInsertHeadWhileHeadNodeIsFull", test_quicklistInsertHeadWhileHeadNodeIsFull}, {"test_quicklistInsertTailWhileTailNodeIsFull", test_quicklistInsertTailWhileTailNodeIsFull}, {"test_quicklistInsertOnceInElementsWhileIteratingAtCompress", test_quicklistInsertOnceInElementsWhileIteratingAtCompress}, {"test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistDuplicateEmptyList", test_quicklistDuplicateEmptyList}, {"test_quicklistDuplicateListOf1Element", test_quicklistDuplicateListOf1Element}, {"test_quicklistDuplicateListOf500", test_quicklistDuplicateListOf500}, {"test_quicklistIndex1200From500ListAtFill", test_quicklistIndex1200From500ListAtFill}, {"test_quicklistIndex12From500ListAtFill", test_quicklistIndex12From500ListAtFill}, {"test_quicklistIndex100From500ListAtFill", test_quicklistIndex100From500ListAtFill}, {"test_quicklistIndexTooBig1From50ListAtFill", test_quicklistIndexTooBig1From50ListAtFill}, {"test_quicklistDeleteRangeEmptyList", test_quicklistDeleteRangeEmptyList}, {"test_quicklistDeleteRangeOfEntireNodeInListOfOneNode", test_quicklistDeleteRangeOfEntireNodeInListOfOneNode}, {"test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts", test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts}, {"test_quicklistDeleteMiddle100Of500List", test_quicklistDeleteMiddle100Of500List}, {"test_quicklistDeleteLessThanFillButAcrossNodes", test_quicklistDeleteLessThanFillButAcrossNodes}, {"test_quicklistDeleteNegative1From500List", test_quicklistDeleteNegative1From500List}, {"test_quicklistDeleteNegative1From500ListWithOverflowCounts", test_quicklistDeleteNegative1From500ListWithOverflowCounts}, {"test_quicklistDeleteNegative100From500List", test_quicklistDeleteNegative100From500List}, {"test_quicklistDelete10Count5From50List", test_quicklistDelete10Count5From50List}, {"test_quicklistNumbersOnlyListRead", test_quicklistNumbersOnlyListRead}, {"test_quicklistNumbersLargerListRead", test_quicklistNumbersLargerListRead}, {"test_quicklistNumbersLargerListReadB", test_quicklistNumbersLargerListReadB}, {"test_quicklistLremTestAtCompress", test_quicklistLremTestAtCompress}, {"test_quicklistIterateReverseDeleteAtCompress", test_quicklistIterateReverseDeleteAtCompress}, {"test_quicklistIteratorAtIndexTestAtCompress", test_quicklistIteratorAtIndexTestAtCompress}, {"test_quicklistLtrimTestAAtCompress", test_quicklistLtrimTestAAtCompress}, {"test_quicklistLtrimTestBAtCompress", test_quicklistLtrimTestBAtCompress}, {"test_quicklistLtrimTestCAtCompress", test_quicklistLtrimTestCAtCompress}, {"test_quicklistLtrimTestDAtCompress", test_quicklistLtrimTestDAtCompress}, {"test_quicklistVerifySpecificCompressionOfInteriorNodes", test_quicklistVerifySpecificCompressionOfInteriorNodes}, {"test_quicklistBookmarkGetUpdatedToNextItem", test_quicklistBookmarkGetUpdatedToNextItem}, {"test_quicklistBookmarkLimit", test_quicklistBookmarkLimit}, {"test_quicklistCompressAndDecompressQuicklistListpackNode", test_quicklistCompressAndDecompressQuicklistListpackNode}, {"test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX", test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX}, {NULL, NULL}}; +unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {"test_raxRecompressHugeKey", test_raxRecompressHugeKey}, {NULL, NULL}}; +unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {"test_sdssplitargs", test_sdssplitargs}, {"test_sdsnsplitargs", test_sdsnsplitargs}, {"test_sdsnsplitargsBenchmark", test_sdsnsplitargsBenchmark}, {NULL, NULL}}; +unitTest __test_sha1_c[] = {{"test_sha1", test_sha1}, {NULL, NULL}}; +unitTest __test_sha256_c[] = {{"test_sha256_abc", test_sha256_abc}, {"test_sha256_large", test_sha256_large}, {"test_sha256_million_a", test_sha256_million_a}, {NULL, NULL}}; +unitTest __test_util_c[] = {{"test_string2ll", test_string2ll}, {"test_string2l", test_string2l}, {"test_ll2string", test_ll2string}, {"test_ld2string", test_ld2string}, {"test_fixedpoint_d2string", test_fixedpoint_d2string}, {"test_version2num", test_version2num}, {"test_reclaimFilePageCache", test_reclaimFilePageCache}, {"test_writePointerWithPadding", test_writePointerWithPadding}, {NULL, NULL}}; +unitTest __test_valkey_strtod_c[] = {{"test_valkey_strtod", test_valkey_strtod}, {NULL, NULL}}; +unitTest __test_vector_c[] = {{"test_vector", test_vector}, {NULL, NULL}}; +unitTest __test_vset_c[] = {{"test_vset_add_and_iterate", test_vset_add_and_iterate}, {"test_vset_large_batch_same_expiry", test_vset_large_batch_same_expiry}, {"test_vset_large_batch_update_entry_same_expiry", test_vset_large_batch_update_entry_same_expiry}, {"test_vset_large_batch_update_entry_multiple_expiries", test_vset_large_batch_update_entry_multiple_expiries}, {"test_vset_iterate_multiple_expiries", test_vset_iterate_multiple_expiries}, {"test_vset_add_and_remove_all", test_vset_add_and_remove_all}, {"test_vset_remove_expire_shrink", test_vset_remove_expire_shrink}, {"test_vset_defrag", test_vset_defrag}, {"test_vset_fuzzer", test_vset_fuzzer}, {NULL, NULL}}; +unitTest __test_ziplist_c[] = {{"test_ziplistCreateIntList", test_ziplistCreateIntList}, {"test_ziplistPop", test_ziplistPop}, {"test_ziplistGetElementAtIndex3", test_ziplistGetElementAtIndex3}, {"test_ziplistGetElementOutOfRange", test_ziplistGetElementOutOfRange}, {"test_ziplistGetLastElement", test_ziplistGetLastElement}, {"test_ziplistGetFirstElement", test_ziplistGetFirstElement}, {"test_ziplistGetElementOutOfRangeReverse", test_ziplistGetElementOutOfRangeReverse}, {"test_ziplistIterateThroughFullList", test_ziplistIterateThroughFullList}, {"test_ziplistIterateThroughListFrom1ToEnd", test_ziplistIterateThroughListFrom1ToEnd}, {"test_ziplistIterateThroughListFrom2ToEnd", test_ziplistIterateThroughListFrom2ToEnd}, {"test_ziplistIterateThroughStartOutOfRange", test_ziplistIterateThroughStartOutOfRange}, {"test_ziplistIterateBackToFront", test_ziplistIterateBackToFront}, {"test_ziplistIterateBackToFrontDeletingAllItems", test_ziplistIterateBackToFrontDeletingAllItems}, {"test_ziplistDeleteInclusiveRange0To0", test_ziplistDeleteInclusiveRange0To0}, {"test_ziplistDeleteInclusiveRange0To1", test_ziplistDeleteInclusiveRange0To1}, {"test_ziplistDeleteInclusiveRange1To2", test_ziplistDeleteInclusiveRange1To2}, {"test_ziplistDeleteWithStartIndexOutOfRange", test_ziplistDeleteWithStartIndexOutOfRange}, {"test_ziplistDeleteWithNumOverflow", test_ziplistDeleteWithNumOverflow}, {"test_ziplistDeleteFooWhileIterating", test_ziplistDeleteFooWhileIterating}, {"test_ziplistReplaceWithSameSize", test_ziplistReplaceWithSameSize}, {"test_ziplistReplaceWithDifferentSize", test_ziplistReplaceWithDifferentSize}, {"test_ziplistRegressionTestForOver255ByteStrings", test_ziplistRegressionTestForOver255ByteStrings}, {"test_ziplistRegressionTestDeleteNextToLastEntries", test_ziplistRegressionTestDeleteNextToLastEntries}, {"test_ziplistCreateLongListAndCheckIndices", test_ziplistCreateLongListAndCheckIndices}, {"test_ziplistCompareStringWithZiplistEntries", test_ziplistCompareStringWithZiplistEntries}, {"test_ziplistMergeTest", test_ziplistMergeTest}, {"test_ziplistStressWithRandomPayloadsOfDifferentEncoding", test_ziplistStressWithRandomPayloadsOfDifferentEncoding}, {"test_ziplistCascadeUpdateEdgeCases", test_ziplistCascadeUpdateEdgeCases}, {"test_ziplistInsertEdgeCase", test_ziplistInsertEdgeCase}, {"test_ziplistStressWithVariableSize", test_ziplistStressWithVariableSize}, {"test_BenchmarkziplistFind", test_BenchmarkziplistFind}, {"test_BenchmarkziplistIndex", test_BenchmarkziplistIndex}, {"test_BenchmarkziplistValidateIntegrity", test_BenchmarkziplistValidateIntegrity}, {"test_BenchmarkziplistCompareWithString", test_BenchmarkziplistCompareWithString}, {"test_BenchmarkziplistCompareWithNumber", test_BenchmarkziplistCompareWithNumber}, {"test_ziplistStress__ziplistCascadeUpdate", test_ziplistStress__ziplistCascadeUpdate}, {NULL, NULL}}; +unitTest __test_zipmap_c[] = {{"test_zipmapIterateWithLargeKey", test_zipmapIterateWithLargeKey}, {"test_zipmapIterateThroughElements", test_zipmapIterateThroughElements}, {NULL, NULL}}; +unitTest __test_zmalloc_c[] = {{"test_zmallocAllocReallocCallocAndFree", test_zmallocAllocReallocCallocAndFree}, {"test_zmallocAllocZeroByteAndFree", test_zmallocAllocZeroByteAndFree}, {NULL, NULL}}; + +struct unitTestSuite { + char *filename; + unitTest *tests; +} unitTestSuite[] = { + {"test_bitops.c", __test_bitops_c}, + {"test_crc64.c", __test_crc64_c}, + {"test_crc64combine.c", __test_crc64combine_c}, + {"test_dict.c", __test_dict_c}, + {"test_endianconv.c", __test_endianconv_c}, + {"test_entry.c", __test_entry_c}, + {"test_fifo.c", __test_fifo_c}, + {"test_hashtable.c", __test_hashtable_c}, + {"test_intset.c", __test_intset_c}, + {"test_kvstore.c", __test_kvstore_c}, + {"test_listpack.c", __test_listpack_c}, + {"test_mutexqueue.c", __test_mutexqueue_c}, + {"test_networking.c", __test_networking_c}, + {"test_object.c", __test_object_c}, + {"test_quicklist.c", __test_quicklist_c}, + {"test_rax.c", __test_rax_c}, + {"test_sds.c", __test_sds_c}, + {"test_sha1.c", __test_sha1_c}, + {"test_sha256.c", __test_sha256_c}, + {"test_util.c", __test_util_c}, + {"test_valkey_strtod.c", __test_valkey_strtod_c}, + {"test_vector.c", __test_vector_c}, + {"test_vset.c", __test_vset_c}, + {"test_ziplist.c", __test_ziplist_c}, + {"test_zipmap.c", __test_zipmap_c}, + {"test_zmalloc.c", __test_zmalloc_c}, +}; diff --git a/src/unit/test_reply_blocking.cpp b/src/unit/test_reply_blocking.cpp new file mode 100644 index 00000000000..66c15d011f5 --- /dev/null +++ b/src/unit/test_reply_blocking.cpp @@ -0,0 +1,876 @@ +/* + * Copyright (c) Valkey Contributors + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include "generated_wrappers.hpp" + +#include +#include +#include +#include + +extern "C" { +#include "server.h" +#include "reply_blocking.h" +#include "durability_provider.h" +#include "uncommitted_keys.h" + +/* Forward declarations used by tests */ +} + +/* ========================= Test Helpers ========================= */ + +static void initTestEnv(void) { + static char test_logfile[] = ""; + if (server.logfile == nullptr) { + server.logfile = test_logfile; + } +} + +/** + * Minimal durability initialization for tests — avoids calling initTaskTypes() + * which is forward-declared but not yet defined. + */ +static void initDurabilityForTest(void) { + uncommittedKeysInitPending(); + initTaskTypes(); + server.durability.previous_acked_offset = -1; + server.durability.curr_db_scan_idx = 0; + server.durability.clients_waiting_ack = listCreate(); + durableTaskInitLists(); + server.durability.clients_blocked = 0; + server.durability.clients_unblocked = 0; + server.durability.clients_disconnected_before_unblocking = 0; + server.durability.read_responses_blocked = 0; + server.durability.write_responses_blocked = 0; + server.durability.other_responses_blocked = 0; + server.durability.read_responses_unblocked = 0; + server.durability.write_responses_unblocked = 0; + server.durability.other_responses_unblocked = 0; + server.durability.read_responses_blocked_cumulative_time_us = 0; + server.durability.write_responses_blocked_cumulative_time_us = 0; + server.durability.other_responses_blocked_cumulative_time_us = 0; + registerBuiltinDurabilityProviders(); +} + +/** + * Minimal durability cleanup for tests. + */ +static void cleanupDurabilityForTest(void) { + if (server.durability.clients_waiting_ack) { + listRelease(server.durability.clients_waiting_ack); + server.durability.clients_waiting_ack = nullptr; + } + uncommittedKeysCleanupPending(); + durableTaskCleanupLists(); + resetDurabilityProviders(); +} + +/* ========================= Test Fixtures ========================= */ + +class SyncReplicationTest : public ::testing::Test { + protected: + void SetUp() override { + initTestEnv(); + } +}; + +class DurabilityProviderTest : public ::testing::Test { + protected: + /* Saved state */ + int old_aof_state; + int old_aof_fsync; + long long old_fsynced_reploff; + list *old_replicas; + list *old_clients_pending_write; + char *old_primary_host; + durable_t old_durability; + + void SetUp() override { + initTestEnv(); + old_aof_state = server.aof_state; + old_aof_fsync = server.aof_fsync; + old_fsynced_reploff = server.fsynced_reploff; + old_replicas = server.replicas; + old_clients_pending_write = server.clients_pending_write; + old_primary_host = server.primary_host; + old_durability = server.durability; + + server.primary_host = nullptr; + server.clients_pending_write = listCreate(); + server.replicas = listCreate(); + } + + void TearDown() override { + listRelease(server.clients_pending_write); + listRelease(server.replicas); + + server.aof_state = old_aof_state; + server.aof_fsync = old_aof_fsync; + server.fsynced_reploff = old_fsynced_reploff; + server.replicas = old_replicas; + server.clients_pending_write = old_clients_pending_write; + server.primary_host = old_primary_host; + server.durability = old_durability; + } +}; + +class UncommittedKeysTest : public ::testing::Test { + protected: + serverDb **old_db; + int old_dbnum; + char *old_primary_host; + int old_cluster_enabled; + long long old_previous_acked_offset; + long long old_primary_repl_offset; + + void SetUp() override { + initTestEnv(); + old_db = server.db; + old_dbnum = server.dbnum; + old_primary_host = server.primary_host; + old_cluster_enabled = server.cluster_enabled; + old_previous_acked_offset = server.durability.previous_acked_offset; + old_primary_repl_offset = server.primary_repl_offset; + + server.cluster_enabled = 0; + server.primary_host = nullptr; + server.dbnum = 1; + server.db = (serverDb **)zcalloc(sizeof(serverDb *)); + server.db[0] = (serverDb *)zcalloc(sizeof(serverDb)); + durabilityInitDatabase(server.db[0]); + } + + void TearDown() override { + hashtableRelease(server.db[0]->uncommitted_keys); + zfree(server.db[0]); + zfree(server.db); + + server.db = old_db; + server.dbnum = old_dbnum; + server.primary_host = old_primary_host; + server.cluster_enabled = old_cluster_enabled; + server.durability.previous_acked_offset = old_previous_acked_offset; + server.primary_repl_offset = old_primary_repl_offset; + } +}; + +/* ========================= Durability Tests ========================= */ + +TEST_F(SyncReplicationTest, IsDurabilityEnabled) { + server.durability.enabled = 0; + ASSERT_EQ(isDurabilityEnabled(), 0); + + server.durability.enabled = 1; + ASSERT_EQ(isDurabilityEnabled(), 1); + + server.durability.enabled = 0; +} + +TEST_F(SyncReplicationTest, IsPrimaryDurabilityEnabled) { + server.durability.enabled = 1; + + /* Primary (not a replica) */ + server.primary_host = nullptr; + ASSERT_EQ(isPrimaryDurabilityEnabled(), 1); + + /* Replica */ + server.primary_host = sdsnew("127.0.0.1"); + ASSERT_EQ(isPrimaryDurabilityEnabled(), 0); + sdsfree(server.primary_host); + + /* Disabled + primary */ + server.durability.enabled = 0; + server.primary_host = nullptr; + ASSERT_EQ(isPrimaryDurabilityEnabled(), 0); +} + +TEST_F(SyncReplicationTest, ClientInitAndReset) { + client *c = (client *)zcalloc(sizeof(client)); + c->clientDurabilityInfo.blocked_responses = nullptr; + c->clientDurabilityInfo.durability_blocked = 0; + c->clientDurabilityInfo.current_command_repl_offset = 0; + + /* Disabled — should be a no-op */ + server.durability.enabled = 0; + durabilityClientInit(c); + ASSERT_EQ(c->clientDurabilityInfo.blocked_responses, nullptr); + + /* Enabled — should initialize */ + server.durability.enabled = 1; + durabilityClientInit(c); + ASSERT_NE(c->clientDurabilityInfo.blocked_responses, nullptr); + ASSERT_EQ(listLength(c->clientDurabilityInfo.blocked_responses), 0u); + ASSERT_FALSE(c->clientDurabilityInfo.offset.recorded); + ASSERT_EQ(c->clientDurabilityInfo.offset.reply_block, nullptr); + ASSERT_EQ(c->clientDurabilityInfo.offset.byte_offset, 0u); + ASSERT_EQ(c->clientDurabilityInfo.current_command_repl_offset, -1); + + /* Reset — should free */ + durabilityClientReset(c); + ASSERT_EQ(c->clientDurabilityInfo.blocked_responses, nullptr); + ASSERT_FALSE(c->clientDurabilityInfo.offset.recorded); + ASSERT_EQ(c->clientDurabilityInfo.current_command_repl_offset, -1); + + server.durability.enabled = 0; + zfree(c); +} + +TEST_F(SyncReplicationTest, IsClientReplyBufferLimited) { + client *c = (client *)zcalloc(sizeof(client)); + + /* No blocked_responses list */ + c->clientDurabilityInfo.blocked_responses = nullptr; + ASSERT_FALSE(isClientReplyBufferLimited(c)); + + /* Empty blocked_responses list */ + c->clientDurabilityInfo.blocked_responses = listCreate(); + ASSERT_FALSE(isClientReplyBufferLimited(c)); + + /* Non-empty blocked_responses list */ + blockedResponse *br = (blockedResponse *)zcalloc(sizeof(blockedResponse)); + br->primary_repl_offset = 100; + br->disallowed_byte_offset = 0; + br->disallowed_reply_block = nullptr; + listAddNodeTail(c->clientDurabilityInfo.blocked_responses, br); + ASSERT_TRUE(isClientReplyBufferLimited(c)); + + listSetFreeMethod(c->clientDurabilityInfo.blocked_responses, zfree); + listRelease(c->clientDurabilityInfo.blocked_responses); + zfree(c); +} + +/* ========================= DurabilityProvider Tests ========================= */ + +TEST_F(DurabilityProviderTest, BuiltinAofProviderDisabledWhenAofOff) { + initDurabilityForTest(); + + server.aof_state = AOF_OFF; + server.aof_fsync = AOF_FSYNC_EVERYSEC; + ASSERT_FALSE(anyDurabilityProviderEnabled()); + + cleanupDurabilityForTest(); +} + +TEST_F(DurabilityProviderTest, AofProviderEnabledWhenAofOn) { + initDurabilityForTest(); + + /* AOF provider is enabled whenever AOF is on, regardless of fsync policy */ + server.aof_state = AOF_ON; + server.aof_fsync = AOF_FSYNC_ALWAYS; + ASSERT_TRUE(anyDurabilityProviderEnabled()); + + server.aof_fsync = AOF_FSYNC_EVERYSEC; + ASSERT_TRUE(anyDurabilityProviderEnabled()); + + server.aof_fsync = AOF_FSYNC_NO; + ASSERT_TRUE(anyDurabilityProviderEnabled()); + + cleanupDurabilityForTest(); +} + +TEST_F(DurabilityProviderTest, AofProviderPassThroughWhenNotAlwaysFsync) { + initDurabilityForTest(); + + /* When fsync != always, AOF provider returns primary_repl_offset (pass-through) */ + server.aof_state = AOF_ON; + server.aof_fsync = AOF_FSYNC_EVERYSEC; + server.primary_repl_offset = 500; + ASSERT_EQ(getDurabilityConsensusOffset(), 500); + + server.aof_fsync = AOF_FSYNC_NO; + server.primary_repl_offset = 700; + ASSERT_EQ(getDurabilityConsensusOffset(), 700); + + cleanupDurabilityForTest(); +} + +TEST_F(DurabilityProviderTest, AofProviderPauseAndResume) { + initDurabilityForTest(); + + server.aof_state = AOF_ON; + server.aof_fsync = AOF_FSYNC_ALWAYS; + server.primary_repl_offset = 300; + __atomic_store_n(&server.fsynced_reploff_pending, (long long)300, __ATOMIC_RELAXED); + server.fsynced_reploff = 300; + + /* Before pause: consensus = 300 (fsynced) */ + ASSERT_EQ(getDurabilityConsensusOffset(), 300); + + /* Pause: consensus should be frozen at 300 (the offset at pause time). + * New writes that advance primary_repl_offset past 300 will block, + * but already-acknowledged data remains unblocked. */ + ASSERT_TRUE(pauseDurabilityProvider("aof")); + ASSERT_EQ(getDurabilityConsensusOffset(), 300); + + /* Advance primary_repl_offset — consensus stays frozen at 300 */ + server.primary_repl_offset = 500; + ASSERT_EQ(getDurabilityConsensusOffset(), 300); + + /* Resume: consensus should catch up to actual fsynced offset */ + server.durability.enabled = 1; + server.durability.previous_acked_offset = -1; + ASSERT_TRUE(resumeDurabilityProvider("aof")); + ASSERT_EQ(getDurabilityConsensusOffset(), 300); + + /* Nonexistent provider returns false */ + ASSERT_FALSE(pauseDurabilityProvider("nonexistent")); + ASSERT_FALSE(resumeDurabilityProvider("nonexistent")); + + cleanupDurabilityForTest(); +} + +/* Custom test provider */ +static bool testProviderEnabled = true; +static long long testProviderOffset = 50; +static bool testCustomIsEnabled(void) { return testProviderEnabled; } +static long long testCustomGetAckedOffset(void) { return testProviderOffset; } + +TEST_F(DurabilityProviderTest, CustomProviderRegistrationAndConsensus) { + initDurabilityForTest(); + + durabilityProvider customProvider = { + .name = "custom-test", + .isEnabled = testCustomIsEnabled, + .getAckedOffset = testCustomGetAckedOffset, + }; + + /* Enable AOF provider */ + server.aof_state = AOF_ON; + server.aof_fsync = AOF_FSYNC_ALWAYS; + __atomic_store_n(&server.fsynced_reploff_pending, (long long)300, __ATOMIC_RELAXED); + server.fsynced_reploff = 300; + + /* Register custom provider */ + testProviderEnabled = true; + testProviderOffset = 50; + registerDurabilityProvider(&customProvider); + ASSERT_TRUE(anyDurabilityProviderEnabled()); + + /* Consensus = MIN(aof=300, custom=50) = 50 */ + server.primary_repl_offset = 300; + ASSERT_EQ(getDurabilityConsensusOffset(), 50); + + /* Unregister */ + unregisterDurabilityProvider(&customProvider); + + cleanupDurabilityForTest(); +} + +TEST_F(DurabilityProviderTest, CustomProviderDisabledIsSkipped) { + initDurabilityForTest(); + + durabilityProvider customProvider = { + .name = "custom-disabled", + .isEnabled = testCustomIsEnabled, + .getAckedOffset = testCustomGetAckedOffset, + }; + + server.aof_state = AOF_ON; + server.aof_fsync = AOF_FSYNC_ALWAYS; + __atomic_store_n(&server.fsynced_reploff_pending, (long long)200, __ATOMIC_RELAXED); + server.fsynced_reploff = 200; + + testProviderEnabled = false; + testProviderOffset = 10; + registerDurabilityProvider(&customProvider); + + /* Custom disabled, only AOF enabled => consensus = 200 */ + server.primary_repl_offset = 300; + ASSERT_EQ(getDurabilityConsensusOffset(), 200); + + unregisterDurabilityProvider(&customProvider); + cleanupDurabilityForTest(); +} + +static long long negativeOffsetProvider(void) { return -1; } +static bool alwaysEnabled(void) { return true; } + +TEST_F(DurabilityProviderTest, ProviderReturningNegativeOneBlocksConsensus) { + initDurabilityForTest(); + + durabilityProvider blockingProvider = { + .name = "blocking", + .isEnabled = alwaysEnabled, + .getAckedOffset = negativeOffsetProvider, + }; + + registerDurabilityProvider(&blockingProvider); + server.primary_repl_offset = 300; + ASSERT_EQ(getDurabilityConsensusOffset(), -1); + + unregisterDurabilityProvider(&blockingProvider); + cleanupDurabilityForTest(); +} + + +/* ========================= UncommittedKeys Tests ========================= */ + +TEST_F(UncommittedKeysTest, HandleAndPurgeUncommittedKey) { + robj *key_obj = createStringObject("key", 3); + sds key = (sds)objectGetVal(key_obj); + long long offset = 10; + server.primary_repl_offset = offset; + handleUncommittedKeyForClient(nullptr, key_obj, server.db[0]); + + /* Key should be in uncommitted set */ + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + + /* Not yet acked — should return the offset */ + server.durability.previous_acked_offset = 5; + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset(key, server.db[0]), offset); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + + /* Acked — should purge and return -1 */ + server.durability.previous_acked_offset = 10; + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset(key, server.db[0]), -1); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 0u); + + decrRefCount(key_obj); +} + +TEST_F(UncommittedKeysTest, MultipleKeysTracked) { + robj *k1 = createStringObject("key1", 4); + robj *k2 = createStringObject("key2", 4); + + server.primary_repl_offset = 10; + handleUncommittedKeyForClient(nullptr, k1, server.db[0]); + server.primary_repl_offset = 20; + handleUncommittedKeyForClient(nullptr, k2, server.db[0]); + + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 2u); + + /* Ack up to 10 — only key1 should be purged */ + server.durability.previous_acked_offset = 10; + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset((sds)objectGetVal(k1), server.db[0]), -1); + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset((sds)objectGetVal(k2), server.db[0]), 20); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + + /* Ack up to 20 — key2 also purged */ + server.durability.previous_acked_offset = 20; + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset((sds)objectGetVal(k2), server.db[0]), -1); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 0u); + + decrRefCount(k1); + decrRefCount(k2); +} + +TEST_F(UncommittedKeysTest, KeyOffsetUpdatedOnRewrite) { + robj *key_obj = createStringObject("key", 3); + + server.primary_repl_offset = 10; + handleUncommittedKeyForClient(nullptr, key_obj, server.db[0]); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + + /* Rewrite same key at higher offset */ + server.primary_repl_offset = 50; + handleUncommittedKeyForClient(nullptr, key_obj, server.db[0]); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + + /* Old offset acked but new offset not */ + server.durability.previous_acked_offset = 10; + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset((sds)objectGetVal(key_obj), server.db[0]), 50); + + decrRefCount(key_obj); +} + +TEST_F(UncommittedKeysTest, NonexistentKeyReturnsNegativeOne) { + sds missing = sdsnew("nonexistent"); + server.durability.previous_acked_offset = 0; + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset(missing, server.db[0]), -1); + sdsfree(missing); +} + +TEST_F(UncommittedKeysTest, HasUncommittedKeysAcrossDBs) { + /* No uncommitted keys initially */ + ASSERT_EQ(hasUncommittedKeys(), 0); + + robj *key_obj = createStringObject("key", 3); + server.primary_repl_offset = 10; + handleUncommittedKeyForClient(nullptr, key_obj, server.db[0]); + ASSERT_EQ(hasUncommittedKeys(), 1); + + /* Purge it */ + server.durability.previous_acked_offset = 10; + durabilityPurgeAndGetUncommittedKeyOffset((sds)objectGetVal(key_obj), server.db[0]); + ASSERT_EQ(hasUncommittedKeys(), 0); + + decrRefCount(key_obj); +} + +TEST_F(UncommittedKeysTest, GetNumberOfUncommittedKeys) { + ASSERT_EQ(getNumberOfUncommittedKeys(), 0u); + + robj *k1 = createStringObject("a", 1); + robj *k2 = createStringObject("b", 1); + robj *k3 = createStringObject("c", 1); + + server.primary_repl_offset = 10; + handleUncommittedKeyForClient(nullptr, k1, server.db[0]); + handleUncommittedKeyForClient(nullptr, k2, server.db[0]); + handleUncommittedKeyForClient(nullptr, k3, server.db[0]); + + ASSERT_EQ(getNumberOfUncommittedKeys(), 3u); + + decrRefCount(k1); + decrRefCount(k2); + decrRefCount(k3); +} + +TEST_F(UncommittedKeysTest, CleanupTimeLimitScalesWithKeyCount) { + /* Set the cleanup time limit config (normally set by server init) */ + server.durability.keys_cleanup_time_limit_ms = 100; + + /* 0 keys => 1ms */ + ASSERT_EQ(getUncommittedKeysCleanupTimeLimit(0), 1u); + + /* Small count => small limit */ + unsigned long long small_limit = getUncommittedKeysCleanupTimeLimit(100); + ASSERT_GE(small_limit, 1u); + + /* Larger count => larger limit (monotonically increasing) */ + unsigned long long larger = getUncommittedKeysCleanupTimeLimit(500000); + ASSERT_GE(larger, small_limit); + + /* At 1 million keys, should hit the configured max */ + unsigned long long at_max = getUncommittedKeysCleanupTimeLimit(1000000); + ASSERT_EQ(at_max, 100u); +} + +/* ========================= Function Store Tests ========================= */ + +TEST_F(SyncReplicationTest, FunctionStoreUncommittedTracking) { + server.durability.previous_acked_offset = 0; + + /* Not uncommitted initially */ + ASSERT_FALSE(isDurableFunctionStoreUncommitted()); + + /* Mark uncommitted */ + server.execution_nesting = 0; + server.primary_repl_offset = 100; + handleUncommittedFunctionStore(); + ASSERT_TRUE(isDurableFunctionStoreUncommitted()); + ASSERT_EQ(getFuncStoreBlockingOffset(), 100); + + /* After acking, it should no longer be uncommitted */ + server.durability.previous_acked_offset = 100; + ASSERT_FALSE(isDurableFunctionStoreUncommitted()); +} + +/* ========================= INFO String Test ========================= */ + +TEST_F(SyncReplicationTest, GenInfoStringDisabled) { + server.durability.enabled = 0; + sds info = sdsempty(); + info = genDurabilityInfoString(info); + ASSERT_NE(strstr(info, "durability_enabled:0"), nullptr); + sdsfree(info); +} + +TEST_F(SyncReplicationTest, GenInfoStringEnabled) { + server.durability.enabled = 1; + server.durability.clients_waiting_ack = listCreate(); + server.durability.read_responses_blocked = 5; + server.durability.write_responses_blocked = 3; + server.durability.previous_acked_offset = 42; + server.primary_repl_offset = 100; + + sds info = sdsempty(); + info = genDurabilityInfoString(info); + ASSERT_NE(strstr(info, "durability_enabled:1"), nullptr); + ASSERT_NE(strstr(info, "durability_read_blocked_count:5"), nullptr); + ASSERT_NE(strstr(info, "durability_write_blocked_count:3"), nullptr); + ASSERT_NE(strstr(info, "durability_previous_acked_offset:42"), nullptr); + ASSERT_NE(strstr(info, "durability_primary_repl_offset:100"), nullptr); + + sdsfree(info); + listRelease(server.durability.clients_waiting_ack); + server.durability.clients_waiting_ack = nullptr; + server.durability.enabled = 0; +} + +/* ========================= Migrated from C tests ========================= */ + +/** + * Fixture for tests that need full durability init (durabilityInit) + * plus database and client setup. + */ +class FullDurabilityTest : public ::testing::Test { + protected: + serverDb **old_db; + int old_dbnum; + char *old_primary_host; + int old_cluster_enabled; + long long old_primary_repl_offset; + int old_get_ack; + list *old_replicas; + list *old_clients_pending_write; + int old_aof_state; + int old_aof_fsync; + long long old_fsynced_reploff; + durable_t old_durability; + list *old_monitors; + + void SetUp() override { + initTestEnv(); + old_db = server.db; + old_dbnum = server.dbnum; + old_primary_host = server.primary_host; + old_cluster_enabled = server.cluster_enabled; + old_primary_repl_offset = server.primary_repl_offset; + old_get_ack = server.get_ack_from_replicas; + old_replicas = server.replicas; + old_clients_pending_write = server.clients_pending_write; + old_aof_state = server.aof_state; + old_aof_fsync = server.aof_fsync; + old_fsynced_reploff = server.fsynced_reploff; + old_durability = server.durability; + old_monitors = server.monitors; + + server.cluster_enabled = 0; + server.primary_host = nullptr; + server.clients_pending_write = listCreate(); + server.monitors = listCreate(); + server.dbnum = 1; + server.db = (serverDb **)zcalloc(sizeof(serverDb *)); + server.db[0] = (serverDb *)zcalloc(sizeof(serverDb)); + durabilityInitDatabase(server.db[0]); + + server.durability.enabled = 1; + durabilityInit(); + } + + void TearDown() override { + durabilityCleanup(); + listRelease(server.clients_pending_write); + listRelease(server.monitors); + hashtableRelease(server.db[0]->uncommitted_keys); + zfree(server.db[0]); + zfree(server.db); + + server.db = old_db; + server.dbnum = old_dbnum; + server.primary_host = old_primary_host; + server.cluster_enabled = old_cluster_enabled; + server.primary_repl_offset = old_primary_repl_offset; + server.get_ack_from_replicas = old_get_ack; + server.replicas = old_replicas; + server.clients_pending_write = old_clients_pending_write; + server.aof_state = old_aof_state; + server.aof_fsync = old_aof_fsync; + server.fsynced_reploff = old_fsynced_reploff; + server.durability = old_durability; + server.monitors = old_monitors; + } +}; + +/* Migrated from test_durableInit */ +TEST_F(SyncReplicationTest, SyncReplicationInitSetsDefaults) { + /* initDurabilityForTest() approximates durabilityInit(); verify fields */ + initDurabilityForTest(); + + ASSERT_NE(server.durability.clients_waiting_ack, nullptr); + ASSERT_EQ(listLength(server.durability.clients_waiting_ack), 0u); + ASSERT_EQ(server.durability.previous_acked_offset, -1); + ASSERT_EQ(server.durability.curr_db_scan_idx, 0); + ASSERT_EQ(server.durability.clients_blocked, 0u); + ASSERT_EQ(server.durability.clients_unblocked, 0u); + ASSERT_EQ(server.durability.clients_disconnected_before_unblocking, 0u); + ASSERT_EQ(server.durability.read_responses_blocked, 0u); + ASSERT_EQ(server.durability.write_responses_blocked, 0u); + ASSERT_EQ(server.durability.other_responses_blocked, 0u); + + cleanupDurabilityForTest(); +} + +/* Migrated from test_beforeCommandTrackReplOffset */ +TEST_F(FullDurabilityTest, BeforeCommandTrackReplOffset) { + client *c = (client *)zcalloc(sizeof(client)); + durabilityClientInit(c); + + struct serverCommand readonly_cmd = {.declared_name = "get", .flags = CMD_READONLY}; + c->cmd = &readonly_cmd; + + server.primary_repl_offset = 500; + beforeCommandTrackReplOffset(c); + + /* pre_call_replication_offset should be snapshotted */ + ASSERT_EQ(server.durability.pre_call_replication_offset, 500); + + durabilityClientReset(c); + zfree(c); +} + +/* Migrated from test_preCommandExec — Case 1: durability disabled */ +TEST_F(SyncReplicationTest, PreCommandExecDurabilityDisabled) { + struct serverCommand readonly_cmd = {.declared_name = "get", .flags = CMD_READONLY}; + + /* preCommandExec always accesses server.monitors via isCommandReplicatedToMonitors() */ + list *old_monitors = server.monitors; + server.monitors = listCreate(); + + client *c = (client *)zcalloc(sizeof(client)); + c->cmd = &readonly_cmd; + c->clientDurabilityInfo.current_command_repl_offset = 123; + server.durability.enabled = 0; + server.primary_repl_offset = 555; + + ASSERT_EQ(preCommandExec(c), CMD_FILTER_ALLOW); + /* preCommandExec always resets current_command_repl_offset to -1 */ + ASSERT_EQ(c->clientDurabilityInfo.current_command_repl_offset, -1); + /* pre_command_replication_offset is always snapshotted */ + ASSERT_EQ(server.durability.pre_command_replication_offset, 555); + + zfree(c); + listRelease(server.monitors); + server.monitors = old_monitors; +} + +/* Migrated from test_preCommandExec — Case 2: durability enabled on primary */ +TEST_F(FullDurabilityTest, PreCommandExecDurabilityEnabledOnPrimary) { + struct serverCommand readonly_cmd = {.declared_name = "get", .flags = CMD_READONLY}; + + client *c = (client *)zcalloc(sizeof(client)); + durabilityClientInit(c); + c->cmd = &readonly_cmd; + c->bufpos = 7; + c->clientDurabilityInfo.current_command_repl_offset = 88; + server.primary_repl_offset = 1234; + + ASSERT_EQ(preCommandExec(c), CMD_FILTER_ALLOW); + /* current_command_repl_offset should be reset to -1 */ + ASSERT_EQ(c->clientDurabilityInfo.current_command_repl_offset, -1); + /* Pre-execution position should be tracked */ + ASSERT_TRUE(c->clientDurabilityInfo.offset.recorded); + ASSERT_EQ(c->clientDurabilityInfo.offset.reply_block, nullptr); + ASSERT_EQ(c->clientDurabilityInfo.offset.byte_offset, 7u); + ASSERT_EQ(server.durability.pre_command_replication_offset, 1234); + + durabilityClientReset(c); + zfree(c); +} + +/* Migrated from test_multi_exec_defers_dirty_keys */ +TEST_F(FullDurabilityTest, MultiExecDefersDirtyKeys) { + client *c = (client *)zcalloc(sizeof(client)); + durabilityClientInit(c); + c->db = server.db[0]; + c->reply = listCreate(); + listSetFreeMethod(c->reply, zfree); + + /* Inside a MULTI — dirty key tracking should be deferred */ + c->flag.multi = 1; + robj *key_obj = createStringObject("multi-key", 9); + handleUncommittedKeyForClient(c, key_obj, server.db[0]); + /* Key should NOT be committed yet while inside MULTI */ + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 0u); + + /* After EXEC completes: postCommandExec commits deferred keys */ + c->flag.multi = 0; + struct serverCommand exec_cmd = {.declared_name = "exec", .proc = execCommand, .flags = 0}; + c->cmd = &exec_cmd; + c->clientDurabilityInfo.current_command_repl_offset = -1; + server.primary_repl_offset = 100; + server.durability.pre_command_replication_offset = 100; + server.durability.previous_acked_offset = 0; + postCommandExec(c); + + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset((sds)objectGetVal(key_obj), server.db[0]), 100); + + decrRefCount(key_obj); + listRelease(c->reply); + durabilityClientReset(c); + zfree(c); +} + +/* Note: test_exec_blocks_reply_and_tracks_dirty_keys from the C test suite + * exercised the full end-to-end blocking/unblocking flow including + * notifyDurabilityProgress with replica ack simulation. This requires + * putClientInPendingWriteQueue which needs a fully event-loop-registered client. + * The blocking path is covered by the MultiExecDefersDirtyKeys test above, + * and the full integration flow is tested by tests/durability/reply_blocking.tcl. */ + +/* ========================= Additional Coverage Tests ========================= */ + +/* Test updateFuncStoreBlockingOffsetForWrite */ +TEST_F(SyncReplicationTest, UpdateFuncStoreBlockingOffsetForWrite) { + server.durability.func_store_blocking_offset = -1; + server.durability.processed_func_write_in_transaction = false; + + /* Should not update when no func write was processed in transaction */ + updateFuncStoreBlockingOffsetForWrite(200); + ASSERT_EQ(server.durability.func_store_blocking_offset, -1); + + /* Should update when processed_func_write_in_transaction is set */ + server.durability.processed_func_write_in_transaction = true; + updateFuncStoreBlockingOffsetForWrite(200); + ASSERT_EQ(server.durability.func_store_blocking_offset, 200); + ASSERT_FALSE(server.durability.processed_func_write_in_transaction); +} + +/* Test handleUncommittedFunctionStore inside vs outside a transaction */ +TEST_F(SyncReplicationTest, HandleUncommittedFunctionStoreInsideTransaction) { + server.durability.processed_func_write_in_transaction = false; + server.durability.func_store_blocking_offset = -1; + + /* Inside a transaction (execution_nesting > 0): should only set the flag */ + server.execution_nesting = 1; + server.primary_repl_offset = 300; + handleUncommittedFunctionStore(); + ASSERT_TRUE(server.durability.processed_func_write_in_transaction); + ASSERT_EQ(server.durability.func_store_blocking_offset, -1); + + /* Outside a transaction: should set the blocking offset directly */ + server.execution_nesting = 0; + server.durability.processed_func_write_in_transaction = false; + server.primary_repl_offset = 400; + handleUncommittedFunctionStore(); + ASSERT_FALSE(server.durability.processed_func_write_in_transaction); + ASSERT_EQ(server.durability.func_store_blocking_offset, 400); +} + +/* Test notifyDurabilityProgress when sync replication is disabled */ +TEST_F(SyncReplicationTest, NotifyDurabilityProgressNoOpWhenDisabled) { + server.durability.enabled = 0; + server.primary_host = nullptr; + long long old_offset = server.durability.previous_acked_offset; + notifyDurabilityProgress(); + /* Should be a no-op */ + ASSERT_EQ(server.durability.previous_acked_offset, old_offset); +} + +/* Test notifyDurabilityProgress when server is a replica */ +TEST_F(SyncReplicationTest, NotifyDurabilityProgressNoOpWhenReplica) { + server.durability.enabled = 1; + server.primary_host = sdsnew("127.0.0.1"); + long long old_offset = server.durability.previous_acked_offset; + notifyDurabilityProgress(); + ASSERT_EQ(server.durability.previous_acked_offset, old_offset); + sdsfree(server.primary_host); + server.primary_host = nullptr; + server.durability.enabled = 0; +} + + +/* Test durabilityClientInit is idempotent */ +TEST_F(SyncReplicationTest, ClientInitIdempotent) { + server.durability.enabled = 1; + + client *c = (client *)zcalloc(sizeof(client)); + c->clientDurabilityInfo.blocked_responses = nullptr; + + durabilityClientInit(c); + list *first_list = c->clientDurabilityInfo.blocked_responses; + ASSERT_NE(first_list, nullptr); + + /* Calling init again should be a no-op — should NOT create a new list */ + durabilityClientInit(c); + ASSERT_EQ(c->clientDurabilityInfo.blocked_responses, first_list); + + durabilityClientReset(c); + server.durability.enabled = 0; + zfree(c); +} From 85aa8e72e1287454a86cf0ec6b1efb0df8abd470 Mon Sep 17 00:00:00 2001 From: jjuleslasarte Date: Wed, 18 Mar 2026 14:32:58 -0700 Subject: [PATCH 8/8] tests: add integration tests for durability reply blocking Add Tcl-based integration tests (1,051 lines) covering end-to-end durability behavior including: - AOF-based response blocking with appendfsync=always - Provider pause/resume via DEBUG commands for deterministic testing - Uncommitted key rejection (reads return error for dirty keys) - MULTI/EXEC transaction durability semantics - Lua script and FCALL durability checks - Function store (FUNCTION LOAD/DELETE) durability blocking - Client disconnection during blocked state - Multiple concurrent clients with interleaved blocking/unblocking - INFO durability stats verification Signed-off-by: jjuleslasarte --- .gitignore | 1 + tests/durability/reply_blocking.tcl | 1051 +++++++++++++++++++++++++++ 2 files changed, 1052 insertions(+) create mode 100644 tests/durability/reply_blocking.tcl diff --git a/.gitignore b/.gitignore index 636e29b86ed..b6d3f2c84d7 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,4 @@ cmake-build-debug/ cmake-build-release/ __pycache__ src/unit/.flags +.DS_Store diff --git a/tests/durability/reply_blocking.tcl b/tests/durability/reply_blocking.tcl new file mode 100644 index 00000000000..c5615480cfb --- /dev/null +++ b/tests/durability/reply_blocking.tcl @@ -0,0 +1,1051 @@ +# Tests for reply blocking durability feature +# This test suite covers the synchronous replication functionality +# that blocks client responses until durability providers acknowledge writes. +# +# Tests are parameterized over provider_mode: +# replica - TODO +# aof - unblock via AOF appendfsync=always (automatic in beforeSleep) + +foreach provider_mode {aof} { + + if {$provider_mode eq "replica"} { + set server_overrides {durability yes} + } else { + # Start with appendfsync always so the AOF provider is fully active. + # We use DEBUG durability-provider-pause/resume to control blocking + # instead of toggling appendfsync, which avoids the issue where the + # provider reports as disabled when appendfsync != always. + set server_overrides {durability yes appendonly yes appendfsync always} + } + + start_server [list tags {"repl durability external:skip"} overrides $server_overrides] { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + + start_server {} { + set replica [srv 0 client] + set replica_host [srv 0 host] + set replica_port [srv 0 port] + + # Helper: put the provider into a state where writes will block. + # replica mode: ensure no replica is connected (so no one acks writes) + # aof mode: pause the AOF provider so fsynced offsets are not advanced + proc pause_provider {} { + upvar provider_mode provider_mode + upvar primary primary + upvar replica replica + + if {$provider_mode eq "replica"} { + # Disconnect any existing replica so the next write has no one to ack it + $replica replicaof no one + wait_for_condition 50 100 { + [llength [$primary client list type replica]] == 0 + } else { + fail "Primary didn't notice replica disconnect" + } + } else { + # Pause the AOF provider so the next write will block + $primary DEBUG durability-provider-pause aof + } + } + + # Helper: trigger durability acknowledgement, unblocking pending replies. + # replica mode: connect replica and wait for replication ack + # aof mode: resume the AOF provider and ping to force a beforeSleep fsync + proc unblock_with_provider {} { + upvar provider_mode provider_mode + upvar primary primary + upvar primary_host primary_host + upvar primary_port primary_port + upvar replica replica + upvar replica_host replica_host + upvar replica_port replica_port + + if {$provider_mode eq "replica"} { + $replica replicaof $primary_host $primary_port + wait_replica_online $primary + wait_replica_acked_ofs $primary $replica $replica_host $replica_port + } else { + # Resume the AOF provider so it reports real fsynced offsets + $primary DEBUG durability-provider-resume aof + # Issue a PING to force a beforeSleep cycle that fsyncs the AOF + $primary ping + } + } + + # ==================== Write blocking tests ==================== + + test "($provider_mode) Sync replication blocks replies until provider acks" { + assert_equal "yes" [lindex [$primary config get durability] 1] + puts "durability blocks" + pause_provider + + set rd [valkey_deferring_client -1] + $rd set durable:blocked value + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "OK" [$rd read] + $rd close + } + + test "($provider_mode) Sync replication blocks EXEC replies until provider acks" { + assert_equal "yes" [lindex [$primary config get durability] 1] + puts "durability blocks" + + pause_provider + + set rd [valkey_deferring_client -1] + $rd multi + $rd set durable:multi value + + assert_equal "OK" [$rd read] + assert_equal "QUEUED" [$rd read] + + $rd exec + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + assert_equal {OK} [$rd read] + $rd close + } + + test "($provider_mode) Sync replication blocks only written keys in EXEC" { + assert_equal "yes" [lindex [$primary config get durability] 1] + puts "durability only written keys in EXEC" + + # Pre-populate with durability off so the SET doesn't block + assert_equal "OK" [$primary set durable:multi-clean clean] + # Verify the pre-populated value is readable on the primary before EXEC + + pause_provider + + set rd [valkey_deferring_client -1] + $rd multi + $rd set durable:multi-dirty value + $rd get durable:multi-clean + + assert_equal "OK" [$rd read] + assert_equal "QUEUED" [$rd read] + assert_equal "QUEUED" [$rd read] + assert_equal {clean} [$primary get durable:multi-clean] + + + $rd exec + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal {OK clean} [$rd read] + $rd close + } + + test "($provider_mode) Lua script write blocks replies until provider acks" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + # Pre-populate with sync-repl off so the SET doesn't block + assert_equal "OK" [$primary config set sync-replication no] + assert_equal "OK" [$primary set durable:lua-clean clean] + assert_equal "OK" [$primary config set sync-replication yes] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd eval {redis.call('set', KEYS[1], ARGV[1]); return redis.call('get', KEYS[2])} 2 durable:lua-dirty durable:lua-clean value + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + set reader [valkey_client -1] + assert_equal {clean} [$reader get durable:lua-clean] + + unblock_with_provider + + assert_equal {clean} [$rd read] + $rd close + } + + test "($provider_mode) Lua script error after partial write still blocks" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd eval {redis.call('set', KEYS[1], 'written'); error('deliberate error')} 1 durable:lua-error-key + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + catch {$rd read} err + assert_match "*deliberate error*" $err + $rd close + } + + # ==================== Non-blocking tests ==================== + + test "($provider_mode) EVAL_RO should not block replies" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + # Pre-populate with sync-repl off so the SET doesn't block + assert_equal "OK" [$primary config set sync-replication no] + assert_equal "OK" [$primary set durable:eval-ro-key hello] + assert_equal "OK" [$primary config set sync-replication yes] + + set rd [valkey_deferring_client -1] + $rd eval_ro {return redis.call('get', KEYS[1])} 1 durable:eval-ro-key + + assert_equal "hello" [$rd read] + $rd close + } + + test "($provider_mode) MULTI/EXEC with DISCARD does not block" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + set rd [valkey_deferring_client -1] + $rd multi + assert_equal "OK" [$rd read] + + $rd set durable:discard-key value + assert_equal "QUEUED" [$rd read] + + $rd discard + assert_equal "OK" [$rd read] + + $rd get durable:discard-key + assert_equal "" [$rd read] + $rd close + } + + test "($provider_mode) MULTI/EXEC with no writes does not block" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + # Pre-populate with sync-repl off so the SET doesn't block + assert_equal "OK" [$primary config set sync-replication no] + assert_equal "OK" [$primary set durable:nowrite-key existing] + assert_equal "OK" [$primary config set sync-replication yes] + + set rd [valkey_deferring_client -1] + $rd multi + assert_equal "OK" [$rd read] + + $rd get durable:nowrite-key + assert_equal "QUEUED" [$rd read] + + $rd ping + assert_equal "QUEUED" [$rd read] + + $rd exec + assert_equal {existing PONG} [$rd read] + $rd close + } + + test "($provider_mode) Admin commands are never blocked" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + set rd [valkey_deferring_client -1] + + $rd ping + assert_equal "PONG" [$rd read] + + $rd info server + set info [$rd read] + assert_match "*valkey_version*" $info + + $rd dbsize + set dbsize [$rd read] + assert {[string is integer $dbsize]} + + $rd close + } + + test "($provider_mode) Read-only commands on clean keys are not blocked" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + assert_equal "OK" [$primary config set sync-replication no] + assert_equal "OK" [$primary set durable:clean-key cleanvalue] + assert_equal "OK" [$primary config set sync-replication yes] + + set rd [valkey_deferring_client -1] + $rd get durable:clean-key + assert_equal "cleanvalue" [$rd read] + $rd close + } + + test "($provider_mode) Sync replication disabled - writes return immediately (regression)" { + assert_equal "OK" [$primary config set sync-replication no] + assert_equal "no" [lindex [$primary config get sync-replication] 1] + + set rd [valkey_deferring_client -1] + $rd set durable:norep-key value + assert_equal "OK" [$rd read] + + $rd get durable:norep-key + assert_equal "value" [$rd read] + + $rd multi + assert_equal "OK" [$rd read] + $rd set durable:norep-key2 value2 + assert_equal "QUEUED" [$rd read] + $rd exec + assert_equal {OK} [$rd read] + + $rd close + assert_equal "OK" [$primary config set sync-replication yes] + } + + # ==================== Multiple clients ==================== + + test "($provider_mode) Multiple concurrent writers block independently" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + pause_provider + + set wr1 [valkey_deferring_client -1] + set wr2 [valkey_deferring_client -1] + + $wr1 set durable:concurrent-1 val1 + $wr2 set durable:concurrent-2 val2 + + set fd1 [$wr1 channel] + set fd2 [$wr2 channel] + fconfigure $fd1 -blocking 0 + fconfigure $fd2 -blocking 0 + set early1 [read $fd1] + set early2 [read $fd2] + fconfigure $fd1 -blocking 1 + fconfigure $fd2 -blocking 1 + assert_equal "" $early1 + assert_equal "" $early2 + + unblock_with_provider + + assert_equal "OK" [$wr1 read] + assert_equal "OK" [$wr2 read] + + $wr1 close + $wr2 close + } + + test "($provider_mode) Write then read on same client preserves reply ordering" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd set durable:ordering-key orderval + $rd get durable:ordering-key + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "OK" [$rd read] + assert_equal "orderval" [$rd read] + $rd close + } + + # ==================== Database-level commands ==================== + + test "($provider_mode) FLUSHDB inside MULTI/EXEC blocks entire database" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + assert_equal "OK" [$primary config set sync-replication no] + assert_equal "OK" [$primary set durable:flush-pre existing] + assert_equal "OK" [$primary config set sync-replication yes] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd multi + assert_equal "OK" [$rd read] + + $rd flushdb + assert_equal "QUEUED" [$rd read] + + $rd exec + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal {OK} [$rd read] + $rd close + } + + test "($provider_mode) FLUSHALL blocks write reply until provider acks" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + assert_equal "OK" [$primary set durable:flushall-key value] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd flushall + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "OK" [$rd read] + $rd close + } + + test "($provider_mode) FLUSHALL inside MULTI/EXEC blocks all databases" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + assert_equal "OK" [$primary config set sync-replication no] + assert_equal "OK" [$primary set durable:flushall-multi-key value] + assert_equal "OK" [$primary config set sync-replication yes] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd multi + assert_equal "OK" [$rd read] + + $rd flushall + assert_equal "QUEUED" [$rd read] + + $rd exec + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal {OK} [$rd read] + $rd close + } + + test "($provider_mode) COPY cross-database blocks write reply" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + assert_equal "OK" [$primary config set sync-replication no] + assert_equal "OK" [$primary set durable:copy-src srcvalue] + assert_equal "OK" [$primary config set sync-replication yes] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd copy durable:copy-src durable:copy-dst db 1 + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal 1 [$rd read] + $rd close + } + + test "($provider_mode) SWAPDB blocks write reply until provider acks" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + assert_equal "OK" [$primary set durable:swap-db0 db0val] + $primary select 1 + assert_equal "OK" [$primary set durable:swap-db1 db1val] + $primary select 0 + + pause_provider + + set rd [valkey_deferring_client -1] + $rd swapdb 0 1 + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "OK" [$rd read] + $rd close + + # Swap back to restore state (with sync-repl off so it doesn't block) + $primary config set sync-replication no + $primary swapdb 0 1 + $primary config set sync-replication yes + } + + test "($provider_mode) MOVE blocks write reply until provider acks" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + $primary select 2 + $primary del durable:move-key + $primary select 9 + assert_equal "OK" [$primary set durable:move-key moveval] + assert_equal "OK" [$primary config set sync-replication yes] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd move durable:move-key 2 + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal 1 [$rd read] + $rd close + } + + test "($provider_mode) MULTI/EXEC with SELECT writes to multiple databases blocks" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd multi + assert_equal "OK" [$rd read] + + $rd set durable:multidb-key0 val0 + assert_equal "QUEUED" [$rd read] + + $rd select 1 + assert_equal "QUEUED" [$rd read] + + $rd set durable:multidb-key1 val1 + assert_equal "QUEUED" [$rd read] + + $rd select 0 + assert_equal "QUEUED" [$rd read] + + $rd exec + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal {OK OK OK OK} [$rd read] + $rd close + } + + # ==================== Function store ==================== + + test "($provider_mode) FUNCTION LOAD blocks reply until provider acks" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd function load "#!lua name=durtest\nserver.register_function('durfunc', function() return 'hello' end)" + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "durtest" [$rd read] + $rd close + } + + test "($provider_mode) FUNCTION DELETE blocks reply until provider acks" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd function delete durtest + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "OK" [$rd read] + $rd close + } + + # ==================== Dirty key reads ==================== + + test "($provider_mode) Sync replication blocks reads on dirty keys" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + pause_provider + + set writer [valkey_deferring_client -1] + $writer client reply off + $writer set durable:blocked dirty + + set rd [valkey_deferring_client -1] + $rd get durable:blocked + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "dirty" [$rd read] + $rd close + } + + # ==================== Client disconnect stats ==================== + + test "($provider_mode) Client disconnect while blocked updates stats" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd set durable:disconnect-test value + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + assert_equal "" $early_reply + + $rd close + + after 200 + + set info [$primary info all] + assert_match "*durability_clients_waiting_ack:0*" $info + + # Resume the provider so subsequent tests aren't affected + unblock_with_provider + } + + # ==================== Toggle / config changes ==================== + + test "($provider_mode) Sync replication toggling disables reply blocking" { + assert_equal "OK" [$primary config set sync-replication no] + assert_equal "no" [lindex [$primary config get sync-replication] 1] + + set writer [valkey_deferring_client -1] + $writer client reply off + $writer set durable:toggle value + + set rd [valkey_deferring_client -1] + $rd get durable:toggle + assert_equal "value" [$rd read] + + $rd close + assert_equal "OK" [$primary config set sync-replication yes] + } + + test "($provider_mode) Disabling sync replication unblocks pending replies" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd set durable:toggle-blocked value + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + assert_equal "" $early_reply + + assert_equal "OK" [$primary config set sync-replication no] + + set raw_reply "" + set got_reply 0 + for {set i 0} {$i < 50} {incr i} { + append raw_reply [read $fd] + if {[string match "*\r\n" $raw_reply]} { + set got_reply 1 + break + } + after 100 + } + if {!$got_reply} { + fail "Reply didn't unblock after disabling sync replication" + } + fconfigure $fd -blocking 1 + assert_match "+OK*" $raw_reply + + # Resume the provider so subsequent tests aren't affected + # (disabling sync-replication unblocked the client but didn't resume the provider) + $primary DEBUG durability-provider-resume aof + + assert_equal "OK" [$primary config set sync-replication yes] + } + + test "($provider_mode) INFO reports sync replication stats" { + set info [$primary info all] + assert_match "*durability_enabled:1*" $info + assert_match "*durability_primary_repl_offset:*" $info + assert_match "*durability_previous_acked_offset:*" $info + } + + # ==================== Client tracking invalidation (deferred tasks) ==================== + + test "($provider_mode) Key invalidation is deferred until provider acks - signalModifiedKey" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + puts "running key invalidation" + # Set up a RESP3 tracking client that will receive invalidation messages + set tracker [valkey_deferring_client -1] + $tracker HELLO 3 + $tracker read ;# consume HELLO reply + $tracker CLIENT TRACKING on + $tracker read ;# consume TRACKING reply + + # Populate a key and cache it via GET on the tracking client + $primary config set sync-replication no + $primary set durable:track-key original + $primary config set sync-replication yes + + $tracker GET durable:track-key + $tracker read ;# consume "original" — key is now tracked + + # Pause the provider so the next write's invalidation is deferred + pause_provider + + # Write to the tracked key from a different client (fire-and-forget) + set writer [valkey_deferring_client -1] + $writer client reply off + $writer set durable:track-key modified + + # Give the server a moment to process the write + after 100 + + # The tracking client should NOT have received an invalidation yet + set tracker_fd [$tracker channel] + fconfigure $tracker_fd -blocking 0 + set early_inval [read $tracker_fd] + fconfigure $tracker_fd -blocking 1 + # No invalidation push should appear while provider is paused + assert_equal "" $early_inval + + # Now unblock — this should trigger the deferred invalidation + unblock_with_provider + + # Read the invalidation message from the tracking client + # RESP3 push: [invalidate [key1 key2 ...]] + set inval_msg [$tracker read] + assert_match "*durable:track-key*" $inval_msg + + $tracker CLIENT TRACKING off + $tracker read ;# consume reply + $tracker close + $writer close + } + + test "($provider_mode) Flush invalidation is deferred until provider acks - signalFlushedDb" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + puts "running flush invalidation" + + # Set up a RESP3 BCAST tracking client to catch FLUSHDB invalidations + set tracker [valkey_deferring_client -1] + $tracker HELLO 3 + $tracker read ;# consume HELLO reply + $tracker CLIENT TRACKING on BCAST + $tracker read ;# consume TRACKING reply + + # Populate some keys so there's something to flush + $primary config set sync-replication no + $primary set durable:flush-track-a val_a + $primary set durable:flush-track-b val_b + $primary config set sync-replication yes + + # Drain any invalidation messages from the SETs above + # (BCAST mode sends invalidations for all writes) + after 100 + set tracker_fd [$tracker channel] + fconfigure $tracker_fd -blocking 0 + read $tracker_fd + fconfigure $tracker_fd -blocking 1 + + # Pause the provider so the FLUSHDB invalidation is deferred + pause_provider + + # Issue FLUSHDB from a fire-and-forget writer + set writer [valkey_deferring_client -1] + $writer client reply off + $writer flushdb + + # Give the server time to process the command + after 100 + + # The tracking client should NOT have received flush invalidation yet + fconfigure $tracker_fd -blocking 0 + set early_inval [read $tracker_fd] + fconfigure $tracker_fd -blocking 1 + assert_equal "" $early_inval + + # Unblock — this triggers the deferred flush invalidation + unblock_with_provider + + # The tracking client should now receive an invalidation + # For FLUSHDB, the invalidation message contains NULL to indicate all keys + # Use a polling read with timeout to avoid hanging if message doesn't arrive + set inval_msg "" + set got_inval 0 + fconfigure $tracker_fd -blocking 0 + for {set i 0} {$i < 50} {incr i} { + append inval_msg [read $tracker_fd] + if {[string match "*invalidate*" $inval_msg]} { + set got_inval 1 + break + } + after 100 + } + fconfigure $tracker_fd -blocking 1 + if {!$got_inval} { + fail "Flush invalidation message not received within timeout" + } + assert_match "*invalidate*" $inval_msg + + $tracker CLIENT TRACKING off + $tracker read ;# consume reply + $tracker close + $writer close + } + + # ==================== Keyspace notification deferral (deferred tasks) ==================== + + test "($provider_mode) Keyspace notification is deferred until provider acks" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + # Enable keyspace notifications for all events + $primary config set notify-keyspace-events KA + + # Subscribe to keyspace notifications + set rd1 [valkey_deferring_client -1] + assert_equal {1} [psubscribe $rd1 "__keyspace@*__:*"] + + # Pause the provider so keyspace notifications are deferred + pause_provider + + # Write to a key from a fire-and-forget writer + set writer [valkey_deferring_client -1] + $writer client reply off + $writer set durable:keyspace-deferred-key val + + # Give the server time to process the command + after 100 + + # The subscriber should NOT have received the notification yet + set rd1_fd [$rd1 channel] + fconfigure $rd1_fd -blocking 0 + set early_notif [read $rd1_fd] + fconfigure $rd1_fd -blocking 1 + # No keyspace notification should appear while provider is paused + assert_equal "" $early_notif + + # Now unblock — this should trigger the deferred keyspace notification + unblock_with_provider + + # Read the keyspace notification + set notif_msg [$rd1 read] + assert_match "*set*" $notif_msg + + $rd1 close + $writer close + $primary config set notify-keyspace-events "" + } + + test "($provider_mode) Keyspace notification fires immediately when sync replication disabled" { + # Verify that without sync replication, keyspace events are NOT deferred + $primary config set sync-replication no + $primary config set notify-keyspace-events KA + + set rd1 [valkey_deferring_client] + assert_equal {1} [psubscribe $rd1 *] + r set foo bar + assert_match "*set*" [$rd1 read] + $rd1 close + + $primary config set notify-keyspace-events "" + $primary config set sync-replication yes + } + + # ==================== Client tracking invalidation (existing) ==================== + + test "($provider_mode) Key invalidation fires immediately when sync replication disabled" { + # Verify that without sync replication, invalidations are NOT deferred + $primary config set sync-replication no + + set tracker [valkey_deferring_client -1] + $tracker HELLO 3 + $tracker read ;# consume HELLO reply + $tracker CLIENT TRACKING on + $tracker read ;# consume TRACKING reply + + $primary set durable:track-nodefer original + + $tracker GET durable:track-nodefer + $tracker read ;# consume "original" — key is now tracked + + # Write to the tracked key — invalidation should fire immediately + $primary set durable:track-nodefer changed + + # Should get the invalidation right away (no provider pause needed) + set inval_msg [$tracker read] + assert_match "*durable:track-nodefer*" $inval_msg + + $tracker CLIENT TRACKING off + $tracker read ;# consume reply + $tracker close + + $primary config set sync-replication yes + } + + # ==================== Durability provider edge cases ==================== + + test "($provider_mode) Pause unknown provider returns error" { + catch {$primary DEBUG durability-provider-pause nonexistent} err + assert_match "*No such durability provider*" $err + } + + test "($provider_mode) Resume unknown provider returns error" { + catch {$primary DEBUG durability-provider-resume nonexistent} err + assert_match "*No such durability provider*" $err + } + + test "($provider_mode) Double pause is idempotent - writes still block" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + # Pause twice + $primary DEBUG durability-provider-pause aof + $primary DEBUG durability-provider-pause aof + + # Write should still block + set rd [valkey_deferring_client -1] + $rd set durable:double-pause-key val + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + # Resume once should unblock + $primary DEBUG durability-provider-resume aof + $primary ping + + assert_equal "OK" [$rd read] + $rd close + } + + test "($provider_mode) Resume while not paused is harmless" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + # Resume when not paused should succeed without issue + assert_equal "OK" [$primary DEBUG durability-provider-resume aof] + + # Writes should still work normally + set rd [valkey_deferring_client -1] + $rd set durable:resume-noop-key val + assert_equal "OK" [$rd read] + $rd close + } + + test "($provider_mode) Multiple writes while paused all unblock on resume" { + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd set durable:multi-write-1 val1 + $rd set durable:multi-write-2 val2 + $rd set durable:multi-write-3 val3 + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + # All three replies should come through + assert_equal "OK" [$rd read] + assert_equal "OK" [$rd read] + assert_equal "OK" [$rd read] + $rd close + } + + # ==================== Failover tests (must be last changes roles) ==================== + + test "($provider_mode) Failover disconnects clients waiting for ack" { + # Ensure replica is in clean state for deterministic failover behavior. + # In replica mode, earlier tests connected the replica and replicated data; + # we flush it here so the demoted primary's dirty key tracking is preserved + # correctly after failover (not overwritten by a full sync). + $replica flushall + assert_equal "yes" [lindex [$primary config get sync-replication] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd client setname durability-waiter + $rd read + $rd set durable:failover value + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + assert_equal "" $early_reply + fconfigure $fd -blocking 1 + + $primary replicaof $replica_host $replica_port + + catch {$rd read} err + assert_match {*I/O error*} $err + } + + test "($provider_mode) Demoted primary returns ERR on dirty data" { + set reader [valkey_client -1] + catch {$reader get durable:failover} err + assert_equal "ERR Accessed data unavailable to be served" $err + } + } + } +}