From 35c7ca47e8195a43b5a332be6189f0e819ed34bb Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 20 Jul 2025 19:28:37 +0900 Subject: [PATCH] verify: add --verify_type option for crash-consistent verification This patch introduces a new --verify_type option to support crash-consistent offline verification scenarios. The primary motivation is to handle cases where write operations are interrupted by device failures or unexpected power loss during the write phase, requiring verification to be limited to data that was properly persisted before the interruption. Key features: - New --verify_type=flush option filters verification candidates based on fsync completion timing - Tracks fsync completion timestamps during write phase - During verification, excludes writes that completed after the last fsync - Always includes FUA (Force Unit Access) writes regardless of timing, as they bypass cache and are immediately persistent - Works with existing verify_state_save mechanism for offline verification Use case: This is particularly useful for testing storage durability guarantees in scenarios such as: 1. Simulated power failures during write workloads 2. Device error injection testing and espeically with such ioengines which directly communicates with the device directly as io_uring_cmd does. This enhancement enables more realistic testing of data durability in storage systems by ensuring only properly synchronized data is verified after simulated failures. Examples: 1. write phase [global] ioengine=io_uring_cmd cmd_type=nvme filename=/dev/ng0n1 rw=write bs=4k verify=pattern verify_pattern=%o iodepth=32 size=32k fsync=3 [test] verify_type=flush do_verify=0 verify_state_save=1 2. read phase [global] ioengine=io_uring_cmd cmd_type=nvme filename=/dev/ng0n1 rw=write bs=4k verify=pattern verify_pattern=%o iodepth=32 size=32k fsync=3 [test] verify_type=flush do_verify=1 verify_only=1 verify_state_load=1 Signed-off-by: Minwoo Im --- HOWTO.rst | 27 +++++++++++++++++++ backend.c | 2 +- cconv.c | 2 ++ engines/io_uring.c | 4 +++ engines/sg.c | 4 +++ file.h | 12 ++++++++- fio.1 | 28 ++++++++++++++++++++ io_u.c | 33 ++++++++++++++++++++--- io_u.h | 1 + options.c | 22 ++++++++++++++++ t/verify-state.c | 10 ++++--- thread_options.h | 4 +++ verify-state.h | 9 ++++++- verify.c | 66 +++++++++++++++++++++++++++++++++++++++++++--- verify.h | 2 ++ 15 files changed, 214 insertions(+), 12 deletions(-) diff --git a/HOWTO.rst b/HOWTO.rst index 55ebc388cc..e627448a3f 100644 --- a/HOWTO.rst +++ b/HOWTO.rst @@ -4005,6 +4005,33 @@ Verification used to speed up the process of writing each block on a device with its offset. Default: 0 (disabled). +.. option:: verify_type=str + + Controls which write operations are included during the verification + phase. This option only affects offline verification when using + :option:`verify_state_save` to save completion state and later verify + with a separate job. The allowed values are: + + **flush** + Only verify writes that completed at or before the last + fsync operation. This mode filters out writes that + completed after the last fsync, which may not be + persistent on storage. Writes with the Force Unit + Access (FUA) flag are always included regardless of + fsync timing, as they bypass the cache and are + immediately persistent. This is useful for testing data + persistence guarantees across power failures or system + crashes. fio tracks fsync completion times and write + completion times during the write phase. During + verification, only writes that meet the fsync timing + criteria are verified. This allows testing scenarios + where only data that was properly synced before a + simulated failure should be verified. + This option requires :option:`verify_state_save` to be + enabled and is only effective during offline + verification (separate verify job). Default: none + (verify all completed writes). + .. option:: verify_fatal=bool Normally fio will keep checking the entire contents before quitting on a diff --git a/backend.c b/backend.c index 0cdee86465..86efbaf09f 100644 --- a/backend.c +++ b/backend.c @@ -1259,7 +1259,7 @@ static int init_file_completion_logging(struct thread_data *td, for_each_file(td, f, i) { f->last_write_comp = scalloc(td->last_write_comp_depth, - sizeof(uint64_t)); + sizeof(struct fio_write_comp)); if (!f->last_write_comp) goto cleanup; } diff --git a/cconv.c b/cconv.c index 4e72ae16a9..aa041c52a1 100644 --- a/cconv.c +++ b/cconv.c @@ -178,6 +178,7 @@ int convert_thread_options_to_cpu(struct thread_options *o, o->sync_io = le32_to_cpu(top->sync_io); o->write_hint = le32_to_cpu(top->write_hint); o->verify = le32_to_cpu(top->verify); + o->verify_type = le32_to_cpu(top->verify_type); o->do_verify = le32_to_cpu(top->do_verify); o->experimental_verify = le32_to_cpu(top->experimental_verify); o->verify_state = le32_to_cpu(top->verify_state); @@ -443,6 +444,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top, top->sync_io = cpu_to_le32(o->sync_io); top->write_hint = cpu_to_le32(o->write_hint); top->verify = cpu_to_le32(o->verify); + top->verify_type = cpu_to_le32(o->verify_type); top->do_verify = cpu_to_le32(o->do_verify); top->experimental_verify = cpu_to_le32(o->experimental_verify); top->verify_state = cpu_to_le32(o->verify_state); diff --git a/engines/io_uring.c b/engines/io_uring.c index 5bbcc97ab9..985c7b6794 100644 --- a/engines/io_uring.c +++ b/engines/io_uring.c @@ -562,6 +562,10 @@ static int fio_ioring_cmd_prep(struct thread_data *td, struct io_u *io_u) io_u_set(td, io_u, IO_U_F_VER_IN_DEV); } + /* Mark FUA writes for verification state tracking */ + if (io_u->ddir == DDIR_WRITE && o->writefua) + io_u_set(td, io_u, IO_U_F_FUA); + return fio_nvme_uring_cmd_prep(cmd, io_u, o->nonvectored ? NULL : &ld->iovecs[io_u->index], dsm, read_opcode, ld->write_opcode, diff --git a/engines/sg.c b/engines/sg.c index 9df70bd28b..86f3104ec9 100644 --- a/engines/sg.c +++ b/engines/sg.c @@ -668,6 +668,10 @@ static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u) fio_sgio_rw_lba(hdr, lba, nr_blocks, o->write_mode == FIO_SG_WRITE_SAME_NDOB); + /* Mark FUA writes for verification state tracking */ + if (o->writefua) + io_u_set(td, io_u, IO_U_F_FUA); + } else if (io_u->ddir == DDIR_TRIM) { struct sgio_trim *st; diff --git a/file.h b/file.h index e38ed2f123..c946844d4e 100644 --- a/file.h +++ b/file.h @@ -37,6 +37,9 @@ enum fio_file_flags { FIO_FILE_smalloc = 1 << 9, /* smalloc file/file_name */ }; +/* Flags for fio_write_comp.flags */ +#define FIO_WRITE_COMP_FUA 0x1 /* Write had Force Unit Access flag */ + enum file_lock_mode { FILE_LOCK_NONE, FILE_LOCK_EXCLUSIVE, @@ -126,8 +129,15 @@ struct fio_file { * Tracks the last iodepth number of completed writes, if data * verification is enabled */ - uint64_t *last_write_comp; + struct fio_write_comp { + uint64_t offset; + uint64_t completion_time_nsec; + uint32_t flags; /* I/O flags including FUA */ + uint32_t flush_count; /* FLUSH count at completion time */ + } *last_write_comp; unsigned int last_write_idx; + uint64_t last_flush_time_nsec; /* Last FLUSH completion timestamp */ + unsigned int flush_count; /* Count of completed FLUSH operations */ /* * For use by the io engine to store offset diff --git a/fio.1 b/fio.1 index 5bcb1d46a7..239a8c3675 100644 --- a/fio.1 +++ b/fio.1 @@ -3734,6 +3734,34 @@ Recreate an instance of the \fBverify_pattern\fR every up the process of writing each block on a device with its offset. Default: 0 (disabled). .TP +.BI verify_type \fR=\fPstr +Controls which write operations are included during the verification phase. +This option only affects offline verification when using \fBverify_state_save\fR +to save completion state and later verify with a separate job. The allowed +values are: +.RS +.RS +.TP +.B flush +Only verify writes that completed at or before the last fsync operation. +This mode filters out writes that completed after the last fsync, which may +not be persistent on storage. Writes with the Force Unit Access (FUA) flag +are always included regardless of fsync timing, as they bypass the cache and +are immediately persistent. This is useful for testing data persistence +guarantees across power failures or system crashes. +.RE +.P +When \fBverify_type=flush\fR is used, fio tracks fsync completion times and +write completion times during the write phase. During verification, only +writes that meet the fsync timing criteria are verified. This allows testing +scenarios where only data that was properly synced before a simulated +failure should be verified. +.P +This option requires \fBverify_state_save\fR to be enabled and is only +effective during offline verification (separate verify job). Default: none +(verify all completed writes). +.RE +.TP .BI verify_fatal \fR=\fPbool Normally fio will keep checking the entire contents before quitting on a block verification failure. If this option is set, fio will exit the job on diff --git a/io_u.c b/io_u.c index ca97f38881..e452fc3525 100644 --- a/io_u.c +++ b/io_u.c @@ -2063,7 +2063,7 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u, } static void file_log_write_comp(const struct thread_data *td, struct fio_file *f, - uint64_t offset, unsigned int bytes) + uint64_t offset, unsigned int bytes, struct io_u *io_u) { int idx; @@ -2079,11 +2079,31 @@ static void file_log_write_comp(const struct thread_data *td, struct fio_file *f return; idx = f->last_write_idx++; - f->last_write_comp[idx] = offset; + f->last_write_comp[idx].offset = offset; + f->last_write_comp[idx].completion_time_nsec = ntime_since_now(&io_u->start_time); + f->last_write_comp[idx].flags = 0; + f->last_write_comp[idx].flush_count = f->flush_count; + + /* Check if this is a FUA write */ + if (io_u && (io_u->flags & IO_U_F_FUA)) + f->last_write_comp[idx].flags |= FIO_WRITE_COMP_FUA; + if (f->last_write_idx == td->last_write_comp_depth) f->last_write_idx = 0; } +static void file_log_flush_comp(struct fio_file *f, struct io_u *io_u) +{ + if (!f) + return; + + /* Track the last FLUSH completion timestamp */ + f->last_flush_time_nsec = ntime_since_now(&io_u->start_time); + + /* Increment FLUSH counter */ + f->flush_count++; +} + static bool should_account(struct thread_data *td) { return ramp_time_over(td) && (td->runstate == TD_RUNNING || @@ -2125,7 +2145,10 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr, if (ddir_sync(ddir)) { if (io_u->error) goto error; + + /* Log flush completion */ if (f) { + file_log_flush_comp(f, io_u); f->first_write = -1ULL; f->last_write = -1ULL; } @@ -2164,7 +2187,7 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr, } if (ddir == DDIR_WRITE) - file_log_write_comp(td, f, io_u->offset, bytes); + file_log_write_comp(td, f, io_u->offset, bytes, io_u); if (should_account(td)) account_io_completion(td, io_u, icd, ddir, bytes); @@ -2464,6 +2487,10 @@ int do_io_u_sync(const struct thread_data *td, struct io_u *io_u) if (ret < 0) io_u->error = errno; + else { + /* Record FLUSH completion timing for verification state */ + file_log_flush_comp(io_u->file, io_u); + } return ret; } diff --git a/io_u.h b/io_u.h index 178c12293f..4f8bbf73c8 100644 --- a/io_u.h +++ b/io_u.h @@ -24,6 +24,7 @@ enum { IO_U_F_PATTERN_DONE = 1 << 8, IO_U_F_DEVICE_ERROR = 1 << 9, IO_U_F_VER_IN_DEV = 1 << 10, /* Verify data in device */ + IO_U_F_FUA = 1 << 11, /* Force Unit Access flag */ }; /* diff --git a/options.c b/options.c index 6295a616ca..17bfcbd475 100644 --- a/options.c +++ b/options.c @@ -3256,6 +3256,28 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_VERIFY, }, + { + .name = "verify_type", + .lname = "Verify type", + .type = FIO_OPT_STR, + .off1 = offsetof(struct thread_options, verify_type), + .help = "Verification filter type", + .def = "none", + .parent = "verify", + .hide = 1, + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_VERIFY, + .posval = { + { .ival = "none", + .oval = VERIFY_NONE, + .help = "No verification filtering", + }, + { .ival = "flush", + .oval = VERIFY_TYPE_FLUSH, + .help = "Verify only writes that completed before flush", + }, + }, + }, { .name = "verifysort", .lname = "Verify sort", diff --git a/t/verify-state.c b/t/verify-state.c index f8787e9a33..465889f53d 100644 --- a/t/verify-state.c +++ b/t/verify-state.c @@ -26,14 +26,16 @@ static void show_s(struct thread_io_list *s, unsigned int no_s) printf("Max completions per file:\t\t%lu\n", (unsigned long) s->max_no_comps_per_file); printf("Number IOs:\t%llu\n", (unsigned long long) s->numberio); printf("Index:\t\t%llu\n", (unsigned long long) s->index); + printf("Last flush count:\t%u\n", s->last_flush_count); printf("Completions:\n"); if (!s->no_comps) return; for (i = s->no_comps - 1; i >= 0; i--) { - printf("\t(file=%2llu) %llu\n", + printf("\t(file=%2llu) %llu (flush_count=%u)\n", (unsigned long long) s->comps[i].fileno, - (unsigned long long) s->comps[i].offset); + (unsigned long long) s->comps[i].offset, + s->comps[i].flush_count); } } @@ -51,10 +53,12 @@ static void show(struct thread_io_list *s, size_t size) s->nofiles = le32_to_cpu(s->nofiles); s->numberio = le64_to_cpu(s->numberio); s->index = le64_to_cpu(s->index); + s->last_flush_count = le32_to_cpu(s->last_flush_count); for (i = 0; i < s->no_comps; i++) { s->comps[i].fileno = le64_to_cpu(s->comps[i].fileno); s->comps[i].offset = le64_to_cpu(s->comps[i].offset); + s->comps[i].flush_count = le32_to_cpu(s->comps[i].flush_count); } show_s(s, no_s); @@ -92,7 +96,7 @@ static void show_verify_state(void *buf, size_t size) return; } - if (hdr->version == 0x04) + if (hdr->version == 0x05) show(s, size); else log_err("Unsupported version %d\n", (int) hdr->version); diff --git a/thread_options.h b/thread_options.h index 1b26ab5864..9f733ff6cb 100644 --- a/thread_options.h +++ b/thread_options.h @@ -142,6 +142,7 @@ struct thread_options { unsigned int sync_io; unsigned int write_hint; unsigned int verify; + unsigned int verify_type; unsigned int do_verify; unsigned int verify_interval; unsigned int verify_offset; @@ -189,6 +190,7 @@ struct thread_options { struct zone_split *zone_split[DDIR_RWDIR_CNT]; unsigned int zone_split_nr[DDIR_RWDIR_CNT]; + uint32_t pad2; fio_fp64_t zipf_theta; fio_fp64_t pareto_h; @@ -477,6 +479,7 @@ struct thread_options_pack { uint32_t sync_io; uint32_t write_hint; uint32_t verify; + uint32_t verify_type; uint32_t do_verify; uint32_t verify_interval; uint32_t verify_offset; @@ -521,6 +524,7 @@ struct thread_options_pack { struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX]; uint32_t zone_split_nr[DDIR_RWDIR_CNT]; + uint32_t pad2; fio_fp64_t zipf_theta; fio_fp64_t pareto_h; diff --git a/verify-state.h b/verify-state.h index 603af70d4b..c2d17d5ef4 100644 --- a/verify-state.h +++ b/verify-state.h @@ -28,6 +28,8 @@ struct thread_rand_state { struct file_comp { uint64_t fileno; uint64_t offset; + uint32_t flush_count; /* FLUSH count at completion time for ordering */ + uint32_t flags; /* I/O flags including FUA */ }; struct thread_io_list { @@ -37,6 +39,8 @@ struct thread_io_list { uint32_t nofiles; uint64_t numberio; uint64_t index; + uint32_t last_flush_count; /* Last FLUSH count for ordering */ + uint32_t padding; /* Padding for alignment */ struct thread_rand_state rand; uint8_t name[64]; struct file_comp comps[0]; @@ -47,7 +51,7 @@ struct all_io_list { struct thread_io_list state[0]; }; -#define VSTATE_HDR_VERSION 0x04 +#define VSTATE_HDR_VERSION 0x05 /* Incremented for FLUSH count support */ struct verify_state_hdr { uint64_t version; @@ -57,6 +61,9 @@ struct verify_state_hdr { #define IO_LIST_ALL 0xffffffff +/* Flags for file_comp.flags */ +#define FIO_COMP_FLAG_FUA 0x1 /* Write had Force Unit Access flag */ + struct io_u; extern struct all_io_list *get_all_io_list(int, size_t *); extern void __verify_save_state(struct all_io_list *, const char *); diff --git a/verify.c b/verify.c index 04718f303a..9c42512fd1 100644 --- a/verify.c +++ b/verify.c @@ -1407,6 +1407,7 @@ int get_next_verify(struct thread_data *td, struct io_u *io_u) if (io_u->file) return 0; +retry: if (!RB_EMPTY_ROOT(&td->io_hist_tree)) { struct fio_rb_node *n = rb_first(&td->io_hist_tree); @@ -1466,6 +1467,11 @@ int get_next_verify(struct thread_data *td, struct io_u *io_u) remove_trim_entry(td, ipo); free(ipo); + + /* Check if this offset was filtered out by FLUSH timing */ + if (verify_state_should_stop(td, io_u)) + goto retry; + dprint(FD_VERIFY, "get_next_verify: ret io_u %p\n", io_u); if (!td->o.verify_pattern_bytes) { @@ -1649,7 +1655,9 @@ static int __fill_file_completions(struct thread_data *td, if (j == -1) j = td->last_write_comp_depth - 1; s->comps[*index].fileno = __cpu_to_le64(f->fileno); - s->comps[*index].offset = cpu_to_le64(f->last_write_comp[j]); + s->comps[*index].offset = cpu_to_le64(f->last_write_comp[j].offset); + s->comps[*index].flush_count = cpu_to_le32(f->last_write_comp[j].flush_count); + s->comps[*index].flags = cpu_to_le32(f->last_write_comp[j].flags); (*index)++; j--; } @@ -1661,12 +1669,17 @@ static int fill_file_completions(struct thread_data *td, struct thread_io_list *s, unsigned int *index) { struct fio_file *f; - unsigned int i; + unsigned int i = 0; int comps = 0; + uint32_t max_flush_count = 0; - for_each_file(td, f, i) + for_each_file(td, f, i) { comps += __fill_file_completions(td, s, f, index); + if (f->flush_count > max_flush_count) + max_flush_count = f->flush_count; + } + s->last_flush_count = cpu_to_le32(max_flush_count); return comps; } @@ -1887,6 +1900,46 @@ int verify_state_hdr(struct verify_state_hdr *hdr, struct thread_io_list *s) return 0; } +/* + * Filter completion records based on FLUSH completion count rules: + * - Include writes that completed at or before the last FLUSH count + * - Include FUA writes regardless of FLUSH count + * - Exclude non-FUA writes that completed after the last FLUSH count + */ +static void filter_verify_state_by_flush_timing(struct thread_io_list *s) +{ + uint32_t last_flush_count; + int original_count; + int i, j; + + if (!s || s->no_comps == 0) + return; + + original_count = le64_to_cpu(s->no_comps); + last_flush_count = le32_to_cpu(s->last_flush_count); + + if (last_flush_count == 0) + return; + + /* Filter completion records in-place */ + for (i = 0, j = 0; i < original_count; i++) { + uint32_t write_flush_count = le32_to_cpu(s->comps[i].flush_count); + uint32_t write_flags = le32_to_cpu(s->comps[i].flags); + + /* Apply FLUSH completion count rules */ + if ((write_flags & FIO_COMP_FLAG_FUA) || + (write_flush_count < last_flush_count)) { + /* FUA writes or writes completed before FLUSH are included */ + if (i != j) + s->comps[j] = s->comps[i]; + j++; + } + } + + /* Update the completion count */ + s->no_comps = cpu_to_le64((uint64_t) j); +} + int verify_load_state(struct thread_data *td, const char *prefix) { struct verify_state_hdr hdr; @@ -1937,6 +1990,10 @@ int verify_load_state(struct thread_data *td, const char *prefix) close(fd); + /* Filter completion records based on FLUSH timing before assigning state */ + if (td->o.verify_type == VERIFY_TYPE_FLUSH) + filter_verify_state_by_flush_timing(s); + verify_assign_state(td, s); return 0; err: @@ -1971,6 +2028,9 @@ int verify_state_should_stop(struct thread_data *td, struct io_u *io_u) * We're in the window of having to check if this io was * completed or not. If the IO was seen as completed, then * lets verify it. + * + * Note: FLUSH completion timing filtering is now done at state load time, + * so any offset in the completion list is valid for verification. */ for (i = 0; i < s->no_comps; i++) { if (s->comps[i].fileno != f->fileno) diff --git a/verify.h b/verify.h index 539e6f6cf5..f5c7a99535 100644 --- a/verify.h +++ b/verify.h @@ -30,6 +30,8 @@ enum { VERIFY_PATTERN, /* verify specific patterns */ VERIFY_PATTERN_NO_HDR, /* verify specific patterns, no hdr */ VERIFY_NULL, /* pretend to verify */ + + VERIFY_TYPE_FLUSH, /* verify offsets based on flush completion time */ }; /*