Skip to content

Commit 914276a

Browse files
cuihtlauacclaude
andcommitted
Add TSan stress suite for per-hotspot race detection
Five new scenarios under test/irmin-pack/test_tsan_stress/ target mutable state hotspots that the existing multicore + QCheck-STM tests do not exercise across domains: - stress_mem_cache: races the global Hashtbl.t cache captured by Irmin_mem.Read_only.v (irmin_mem.ml:44) and the KMap mutable in the Read_only instance. - stress_watch: races the listen_dir_hook ref in watch.ml:28-29. - stress_ao_buf: races the unguarded Buffer.t in the rw_perm of Append_only_file (append_only_file.ml). - stress_dict: races the two unguarded Hashtbl.t caches plus last_refill_offset in dict.ml. - stress_fs_pool: races the shared Eio_pool instances in irmin_fs_unix.ml (mkdir_pool, openfile_pool). mem and watch produce clean TSan data-race warnings pointing at the exact hotspot; ao, dict and fs cause memory corruption fast enough that TSan fires via SEGV before it can write a detailed report — the SEGV itself is the race signal. The @tsan-stress dune alias runs each scenario in its own process via `|| true` so that a race-induced crash in one does not suppress the others. The tsan.yml workflow counts both "WARNING: ThreadSanitizer" and "ERROR: ThreadSanitizer" as findings, and includes the @tsan-stress build-dir reports in the uploaded artifact. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent fdc7989 commit 914276a

9 files changed

Lines changed: 258 additions & 24 deletions

File tree

.github/workflows/tsan.yml

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,16 +71,24 @@ jobs:
7171
shell: bash
7272
run: |
7373
shopt -s nullglob
74-
files=(tsan-report.*)
75-
n=0
74+
files=(tsan-report.* _build/default/test/irmin-pack/test_tsan_stress/tsan-report.*)
75+
warnings=0
76+
errors=0
7677
if [ ${#files[@]} -gt 0 ] || [ -f tsan-run.log ]; then
77-
n=$(grep -ch "WARNING: ThreadSanitizer" "${files[@]}" tsan-run.log 2>/dev/null | awk '{s+=$1} END {print s+0}')
78+
warnings=$(grep -ch "WARNING: ThreadSanitizer" "${files[@]}" tsan-run.log 2>/dev/null | awk -F: '{s+=$NF} END {print s+0}')
79+
errors=$(grep -ch "ERROR: ThreadSanitizer" "${files[@]}" tsan-run.log 2>/dev/null | awk -F: '{s+=$NF} END {print s+0}')
7880
fi
81+
total=$((warnings + errors))
7982
{
80-
echo "### TSan findings: $n"
81-
if [ "$n" = "0" ]; then
83+
echo "### TSan findings: $total"
84+
echo ""
85+
echo "- data-race warnings: $warnings"
86+
echo "- signal/SEGV-on-race errors: $errors"
87+
if [ "$total" = "0" ]; then
88+
echo ""
8289
echo "No races detected."
8390
else
91+
echo ""
8492
echo "See artifact \`tsan-reports-${{ github.run_id }}\`."
8593
fi
8694
} >> "$GITHUB_STEP_SUMMARY"
@@ -92,5 +100,6 @@ jobs:
92100
name: tsan-reports-${{ github.run_id }}
93101
path: |
94102
tsan-report.*
103+
_build/default/test/irmin-pack/test_tsan_stress/tsan-report.*
95104
tsan-run.log
96105
if-no-files-found: ignore
Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,29 @@
11
(executable
22
(name main)
3-
(modules main)
4-
(libraries eio_main))
3+
(modules
4+
main
5+
stress_common
6+
stress_ao_buf
7+
stress_dict
8+
stress_mem_cache
9+
stress_watch
10+
stress_fs_pool)
11+
(libraries
12+
eio_main
13+
irmin
14+
irmin.mem
15+
irmin-pack.io
16+
irmin-pack.unix
17+
irmin-fs.unix))
18+
19+
; Run each scenario in a fresh process so a race that corrupts memory
20+
; and triggers a SEGV in one scenario does not prevent the others from
21+
; running. Each scenario writes TSan reports to tsan-report.<pid>, and
22+
; the outer workflow aggregates all of them.
523

624
(rule
725
(alias tsan-stress)
26+
(deps main.exe)
827
(action
9-
(run ./main.exe)))
28+
(bash
29+
"./main.exe mem || true; ./main.exe watch || true; ./main.exe ao || true; ./main.exe dict || true; ./main.exe fs || true")))
Lines changed: 52 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,68 @@
1-
(* TSan stress suite.
1+
(* TSan stress suite — dispatcher.
22
3-
Dispatcher for race-hunting scenarios. Each scenario targets a mutable
4-
state hotspot that the standard test suite does not exercise across
5-
domains. Scenarios are added incrementally.
3+
Each scenario targets a mutable-state hotspot that the standard test
4+
suite does not exercise across domains. Iteration count is driven by
5+
[IRMIN_TSAN_STRESS_ITER] (default 100).
66
7-
Iteration count: [IRMIN_TSAN_STRESS_ITER] env var (default 100). *)
7+
Scenarios and expected outcomes under TSan:
8+
9+
- mem: clean data-race warning at irmin_mem.ml:51 (Hashtbl.add in
10+
the global cache) and on the shared KMap mutable.
11+
- watch: clean data-race warning at watch.ml:33 (listen_dir_hook
12+
ref assignment).
13+
- ao: SEGV — concurrent Buffer.add_string corrupts the buffer
14+
fast enough that TSan's signal handler fires before the
15+
race warning is written. The SEGV itself is evidence of
16+
the race; a clean warning would need finer-grained access.
17+
- dict: SEGV — same pattern as ao, via the two unguarded Hashtbl.t
18+
caches plus the append path through Ao.
19+
- fs: TSan "nested bug, aborting" — the race interacts with Eio
20+
scheduler/pool internals in a way the sanitizer can't
21+
unwind. Surfaces the problem but not a specific site.
22+
23+
Because ao/dict/fs crash, each scenario is run in its own process by
24+
the @tsan-stress dune alias so one crash does not hide the others.
25+
26+
Usage:
27+
main.exe run all scenarios
28+
main.exe all same
29+
main.exe <name> run one scenario (ao|dict|mem|watch|fs) *)
830

931
let iter_count =
1032
match Sys.getenv_opt "IRMIN_TSAN_STRESS_ITER" with
1133
| Some s -> int_of_string s
1234
| None -> 100
1335

14-
let scenarios : (string * (iter:int -> unit)) list = []
36+
type env = Eio_unix.Stdenv.base
37+
38+
let scenarios : (string * (env:env -> iter:int -> unit)) list =
39+
[
40+
("ao", Stress_ao_buf.run);
41+
("dict", Stress_dict.run);
42+
("mem", Stress_mem_cache.run);
43+
("watch", Stress_watch.run);
44+
("fs", Stress_fs_pool.run);
45+
]
1546

16-
let run_all () =
17-
List.iter
18-
(fun (name, fn) ->
19-
Printf.printf "tsan-stress: %s (iter=%d)\n%!" name iter_count;
20-
fn ~iter:iter_count)
21-
scenarios
47+
let run_one ~env (name, fn) =
48+
Printf.printf "tsan-stress: %s (iter=%d)\n%!" name iter_count;
49+
fn ~env ~iter:iter_count
2250

2351
let () =
24-
let which = if Array.length Sys.argv >= 2 then Sys.argv.(1) else "all" in
52+
let which =
53+
match Sys.argv with
54+
| [| _ |] | [| _; "all" |] -> `All
55+
| [| _; name |] -> `One name
56+
| _ ->
57+
prerr_endline "usage: main.exe [all|ao|dict|mem|watch|fs]";
58+
exit 2
59+
in
60+
Eio_main.run @@ fun env ->
2561
match which with
26-
| "all" -> run_all ()
27-
| name -> (
62+
| `All -> List.iter (run_one ~env) scenarios
63+
| `One name -> (
2864
match List.assoc_opt name scenarios with
29-
| Some fn -> fn ~iter:iter_count
65+
| Some fn -> run_one ~env (name, fn)
3066
| None ->
3167
Printf.eprintf "tsan-stress: unknown scenario %S\n%!" name;
3268
exit 2)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
(* Races the unguarded [Buffer.t] and atomic counter in
2+
[Append_only_file.rw_perm] (src/irmin-pack/io/append_only_file.ml).
3+
4+
N domains call [Ao.append_exn] concurrently on the same [Ao.t].
5+
The function does [Buffer.add_string] without synchronisation, then
6+
an [Atomic.fetch_and_add] on [buf_length] — the TOCTOU and the
7+
Buffer mutation both surface to TSan. *)
8+
9+
module Io = Irmin_pack_unix.Io.Unix
10+
module Errs = Irmin_pack_io.Io_errors.Make (Io)
11+
module Ao = Irmin_pack_io.Append_only_file.Make (Io) (Errs)
12+
13+
let run ~env ~iter =
14+
Eio.Switch.run @@ fun sw ->
15+
let dmgr = Eio.Stdenv.domain_mgr env in
16+
let path = Stress_common.scratch_file ~env "stress_ao_buf.data" in
17+
let ao =
18+
match Ao.create_rw ~sw ~path ~overwrite:true with
19+
| Ok ao -> ao
20+
| Error _ -> failwith "stress_ao_buf: create_rw failed"
21+
in
22+
let worker () =
23+
for _ = 1 to iter do
24+
Ao.append_exn ao "xxxxxxxx"
25+
done
26+
in
27+
Stress_common.domains_spawn ~dmgr ~nb:2 worker;
28+
ignore (Ao.flush ao);
29+
ignore (Ao.close ao)
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
(* Barrier-synchronised domain spawning, mirroring the idiom in
2+
test/irmin-pack/test_multicore.ml so the workers start roughly
3+
together and the scheduler has to interleave them. *)
4+
5+
(* Clean scratch directory for scenarios that need on-disk state.
6+
Rooted at [/tmp/irmin-tsan-stress/<name>] to avoid clashing with
7+
a possibly missing or read-only [_build] under the process cwd. *)
8+
let scratch_path ~env name =
9+
let fs = Eio.Stdenv.fs env in
10+
let base = Eio.Path.(fs / "tmp" / "irmin-tsan-stress") in
11+
let p = Eio.Path.(base / name) in
12+
(try Eio.Path.rmtree p with _ -> ());
13+
(try Eio.Path.mkdirs ~perm:0o755 base with _ -> ());
14+
(try Eio.Path.mkdir ~perm:0o755 p with _ -> ());
15+
p
16+
17+
(* Like [scratch_path] but returns a path to a file (non-existing)
18+
whose parent directory has been freshly created. *)
19+
let scratch_file ~env name =
20+
let fs = Eio.Stdenv.fs env in
21+
let base = Eio.Path.(fs / "tmp" / "irmin-tsan-stress") in
22+
(try Eio.Path.mkdirs ~perm:0o755 base with _ -> ());
23+
let p = Eio.Path.(base / name) in
24+
(try Eio.Path.unlink p with _ -> ());
25+
p
26+
27+
let domains_run ~dmgr fns =
28+
let count = Atomic.make (List.length fns) in
29+
let fibers =
30+
List.map
31+
(fun fn () ->
32+
Eio.Domain_manager.run dmgr (fun () ->
33+
Atomic.decr count;
34+
while Atomic.get count > 0 do
35+
Domain.cpu_relax ()
36+
done;
37+
fn ()))
38+
fns
39+
in
40+
Eio.Fiber.all fibers
41+
42+
let domains_spawn ~dmgr ?(nb = 4) fn =
43+
domains_run ~dmgr (List.init nb (fun _ -> fn))
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
(* Races the two unguarded [Hashtbl.t]s and [mutable last_refill_offset]
2+
in src/irmin-pack/io/dict.ml.
3+
4+
One writer domain loops [Dict.index] (which appends and mutates both
5+
tables via [Hashtbl.add]); readers concurrently loop [Dict.find]. *)
6+
7+
module Io = Irmin_pack_unix.Io.Unix
8+
module Dict = Irmin_pack_io.Dict.Make (Io)
9+
10+
let run ~env ~iter =
11+
Eio.Switch.run @@ fun sw ->
12+
let dmgr = Eio.Stdenv.domain_mgr env in
13+
let path = Stress_common.scratch_file ~env "stress_dict.data" in
14+
let dict =
15+
match Dict.create_rw ~sw ~overwrite:true ~path with
16+
| Ok d -> d
17+
| Error _ -> failwith "stress_dict: create_rw failed"
18+
in
19+
let writer () =
20+
for i = 1 to iter do
21+
let _ = Dict.index dict (Printf.sprintf "str-%d" i) in
22+
()
23+
done
24+
in
25+
let reader () =
26+
for i = 1 to iter do
27+
let _ = Dict.find dict i in
28+
()
29+
done
30+
in
31+
Stress_common.domains_run ~dmgr [ writer; reader ];
32+
ignore (Dict.close dict)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
(* Races the shared [Eio_pool.t] instances ([mkdir_pool], [openfile_pool])
2+
in src/irmin-fs/unix/irmin_fs_unix.ml:75,87.
3+
4+
A single FS-backed store is shared across N domains; each writes a
5+
distinct key. Store.set_exn goes through [with_write_file] which uses
6+
[openfile_pool]; creating the root directory uses [mkdir_pool]. The
7+
pool's internal state (queue, counter) is shared across domains. *)
8+
9+
module Store = Irmin_fs_unix.KV.Make (Irmin.Contents.String)
10+
11+
let run ~env ~iter =
12+
Eio.Switch.run @@ fun _sw ->
13+
let clock = Eio.Stdenv.clock env in
14+
let dmgr = Eio.Stdenv.domain_mgr env in
15+
let root = Stress_common.scratch_path ~env "stress_fs_pool" in
16+
let cfg = Irmin_fs_unix.config ~root ~clock in
17+
let repo = Store.Repo.v cfg in
18+
let main = Store.main repo in
19+
let info () = Store.Info.v 0L in
20+
let worker id () =
21+
for i = 1 to iter do
22+
Store.set_exn ~info main [ Printf.sprintf "k-%d-%d" id i ] "v"
23+
done
24+
in
25+
let fns = List.init 2 (fun id -> worker id) in
26+
Stress_common.domains_run ~dmgr fns;
27+
Store.Repo.close repo
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
(* Races the global [cache : (string, _) Hashtbl.t] captured by
2+
[Irmin_mem.Read_only.v] in src/irmin/mem/irmin_mem.ml:44.
3+
4+
N domains barrier-start, then each opens and closes a Repo.v
5+
repeatedly. On cold cache they all race on Hashtbl.find_opt/add;
6+
after that, set_exn on the shared repo races the mutable KMap field
7+
of the underlying Read_only instance. *)
8+
9+
module Store = Irmin_mem.KV.Make (Irmin.Contents.String)
10+
11+
let run ~env ~iter =
12+
let dmgr = Eio.Stdenv.domain_mgr env in
13+
let info () = Store.Info.v 0L in
14+
let worker id () =
15+
for i = 1 to iter do
16+
let repo = Store.Repo.v (Irmin_mem.config ()) in
17+
let main = Store.main repo in
18+
Store.set_exn ~info main [ Printf.sprintf "k-%d-%d" id i ] "v";
19+
Store.Repo.close repo
20+
done
21+
in
22+
let fns = List.init 4 (fun id -> worker id) in
23+
Stress_common.domains_run ~dmgr fns
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
(* Races the global [listen_dir_hook] / [watch_switch] refs in
2+
src/irmin/watch.ml:28-29. N domains concurrently overwrite the
3+
shared hook with [set_listen_dir_hook]; the assignment to the
4+
[ref] is unsynchronised. *)
5+
6+
let dummy_hook _ _ _ _ = ()
7+
8+
let run ~env ~iter =
9+
let dmgr = Eio.Stdenv.domain_mgr env in
10+
let worker () =
11+
for _ = 1 to iter do
12+
Irmin.Backend.Watch.set_listen_dir_hook dummy_hook
13+
done
14+
in
15+
Stress_common.domains_spawn ~dmgr ~nb:4 worker

0 commit comments

Comments
 (0)