Skip to content

Commit efb40c0

Browse files
cuihtlauacclaude
andcommitted
Add TSan nightly CI workflow
New .github/workflows/tsan.yml runs the test suite under ThreadSanitizer on a nightly cron (plus manual and opt-in `tsan` label on PRs). Runs the default suite alongside a new @tsan-stress alias, with halt_on_error=0 so a single run surfaces every race TSan observes. Reports are uploaded as a workflow artifact. The existing multicore and QCheck-STM tests become scalable via env vars: IRMIN_STM_ITER, IRMIN_STM_PACK_ITER, IRMIN_MULTICORE_DOMAINS, IRMIN_MULTICORE_ITER. Defaults match prior behaviour, so normal `dune runtest` is unchanged. The @tsan-stress alias (test/irmin-pack/test_tsan_stress/) ships as an empty dispatcher; per-hotspot scenarios (dict refill, irmin_mem cache, watch globals, fs pool, append_only_file buffer) land in a follow-up PR. This adds detection only; no src/ changes. Expect the first nightly run to surface several known races from #2397 — that output is the baseline for follow-up fixes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 15f3f2c commit efb40c0

7 files changed

Lines changed: 192 additions & 18 deletions

File tree

.github/workflows/tsan.yml

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
name: TSan
2+
3+
# ThreadSanitizer nightly race-hunting workflow.
4+
# Requires bare ubuntu-latest runners (for writable /proc/sys); cannot run
5+
# inside a container: job, Codespaces, or ocaml-ci.
6+
7+
on:
8+
schedule:
9+
- cron: '0 3 * * *'
10+
workflow_dispatch: {}
11+
pull_request:
12+
types: [labeled, synchronize]
13+
14+
concurrency:
15+
group: tsan-${{ github.ref }}
16+
cancel-in-progress: true
17+
18+
jobs:
19+
tsan:
20+
if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'tsan')
21+
runs-on: ubuntu-latest
22+
timeout-minutes: 75
23+
env:
24+
EIO_BACKEND: posix
25+
TSAN_OPTIONS: "halt_on_error=0 history_size=7 second_deadlock_stack=1 exitcode=66 log_path=tsan-report suppressions=${{ github.workspace }}/test/irmin-pack/tsan_suppressions.txt"
26+
IRMIN_STM_ITER: "5000"
27+
IRMIN_STM_PACK_ITER: "2000"
28+
IRMIN_MULTICORE_DOMAINS: "8"
29+
IRMIN_MULTICORE_ITER: "50"
30+
IRMIN_TSAN_STRESS_ITER: "500"
31+
32+
steps:
33+
- uses: actions/checkout@v4
34+
35+
- name: Reduce ASLR entropy for TSan shadow memory
36+
run: sudo sysctl vm.mmap_rnd_bits=28
37+
38+
- name: Install libunwind
39+
run: sudo apt-get update -y && sudo apt-get install -y libunwind-dev
40+
41+
- name: Cache opam root
42+
uses: actions/cache@v4
43+
with:
44+
path: ~/.opam
45+
key: tsan-opam-${{ runner.os }}-5.3.0-${{ hashFiles('*.opam') }}
46+
restore-keys: |
47+
tsan-opam-${{ runner.os }}-5.3.0-
48+
49+
- name: Set up OCaml with TSan
50+
uses: ocaml/setup-ocaml@v3
51+
with:
52+
ocaml-compiler: "ocaml-variants.5.3.0+options,ocaml-option-tsan"
53+
dune-cache: true
54+
55+
- name: Install dune
56+
run: opam install -y dune
57+
58+
- name: Install dependencies
59+
run: opam install -y --deps-only --with-test .
60+
61+
- name: Build
62+
run: opam exec -- dune build @install
63+
64+
- name: Run tests and stress suite under TSan
65+
run: |
66+
set -o pipefail
67+
opam exec -- dune build @runtest @tsan-stress --force --no-buffer 2>&1 | tee tsan-run.log
68+
69+
- name: Summarize findings
70+
if: always()
71+
shell: bash
72+
run: |
73+
shopt -s nullglob
74+
files=(tsan-report.*)
75+
n=0
76+
if [ ${#files[@]} -gt 0 ] || [ -f tsan-run.log ]; then
77+
n=$(grep -ch "WARNING: ThreadSanitizer" "${files[@]}" tsan-run.log 2>/dev/null | awk '{s+=$1} END {print s+0}')
78+
fi
79+
{
80+
echo "### TSan findings: $n"
81+
if [ "$n" = "0" ]; then
82+
echo "No races detected."
83+
else
84+
echo "See artifact \`tsan-reports-${{ github.run_id }}\`."
85+
fi
86+
} >> "$GITHUB_STEP_SUMMARY"
87+
88+
- name: Upload TSan reports
89+
if: always()
90+
uses: actions/upload-artifact@v4
91+
with:
92+
name: tsan-reports-${{ github.run_id }}
93+
path: |
94+
tsan-report.*
95+
tsan-run.log
96+
if-no-files-found: ignore

test/irmin-pack/test_multicore.ml

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,14 @@ let src = Logs.Src.create "tests.multicore" ~doc:"Tests"
2222

2323
module Log = (val Logs.src_log src : Logs.LOG)
2424

25+
let int_env name default =
26+
match Sys.getenv_opt name with
27+
| Some s -> int_of_string s
28+
| None -> default
29+
30+
let default_domains = int_env "IRMIN_MULTICORE_DOMAINS" 2
31+
let test_iter = int_env "IRMIN_MULTICORE_ITER" 1
32+
2533
module Store = struct
2634
module Maker = Irmin_pack_unix.Maker (Conf)
2735
include Maker.Make (Schema)
@@ -130,7 +138,7 @@ let domains_run ~domain_mgr fns =
130138
in
131139
Eio.Fiber.all fibers
132140

133-
let domains_spawn ~domain_mgr ?(nb = 2) fn =
141+
let domains_spawn ~domain_mgr ?(nb = default_domains) fn =
134142
domains_run ~domain_mgr @@ List.init nb (fun _ -> fn)
135143

136144
let find_all tree paths =
@@ -499,18 +507,26 @@ let test_commit_v ~fs ~domain_mgr =
499507

500508
let tests ~fs ~domain_mgr =
501509
let tc name fn = Alcotest.test_case name `Quick (fun () -> fn ~domain_mgr) in
502-
[
503-
tc "find." (test_find ~fs);
504-
tc "length." (test_length ~fs);
505-
tc "add / remove." (test_add_remove ~fs);
506-
tc "commit." (test_commit ~fs);
507-
tc "merkle." (test_merkle ~fs);
508-
tc "hash." (test_hash ~fs);
509-
tc "list-disk-no-cache." (test_list_disk ~fs ~cache:false);
510-
tc "list-disk-with-cache." (test_list_disk ~fs ~cache:true);
511-
tc "list-mem-no-cache." (test_list_mem ~fs ~cache:false);
512-
tc "list-mem-with-cache." (test_list_mem ~fs ~cache:true);
513-
tc "commit-of-hash." (test_commit_of_hash ~fs);
514-
tc "commit-parents." (test_commit_parents ~fs);
515-
tc "commit-v." (test_commit_v ~fs);
516-
]
510+
let cases =
511+
[
512+
("find.", test_find ~fs);
513+
("length.", test_length ~fs);
514+
("add / remove.", test_add_remove ~fs);
515+
("commit.", test_commit ~fs);
516+
("merkle.", test_merkle ~fs);
517+
("hash.", test_hash ~fs);
518+
("list-disk-no-cache.", test_list_disk ~fs ~cache:false);
519+
("list-disk-with-cache.", test_list_disk ~fs ~cache:true);
520+
("list-mem-no-cache.", test_list_mem ~fs ~cache:false);
521+
("list-mem-with-cache.", test_list_mem ~fs ~cache:true);
522+
("commit-of-hash.", test_commit_of_hash ~fs);
523+
("commit-parents.", test_commit_parents ~fs);
524+
("commit-v.", test_commit_v ~fs);
525+
]
526+
in
527+
if test_iter <= 1 then List.map (fun (name, fn) -> tc name fn) cases
528+
else
529+
List.concat_map
530+
(fun (name, fn) ->
531+
List.init test_iter (fun i -> tc (Printf.sprintf "%s%d" name i) fn))
532+
cases

test/irmin-pack/test_stm/test_stm.ml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,11 @@ let agree_test_eio ~count ~domain_mgr =
7070
TT.agree_test_par ~domain_mgr ~count ~name:"Irmin test parallel"
7171

7272
let () =
73-
let count = 500 in
73+
let count =
74+
match Sys.getenv_opt "IRMIN_STM_ITER" with
75+
| Some s -> int_of_string s
76+
| None -> 500
77+
in
7478
Eio_main.run @@ fun env ->
7579
let domain_mgr = Eio.Stdenv.domain_mgr env in
7680
QCheck_base_runner.run_tests_main [ agree_test_eio ~count ~domain_mgr ]

test/irmin-pack/test_stm/test_stm_irmin_pack.ml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,11 @@ let agree_test_eio ~count ~domain_mgr ~fs ~sw =
117117
TT.agree_test_par ~domain_mgr ~count ~name:"Irmin test parallel"
118118

119119
let () =
120-
let count = 100 in
120+
let count =
121+
match Sys.getenv_opt "IRMIN_STM_PACK_ITER" with
122+
| Some s -> int_of_string s
123+
| None -> 100
124+
in
121125
Eio_main.run @@ fun env ->
122126
Eio.Switch.run @@ fun sw ->
123127
let domain_mgr = Eio.Stdenv.domain_mgr env in
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
(executable
2+
(name main)
3+
(modules main)
4+
(libraries eio_main))
5+
6+
(rule
7+
(alias tsan-stress)
8+
(action
9+
(run ./main.exe)))
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
(* TSan stress suite.
2+
3+
Dispatcher for race-hunting scenarios. Each scenario targets a mutable
4+
state hotspot that the standard test suite does not exercise across
5+
domains. Scenarios are added incrementally.
6+
7+
Iteration count: [IRMIN_TSAN_STRESS_ITER] env var (default 100). *)
8+
9+
let iter_count =
10+
match Sys.getenv_opt "IRMIN_TSAN_STRESS_ITER" with
11+
| Some s -> int_of_string s
12+
| None -> 100
13+
14+
let scenarios : (string * (iter:int -> unit)) list = []
15+
16+
let run_all () =
17+
List.iter
18+
(fun (name, fn) ->
19+
Printf.printf "tsan-stress: %s (iter=%d)\n%!" name iter_count;
20+
fn ~iter:iter_count)
21+
scenarios
22+
23+
let () =
24+
let which =
25+
if Array.length Sys.argv >= 2 then Sys.argv.(1) else "all"
26+
in
27+
match which with
28+
| "all" -> run_all ()
29+
| name -> (
30+
match List.assoc_opt name scenarios with
31+
| Some fn -> fn ~iter:iter_count
32+
| None ->
33+
Printf.eprintf "tsan-stress: unknown scenario %S\n%!" name;
34+
exit 2)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# ThreadSanitizer suppressions for Irmin nightly CI.
2+
#
3+
# Rule: suppress runtime/FFI noise only. Every finding in an Irmin module is
4+
# a real race — do NOT add entries for src/ here.
5+
6+
# OCaml runtime internals (minor heap / major heap bookkeeping).
7+
race:caml_modify
8+
race:caml_alloc_shr
9+
10+
# index uses mmap + its own atomics that TSan doesn't instrument.
11+
called_from_lib:index

0 commit comments

Comments
 (0)