Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"name": "Irmin TSan development",
"image": "ghcr.io/tarides/ocaml-devcontainer-tsan:latest",
"features": {
"ghcr.io/anthropics/devcontainer-features/claude-code:1": {}
},
"postStartCommand": "bash /home/vscode/.fix-tsan-aslr.sh || true",
"remoteUser": "vscode",
"mounts": [
"source=dune-cache,target=/home/vscode/.cache/dune,type=volume"
],
"remoteEnv": {
"DUNE_CACHE_ROOT": "/home/vscode/.cache/dune",
"EIO_BACKEND": "posix"
},
"hostRequirements": {
"cpus": 4,
"memory": "8gb",
"storage": "32gb"
}
}
105 changes: 105 additions & 0 deletions .github/workflows/tsan.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
name: TSan

# ThreadSanitizer nightly race-hunting workflow.
# Requires bare ubuntu-latest runners (for writable /proc/sys); cannot run
# inside a container: job, Codespaces, or ocaml-ci.

on:
schedule:
- cron: '0 3 * * *'
workflow_dispatch: {}
pull_request:
types: [labeled, synchronize]

concurrency:
group: tsan-${{ github.ref }}
cancel-in-progress: true

jobs:
tsan:
if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'tsan')
runs-on: ubuntu-latest
timeout-minutes: 75
env:
EIO_BACKEND: posix
TSAN_OPTIONS: "halt_on_error=0 history_size=7 second_deadlock_stack=1 exitcode=66 log_path=tsan-report suppressions=${{ github.workspace }}/test/irmin-pack/tsan_suppressions.txt"
IRMIN_STM_ITER: "5000"
IRMIN_STM_PACK_ITER: "2000"
IRMIN_MULTICORE_DOMAINS: "8"
IRMIN_MULTICORE_ITER: "50"
IRMIN_TSAN_STRESS_ITER: "500"

steps:
- uses: actions/checkout@v4

- name: Reduce ASLR entropy for TSan shadow memory
run: sudo sysctl vm.mmap_rnd_bits=28

- name: Install libunwind
run: sudo apt-get update -y && sudo apt-get install -y libunwind-dev

- name: Cache opam root
uses: actions/cache@v4
with:
path: ~/.opam
key: tsan-opam-${{ runner.os }}-5.3.0-${{ hashFiles('*.opam') }}
restore-keys: |
tsan-opam-${{ runner.os }}-5.3.0-

- name: Set up OCaml with TSan
uses: ocaml/setup-ocaml@v3
with:
ocaml-compiler: "ocaml-variants.5.3.0+options,ocaml-option-tsan"
dune-cache: true

- name: Install dune
run: opam install -y dune

- name: Install dependencies
run: opam install -y --deps-only --with-test .

- name: Build
run: opam exec -- dune build @install

- name: Run tests and stress suite under TSan
run: |
set -o pipefail
opam exec -- dune build @runtest @tsan-stress --force --no-buffer 2>&1 | tee tsan-run.log

- name: Summarize findings
if: always()
shell: bash
run: |
shopt -s nullglob
files=(tsan-report.* _build/default/test/irmin-pack/test_tsan_stress/tsan-report.*)
warnings=0
errors=0
if [ ${#files[@]} -gt 0 ] || [ -f tsan-run.log ]; then
warnings=$(grep -ch "WARNING: ThreadSanitizer" "${files[@]}" tsan-run.log 2>/dev/null | awk -F: '{s+=$NF} END {print s+0}')
errors=$(grep -ch "ERROR: ThreadSanitizer" "${files[@]}" tsan-run.log 2>/dev/null | awk -F: '{s+=$NF} END {print s+0}')
fi
total=$((warnings + errors))
{
echo "### TSan findings: $total"
echo ""
echo "- data-race warnings: $warnings"
echo "- signal/SEGV-on-race errors: $errors"
if [ "$total" = "0" ]; then
echo ""
echo "No races detected."
else
echo ""
echo "See artifact \`tsan-reports-${{ github.run_id }}\`."
fi
} >> "$GITHUB_STEP_SUMMARY"

- name: Upload TSan reports
if: always()
uses: actions/upload-artifact@v4
with:
name: tsan-reports-${{ github.run_id }}
path: |
tsan-report.*
_build/default/test/irmin-pack/test_tsan_stress/tsan-report.*
tsan-run.log
if-no-files-found: ignore
46 changes: 30 additions & 16 deletions test/irmin-pack/test_multicore.ml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ let src = Logs.Src.create "tests.multicore" ~doc:"Tests"

module Log = (val Logs.src_log src : Logs.LOG)

let int_env name default =
match Sys.getenv_opt name with Some s -> int_of_string s | None -> default

let default_domains = int_env "IRMIN_MULTICORE_DOMAINS" 2
let test_iter = int_env "IRMIN_MULTICORE_ITER" 1

module Store = struct
module Maker = Irmin_pack_unix.Maker (Conf)
include Maker.Make (Schema)
Expand Down Expand Up @@ -130,7 +136,7 @@ let domains_run ~domain_mgr fns =
in
Eio.Fiber.all fibers

let domains_spawn ~domain_mgr ?(nb = 2) fn =
let domains_spawn ~domain_mgr ?(nb = default_domains) fn =
domains_run ~domain_mgr @@ List.init nb (fun _ -> fn)

let find_all tree paths =
Expand Down Expand Up @@ -499,18 +505,26 @@ let test_commit_v ~fs ~domain_mgr =

let tests ~fs ~domain_mgr =
let tc name fn = Alcotest.test_case name `Quick (fun () -> fn ~domain_mgr) in
[
tc "find." (test_find ~fs);
tc "length." (test_length ~fs);
tc "add / remove." (test_add_remove ~fs);
tc "commit." (test_commit ~fs);
tc "merkle." (test_merkle ~fs);
tc "hash." (test_hash ~fs);
tc "list-disk-no-cache." (test_list_disk ~fs ~cache:false);
tc "list-disk-with-cache." (test_list_disk ~fs ~cache:true);
tc "list-mem-no-cache." (test_list_mem ~fs ~cache:false);
tc "list-mem-with-cache." (test_list_mem ~fs ~cache:true);
tc "commit-of-hash." (test_commit_of_hash ~fs);
tc "commit-parents." (test_commit_parents ~fs);
tc "commit-v." (test_commit_v ~fs);
]
let cases =
[
("find.", test_find ~fs);
("length.", test_length ~fs);
("add / remove.", test_add_remove ~fs);
("commit.", test_commit ~fs);
("merkle.", test_merkle ~fs);
("hash.", test_hash ~fs);
("list-disk-no-cache.", test_list_disk ~fs ~cache:false);
("list-disk-with-cache.", test_list_disk ~fs ~cache:true);
("list-mem-no-cache.", test_list_mem ~fs ~cache:false);
("list-mem-with-cache.", test_list_mem ~fs ~cache:true);
("commit-of-hash.", test_commit_of_hash ~fs);
("commit-parents.", test_commit_parents ~fs);
("commit-v.", test_commit_v ~fs);
]
in
if test_iter <= 1 then List.map (fun (name, fn) -> tc name fn) cases
else
List.concat_map
(fun (name, fn) ->
List.init test_iter (fun i -> tc (Printf.sprintf "%s%d" name i) fn))
cases
6 changes: 5 additions & 1 deletion test/irmin-pack/test_stm/test_stm.ml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,11 @@ let agree_test_eio ~count ~domain_mgr =
TT.agree_test_par ~domain_mgr ~count ~name:"Irmin test parallel"

let () =
let count = 500 in
let count =
match Sys.getenv_opt "IRMIN_STM_ITER" with
| Some s -> int_of_string s
| None -> 500
in
Eio_main.run @@ fun env ->
let domain_mgr = Eio.Stdenv.domain_mgr env in
QCheck_base_runner.run_tests_main [ agree_test_eio ~count ~domain_mgr ]
6 changes: 5 additions & 1 deletion test/irmin-pack/test_stm/test_stm_irmin_pack.ml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,11 @@ let agree_test_eio ~count ~domain_mgr ~fs ~sw =
TT.agree_test_par ~domain_mgr ~count ~name:"Irmin test parallel"

let () =
let count = 100 in
let count =
match Sys.getenv_opt "IRMIN_STM_PACK_ITER" with
| Some s -> int_of_string s
| None -> 100
in
Eio_main.run @@ fun env ->
Eio.Switch.run @@ fun sw ->
let domain_mgr = Eio.Stdenv.domain_mgr env in
Expand Down
29 changes: 29 additions & 0 deletions test/irmin-pack/test_tsan_stress/dune
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
(executable
(name main)
(modules
main
stress_common
stress_ao_buf
stress_dict
stress_mem_cache
stress_watch
stress_fs_pool)
(libraries
eio_main
irmin
irmin.mem
irmin-pack.io
irmin-pack.unix
irmin-fs.unix))

; Run each scenario in a fresh process so a race that corrupts memory
; and triggers a SEGV in one scenario does not prevent the others from
; running. Each scenario writes TSan reports to tsan-report.<pid>, and
; the outer workflow aggregates all of them.

(rule
(alias tsan-stress)
(deps main.exe)
(action
(bash
"./main.exe mem || true; ./main.exe watch || true; ./main.exe ao || true; ./main.exe dict || true; ./main.exe fs || true")))
68 changes: 68 additions & 0 deletions test/irmin-pack/test_tsan_stress/main.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
(* TSan stress suite — dispatcher.

Each scenario targets a mutable-state hotspot that the standard test
suite does not exercise across domains. Iteration count is driven by
[IRMIN_TSAN_STRESS_ITER] (default 100).

Scenarios and expected outcomes under TSan:

- mem: clean data-race warning at irmin_mem.ml:51 (Hashtbl.add in
the global cache) and on the shared KMap mutable.
- watch: clean data-race warning at watch.ml:33 (listen_dir_hook
ref assignment).
- ao: SEGV — concurrent Buffer.add_string corrupts the buffer
fast enough that TSan's signal handler fires before the
race warning is written. The SEGV itself is evidence of
the race; a clean warning would need finer-grained access.
- dict: SEGV — same pattern as ao, via the two unguarded Hashtbl.t
caches plus the append path through Ao.
- fs: TSan "nested bug, aborting" — the race interacts with Eio
scheduler/pool internals in a way the sanitizer can't
unwind. Surfaces the problem but not a specific site.

Because ao/dict/fs crash, each scenario is run in its own process by
the @tsan-stress dune alias so one crash does not hide the others.

Usage:
main.exe run all scenarios
main.exe all same
main.exe <name> run one scenario (ao|dict|mem|watch|fs) *)

let iter_count =
match Sys.getenv_opt "IRMIN_TSAN_STRESS_ITER" with
| Some s -> int_of_string s
| None -> 100

type env = Eio_unix.Stdenv.base

let scenarios : (string * (env:env -> iter:int -> unit)) list =
[
("ao", Stress_ao_buf.run);
("dict", Stress_dict.run);
("mem", Stress_mem_cache.run);
("watch", Stress_watch.run);
("fs", Stress_fs_pool.run);
]

let run_one ~env (name, fn) =
Printf.printf "tsan-stress: %s (iter=%d)\n%!" name iter_count;
fn ~env ~iter:iter_count

let () =
let which =
match Sys.argv with
| [| _ |] | [| _; "all" |] -> `All
| [| _; name |] -> `One name
| _ ->
prerr_endline "usage: main.exe [all|ao|dict|mem|watch|fs]";
exit 2
in
Eio_main.run @@ fun env ->
match which with
| `All -> List.iter (run_one ~env) scenarios
| `One name -> (
match List.assoc_opt name scenarios with
| Some fn -> run_one ~env (name, fn)
| None ->
Printf.eprintf "tsan-stress: unknown scenario %S\n%!" name;
exit 2)
29 changes: 29 additions & 0 deletions test/irmin-pack/test_tsan_stress/stress_ao_buf.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
(* Races the unguarded [Buffer.t] and atomic counter in
[Append_only_file.rw_perm] (src/irmin-pack/io/append_only_file.ml).

N domains call [Ao.append_exn] concurrently on the same [Ao.t].
The function does [Buffer.add_string] without synchronisation, then
an [Atomic.fetch_and_add] on [buf_length] — the TOCTOU and the
Buffer mutation both surface to TSan. *)

module Io = Irmin_pack_unix.Io.Unix
module Errs = Irmin_pack_io.Io_errors.Make (Io)
module Ao = Irmin_pack_io.Append_only_file.Make (Io) (Errs)

let run ~env ~iter =
Eio.Switch.run @@ fun sw ->
let dmgr = Eio.Stdenv.domain_mgr env in
let path = Stress_common.scratch_file ~env "stress_ao_buf.data" in
let ao =
match Ao.create_rw ~sw ~path ~overwrite:true with
| Ok ao -> ao
| Error _ -> failwith "stress_ao_buf: create_rw failed"
in
let worker () =
for _ = 1 to iter do
Ao.append_exn ao "xxxxxxxx"
done
in
Stress_common.domains_spawn ~dmgr ~nb:2 worker;
ignore (Ao.flush ao);
ignore (Ao.close ao)
43 changes: 43 additions & 0 deletions test/irmin-pack/test_tsan_stress/stress_common.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
(* Barrier-synchronised domain spawning, mirroring the idiom in
test/irmin-pack/test_multicore.ml so the workers start roughly
together and the scheduler has to interleave them. *)

(* Clean scratch directory for scenarios that need on-disk state.
Rooted at [/tmp/irmin-tsan-stress/<name>] to avoid clashing with
a possibly missing or read-only [_build] under the process cwd. *)
let scratch_path ~env name =
let fs = Eio.Stdenv.fs env in
let base = Eio.Path.(fs / "tmp" / "irmin-tsan-stress") in
let p = Eio.Path.(base / name) in
(try Eio.Path.rmtree p with _ -> ());
(try Eio.Path.mkdirs ~perm:0o755 base with _ -> ());
(try Eio.Path.mkdir ~perm:0o755 p with _ -> ());
p

(* Like [scratch_path] but returns a path to a file (non-existing)
whose parent directory has been freshly created. *)
let scratch_file ~env name =
let fs = Eio.Stdenv.fs env in
let base = Eio.Path.(fs / "tmp" / "irmin-tsan-stress") in
(try Eio.Path.mkdirs ~perm:0o755 base with _ -> ());
let p = Eio.Path.(base / name) in
(try Eio.Path.unlink p with _ -> ());
p

let domains_run ~dmgr fns =
let count = Atomic.make (List.length fns) in
let fibers =
List.map
(fun fn () ->
Eio.Domain_manager.run dmgr (fun () ->
Atomic.decr count;
while Atomic.get count > 0 do
Domain.cpu_relax ()
done;
fn ()))
fns
in
Eio.Fiber.all fibers

let domains_spawn ~dmgr ?(nb = 4) fn =
domains_run ~dmgr (List.init nb (fun _ -> fn))
Loading
Loading