feat(turbovec): multi-bit TurboQuant FastScan ANN index (ADR-194 M1) #131

Workflow file for this run

.github/workflows/regression-guard.yml at 05793c2

	name: regression-guard

	# Guards against the six classes of regressions resolved in the
	# fix/critical-issues-may-2026 batch (issues #437, #438, #458, #462,
	# #463, #430). Each job here corresponds to one fix and exists to
	# prevent the regression from being silently re-introduced.

	on:
	push:
	branches: [main]
	pull_request:
	workflow_dispatch:

	permissions:
	contents: read

	jobs:
	# Issue #437: parking_lot::RwLock is non-reentrant. Two .write() (or .read())
	# in the same expression deadlocks. Forbid the exact textual pattern.
	reentrant-rwlock-double-write:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	- name: Forbid reentrant parking_lot lock acquisition in a single statement
	run: \|
	set -e
	# parking_lot::RwLock is non-reentrant. Dangerous patterns on the SAME
	# lock prefix:
	# * .write() then .write() — pure deadlock (issue #437)
	# * .write() then .read() — read blocks behind write guard
	# * .read() then .write() — write blocks behind read guard
	# `.read()` then `.read()` on the same lock is allowed (multi-reader),
	# and any combination on DIFFERENT locks is safe. We use the same
	# captured prefix `(\S+)` to flag only same-lock cases.
	# Pattern 1: .write() …\1.(write\|read)()
	# Pattern 2: .read() …\1.write()
	# Bash -P (PCRE) supports backreferences.
	if grep -rnPe '(\S+)\.write[^;]*\1\.(write\|read)' \
	-rnPe '(\S+)\.read[^;]*\1\.write' \
	--include='*.rs' -- crates/ ; then
	echo "::error::Found reentrant parking_lot lock acquisition on a single statement (regression of issue #437). Bind the guard once: 'let mut g = x.write(); g.field = …;'"
	exit 1
	fi

	# Issue #458: Windows clones break on case-only collisions because NTFS is
	# case-insensitive. Fail CI if any two paths in the tree differ only by case.
	case-insensitive-collisions:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	- name: Detect case-only filename collisions
	run: \|
	set -e
	dupes=$(git ls-files \| tr '[:upper:]' '[:lower:]' \| sort \| uniq -d \|\| true)
	if [ -n "$dupes" ]; then
	echo "::error::Case-only filename collisions found. Windows clones will silently drop one file from each pair:"
	echo "$dupes"
	exit 1
	fi

	# Issue #438: AVX-512 intrinsics must be gated. ruvector-core must build on
	# stable Rust 1.77+ without the simd-avx512 feature.
	ruvector-core-no-avx512-builds-on-stable:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	- uses: dtolnay/rust-toolchain@stable
	with:
	toolchain: '1.89.0'
	- name: cargo check ruvector-core without simd-avx512
	run: \|
	cargo check -p ruvector-core \
	--no-default-features \
	--features simd,storage,hnsw,api-embeddings,parallel
	- name: cargo check ruvector-core with simd-avx512 (default)
	run: cargo check -p ruvector-core

	# Issue #430: HNSW recall@1 must stay above 95% on the regression test that
	# historically exposed the result-heap inversion.
	hnsw-recall-at-1:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	- uses: dtolnay/rust-toolchain@stable
	- uses: Swatinem/rust-cache@v2
	- name: ruvector-router-core unit tests (release)
	run: \|
	# cargo test only accepts one TESTNAME filter per invocation —
	# run each guard test separately.
	cargo test -p ruvector-router-core --release --lib test_recall_at_1_with_biased_insertion_order
	cargo test -p ruvector-router-core --release --lib test_k_exceeds_ef_search_default
	cargo test -p ruvector-router-core --release --lib test_vector_db_basic_operations
	# Issue #430 (bug C): adjacency-list pruning must keep CLOSEST m
	# neighbours, not the most recently inserted ones.
	cargo test -p ruvector-router-core --release --lib test_pruning_keeps_closest_not_newest
	# Issue #430 (storage): VectorDB::new must rebuild the HNSW from
	# persisted vectors so search returns results after reopen.
	cargo test -p ruvector-router-core --release --lib test_index_rebuilt_from_storage_on_open

	# Issue #430 (bug B): the HNSW insert beam must use `ef_construction`, not
	# `ef_construction.min(m * 2)`. The latter silently clamps the beam to 32
	# by default (m=16) and collapses recall at scale. This guard textually
	# forbids the regression.
	hnsw-insert-beam-no-m2-clamp:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	- name: Forbid ef_construction.min(m * 2) clamp in HNSW insert beam
	run: \|
	set -e
	if grep -nE 'ef_construction\s\.\smin\s$\sself\.config\.m\s\\s2\s$' \
	crates/ruvector-router-core/src/index.rs ; then
	echo "::error::Insert beam clamped to ef_construction.min(m2) — this silently becomes m2 (regression of issue #430 bug B). Use self.config.ef_construction directly."
	exit 1
	fi

	# Issue #430 (bug C): adjacency-list pruning must be distance-based. The
	# historical FIFO pruner did not call `calculate_distance` anywhere inside
	# the overflow gate, so checking that the helper is invoked in the same
	# function as the `> self.config.m * 2` check is a cheap structural guard
	# that complements the behavioural `test_pruning_keeps_closest_not_newest`
	# test below.
	hnsw-distance-based-neighbor-pruning:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	- name: Require calculate_distance() inside HNSW overflow gate
	run: \|
	set -e
	# The `insert` function in index.rs must reach calculate_distance()
	# AFTER the `> self.config.m * 2` overflow check fires — that is
	# what proves the pruner is distance-aware, not FIFO.
	if ! grep -nE 'calculate_distance' crates/ruvector-router-core/src/index.rs >/dev/null ; then
	echo "::error::index.rs no longer references calculate_distance (regression of issue #430 bug C). Adjacency-list pruning must score candidates by distance."
	exit 1
	fi
	# And the overflow gate itself must still exist.
	if ! grep -nE '> self\.config\.m \* 2' crates/ruvector-router-core/src/index.rs >/dev/null ; then
	echo "::error::HNSW overflow gate '> self.config.m * 2' removed — refusing to ship without the m*2/m prune semantics (#430)."
	exit 1
	fi

	# Issue #430 (storage): VectorDB::new must rebuild the in-memory HNSW from
	# persisted storage. The historical bug was that a fresh empty HnswIndex
	# was created on every open, so search returned 0 results after restart.
	vector-db-rebuilds-index-on-open:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	- name: Require storage.get_all_ids() rebuild path in VectorDB::new
	run: \|
	set -e
	if ! grep -nE 'storage\.get_all_ids' crates/ruvector-router-core/src/vector_db.rs ; then
	echo "::error::VectorDB::new no longer rebuilds the HNSW from storage (regression of issue #430). Reintroduce the storage.get_all_ids() + index.insert_batch() path."
	exit 1
	fi

	# Issue #462 / #376: published tarballs must contain dist/. Run `npm pack`
	# (which now triggers our prepack hooks) and assert the entry points exist
	# inside the resulting tarball.
	npm-publish-pipeline:
	runs-on: ubuntu-22.04
	strategy:
	fail-fast: false
	matrix:
	pkg:
	- npm/packages/pi-brain
	- npm/packages/ruvector
	- npm/packages/rvf-wasm
	steps:
	- uses: actions/checkout@v4
	- uses: actions/setup-node@v4
	with:
	node-version: '20'
	- name: copy ${{ matrix.pkg }} to isolated dir + npm install + pack
	run: \|
	# The package lives inside an npm workspace at npm/package.json
	# whose other workspace members declare cross-platform native
	# binaries (router-darwin-arm64, etc.). Installing from the
	# package dir still walks the workspace and chokes on EBADPLATFORM
	# for the wrong-host binaries. Copy the package to a workspace-free
	# temp dir so npm only resolves the package's own declared deps.
	mkdir -p /tmp/pkgcopy
	cp -r ${{ matrix.pkg }}/. /tmp/pkgcopy/
	cd /tmp/pkgcopy
	# Detach from the parent workspace.
	rm -f package-lock.json
	npm install --no-audit --no-fund --legacy-peer-deps --no-workspaces --no-optional
	mkdir -p /tmp/pack
	npm pack --pack-destination /tmp/pack
	tar -tzf /tmp/pack/*.tgz \| head -30
	- name: assert dist/ entry points exist in tarball
	working-directory: ${{ matrix.pkg }}
	run: \|
	tarball=$(ls /tmp/pack/*.tgz \| head -1)
	listing=$(tar -tzf "$tarball")
	for required in $(node -e "
	const p = require('./package.json');
	const files = new Set();
	if (p.main) files.add(p.main);
	if (p.module) files.add(p.module);
	if (p.types) files.add(p.types);
	if (p.exports) {
	const walk = (n) => {
	if (typeof n === 'string') files.add(n);
	else if (n && typeof n === 'object') Object.values(n).forEach(walk);
	};
	walk(p.exports);
	}
	console.log([...files].map(f => f.replace(/^\\.\\//,'')).join('\\n'));
	"); do
	# The tarball prefixes everything with 'package/'.
	if ! echo "$listing" \| grep -qE "^package/${required}\$"; then
	echo "::error::Required entry point missing from tarball: $required"
	echo "Tarball contents:"
	echo "$listing"
	exit 1
	fi
	done

	# Issues #463 / #422: hooks_route_enhanced specifically must not shell out
	# via execSync('npx ruvector …'). Other handlers in mcp-server.js shell out
	# to subprocess-only commands (security-scan, git-churn, verify) and are
	# tracked separately — this guard locks the #463 regression shut.
	no-npx-execSync-in-route-enhanced:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	- name: Forbid execSync('npx ruvector …') inside hooks_route_enhanced case
	run: \|
	set -e
	# Extract the hooks_route_enhanced case body (case label → next case)
	# and grep within it. awk for case-body extraction, then grep.
	body=$(awk '
	/case .hooks_route_enhanced.:/ { flag=1 }
	flag && /case .[a-z_]+.:/ && !/hooks_route_enhanced/ { flag=0 }
	flag { print }
	' npm/packages/ruvector/bin/mcp-server.js)
	if echo "$body" \| grep -E 'execSync\([^)]*npx[[:space:]]+ruvector'; then
	echo "::error::hooks_route_enhanced MUST NOT shell out via 'npx ruvector' (regression of issue #463/#422). Use intel.route() in-process instead."
	exit 1
	fi

	# Issue #256: MCP tool handlers must sanitize user-controlled input before
	# interpolating into a shell command. The specific risky pattern is
	# `${args.X}` (the unsanitized MCP request argument); local variables
	# (filesArg, threshold, etc.) are typically pre-processed by the handler
	# and don't need to match here. To catch #256-class regressions without
	# drowning in false positives, we only flag template literals that include
	# `${args.…}` and don't wrap it in sanitizeShellArg(...).
	shell-injection-in-mcp-server:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	- name: Forbid unsanitized ${args.X} in exec/spawn calls
	run: \|
	set -e
	if grep -nE '(execSync\|execFile\|execFileSync\|exec\|spawnSync\|spawn)\([^)]*\$\{args\.' \
	npm/packages//bin/.js 2>/dev/null \| grep -v 'sanitizeShellArg('; then
	echo "::error::Unsanitized \${args.X} interpolation in an exec/spawn call (regression of issue #256). Wrap with sanitizeShellArg(args.X) or use the array form spawn('cmd', [args])."
	exit 1
	fi

	# Issue #267: crates whose names contain "wasm" compile to
	# wasm32-unknown-unknown and can't use std::time::SystemTime / Instant —
	# they panic at runtime.
	no-systemtime-in-wasm-crates:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	- name: Reject SystemTime/Instant in wasm32-targeted crates
	run: \|
	set -e
	fail=0
	for crate in crates/-wasm crates/wasm*; do
	[ -d "$crate/src" ] \|\| continue
	# Whitelist crates with a time_compat shim — they explicitly
	# provide a wasm-safe alternative.
	[ -f "$crate/src/time_compat.rs" ] && continue
	hits=$(grep -rnE '\b(SystemTime::now\|Instant::now)\b' "$crate/src" 2>/dev/null \|\| true)
	[ -z "$hits" ] && continue
	ungated=$(echo "$hits" \| while IFS=: read -r f line _; do
	pre=$(awk -v L="$line" 'NR>=L-4 && NR<L' "$f")
	if ! echo "$pre" \| grep -q 'cfg(not(target_arch.*wasm32'; then
	echo "$f:$line"
	fi
	done)
	if [ -n "$ungated" ]; then
	echo "::error file=$crate::WASM crate uses SystemTime/Instant without cfg-gate (regression of issue #267):"
	echo "$ungated"
	fail=1
	fi
	done
	exit $fail

	# Issue #359: hardcoded devcontainer-only paths break clones outside the
	# devcontainer. Block them in settings + workflow files. .claude/hooks and
	# .claude/intelligence are excluded because they're user-customised helpers
	# configured per-developer (not committed-by-default). Markdown docs and
	# JS example/test files are excluded — they're illustrative.
	no-hardcoded-workspaces-paths:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	- name: Forbid hardcoded devcontainer path in checked-in config
	run: \|
	set -e
	# Look for the literal pattern but only in load-bearing config files,
	# not in this workflow file itself or in docs/examples/tests.
	pattern=$(printf '/workspaces/%s' 'ruvector')
	hits=$(grep -rln "$pattern" \
	.github/workflows/ .claude/settings.json .claude/settings.local.json \
	scripts/publish/ \
	--exclude='regression-guard.yml' \
	2>/dev/null \|\| true)
	if [ -n "$hits" ]; then
	echo "::error::Hardcoded devcontainer path in checked-in config (regression of issue #359). Use \$GITHUB_WORKSPACE, \$PWD, or a relative path."
	echo "$hits"
	exit 1
	fi

	# Issue #464: the per-collection hydration counters added in 97c07520d are
	# the only way to diagnose silent record loss during Firestore hydration.
	# If a future refactor removes the log lines, we lose the diagnostic when
	# we need it most. Assert all four "Hydrate <collection>:" log lines stay.
	brain-hydration-counters-present:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	- name: Assert hydration counter log lines exist
	run: \|
	set -e
	f=crates/mcp-brain-server/src/store.rs
	missing=()
	for collection in brain_memories brain_contributors brain_page_status brain_nodes; do
	if ! grep -q "Hydrate ${collection}: considered=" "$f"; then
	missing+=("Hydrate ${collection}: considered=…")
	fi
	done
	if [ "${#missing[@]}" -gt 0 ]; then
	echo "::error file=$f::Per-collection hydration counter log lines are missing (regression of issue #464). The next deploy can't diagnose silent record loss without them:"
	printf ' %s\n' "${missing[@]}"
	exit 1
	fi

	# Issue #411: npm wrapper packages declared optionalDependencies pinned to
	# versions of native binaries that were never published on the registry.
	# Resolve every optionalDependency declared by every package in this repo
	# against the live npm registry and fail if any are missing. Soft-skip on
	# network errors so transient registry hiccups don't false-fail.
	optional-deps-resolvable-on-npm:
	runs-on: ubuntu-22.04
	steps:
	- uses: actions/checkout@v4
	- uses: actions/setup-node@v4
	with:
	node-version: '20'
	- name: Resolve every optionalDependency@version on npm
	run: \|
	set -e
	fail=0
	# Collect (pkg, name, version) tuples from every package.json that
	# ships an optionalDependencies block.
	while IFS= read -r pkgjson; do
	entries=$(node -e "
	const p = require('${PWD}/$pkgjson');
	const od = p.optionalDependencies \|\| {};
	for (const [n, v] of Object.entries(od)) {
	console.log(n + ' ' + v);
	}
	")
	[ -z "$entries" ] && continue
	while IFS= read -r line; do
	[ -z "$line" ] && continue
	name=$(echo "$line" \| awk '{print $1}')
	# Keep range operators (^, ~) intact — `npm view <pkg>@^2.3.0`
	# resolves to the highest published 2.x.y. Stripping them turns
	# a range into an exact pin and false-fails on common patterns.
	ver=$(echo "$line" \| awk '{print $2}' \| tr -d '" ')
	# Skip workspace: protocol and other non-semver specs.
	case "$ver" in workspace:\|file:\|://) continue ;; esac
	out=$(npm view "${name}@${ver}" version 2>&1) \|\| true
	if echo "$out" \| grep -qE '^npm (error\|ERR!)' \|\| [ -z "$out" ]; then
	# Distinguish "not in registry" from transient network error.
	if echo "$out" \| grep -qE 'E404\|is not in this registry'; then
	echo "::error file=$pkgjson::optionalDependency ${name}@${ver} is not published on npm (regression of issue #411)."
	fail=1
	else
	echo "::warning file=$pkgjson::Could not resolve ${name}@${ver} (transient?): $out"
	fi
	fi
	done <<< "$entries"
	done < <(find npm/packages -name package.json -not -path '/node_modules/')
	exit $fail

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat(turbovec): multi-bit TurboQuant FastScan ANN index (ADR-194 M1) #131

Workflow file

feat(turbovec): multi-bit TurboQuant FastScan ANN index (ADR-194 M1) #131

Uh oh!

Workflow file for this run