feat(turbovec): multi-bit TurboQuant FastScan ANN index (ADR-194 M1) #131
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: regression-guard | |
| # Guards against the six classes of regressions resolved in the | |
| # fix/critical-issues-may-2026 batch (issues #437, #438, #458, #462, | |
| # #463, #430). Each job here corresponds to one fix and exists to | |
| # prevent the regression from being silently re-introduced. | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| workflow_dispatch: | |
| permissions: | |
| contents: read | |
| jobs: | |
| # Issue #437: parking_lot::RwLock is non-reentrant. Two .write() (or .read()) | |
| # in the same expression deadlocks. Forbid the exact textual pattern. | |
| reentrant-rwlock-double-write: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Forbid reentrant parking_lot lock acquisition in a single statement | |
| run: | | |
| set -e | |
| # parking_lot::RwLock is non-reentrant. Dangerous patterns on the SAME | |
| # lock prefix: | |
| # * .write() then .write() — pure deadlock (issue #437) | |
| # * .write() then .read() — read blocks behind write guard | |
| # * .read() then .write() — write blocks behind read guard | |
| # `.read()` then `.read()` on the same lock is allowed (multi-reader), | |
| # and any combination on DIFFERENT locks is safe. We use the same | |
| # captured prefix `(\S+)` to flag only same-lock cases. | |
| # Pattern 1: .write() …\1.(write|read)() | |
| # Pattern 2: .read() …\1.write() | |
| # Bash -P (PCRE) supports backreferences. | |
| if grep -rnPe '(\S+)\.write\(\)[^;]*\1\.(write|read)\(\)' \ | |
| -rnPe '(\S+)\.read\(\)[^;]*\1\.write\(\)' \ | |
| --include='*.rs' -- crates/ ; then | |
| echo "::error::Found reentrant parking_lot lock acquisition on a single statement (regression of issue #437). Bind the guard once: 'let mut g = x.write(); g.field = …;'" | |
| exit 1 | |
| fi | |
| # Issue #458: Windows clones break on case-only collisions because NTFS is | |
| # case-insensitive. Fail CI if any two paths in the tree differ only by case. | |
| case-insensitive-collisions: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Detect case-only filename collisions | |
| run: | | |
| set -e | |
| dupes=$(git ls-files | tr '[:upper:]' '[:lower:]' | sort | uniq -d || true) | |
| if [ -n "$dupes" ]; then | |
| echo "::error::Case-only filename collisions found. Windows clones will silently drop one file from each pair:" | |
| echo "$dupes" | |
| exit 1 | |
| fi | |
| # Issue #438: AVX-512 intrinsics must be gated. ruvector-core must build on | |
| # stable Rust 1.77+ without the simd-avx512 feature. | |
| ruvector-core-no-avx512-builds-on-stable: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: dtolnay/rust-toolchain@stable | |
| with: | |
| toolchain: '1.89.0' | |
| - name: cargo check ruvector-core without simd-avx512 | |
| run: | | |
| cargo check -p ruvector-core \ | |
| --no-default-features \ | |
| --features simd,storage,hnsw,api-embeddings,parallel | |
| - name: cargo check ruvector-core with simd-avx512 (default) | |
| run: cargo check -p ruvector-core | |
| # Issue #430: HNSW recall@1 must stay above 95% on the regression test that | |
| # historically exposed the result-heap inversion. | |
| hnsw-recall-at-1: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: dtolnay/rust-toolchain@stable | |
| - uses: Swatinem/rust-cache@v2 | |
| - name: ruvector-router-core unit tests (release) | |
| run: | | |
| # cargo test only accepts one TESTNAME filter per invocation — | |
| # run each guard test separately. | |
| cargo test -p ruvector-router-core --release --lib test_recall_at_1_with_biased_insertion_order | |
| cargo test -p ruvector-router-core --release --lib test_k_exceeds_ef_search_default | |
| cargo test -p ruvector-router-core --release --lib test_vector_db_basic_operations | |
| # Issue #430 (bug C): adjacency-list pruning must keep CLOSEST m | |
| # neighbours, not the most recently inserted ones. | |
| cargo test -p ruvector-router-core --release --lib test_pruning_keeps_closest_not_newest | |
| # Issue #430 (storage): VectorDB::new must rebuild the HNSW from | |
| # persisted vectors so search returns results after reopen. | |
| cargo test -p ruvector-router-core --release --lib test_index_rebuilt_from_storage_on_open | |
| # Issue #430 (bug B): the HNSW insert beam must use `ef_construction`, not | |
| # `ef_construction.min(m * 2)`. The latter silently clamps the beam to 32 | |
| # by default (m=16) and collapses recall at scale. This guard textually | |
| # forbids the regression. | |
| hnsw-insert-beam-no-m2-clamp: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Forbid ef_construction.min(m * 2) clamp in HNSW insert beam | |
| run: | | |
| set -e | |
| if grep -nE 'ef_construction\s*\.\s*min\s*\(\s*self\.config\.m\s*\*\s*2\s*\)' \ | |
| crates/ruvector-router-core/src/index.rs ; then | |
| echo "::error::Insert beam clamped to ef_construction.min(m*2) — this silently becomes m*2 (regression of issue #430 bug B). Use self.config.ef_construction directly." | |
| exit 1 | |
| fi | |
| # Issue #430 (bug C): adjacency-list pruning must be distance-based. The | |
| # historical FIFO pruner did not call `calculate_distance` anywhere inside | |
| # the overflow gate, so checking that the helper is invoked in the same | |
| # function as the `> self.config.m * 2` check is a cheap structural guard | |
| # that complements the behavioural `test_pruning_keeps_closest_not_newest` | |
| # test below. | |
| hnsw-distance-based-neighbor-pruning: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Require calculate_distance() inside HNSW overflow gate | |
| run: | | |
| set -e | |
| # The `insert` function in index.rs must reach calculate_distance() | |
| # AFTER the `> self.config.m * 2` overflow check fires — that is | |
| # what proves the pruner is distance-aware, not FIFO. | |
| if ! grep -nE 'calculate_distance' crates/ruvector-router-core/src/index.rs >/dev/null ; then | |
| echo "::error::index.rs no longer references calculate_distance (regression of issue #430 bug C). Adjacency-list pruning must score candidates by distance." | |
| exit 1 | |
| fi | |
| # And the overflow gate itself must still exist. | |
| if ! grep -nE '> self\.config\.m \* 2' crates/ruvector-router-core/src/index.rs >/dev/null ; then | |
| echo "::error::HNSW overflow gate '> self.config.m * 2' removed — refusing to ship without the m*2/m prune semantics (#430)." | |
| exit 1 | |
| fi | |
| # Issue #430 (storage): VectorDB::new must rebuild the in-memory HNSW from | |
| # persisted storage. The historical bug was that a fresh empty HnswIndex | |
| # was created on every open, so search returned 0 results after restart. | |
| vector-db-rebuilds-index-on-open: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Require storage.get_all_ids() rebuild path in VectorDB::new | |
| run: | | |
| set -e | |
| if ! grep -nE 'storage\.get_all_ids' crates/ruvector-router-core/src/vector_db.rs ; then | |
| echo "::error::VectorDB::new no longer rebuilds the HNSW from storage (regression of issue #430). Reintroduce the storage.get_all_ids() + index.insert_batch() path." | |
| exit 1 | |
| fi | |
| # Issue #462 / #376: published tarballs must contain dist/. Run `npm pack` | |
| # (which now triggers our prepack hooks) and assert the entry points exist | |
| # inside the resulting tarball. | |
| npm-publish-pipeline: | |
| runs-on: ubuntu-22.04 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| pkg: | |
| - npm/packages/pi-brain | |
| - npm/packages/ruvector | |
| - npm/packages/rvf-wasm | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-node@v4 | |
| with: | |
| node-version: '20' | |
| - name: copy ${{ matrix.pkg }} to isolated dir + npm install + pack | |
| run: | | |
| # The package lives inside an npm workspace at npm/package.json | |
| # whose other workspace members declare cross-platform native | |
| # binaries (router-darwin-arm64, etc.). Installing from the | |
| # package dir still walks the workspace and chokes on EBADPLATFORM | |
| # for the wrong-host binaries. Copy the package to a workspace-free | |
| # temp dir so npm only resolves the package's own declared deps. | |
| mkdir -p /tmp/pkgcopy | |
| cp -r ${{ matrix.pkg }}/. /tmp/pkgcopy/ | |
| cd /tmp/pkgcopy | |
| # Detach from the parent workspace. | |
| rm -f package-lock.json | |
| npm install --no-audit --no-fund --legacy-peer-deps --no-workspaces --no-optional | |
| mkdir -p /tmp/pack | |
| npm pack --pack-destination /tmp/pack | |
| tar -tzf /tmp/pack/*.tgz | head -30 | |
| - name: assert dist/ entry points exist in tarball | |
| working-directory: ${{ matrix.pkg }} | |
| run: | | |
| tarball=$(ls /tmp/pack/*.tgz | head -1) | |
| listing=$(tar -tzf "$tarball") | |
| for required in $(node -e " | |
| const p = require('./package.json'); | |
| const files = new Set(); | |
| if (p.main) files.add(p.main); | |
| if (p.module) files.add(p.module); | |
| if (p.types) files.add(p.types); | |
| if (p.exports) { | |
| const walk = (n) => { | |
| if (typeof n === 'string') files.add(n); | |
| else if (n && typeof n === 'object') Object.values(n).forEach(walk); | |
| }; | |
| walk(p.exports); | |
| } | |
| console.log([...files].map(f => f.replace(/^\\.\\//,'')).join('\\n')); | |
| "); do | |
| # The tarball prefixes everything with 'package/'. | |
| if ! echo "$listing" | grep -qE "^package/${required}\$"; then | |
| echo "::error::Required entry point missing from tarball: $required" | |
| echo "Tarball contents:" | |
| echo "$listing" | |
| exit 1 | |
| fi | |
| done | |
| # Issues #463 / #422: hooks_route_enhanced specifically must not shell out | |
| # via execSync('npx ruvector …'). Other handlers in mcp-server.js shell out | |
| # to subprocess-only commands (security-scan, git-churn, verify) and are | |
| # tracked separately — this guard locks the #463 regression shut. | |
| no-npx-execSync-in-route-enhanced: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Forbid execSync('npx ruvector …') inside hooks_route_enhanced case | |
| run: | | |
| set -e | |
| # Extract the hooks_route_enhanced case body (case label → next case) | |
| # and grep within it. awk for case-body extraction, then grep. | |
| body=$(awk ' | |
| /case .hooks_route_enhanced.:/ { flag=1 } | |
| flag && /case .[a-z_]+.:/ && !/hooks_route_enhanced/ { flag=0 } | |
| flag { print } | |
| ' npm/packages/ruvector/bin/mcp-server.js) | |
| if echo "$body" | grep -E 'execSync\([^)]*npx[[:space:]]+ruvector'; then | |
| echo "::error::hooks_route_enhanced MUST NOT shell out via 'npx ruvector' (regression of issue #463/#422). Use intel.route() in-process instead." | |
| exit 1 | |
| fi | |
| # Issue #256: MCP tool handlers must sanitize user-controlled input before | |
| # interpolating into a shell command. The specific risky pattern is | |
| # `${args.X}` (the unsanitized MCP request argument); local variables | |
| # (filesArg, threshold, etc.) are typically pre-processed by the handler | |
| # and don't need to match here. To catch #256-class regressions without | |
| # drowning in false positives, we only flag template literals that include | |
| # `${args.…}` and don't wrap it in sanitizeShellArg(...). | |
| shell-injection-in-mcp-server: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Forbid unsanitized ${args.X} in exec*/spawn* calls | |
| run: | | |
| set -e | |
| if grep -nE '(execSync|execFile|execFileSync|exec|spawnSync|spawn)\([^)]*\$\{args\.' \ | |
| npm/packages/*/bin/*.js 2>/dev/null | grep -v 'sanitizeShellArg('; then | |
| echo "::error::Unsanitized \${args.X} interpolation in an exec/spawn call (regression of issue #256). Wrap with sanitizeShellArg(args.X) or use the array form spawn('cmd', [args])." | |
| exit 1 | |
| fi | |
| # Issue #267: crates whose names contain "wasm" compile to | |
| # wasm32-unknown-unknown and can't use std::time::SystemTime / Instant — | |
| # they panic at runtime. | |
| no-systemtime-in-wasm-crates: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Reject SystemTime/Instant in wasm32-targeted crates | |
| run: | | |
| set -e | |
| fail=0 | |
| for crate in crates/*-wasm crates/*wasm*; do | |
| [ -d "$crate/src" ] || continue | |
| # Whitelist crates with a time_compat shim — they explicitly | |
| # provide a wasm-safe alternative. | |
| [ -f "$crate/src/time_compat.rs" ] && continue | |
| hits=$(grep -rnE '\b(SystemTime::now|Instant::now)\b' "$crate/src" 2>/dev/null || true) | |
| [ -z "$hits" ] && continue | |
| ungated=$(echo "$hits" | while IFS=: read -r f line _; do | |
| pre=$(awk -v L="$line" 'NR>=L-4 && NR<L' "$f") | |
| if ! echo "$pre" | grep -q 'cfg(not(target_arch.*wasm32'; then | |
| echo "$f:$line" | |
| fi | |
| done) | |
| if [ -n "$ungated" ]; then | |
| echo "::error file=$crate::WASM crate uses SystemTime/Instant without cfg-gate (regression of issue #267):" | |
| echo "$ungated" | |
| fail=1 | |
| fi | |
| done | |
| exit $fail | |
| # Issue #359: hardcoded devcontainer-only paths break clones outside the | |
| # devcontainer. Block them in settings + workflow files. .claude/hooks and | |
| # .claude/intelligence are excluded because they're user-customised helpers | |
| # configured per-developer (not committed-by-default). Markdown docs and | |
| # JS example/test files are excluded — they're illustrative. | |
| no-hardcoded-workspaces-paths: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Forbid hardcoded devcontainer path in checked-in config | |
| run: | | |
| set -e | |
| # Look for the literal pattern but only in load-bearing config files, | |
| # not in this workflow file itself or in docs/examples/tests. | |
| pattern=$(printf '/workspaces/%s' 'ruvector') | |
| hits=$(grep -rln "$pattern" \ | |
| .github/workflows/ .claude/settings.json .claude/settings.local.json \ | |
| scripts/publish/ \ | |
| --exclude='regression-guard.yml' \ | |
| 2>/dev/null || true) | |
| if [ -n "$hits" ]; then | |
| echo "::error::Hardcoded devcontainer path in checked-in config (regression of issue #359). Use \$GITHUB_WORKSPACE, \$PWD, or a relative path." | |
| echo "$hits" | |
| exit 1 | |
| fi | |
| # Issue #464: the per-collection hydration counters added in 97c07520d are | |
| # the only way to diagnose silent record loss during Firestore hydration. | |
| # If a future refactor removes the log lines, we lose the diagnostic when | |
| # we need it most. Assert all four "Hydrate <collection>:" log lines stay. | |
| brain-hydration-counters-present: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Assert hydration counter log lines exist | |
| run: | | |
| set -e | |
| f=crates/mcp-brain-server/src/store.rs | |
| missing=() | |
| for collection in brain_memories brain_contributors brain_page_status brain_nodes; do | |
| if ! grep -q "Hydrate ${collection}: considered=" "$f"; then | |
| missing+=("Hydrate ${collection}: considered=…") | |
| fi | |
| done | |
| if [ "${#missing[@]}" -gt 0 ]; then | |
| echo "::error file=$f::Per-collection hydration counter log lines are missing (regression of issue #464). The next deploy can't diagnose silent record loss without them:" | |
| printf ' %s\n' "${missing[@]}" | |
| exit 1 | |
| fi | |
| # Issue #411: npm wrapper packages declared optionalDependencies pinned to | |
| # versions of native binaries that were never published on the registry. | |
| # Resolve every optionalDependency declared by every package in this repo | |
| # against the live npm registry and fail if any are missing. Soft-skip on | |
| # network errors so transient registry hiccups don't false-fail. | |
| optional-deps-resolvable-on-npm: | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-node@v4 | |
| with: | |
| node-version: '20' | |
| - name: Resolve every optionalDependency@version on npm | |
| run: | | |
| set -e | |
| fail=0 | |
| # Collect (pkg, name, version) tuples from every package.json that | |
| # ships an optionalDependencies block. | |
| while IFS= read -r pkgjson; do | |
| entries=$(node -e " | |
| const p = require('${PWD}/$pkgjson'); | |
| const od = p.optionalDependencies || {}; | |
| for (const [n, v] of Object.entries(od)) { | |
| console.log(n + ' ' + v); | |
| } | |
| ") | |
| [ -z "$entries" ] && continue | |
| while IFS= read -r line; do | |
| [ -z "$line" ] && continue | |
| name=$(echo "$line" | awk '{print $1}') | |
| # Keep range operators (^, ~) intact — `npm view <pkg>@^2.3.0` | |
| # resolves to the highest published 2.x.y. Stripping them turns | |
| # a range into an exact pin and false-fails on common patterns. | |
| ver=$(echo "$line" | awk '{print $2}' | tr -d '" ') | |
| # Skip workspace: protocol and other non-semver specs. | |
| case "$ver" in workspace:*|file:*|*://*) continue ;; esac | |
| out=$(npm view "${name}@${ver}" version 2>&1) || true | |
| if echo "$out" | grep -qE '^npm (error|ERR!)' || [ -z "$out" ]; then | |
| # Distinguish "not in registry" from transient network error. | |
| if echo "$out" | grep -qE 'E404|is not in this registry'; then | |
| echo "::error file=$pkgjson::optionalDependency ${name}@${ver} is not published on npm (regression of issue #411)." | |
| fail=1 | |
| else | |
| echo "::warning file=$pkgjson::Could not resolve ${name}@${ver} (transient?): $out" | |
| fi | |
| fi | |
| done <<< "$entries" | |
| done < <(find npm/packages -name package.json -not -path '*/node_modules/*') | |
| exit $fail |