Skip to content

[VectorId] Remove Id conversion bounds and traits #504

[VectorId] Remove Id conversion bounds and traits

[VectorId] Remove Id conversion bounds and traits #504

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
# DiskANN Benchmarks Workflow
#
# This workflow runs macro benchmarks comparing the current branch against a baseline.
# It is manually triggered and requires a baseline reference (branch, tag, or commit).
name: Disk Benchmarks
on:
workflow_dispatch:
inputs:
baseline_ref:
description: 'A branch, commit SHA, or tag name to compare the current branch with'
required: true
default: 'main'
type: string
pull_request:
branches:
- main
paths:
- 'diskann/**'
- 'diskann-disk/**'
- 'diskann-linalg/**'
- 'diskann-providers/**'
- 'diskann-quantization/**'
- 'diskann-vector/**'
- 'diskann-wide/**'
- 'diskann-utils/**'
- 'diskann-platform/**'
- 'diskann-label-filter/**'
- 'diskann-benchmark/**'
- 'diskann-benchmark-runner/**'
- '.github/workflows/disk-benchmarks.yml'
# Cancel in-progress runs when a new run is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
cancel-in-progress: true
env:
RUST_BACKTRACE: 1
PERF_INPUTS: diskann-benchmark/perf_test_inputs
defaults:
run:
shell: bash
permissions:
contents: read
jobs:
# Macro benchmark: compare current branch against baseline
macro-benchmark:
name: Macro Benchmark - ${{ matrix.dataset }}
runs-on: [ self-hosted, 1ES.Pool=diskann-github, ubuntu-latest, "JobId=macro-benchmark-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}-${{ strategy.job-index }}" ]
# TODO: For production benchmarks, consider using a self-hosted runner with:
# - NVMe storage for consistent I/O performance
# - CPU pinning (taskset) for reduced variance
# - Dedicated hardware to avoid noisy neighbor effects
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
include:
- dataset: wikipedia-100K
config: wikipedia-100K-disk-index.json
archive: wikipedia-100K.tar.gz
data_dir: wikipedia_cohere
- dataset: openai-100K
config: openai-100K-disk-index.json
archive: openai-100K.tar.gz
data_dir: OpenAIArXiv
steps:
# Kept inline because this must run before checkout, but local action.yml
# files are only available after checkout.
- name: Mount high-speed NVMe SSD
shell: bash
run: |
sudo mkdir -p /mnt/nvme
sudo lsblk
sudo mkfs.ext4 /dev/nvme0n1
sudo mount /dev/nvme0n1 /mnt/nvme
sudo chmod 777 /mnt/nvme
mkdir -p /mnt/nvme/diskann_rust /mnt/nvme/baseline
ln -s /mnt/nvme/diskann_rust diskann_rust
ln -s /mnt/nvme/baseline baseline
- name: Checkout current branch
uses: actions/checkout@v4
with:
path: diskann_rust
lfs: true
- name: Checkout baseline (${{ inputs.baseline_ref || 'main' }})
uses: actions/checkout@v4
with:
ref: ${{ inputs.baseline_ref || 'main' }}
path: baseline
lfs: true
- name: Setup benchmark environment
uses: ./diskann_rust/.github/actions/setup-disk-benchmark
with:
dataset: ${{ matrix.dataset }}
archive: ${{ matrix.archive }}
extract-to: diskann_rust/target/tmp
- name: Copy dataset to baseline
run: |
mkdir -p baseline/target/tmp
cp -r diskann_rust/target/tmp/${{ matrix.data_dir }} baseline/target/tmp/
- name: Run baseline benchmark
working-directory: baseline
run: |
cargo run -p diskann-benchmark --features disk-index --release -- \
run --input-file ../diskann_rust/${{ env.PERF_INPUTS }}/${{ matrix.config }} \
--output-file target/tmp/${{ matrix.dataset }}_baseline.json
- name: Run current branch benchmark
working-directory: diskann_rust
run: |
cargo run -p diskann-benchmark --features disk-index --release -- \
run --input-file ${{ env.PERF_INPUTS }}/${{ matrix.config }} \
--output-file target/tmp/${{ matrix.dataset }}_target.json
- name: Validate benchmark results
working-directory: diskann_rust
run: |
cargo run -p diskann-benchmark --features disk-index --release -- \
check run \
--tolerances ${{ env.PERF_INPUTS }}/disk-index-tolerances.json \
--input-file ${{ env.PERF_INPUTS }}/${{ matrix.config }} \
--before ../baseline/target/tmp/${{ matrix.dataset }}_baseline.json \
--after target/tmp/${{ matrix.dataset }}_target.json
- name: Upload benchmark results
uses: actions/upload-artifact@v4
if: always() # Upload even if validation fails
with:
name: benchmark-results-${{ matrix.dataset }}
path: |
diskann_rust/target/tmp/${{ matrix.dataset }}_target.json
baseline/target/tmp/${{ matrix.dataset }}_baseline.json
retention-days: 30