Skip to content

Build HF Dataset

Build HF Dataset #26

name: Build HF Dataset
# Stub on master.
on:
workflow_dispatch:
inputs:
build_vector_db_image:
required: false
description: "Build vector DB image from source branch and use it (false by default)"
default: "false"
container_mem_limit:
required: false
description: "Memory limit for the Qdrant container during phase 2"
default: "256m"
region:
required: false
description: "Hetzner region"
default: "fsn1"
server_machine_type:
required: false
description: "Hetzner server machine type"
default: "ccx13"
client_machine_type:
required: false
description: "Hetzner client machine type"
default: "ccx23"
worker_pool_size:
required: false
description: "Number of Hetzner pairs to create"
default: "6"
concurrency:
group: hetzner-machines
env:
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }}
POSTGRES_HOST: ${{ secrets.POSTGRES_HOST }}
jobs:
prepareMatrix:
name: Prepare matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.prepare.outputs.matrix }}
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
- id: prepare
shell: bash
run: |
DATASETS=(
random-768-500k-on-disk-bq-keyword-narrow-filter random-768-500k-on-disk-bq-keyword-wide-filter
random-768-500k-on-disk-bq-int-narrow-filter random-768-500k-on-disk-bq-int-wide-filter
random-768-500k-on-disk-bq-float-narrow-filter random-768-500k-on-disk-bq-float-wide-filter
random-768-500k-on-disk-bq-bool-wide-filter
random-768-500k-on-disk-bq-uuid-narrow-filter
random-768-500k-on-disk-bq-geo-narrow-filter random-768-500k-on-disk-bq-geo-wide-filter
random-768-500k-on-disk-bq-text-narrow-filter random-768-500k-on-disk-bq-text-wide-filter
random-768-500k-on-disk-bq-datetime-narrow-filter random-768-500k-on-disk-bq-datetime-wide-filter
)
STORAGE_BASE="https://storage.googleapis.com/qdrant-benchmark-snapshots/on-disk-search/snapshots"
POOL_SIZE="${{ inputs.worker_pool_size || '6' }}"
# Expand datasets × engines into flat cell list.
# Narrow filters bypass HNSW (payload-index + plain scan), so inline_storage
# is irrelevant — narrow only runs against inline-off. Wide runs against both.
all_cells=()
for dataset in "${DATASETS[@]}"; do
case "$dataset" in
*-narrow-filter) engines=(qdrant-on-disk-bq-inline-off) ;;
*) engines=(qdrant-on-disk-bq-inline-on qdrant-on-disk-bq-inline-off) ;;
esac
for engine in "${engines[@]}"; do
case "$engine" in
*-inline-on) snap="$STORAGE_BASE/ondisk_global-inline-on.snapshot" ;;
*-inline-off) snap="$STORAGE_BASE/ondisk_global-inline-off.snapshot" ;;
esac
all_cells+=("{\"dataset\":\"$dataset\",\"engine\":\"$engine\",\"snapshot_url\":\"$snap\"}")
done
done
echo "Built ${#all_cells[@]} cells across $POOL_SIZE workers"
# Round-robin distribute cells into workers.
declare -a worker_cells
for w in $(seq 0 $((POOL_SIZE - 1))); do
worker_cells[$w]=""
done
for i in $(seq 0 $((${#all_cells[@]} - 1))); do
w=$((i % POOL_SIZE))
if [ -z "${worker_cells[$w]}" ]; then
worker_cells[$w]="${all_cells[$i]}"
else
worker_cells[$w]="${worker_cells[$w]},${all_cells[$i]}"
fi
done
workers=()
for w in $(seq 0 $((POOL_SIZE - 1))); do
workers+=("{\"worker_index\":$w,\"cells\":[${worker_cells[$w]}]}")
done
matrix="[$(IFS=,; echo "${workers[*]}")]"
echo "matrix=$matrix" >> "$GITHUB_OUTPUT"
buildImage:
name: Build Vector DB Image
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
if: ${{ inputs.build_vector_db_image == 'true' }}
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
- id: prepare-tag
shell: bash
run: |
branch_tmp="${{ github.ref_name }}"
branch=${branch_tmp//\//-}
tag="ghcr.io/${{ github.repository_owner }}/vector-db-benchmark:${branch}"
echo "tag=${tag}" >> $GITHUB_OUTPUT
- uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
- uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3.7.0
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2
with:
context: .
push: true
tags: ${{ steps.prepare-tag.outputs.tag }}
provenance: false
cache-from: type=gha
cache-to: type=gha,mode=max
setupMachines:
name: setup pair ${{ matrix.worker.worker_index }}
needs: [prepareMatrix, buildImage]
if: ${{ !cancelled() && needs.prepareMatrix.result == 'success' && (needs.buildImage.result == 'success' || needs.buildImage.result == 'skipped') }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
worker: ${{ fromJSON(needs.prepareMatrix.outputs.matrix) }}
env:
SERVER_NAME: search-on-disk-server-${{ matrix.worker.worker_index }}
CLIENT_NAME: search-on-disk-client-${{ matrix.worker.worker_index }}
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
- uses: webfactory/ssh-agent@d4b9b8ff72958532804b70bbe600ad43b36d5f2e # v0.8.0
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Setup CI tools
run: bash -x tools/setup_ci.sh
- name: Create server
uses: ./.github/workflows/actions/create-server-with-retry
with:
server_name: ${{ env.SERVER_NAME }}
server_type: ${{ inputs.server_machine_type || 'ccx13' }}
region: ${{ inputs.region || 'fsn1' }}
max_retries: 5
- name: Create client
uses: ./.github/workflows/actions/create-server-with-retry
with:
server_name: ${{ env.CLIENT_NAME }}
server_type: ${{ inputs.client_machine_type || 'ccx23' }}
region: ${{ inputs.region || 'fsn1' }}
max_retries: 5
runWorker:
name: worker ${{ matrix.worker.worker_index }}
needs: [prepareMatrix, setupMachines]
if: ${{ !cancelled() && needs.setupMachines.result == 'success' }}
runs-on: ubuntu-latest
permissions:
contents: read
packages: read
strategy:
fail-fast: false
matrix:
worker: ${{ fromJSON(needs.prepareMatrix.outputs.matrix) }}
env:
SERVER_NAME: search-on-disk-server-${{ matrix.worker.worker_index }}
CLIENT_NAME: search-on-disk-client-${{ matrix.worker.worker_index }}
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
- uses: webfactory/ssh-agent@d4b9b8ff72958532804b70bbe600ad43b36d5f2e # v0.8.0
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Setup CI tools
run: bash -x tools/setup_ci.sh
- name: Run cells
run: |
export BENCHMARK_STRATEGY="search-on-disk"
export CONTAINER_MEM_LIMIT="${{ inputs.container_mem_limit || '256m' }}"
if [ "${{ inputs.build_vector_db_image }}" = "true" ]; then
branch_tmp="${{ github.ref_name }}"
branch=${branch_tmp//\//-}
export VECTOR_DB_BENCHMARK_IMAGE="ghcr.io/${{ github.repository_owner }}/vector-db-benchmark:${branch}"
export GHCR_USERNAME="${{ github.repository_owner }}"
export GHCR_PASSWORD="${{ secrets.GITHUB_TOKEN }}"
fi
# Allow individual cells to fail without aborting the rest of this worker,
# but track the failure count so the step exits non-zero at the end if any cell failed
# (otherwise the matrix entry would silently report success and slackOnFailure wouldn't fire).
set +e
FAILED=0
CELLS='${{ toJSON(matrix.worker.cells) }}'
# Read cells from FD 3 (not stdin) — `ssh -tt` inside run_ci.sh would
# otherwise drain the loop's stdin pipe after the first iteration and
# silently truncate the worker to one cell.
while IFS= read -r cell <&3; do
export DATASETS=$(echo "$cell" | jq -r '.dataset')
export ENGINE_NAME=$(echo "$cell" | jq -r '.engine')
export SNAPSHOT_URL=$(echo "$cell" | jq -r '.snapshot_url')
echo "===== worker ${{ matrix.worker.worker_index }}: $ENGINE_NAME / $DATASETS ====="
# Same dev/master split as runLoadTimeBenchmark; the Hetzner pair persists
# across both versions (only the qdrant container is recreated by the strategy).
export QDRANT_VERSION=ghcr/dev
export QDRANT__FEATURE_FLAGS__ALL=true
timeout 45m bash -x tools/run_ci.sh || FAILED=$((FAILED + 1))
export QDRANT_VERSION=docker/master
export QDRANT__FEATURE_FLAGS__ALL=false
timeout 45m bash -x tools/run_ci.sh || FAILED=$((FAILED + 1))
done 3< <(echo "$CELLS" | jq -c '.[]')
if [ "$FAILED" -gt 0 ]; then
echo "::warning::worker ${{ matrix.worker.worker_index }}: $FAILED cell run(s) failed"
exit 1
fi
cleanupMachines:
name: teardown pair ${{ matrix.worker.worker_index }}
needs: [prepareMatrix, setupMachines, runWorker]
if: ${{ always() && needs.prepareMatrix.result == 'success' }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
worker: ${{ fromJSON(needs.prepareMatrix.outputs.matrix) }}
env:
SERVER_NAME: search-on-disk-server-${{ matrix.worker.worker_index }}
CLIENT_NAME: search-on-disk-client-${{ matrix.worker.worker_index }}
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
- uses: webfactory/ssh-agent@d4b9b8ff72958532804b70bbe600ad43b36d5f2e # v0.8.0
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Setup CI tools
run: bash -x tools/setup_ci.sh
- name: Teardown
continue-on-error: true
run: bash -x tools/tear_down.sh