Build HF Dataset #26
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Build HF Dataset | |
| # Stub on master. | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| build_vector_db_image: | |
| required: false | |
| description: "Build vector DB image from source branch and use it (false by default)" | |
| default: "false" | |
| container_mem_limit: | |
| required: false | |
| description: "Memory limit for the Qdrant container during phase 2" | |
| default: "256m" | |
| region: | |
| required: false | |
| description: "Hetzner region" | |
| default: "fsn1" | |
| server_machine_type: | |
| required: false | |
| description: "Hetzner server machine type" | |
| default: "ccx13" | |
| client_machine_type: | |
| required: false | |
| description: "Hetzner client machine type" | |
| default: "ccx23" | |
| worker_pool_size: | |
| required: false | |
| description: "Number of Hetzner pairs to create" | |
| default: "6" | |
| concurrency: | |
| group: hetzner-machines | |
| env: | |
| HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} | |
| POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }} | |
| POSTGRES_HOST: ${{ secrets.POSTGRES_HOST }} | |
| jobs: | |
| prepareMatrix: | |
| name: Prepare matrix | |
| runs-on: ubuntu-latest | |
| outputs: | |
| matrix: ${{ steps.prepare.outputs.matrix }} | |
| steps: | |
| - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 | |
| - id: prepare | |
| shell: bash | |
| run: | | |
| DATASETS=( | |
| random-768-500k-on-disk-bq-keyword-narrow-filter random-768-500k-on-disk-bq-keyword-wide-filter | |
| random-768-500k-on-disk-bq-int-narrow-filter random-768-500k-on-disk-bq-int-wide-filter | |
| random-768-500k-on-disk-bq-float-narrow-filter random-768-500k-on-disk-bq-float-wide-filter | |
| random-768-500k-on-disk-bq-bool-wide-filter | |
| random-768-500k-on-disk-bq-uuid-narrow-filter | |
| random-768-500k-on-disk-bq-geo-narrow-filter random-768-500k-on-disk-bq-geo-wide-filter | |
| random-768-500k-on-disk-bq-text-narrow-filter random-768-500k-on-disk-bq-text-wide-filter | |
| random-768-500k-on-disk-bq-datetime-narrow-filter random-768-500k-on-disk-bq-datetime-wide-filter | |
| ) | |
| STORAGE_BASE="https://storage.googleapis.com/qdrant-benchmark-snapshots/on-disk-search/snapshots" | |
| POOL_SIZE="${{ inputs.worker_pool_size || '6' }}" | |
| # Expand datasets × engines into flat cell list. | |
| # Narrow filters bypass HNSW (payload-index + plain scan), so inline_storage | |
| # is irrelevant — narrow only runs against inline-off. Wide runs against both. | |
| all_cells=() | |
| for dataset in "${DATASETS[@]}"; do | |
| case "$dataset" in | |
| *-narrow-filter) engines=(qdrant-on-disk-bq-inline-off) ;; | |
| *) engines=(qdrant-on-disk-bq-inline-on qdrant-on-disk-bq-inline-off) ;; | |
| esac | |
| for engine in "${engines[@]}"; do | |
| case "$engine" in | |
| *-inline-on) snap="$STORAGE_BASE/ondisk_global-inline-on.snapshot" ;; | |
| *-inline-off) snap="$STORAGE_BASE/ondisk_global-inline-off.snapshot" ;; | |
| esac | |
| all_cells+=("{\"dataset\":\"$dataset\",\"engine\":\"$engine\",\"snapshot_url\":\"$snap\"}") | |
| done | |
| done | |
| echo "Built ${#all_cells[@]} cells across $POOL_SIZE workers" | |
| # Round-robin distribute cells into workers. | |
| declare -a worker_cells | |
| for w in $(seq 0 $((POOL_SIZE - 1))); do | |
| worker_cells[$w]="" | |
| done | |
| for i in $(seq 0 $((${#all_cells[@]} - 1))); do | |
| w=$((i % POOL_SIZE)) | |
| if [ -z "${worker_cells[$w]}" ]; then | |
| worker_cells[$w]="${all_cells[$i]}" | |
| else | |
| worker_cells[$w]="${worker_cells[$w]},${all_cells[$i]}" | |
| fi | |
| done | |
| workers=() | |
| for w in $(seq 0 $((POOL_SIZE - 1))); do | |
| workers+=("{\"worker_index\":$w,\"cells\":[${worker_cells[$w]}]}") | |
| done | |
| matrix="[$(IFS=,; echo "${workers[*]}")]" | |
| echo "matrix=$matrix" >> "$GITHUB_OUTPUT" | |
| buildImage: | |
| name: Build Vector DB Image | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| packages: write | |
| if: ${{ inputs.build_vector_db_image == 'true' }} | |
| steps: | |
| - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 | |
| - id: prepare-tag | |
| shell: bash | |
| run: | | |
| branch_tmp="${{ github.ref_name }}" | |
| branch=${branch_tmp//\//-} | |
| tag="ghcr.io/${{ github.repository_owner }}/vector-db-benchmark:${branch}" | |
| echo "tag=${tag}" >> $GITHUB_OUTPUT | |
| - uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0 | |
| - uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3.7.0 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.repository_owner }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2 | |
| with: | |
| context: . | |
| push: true | |
| tags: ${{ steps.prepare-tag.outputs.tag }} | |
| provenance: false | |
| cache-from: type=gha | |
| cache-to: type=gha,mode=max | |
| setupMachines: | |
| name: setup pair ${{ matrix.worker.worker_index }} | |
| needs: [prepareMatrix, buildImage] | |
| if: ${{ !cancelled() && needs.prepareMatrix.result == 'success' && (needs.buildImage.result == 'success' || needs.buildImage.result == 'skipped') }} | |
| runs-on: ubuntu-latest | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| worker: ${{ fromJSON(needs.prepareMatrix.outputs.matrix) }} | |
| env: | |
| SERVER_NAME: search-on-disk-server-${{ matrix.worker.worker_index }} | |
| CLIENT_NAME: search-on-disk-client-${{ matrix.worker.worker_index }} | |
| steps: | |
| - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 | |
| - uses: webfactory/ssh-agent@d4b9b8ff72958532804b70bbe600ad43b36d5f2e # v0.8.0 | |
| with: | |
| ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} | |
| - name: Setup CI tools | |
| run: bash -x tools/setup_ci.sh | |
| - name: Create server | |
| uses: ./.github/workflows/actions/create-server-with-retry | |
| with: | |
| server_name: ${{ env.SERVER_NAME }} | |
| server_type: ${{ inputs.server_machine_type || 'ccx13' }} | |
| region: ${{ inputs.region || 'fsn1' }} | |
| max_retries: 5 | |
| - name: Create client | |
| uses: ./.github/workflows/actions/create-server-with-retry | |
| with: | |
| server_name: ${{ env.CLIENT_NAME }} | |
| server_type: ${{ inputs.client_machine_type || 'ccx23' }} | |
| region: ${{ inputs.region || 'fsn1' }} | |
| max_retries: 5 | |
| runWorker: | |
| name: worker ${{ matrix.worker.worker_index }} | |
| needs: [prepareMatrix, setupMachines] | |
| if: ${{ !cancelled() && needs.setupMachines.result == 'success' }} | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| packages: read | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| worker: ${{ fromJSON(needs.prepareMatrix.outputs.matrix) }} | |
| env: | |
| SERVER_NAME: search-on-disk-server-${{ matrix.worker.worker_index }} | |
| CLIENT_NAME: search-on-disk-client-${{ matrix.worker.worker_index }} | |
| steps: | |
| - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 | |
| - uses: webfactory/ssh-agent@d4b9b8ff72958532804b70bbe600ad43b36d5f2e # v0.8.0 | |
| with: | |
| ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} | |
| - name: Setup CI tools | |
| run: bash -x tools/setup_ci.sh | |
| - name: Run cells | |
| run: | | |
| export BENCHMARK_STRATEGY="search-on-disk" | |
| export CONTAINER_MEM_LIMIT="${{ inputs.container_mem_limit || '256m' }}" | |
| if [ "${{ inputs.build_vector_db_image }}" = "true" ]; then | |
| branch_tmp="${{ github.ref_name }}" | |
| branch=${branch_tmp//\//-} | |
| export VECTOR_DB_BENCHMARK_IMAGE="ghcr.io/${{ github.repository_owner }}/vector-db-benchmark:${branch}" | |
| export GHCR_USERNAME="${{ github.repository_owner }}" | |
| export GHCR_PASSWORD="${{ secrets.GITHUB_TOKEN }}" | |
| fi | |
| # Allow individual cells to fail without aborting the rest of this worker, | |
| # but track the failure count so the step exits non-zero at the end if any cell failed | |
| # (otherwise the matrix entry would silently report success and slackOnFailure wouldn't fire). | |
| set +e | |
| FAILED=0 | |
| CELLS='${{ toJSON(matrix.worker.cells) }}' | |
| # Read cells from FD 3 (not stdin) — `ssh -tt` inside run_ci.sh would | |
| # otherwise drain the loop's stdin pipe after the first iteration and | |
| # silently truncate the worker to one cell. | |
| while IFS= read -r cell <&3; do | |
| export DATASETS=$(echo "$cell" | jq -r '.dataset') | |
| export ENGINE_NAME=$(echo "$cell" | jq -r '.engine') | |
| export SNAPSHOT_URL=$(echo "$cell" | jq -r '.snapshot_url') | |
| echo "===== worker ${{ matrix.worker.worker_index }}: $ENGINE_NAME / $DATASETS =====" | |
| # Same dev/master split as runLoadTimeBenchmark; the Hetzner pair persists | |
| # across both versions (only the qdrant container is recreated by the strategy). | |
| export QDRANT_VERSION=ghcr/dev | |
| export QDRANT__FEATURE_FLAGS__ALL=true | |
| timeout 45m bash -x tools/run_ci.sh || FAILED=$((FAILED + 1)) | |
| export QDRANT_VERSION=docker/master | |
| export QDRANT__FEATURE_FLAGS__ALL=false | |
| timeout 45m bash -x tools/run_ci.sh || FAILED=$((FAILED + 1)) | |
| done 3< <(echo "$CELLS" | jq -c '.[]') | |
| if [ "$FAILED" -gt 0 ]; then | |
| echo "::warning::worker ${{ matrix.worker.worker_index }}: $FAILED cell run(s) failed" | |
| exit 1 | |
| fi | |
| cleanupMachines: | |
| name: teardown pair ${{ matrix.worker.worker_index }} | |
| needs: [prepareMatrix, setupMachines, runWorker] | |
| if: ${{ always() && needs.prepareMatrix.result == 'success' }} | |
| runs-on: ubuntu-latest | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| worker: ${{ fromJSON(needs.prepareMatrix.outputs.matrix) }} | |
| env: | |
| SERVER_NAME: search-on-disk-server-${{ matrix.worker.worker_index }} | |
| CLIENT_NAME: search-on-disk-client-${{ matrix.worker.worker_index }} | |
| steps: | |
| - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 | |
| - uses: webfactory/ssh-agent@d4b9b8ff72958532804b70bbe600ad43b36d5f2e # v0.8.0 | |
| with: | |
| ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} | |
| - name: Setup CI tools | |
| run: bash -x tools/setup_ci.sh | |
| - name: Teardown | |
| continue-on-error: true | |
| run: bash -x tools/tear_down.sh |