Implement SOTA alignment reset and reality-check gates #305
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Полный smoke: benchmark-track-regen на сгенерированном mp4 внутри образа birdlense (#372). | |
| # Не дублирует «общий» CI: только при изменениях в процессоре / скриптах бенчмарка. | |
| name: benchmark-regen-integration | |
| on: | |
| workflow_dispatch: | |
| pull_request: | |
| branches: [main, dev, ML] | |
| paths: | |
| - 'scripts/benchmark*.py' | |
| - 'scripts/compare_benchmark_reports.py' | |
| - 'scripts/verify_benchmark_report_schema.py' | |
| - 'scripts/ci/gen_smoke_benchmark_clip.sh' | |
| - 'scripts/ci/reference_smoke_report.json' | |
| - 'app/processor/**' | |
| - '.github/workflows/benchmark-regen-integration.yml' | |
| # Прямой push в ML: полный docker-smoke при изменениях по тем же paths (без открытого PR). | |
| push: | |
| branches: [ML] | |
| paths: | |
| - 'scripts/benchmark*.py' | |
| - 'scripts/compare_benchmark_reports.py' | |
| - 'scripts/verify_benchmark_report_schema.py' | |
| - 'scripts/ci/gen_smoke_benchmark_clip.sh' | |
| - 'scripts/ci/reference_smoke_report.json' | |
| - 'app/processor/**' | |
| - '.github/workflows/benchmark-regen-integration.yml' | |
| concurrency: | |
| group: benchmark-regen-${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| smoke: | |
| name: docker-benchmark-smoke | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 40 | |
| defaults: | |
| run: | |
| working-directory: app | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - name: Install ffmpeg (generate smoke clip) | |
| run: | | |
| sudo apt-get update -qq | |
| sudo apt-get install -y --no-install-recommends ffmpeg | |
| - name: Generate smoke MP4 | |
| working-directory: ${{ github.workspace }} | |
| run: | | |
| mkdir -p .artifacts | |
| bash scripts/ci/gen_smoke_benchmark_clip.sh .artifacts/smoke_clip.mp4 | |
| test -s .artifacts/smoke_clip.mp4 | |
| - name: Free disk space (Docker image) | |
| run: | | |
| sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \ | |
| /usr/local/share/powershell /opt/hostedtoolcache/CodeQL \ | |
| /opt/hostedtoolcache/go /opt/hostedtoolcache/node \ | |
| "$AGENT_TOOLSDIRECTORY"/Python || true | |
| docker system prune -af || true | |
| df -h | |
| - uses: actions/setup-node@v6 | |
| with: | |
| node-version: "22" | |
| cache: npm | |
| cache-dependency-path: app/ui/package-lock.json | |
| - name: Pre-build UI (skip npm in Docker build) | |
| working-directory: app/ui | |
| run: npm ci && npm run build | |
| - uses: docker/setup-buildx-action@v4 | |
| - name: Prepare .env and data dirs | |
| run: | | |
| test -f .env || cp .env.example .env | |
| for var in BIRDLENSE_INFERENCE_BACKEND BIRDLENSE_CLASSIFIER_INFERENCE_BACKEND \ | |
| BIRDLENSE_OPENVINO_BINARY_ENABLED BIRDLENSE_INFERENCE_DEVICE BIRDLENSE_CLASSIFIER_INFERENCE_DEVICE; do | |
| sed -i "/^${var}=/d" .env || true | |
| done | |
| mkdir -p data/db data/recordings | |
| - name: Processor .pt weights | |
| working-directory: ${{ github.workspace }} | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| BIRDLENSE_FETCH_CLASSIFIER_OPTIONAL: "1" | |
| run: | | |
| set -euo pipefail | |
| link_weights() { | |
| W="${{ github.workspace }}/app/processor/models/detection/weights" | |
| C="${{ github.workspace }}/app/processor/models/classification/weights" | |
| mkdir -p "$W" "$C" | |
| ln -sf best.pt "$W/trapper_ai_v02_2024.pt" | |
| if [[ ! -s "$C/best.pt" && -s "$W/best.pt" ]]; then | |
| ln -sf ../../detection/weights/best.pt "$C/best.pt" | |
| fi | |
| if [[ -s "$C/best.pt" && ! -e "$C/convnext_v2_tiny_eu-common256px.pt" ]]; then | |
| ln -sf best.pt "$C/convnext_v2_tiny_eu-common256px.pt" | |
| fi | |
| } | |
| for attempt in 1 2 3; do | |
| if ./scripts/fetch-processor-weights.sh; then | |
| link_weights | |
| exit 0 | |
| fi | |
| sleep $((attempt * 10)) | |
| done | |
| echo "Failed to fetch processor weights" >&2 | |
| exit 1 | |
| - name: Ensure Birder EU classifier for benchmark smoke | |
| working-directory: ${{ github.workspace }} | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| run: | | |
| set -euo pipefail | |
| C="app/processor/models/classification/weights" | |
| OV="$C/convnext_v2_tiny_eu-common256px_openvino_model" | |
| if [[ -f "$OV/class_labels.txt" || -f "$OV/birdlense_manifest.json" ]]; then | |
| echo "Birder EU metadata already present" | |
| exit 0 | |
| fi | |
| pip install -q huggingface_hub birder torch | |
| python3 scripts/download_birder_classifier.py --variant convnext_v2_tiny_eu-common256px | |
| test -f "$OV/class_labels.txt" -o -f "$OV/birdlense_manifest.json" | |
| - name: Reclaim disk before Docker build | |
| run: | | |
| pip cache purge || true | |
| rm -rf "$HOME/.cache/pip" /tmp/pip-* || true | |
| docker builder prune -af || true | |
| docker image prune -af || true | |
| df -h | |
| - name: Build birdlense image | |
| run: docker compose build birdlense | |
| env: | |
| DOCKER_BUILDKIT: 1 | |
| COMPOSE_DOCKER_CLI_BUILD: 1 | |
| - name: Run benchmark-track-regen on smoke clip | |
| env: | |
| WS: ${{ github.workspace }} | |
| run: | | |
| docker compose run --rm \ | |
| -v "${WS}:/workspace:rw" \ | |
| birdlense \ | |
| bash -lc 'export PYTHONPATH=/app:/app/web:/app/processor/src && \ | |
| python3 /workspace/scripts/benchmark-track-regen.py \ | |
| --video /workspace/.artifacts/smoke_clip.mp4 \ | |
| --frame-step 24 \ | |
| --max-runtime-sec 300 \ | |
| --write-report /workspace/.artifacts/benchmark_smoke_report.json' | |
| - name: Run SOTA golden benchmark harness (smoke) | |
| env: | |
| WS: ${{ github.workspace }} | |
| run: | | |
| docker compose run --rm \ | |
| -v "${WS}:/workspace:rw" \ | |
| birdlense \ | |
| bash -lc 'export PYTHONPATH=/app:/app/web:/app/processor/src && \ | |
| python3 /workspace/scripts/benchmark_sota.py \ | |
| --smoke \ | |
| --clip-1816 /workspace/.artifacts/smoke_clip.mp4 \ | |
| --clip-1819 /workspace/.artifacts/smoke_clip.mp4 \ | |
| --frame-step 24 \ | |
| --max-runtime-sec 300 \ | |
| --write-report /workspace/.artifacts/benchmark_sota_smoke.json' | |
| - name: Run benchmark_trackers on smoke clip | |
| env: | |
| WS: ${{ github.workspace }} | |
| run: | | |
| docker compose run --rm \ | |
| -v "${WS}:/workspace:rw" \ | |
| birdlense \ | |
| bash -lc 'export PYTHONPATH=/app:/app/web:/app/processor/src && \ | |
| python3 /workspace/scripts/benchmark_trackers.py \ | |
| --clip /workspace/.artifacts/smoke_clip.mp4 \ | |
| --presets "bytetrack_birdlense,botsort_birdlense" \ | |
| --frame-step 24 \ | |
| --max-runtime-sec 300 \ | |
| --write-report /workspace/.artifacts/benchmark_trackers_smoke.json' | |
| - name: Build tracker A/B summary report | |
| working-directory: ${{ github.workspace }} | |
| run: | | |
| python3 scripts/benchmark_trackers_report.py \ | |
| --trackers-report .artifacts/benchmark_trackers_smoke.json \ | |
| --baseline-preset bytetrack_birdlense \ | |
| --out .artifacts/benchmark_trackers_ab_smoke.json \ | |
| --summary-out .artifacts/benchmark_trackers_ab_smoke.md | |
| cat .artifacts/benchmark_trackers_ab_smoke.md >> "$GITHUB_STEP_SUMMARY" | |
| - name: Build track quality core metrics report (HOTA/IDF1/IDSW/fragmentation) | |
| working-directory: ${{ github.workspace }} | |
| run: | | |
| python3 scripts/report_track_quality_core_metrics.py \ | |
| --benchmark-sota-report .artifacts/benchmark_sota_smoke.json \ | |
| --baseline benchmarks/golden_baseline_smoke.json \ | |
| --out .artifacts/track_quality_core_metrics_smoke.json \ | |
| --summary-out .artifacts/track_quality_core_metrics_smoke.md | |
| cat .artifacts/track_quality_core_metrics_smoke.md >> "$GITHUB_STEP_SUMMARY" | |
| - name: Build truth-set before/after delta report | |
| working-directory: ${{ github.workspace }} | |
| run: | | |
| python3 scripts/report_track_quality_truthset_delta.py \ | |
| --baseline benchmarks/golden_baseline_smoke.json \ | |
| --benchmark-sota-report .artifacts/benchmark_sota_smoke.json \ | |
| --out .artifacts/track_quality_truthset_delta_smoke.json \ | |
| --summary-out .artifacts/track_quality_truthset_delta_smoke.md | |
| cat .artifacts/track_quality_truthset_delta_smoke.md >> "$GITHUB_STEP_SUMMARY" | |
| - name: Build failure modes + mitigation report | |
| working-directory: ${{ github.workspace }} | |
| run: | | |
| python3 scripts/report_track_failure_modes.py \ | |
| --benchmark-sota-report .artifacts/benchmark_sota_smoke.json \ | |
| --benchmark-trackers-ab-report .artifacts/benchmark_trackers_ab_smoke.json \ | |
| --track-quality-core-report .artifacts/track_quality_core_metrics_smoke.json \ | |
| --out .artifacts/track_failure_modes_smoke.json \ | |
| --summary-out .artifacts/track_failure_modes_smoke.md | |
| cat .artifacts/track_failure_modes_smoke.md >> "$GITHUB_STEP_SUMMARY" | |
| - name: Build unified parity report (daily) | |
| working-directory: ${{ github.workspace }} | |
| run: | | |
| python3 scripts/parity_runner.py \ | |
| --benchmark-sota-report .artifacts/benchmark_sota_smoke.json \ | |
| --core-metrics-report .artifacts/track_quality_core_metrics_smoke.json \ | |
| --truthset-delta-report .artifacts/track_quality_truthset_delta_smoke.json \ | |
| --failure-modes-report .artifacts/track_failure_modes_smoke.json \ | |
| --tracker-ab-report .artifacts/benchmark_trackers_ab_smoke.json \ | |
| --period daily \ | |
| --out .artifacts/parity_report_daily_smoke.json \ | |
| --summary-out .artifacts/parity_report_daily_smoke.md | |
| cat .artifacts/parity_report_daily_smoke.md >> "$GITHUB_STEP_SUMMARY" | |
| - name: Verify parity report gate | |
| working-directory: ${{ github.workspace }} | |
| run: | | |
| python3 scripts/verify_parity_report.py \ | |
| --report .artifacts/parity_report_daily_smoke.json \ | |
| --min-precision-proxy 0.5 \ | |
| --min-recall-proxy 0.1 \ | |
| --max-unknown-share 0.95 \ | |
| --out .artifacts/parity_report_gate_smoke.json | |
| - name: Freeze truth-set splits (smoke contract) | |
| working-directory: ${{ github.workspace }} | |
| run: | | |
| cat <<'EOF' > .artifacts/truthset_seed_smoke.json | |
| { | |
| "clips": [ | |
| {"clip_id":"c1","day_night":"day","weather":"clear"}, | |
| {"clip_id":"c2","day_night":"night","weather":"rain"}, | |
| {"clip_id":"c3","day_night":"day","weather":"cloudy"}, | |
| {"clip_id":"c4","day_night":"night","weather":"rain"}, | |
| {"clip_id":"c5","day_night":"day","weather":"clear"}, | |
| {"clip_id":"c6","day_night":"night","weather":"clear"} | |
| ] | |
| } | |
| EOF | |
| python3 scripts/freeze_truthset_splits.py \ | |
| --input .artifacts/truthset_seed_smoke.json \ | |
| --min-clips 6 \ | |
| --out .artifacts/truthset_splits_smoke.json | |
| - name: Build active-learning backlog + weekly playbook + comparison (smoke) | |
| working-directory: ${{ github.workspace }} | |
| run: | | |
| cat <<'EOF' > .artifacts/track_quality_regression_smoke.json | |
| { | |
| "schema": "track_quality_regression_report@v1", | |
| "metrics": { | |
| "parity_mismatch_rate_24h": 0.31, | |
| "track_id_switch_rate_24h": 0.06 | |
| } | |
| } | |
| EOF | |
| cat <<'EOF' > .artifacts/species_calibration_baseline_smoke.json | |
| { | |
| "schema": "classifier_calibration_report@v1", | |
| "topk_metrics": { | |
| "top1_before": 0.56, | |
| "top3_proxy_before": 0.70, | |
| "false_species_rate_before": 0.44 | |
| }, | |
| "calibration_metrics": {"ece": 0.22}, | |
| "unknown_ood_dashboard": { | |
| "unknown_policy": {"unknown_share_after_policy": 0.33} | |
| }, | |
| "top_confusion_pairs": [ | |
| {"from":"Wood Mouse","to":"Great Tit","count":6} | |
| ] | |
| } | |
| EOF | |
| cat <<'EOF' > .artifacts/species_calibration_current_smoke.json | |
| { | |
| "schema": "classifier_calibration_report@v1", | |
| "topk_metrics": { | |
| "top1_before": 0.61, | |
| "top3_proxy_before": 0.76, | |
| "false_species_rate_before": 0.35 | |
| }, | |
| "calibration_metrics": {"ece": 0.18}, | |
| "unknown_ood_dashboard": { | |
| "unknown_policy": {"unknown_share_after_policy": 0.29} | |
| }, | |
| "top_confusion_pairs": [ | |
| {"from":"Wood Mouse","to":"Great Tit","count":4} | |
| ] | |
| } | |
| EOF | |
| cat <<'EOF' > .artifacts/feedback_loop_status_smoke.json | |
| { | |
| "schema": "feedback_loop_status@v1", | |
| "events_total": 42 | |
| } | |
| EOF | |
| python3 scripts/report_active_learning_backlog.py \ | |
| --track-regression-report .artifacts/track_quality_regression_smoke.json \ | |
| --species-calibration-report .artifacts/species_calibration_current_smoke.json \ | |
| --truthset-delta-report .artifacts/track_quality_truthset_delta_smoke.json \ | |
| --out .artifacts/active_learning_backlog_smoke.json \ | |
| --summary-out .artifacts/active_learning_backlog_smoke.md | |
| cat .artifacts/active_learning_backlog_smoke.md >> "$GITHUB_STEP_SUMMARY" | |
| python3 scripts/build_weekly_quality_cycle_playbook.py \ | |
| --backlog-report .artifacts/active_learning_backlog_smoke.json \ | |
| --feedback-loop-status .artifacts/feedback_loop_status_smoke.json \ | |
| --out .artifacts/weekly_quality_cycle_playbook_smoke.json \ | |
| --summary-out .artifacts/weekly_quality_cycle_playbook_smoke.md | |
| cat .artifacts/weekly_quality_cycle_playbook_smoke.md >> "$GITHUB_STEP_SUMMARY" | |
| python3 scripts/compare_quality_cycle_reports.py \ | |
| --baseline-report .artifacts/species_calibration_baseline_smoke.json \ | |
| --current-report .artifacts/species_calibration_current_smoke.json \ | |
| --min-top1-gain 0.0 \ | |
| --min-top3-gain 0.0 \ | |
| --max-ece-delta 0.0 \ | |
| --out .artifacts/quality_cycle_comparison_smoke.json | |
| python3 scripts/report_feedback_effect.py \ | |
| --baseline-report .artifacts/species_calibration_baseline_smoke.json \ | |
| --current-report .artifacts/species_calibration_current_smoke.json \ | |
| --out .artifacts/feedback_effect_smoke.json \ | |
| --summary-out .artifacts/feedback_effect_smoke.md | |
| cat .artifacts/feedback_effect_smoke.md >> "$GITHUB_STEP_SUMMARY" | |
| cat <<'EOF' > .artifacts/similarity_behavior_summary_smoke.json | |
| { | |
| "schema": "similarity_behavior_summary@v1", | |
| "similarity": { | |
| "topk_hit_rate": 0.81, | |
| "p95_query_ms": 22.4 | |
| }, | |
| "behavior": { | |
| "macro_f1": 0.63 | |
| }, | |
| "runtime_cost": { | |
| "retrieval_p95_ok": true | |
| } | |
| } | |
| EOF | |
| python3 scripts/verify_similarity_behavior_summary.py \ | |
| --report .artifacts/similarity_behavior_summary_smoke.json \ | |
| --min-topk-hit-rate 0.6 \ | |
| --min-behavior-macro-f1 0.4 \ | |
| --max-retrieval-p95-ms 50.0 \ | |
| --out .artifacts/similarity_behavior_verify_smoke.json | |
| cat <<'EOF' > .artifacts/runtime_slo_domain_health_smoke.json | |
| { | |
| "slo_dashboard": { | |
| "schema": "runtime_slo_dashboard@v1", | |
| "snapshot": { | |
| "sustained_fps_avg_24h": 8.4, | |
| "skipped_ratio_avg_24h": 0.018, | |
| "pipeline_latency_p95_ms_24h": 1100.0, | |
| "per_camera_warn_count_24h": 0 | |
| }, | |
| "status": { | |
| "ok": true, | |
| "breaches": [] | |
| } | |
| } | |
| } | |
| EOF | |
| python3 scripts/verify_runtime_slo_dashboard.py \ | |
| --report .artifacts/runtime_slo_domain_health_smoke.json | |
| echo "## Similarity/Behavior smoke gate" >> "$GITHUB_STEP_SUMMARY" | |
| python3 - <<'PY' >> "$GITHUB_STEP_SUMMARY" | |
| import json | |
| data = json.load(open(".artifacts/similarity_behavior_verify_smoke.json", "r", encoding="utf-8")) | |
| print(f"- ok: **{data.get('ok')}**") | |
| print(f"- errors: `{data.get('errors')}`") | |
| PY | |
| echo "## Runtime SLO smoke gate" >> "$GITHUB_STEP_SUMMARY" | |
| echo "- verify_runtime_slo_dashboard.py: **PASS**" >> "$GITHUB_STEP_SUMMARY" | |
| - name: Verify benchmark report schema | |
| working-directory: ${{ github.workspace }} | |
| run: | | |
| python3 scripts/verify_benchmark_report_schema.py \ | |
| --report .artifacts/benchmark_smoke_report.json | |
| - name: Compare to committed reference smoke report | |
| working-directory: ${{ github.workspace }} | |
| run: | | |
| python3 scripts/compare_benchmark_reports.py \ | |
| --baseline scripts/ci/reference_smoke_report.json \ | |
| --current .artifacts/benchmark_smoke_report.json \ | |
| --match-by-basename \ | |
| --tolerance 0 | |
| - name: Upload benchmark JSON artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v6 | |
| with: | |
| name: benchmark-smoke-reports | |
| path: | | |
| .artifacts/benchmark_smoke_report.json | |
| .artifacts/benchmark_sota_smoke.json | |
| .artifacts/benchmark_trackers_smoke.json | |
| .artifacts/benchmark_trackers_ab_smoke.json | |
| .artifacts/benchmark_trackers_ab_smoke.md | |
| .artifacts/track_quality_core_metrics_smoke.json | |
| .artifacts/track_quality_core_metrics_smoke.md | |
| .artifacts/track_quality_truthset_delta_smoke.json | |
| .artifacts/track_quality_truthset_delta_smoke.md | |
| .artifacts/track_failure_modes_smoke.json | |
| .artifacts/track_failure_modes_smoke.md | |
| .artifacts/parity_report_daily_smoke.json | |
| .artifacts/parity_report_daily_smoke.md | |
| .artifacts/parity_report_gate_smoke.json | |
| .artifacts/truthset_splits_smoke.json | |
| .artifacts/active_learning_backlog_smoke.json | |
| .artifacts/active_learning_backlog_smoke.md | |
| .artifacts/weekly_quality_cycle_playbook_smoke.json | |
| .artifacts/weekly_quality_cycle_playbook_smoke.md | |
| .artifacts/quality_cycle_comparison_smoke.json | |
| .artifacts/feedback_effect_smoke.json | |
| .artifacts/feedback_effect_smoke.md | |
| .artifacts/similarity_behavior_summary_smoke.json | |
| .artifacts/similarity_behavior_verify_smoke.json | |
| .artifacts/runtime_slo_domain_health_smoke.json | |
| if-no-files-found: ignore |