Skip to content

Implement SOTA alignment reset and reality-check gates #305

Implement SOTA alignment reset and reality-check gates

Implement SOTA alignment reset and reality-check gates #305

# Полный smoke: benchmark-track-regen на сгенерированном mp4 внутри образа birdlense (#372).
# Не дублирует «общий» CI: только при изменениях в процессоре / скриптах бенчмарка.
name: benchmark-regen-integration
on:
workflow_dispatch:
pull_request:
branches: [main, dev, ML]
paths:
- 'scripts/benchmark*.py'
- 'scripts/compare_benchmark_reports.py'
- 'scripts/verify_benchmark_report_schema.py'
- 'scripts/ci/gen_smoke_benchmark_clip.sh'
- 'scripts/ci/reference_smoke_report.json'
- 'app/processor/**'
- '.github/workflows/benchmark-regen-integration.yml'
# Прямой push в ML: полный docker-smoke при изменениях по тем же paths (без открытого PR).
push:
branches: [ML]
paths:
- 'scripts/benchmark*.py'
- 'scripts/compare_benchmark_reports.py'
- 'scripts/verify_benchmark_report_schema.py'
- 'scripts/ci/gen_smoke_benchmark_clip.sh'
- 'scripts/ci/reference_smoke_report.json'
- 'app/processor/**'
- '.github/workflows/benchmark-regen-integration.yml'
concurrency:
group: benchmark-regen-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
smoke:
name: docker-benchmark-smoke
runs-on: ubuntu-latest
timeout-minutes: 40
defaults:
run:
working-directory: app
steps:
- uses: actions/checkout@v6
- name: Install ffmpeg (generate smoke clip)
run: |
sudo apt-get update -qq
sudo apt-get install -y --no-install-recommends ffmpeg
- name: Generate smoke MP4
working-directory: ${{ github.workspace }}
run: |
mkdir -p .artifacts
bash scripts/ci/gen_smoke_benchmark_clip.sh .artifacts/smoke_clip.mp4
test -s .artifacts/smoke_clip.mp4
- name: Free disk space (Docker image)
run: |
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
/usr/local/share/powershell /opt/hostedtoolcache/CodeQL \
/opt/hostedtoolcache/go /opt/hostedtoolcache/node \
"$AGENT_TOOLSDIRECTORY"/Python || true
docker system prune -af || true
df -h
- uses: actions/setup-node@v6
with:
node-version: "22"
cache: npm
cache-dependency-path: app/ui/package-lock.json
- name: Pre-build UI (skip npm in Docker build)
working-directory: app/ui
run: npm ci && npm run build
- uses: docker/setup-buildx-action@v4
- name: Prepare .env and data dirs
run: |
test -f .env || cp .env.example .env
for var in BIRDLENSE_INFERENCE_BACKEND BIRDLENSE_CLASSIFIER_INFERENCE_BACKEND \
BIRDLENSE_OPENVINO_BINARY_ENABLED BIRDLENSE_INFERENCE_DEVICE BIRDLENSE_CLASSIFIER_INFERENCE_DEVICE; do
sed -i "/^${var}=/d" .env || true
done
mkdir -p data/db data/recordings
- name: Processor .pt weights
working-directory: ${{ github.workspace }}
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
BIRDLENSE_FETCH_CLASSIFIER_OPTIONAL: "1"
run: |
set -euo pipefail
link_weights() {
W="${{ github.workspace }}/app/processor/models/detection/weights"
C="${{ github.workspace }}/app/processor/models/classification/weights"
mkdir -p "$W" "$C"
ln -sf best.pt "$W/trapper_ai_v02_2024.pt"
if [[ ! -s "$C/best.pt" && -s "$W/best.pt" ]]; then
ln -sf ../../detection/weights/best.pt "$C/best.pt"
fi
if [[ -s "$C/best.pt" && ! -e "$C/convnext_v2_tiny_eu-common256px.pt" ]]; then
ln -sf best.pt "$C/convnext_v2_tiny_eu-common256px.pt"
fi
}
for attempt in 1 2 3; do
if ./scripts/fetch-processor-weights.sh; then
link_weights
exit 0
fi
sleep $((attempt * 10))
done
echo "Failed to fetch processor weights" >&2
exit 1
- name: Ensure Birder EU classifier for benchmark smoke
working-directory: ${{ github.workspace }}
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
set -euo pipefail
C="app/processor/models/classification/weights"
OV="$C/convnext_v2_tiny_eu-common256px_openvino_model"
if [[ -f "$OV/class_labels.txt" || -f "$OV/birdlense_manifest.json" ]]; then
echo "Birder EU metadata already present"
exit 0
fi
pip install -q huggingface_hub birder torch
python3 scripts/download_birder_classifier.py --variant convnext_v2_tiny_eu-common256px
test -f "$OV/class_labels.txt" -o -f "$OV/birdlense_manifest.json"
- name: Reclaim disk before Docker build
run: |
pip cache purge || true
rm -rf "$HOME/.cache/pip" /tmp/pip-* || true
docker builder prune -af || true
docker image prune -af || true
df -h
- name: Build birdlense image
run: docker compose build birdlense
env:
DOCKER_BUILDKIT: 1
COMPOSE_DOCKER_CLI_BUILD: 1
- name: Run benchmark-track-regen on smoke clip
env:
WS: ${{ github.workspace }}
run: |
docker compose run --rm \
-v "${WS}:/workspace:rw" \
birdlense \
bash -lc 'export PYTHONPATH=/app:/app/web:/app/processor/src && \
python3 /workspace/scripts/benchmark-track-regen.py \
--video /workspace/.artifacts/smoke_clip.mp4 \
--frame-step 24 \
--max-runtime-sec 300 \
--write-report /workspace/.artifacts/benchmark_smoke_report.json'
- name: Run SOTA golden benchmark harness (smoke)
env:
WS: ${{ github.workspace }}
run: |
docker compose run --rm \
-v "${WS}:/workspace:rw" \
birdlense \
bash -lc 'export PYTHONPATH=/app:/app/web:/app/processor/src && \
python3 /workspace/scripts/benchmark_sota.py \
--smoke \
--clip-1816 /workspace/.artifacts/smoke_clip.mp4 \
--clip-1819 /workspace/.artifacts/smoke_clip.mp4 \
--frame-step 24 \
--max-runtime-sec 300 \
--write-report /workspace/.artifacts/benchmark_sota_smoke.json'
- name: Run benchmark_trackers on smoke clip
env:
WS: ${{ github.workspace }}
run: |
docker compose run --rm \
-v "${WS}:/workspace:rw" \
birdlense \
bash -lc 'export PYTHONPATH=/app:/app/web:/app/processor/src && \
python3 /workspace/scripts/benchmark_trackers.py \
--clip /workspace/.artifacts/smoke_clip.mp4 \
--presets "bytetrack_birdlense,botsort_birdlense" \
--frame-step 24 \
--max-runtime-sec 300 \
--write-report /workspace/.artifacts/benchmark_trackers_smoke.json'
- name: Build tracker A/B summary report
working-directory: ${{ github.workspace }}
run: |
python3 scripts/benchmark_trackers_report.py \
--trackers-report .artifacts/benchmark_trackers_smoke.json \
--baseline-preset bytetrack_birdlense \
--out .artifacts/benchmark_trackers_ab_smoke.json \
--summary-out .artifacts/benchmark_trackers_ab_smoke.md
cat .artifacts/benchmark_trackers_ab_smoke.md >> "$GITHUB_STEP_SUMMARY"
- name: Build track quality core metrics report (HOTA/IDF1/IDSW/fragmentation)
working-directory: ${{ github.workspace }}
run: |
python3 scripts/report_track_quality_core_metrics.py \
--benchmark-sota-report .artifacts/benchmark_sota_smoke.json \
--baseline benchmarks/golden_baseline_smoke.json \
--out .artifacts/track_quality_core_metrics_smoke.json \
--summary-out .artifacts/track_quality_core_metrics_smoke.md
cat .artifacts/track_quality_core_metrics_smoke.md >> "$GITHUB_STEP_SUMMARY"
- name: Build truth-set before/after delta report
working-directory: ${{ github.workspace }}
run: |
python3 scripts/report_track_quality_truthset_delta.py \
--baseline benchmarks/golden_baseline_smoke.json \
--benchmark-sota-report .artifacts/benchmark_sota_smoke.json \
--out .artifacts/track_quality_truthset_delta_smoke.json \
--summary-out .artifacts/track_quality_truthset_delta_smoke.md
cat .artifacts/track_quality_truthset_delta_smoke.md >> "$GITHUB_STEP_SUMMARY"
- name: Build failure modes + mitigation report
working-directory: ${{ github.workspace }}
run: |
python3 scripts/report_track_failure_modes.py \
--benchmark-sota-report .artifacts/benchmark_sota_smoke.json \
--benchmark-trackers-ab-report .artifacts/benchmark_trackers_ab_smoke.json \
--track-quality-core-report .artifacts/track_quality_core_metrics_smoke.json \
--out .artifacts/track_failure_modes_smoke.json \
--summary-out .artifacts/track_failure_modes_smoke.md
cat .artifacts/track_failure_modes_smoke.md >> "$GITHUB_STEP_SUMMARY"
- name: Build unified parity report (daily)
working-directory: ${{ github.workspace }}
run: |
python3 scripts/parity_runner.py \
--benchmark-sota-report .artifacts/benchmark_sota_smoke.json \
--core-metrics-report .artifacts/track_quality_core_metrics_smoke.json \
--truthset-delta-report .artifacts/track_quality_truthset_delta_smoke.json \
--failure-modes-report .artifacts/track_failure_modes_smoke.json \
--tracker-ab-report .artifacts/benchmark_trackers_ab_smoke.json \
--period daily \
--out .artifacts/parity_report_daily_smoke.json \
--summary-out .artifacts/parity_report_daily_smoke.md
cat .artifacts/parity_report_daily_smoke.md >> "$GITHUB_STEP_SUMMARY"
- name: Verify parity report gate
working-directory: ${{ github.workspace }}
run: |
python3 scripts/verify_parity_report.py \
--report .artifacts/parity_report_daily_smoke.json \
--min-precision-proxy 0.5 \
--min-recall-proxy 0.1 \
--max-unknown-share 0.95 \
--out .artifacts/parity_report_gate_smoke.json
- name: Freeze truth-set splits (smoke contract)
working-directory: ${{ github.workspace }}
run: |
cat <<'EOF' > .artifacts/truthset_seed_smoke.json
{
"clips": [
{"clip_id":"c1","day_night":"day","weather":"clear"},
{"clip_id":"c2","day_night":"night","weather":"rain"},
{"clip_id":"c3","day_night":"day","weather":"cloudy"},
{"clip_id":"c4","day_night":"night","weather":"rain"},
{"clip_id":"c5","day_night":"day","weather":"clear"},
{"clip_id":"c6","day_night":"night","weather":"clear"}
]
}
EOF
python3 scripts/freeze_truthset_splits.py \
--input .artifacts/truthset_seed_smoke.json \
--min-clips 6 \
--out .artifacts/truthset_splits_smoke.json
- name: Build active-learning backlog + weekly playbook + comparison (smoke)
working-directory: ${{ github.workspace }}
run: |
cat <<'EOF' > .artifacts/track_quality_regression_smoke.json
{
"schema": "track_quality_regression_report@v1",
"metrics": {
"parity_mismatch_rate_24h": 0.31,
"track_id_switch_rate_24h": 0.06
}
}
EOF
cat <<'EOF' > .artifacts/species_calibration_baseline_smoke.json
{
"schema": "classifier_calibration_report@v1",
"topk_metrics": {
"top1_before": 0.56,
"top3_proxy_before": 0.70,
"false_species_rate_before": 0.44
},
"calibration_metrics": {"ece": 0.22},
"unknown_ood_dashboard": {
"unknown_policy": {"unknown_share_after_policy": 0.33}
},
"top_confusion_pairs": [
{"from":"Wood Mouse","to":"Great Tit","count":6}
]
}
EOF
cat <<'EOF' > .artifacts/species_calibration_current_smoke.json
{
"schema": "classifier_calibration_report@v1",
"topk_metrics": {
"top1_before": 0.61,
"top3_proxy_before": 0.76,
"false_species_rate_before": 0.35
},
"calibration_metrics": {"ece": 0.18},
"unknown_ood_dashboard": {
"unknown_policy": {"unknown_share_after_policy": 0.29}
},
"top_confusion_pairs": [
{"from":"Wood Mouse","to":"Great Tit","count":4}
]
}
EOF
cat <<'EOF' > .artifacts/feedback_loop_status_smoke.json
{
"schema": "feedback_loop_status@v1",
"events_total": 42
}
EOF
python3 scripts/report_active_learning_backlog.py \
--track-regression-report .artifacts/track_quality_regression_smoke.json \
--species-calibration-report .artifacts/species_calibration_current_smoke.json \
--truthset-delta-report .artifacts/track_quality_truthset_delta_smoke.json \
--out .artifacts/active_learning_backlog_smoke.json \
--summary-out .artifacts/active_learning_backlog_smoke.md
cat .artifacts/active_learning_backlog_smoke.md >> "$GITHUB_STEP_SUMMARY"
python3 scripts/build_weekly_quality_cycle_playbook.py \
--backlog-report .artifacts/active_learning_backlog_smoke.json \
--feedback-loop-status .artifacts/feedback_loop_status_smoke.json \
--out .artifacts/weekly_quality_cycle_playbook_smoke.json \
--summary-out .artifacts/weekly_quality_cycle_playbook_smoke.md
cat .artifacts/weekly_quality_cycle_playbook_smoke.md >> "$GITHUB_STEP_SUMMARY"
python3 scripts/compare_quality_cycle_reports.py \
--baseline-report .artifacts/species_calibration_baseline_smoke.json \
--current-report .artifacts/species_calibration_current_smoke.json \
--min-top1-gain 0.0 \
--min-top3-gain 0.0 \
--max-ece-delta 0.0 \
--out .artifacts/quality_cycle_comparison_smoke.json
python3 scripts/report_feedback_effect.py \
--baseline-report .artifacts/species_calibration_baseline_smoke.json \
--current-report .artifacts/species_calibration_current_smoke.json \
--out .artifacts/feedback_effect_smoke.json \
--summary-out .artifacts/feedback_effect_smoke.md
cat .artifacts/feedback_effect_smoke.md >> "$GITHUB_STEP_SUMMARY"
cat <<'EOF' > .artifacts/similarity_behavior_summary_smoke.json
{
"schema": "similarity_behavior_summary@v1",
"similarity": {
"topk_hit_rate": 0.81,
"p95_query_ms": 22.4
},
"behavior": {
"macro_f1": 0.63
},
"runtime_cost": {
"retrieval_p95_ok": true
}
}
EOF
python3 scripts/verify_similarity_behavior_summary.py \
--report .artifacts/similarity_behavior_summary_smoke.json \
--min-topk-hit-rate 0.6 \
--min-behavior-macro-f1 0.4 \
--max-retrieval-p95-ms 50.0 \
--out .artifacts/similarity_behavior_verify_smoke.json
cat <<'EOF' > .artifacts/runtime_slo_domain_health_smoke.json
{
"slo_dashboard": {
"schema": "runtime_slo_dashboard@v1",
"snapshot": {
"sustained_fps_avg_24h": 8.4,
"skipped_ratio_avg_24h": 0.018,
"pipeline_latency_p95_ms_24h": 1100.0,
"per_camera_warn_count_24h": 0
},
"status": {
"ok": true,
"breaches": []
}
}
}
EOF
python3 scripts/verify_runtime_slo_dashboard.py \
--report .artifacts/runtime_slo_domain_health_smoke.json
echo "## Similarity/Behavior smoke gate" >> "$GITHUB_STEP_SUMMARY"
python3 - <<'PY' >> "$GITHUB_STEP_SUMMARY"
import json
data = json.load(open(".artifacts/similarity_behavior_verify_smoke.json", "r", encoding="utf-8"))
print(f"- ok: **{data.get('ok')}**")
print(f"- errors: `{data.get('errors')}`")
PY
echo "## Runtime SLO smoke gate" >> "$GITHUB_STEP_SUMMARY"
echo "- verify_runtime_slo_dashboard.py: **PASS**" >> "$GITHUB_STEP_SUMMARY"
- name: Verify benchmark report schema
working-directory: ${{ github.workspace }}
run: |
python3 scripts/verify_benchmark_report_schema.py \
--report .artifacts/benchmark_smoke_report.json
- name: Compare to committed reference smoke report
working-directory: ${{ github.workspace }}
run: |
python3 scripts/compare_benchmark_reports.py \
--baseline scripts/ci/reference_smoke_report.json \
--current .artifacts/benchmark_smoke_report.json \
--match-by-basename \
--tolerance 0
- name: Upload benchmark JSON artifacts
if: always()
uses: actions/upload-artifact@v6
with:
name: benchmark-smoke-reports
path: |
.artifacts/benchmark_smoke_report.json
.artifacts/benchmark_sota_smoke.json
.artifacts/benchmark_trackers_smoke.json
.artifacts/benchmark_trackers_ab_smoke.json
.artifacts/benchmark_trackers_ab_smoke.md
.artifacts/track_quality_core_metrics_smoke.json
.artifacts/track_quality_core_metrics_smoke.md
.artifacts/track_quality_truthset_delta_smoke.json
.artifacts/track_quality_truthset_delta_smoke.md
.artifacts/track_failure_modes_smoke.json
.artifacts/track_failure_modes_smoke.md
.artifacts/parity_report_daily_smoke.json
.artifacts/parity_report_daily_smoke.md
.artifacts/parity_report_gate_smoke.json
.artifacts/truthset_splits_smoke.json
.artifacts/active_learning_backlog_smoke.json
.artifacts/active_learning_backlog_smoke.md
.artifacts/weekly_quality_cycle_playbook_smoke.json
.artifacts/weekly_quality_cycle_playbook_smoke.md
.artifacts/quality_cycle_comparison_smoke.json
.artifacts/feedback_effect_smoke.json
.artifacts/feedback_effect_smoke.md
.artifacts/similarity_behavior_summary_smoke.json
.artifacts/similarity_behavior_verify_smoke.json
.artifacts/runtime_slo_domain_health_smoke.json
if-no-files-found: ignore