Implement SOTA alignment reset and reality-check gates #305

Workflow file for this run

.github/workflows/benchmark-regen-integration.yml at f159c94

	# Полный smoke: benchmark-track-regen на сгенерированном mp4 внутри образа birdlense (#372).
	# Не дублирует «общий» CI: только при изменениях в процессоре / скриптах бенчмарка.
	name: benchmark-regen-integration

	on:
	workflow_dispatch:
	pull_request:
	branches: [main, dev, ML]
	paths:
	- 'scripts/benchmark*.py'
	- 'scripts/compare_benchmark_reports.py'
	- 'scripts/verify_benchmark_report_schema.py'
	- 'scripts/ci/gen_smoke_benchmark_clip.sh'
	- 'scripts/ci/reference_smoke_report.json'
	- 'app/processor/**'
	- '.github/workflows/benchmark-regen-integration.yml'
	# Прямой push в ML: полный docker-smoke при изменениях по тем же paths (без открытого PR).
	push:
	branches: [ML]
	paths:
	- 'scripts/benchmark*.py'
	- 'scripts/compare_benchmark_reports.py'
	- 'scripts/verify_benchmark_report_schema.py'
	- 'scripts/ci/gen_smoke_benchmark_clip.sh'
	- 'scripts/ci/reference_smoke_report.json'
	- 'app/processor/**'
	- '.github/workflows/benchmark-regen-integration.yml'

	concurrency:
	group: benchmark-regen-${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	jobs:
	smoke:
	name: docker-benchmark-smoke
	runs-on: ubuntu-latest
	timeout-minutes: 40
	defaults:
	run:
	working-directory: app
	steps:
	- uses: actions/checkout@v6

	- name: Install ffmpeg (generate smoke clip)
	run: \|
	sudo apt-get update -qq
	sudo apt-get install -y --no-install-recommends ffmpeg

	- name: Generate smoke MP4
	working-directory: ${{ github.workspace }}
	run: \|
	mkdir -p .artifacts
	bash scripts/ci/gen_smoke_benchmark_clip.sh .artifacts/smoke_clip.mp4
	test -s .artifacts/smoke_clip.mp4

	- name: Free disk space (Docker image)
	run: \|
	sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
	/usr/local/share/powershell /opt/hostedtoolcache/CodeQL \
	/opt/hostedtoolcache/go /opt/hostedtoolcache/node \
	"$AGENT_TOOLSDIRECTORY"/Python \|\| true
	docker system prune -af \|\| true
	df -h

	- uses: actions/setup-node@v6
	with:
	node-version: "22"
	cache: npm
	cache-dependency-path: app/ui/package-lock.json

	- name: Pre-build UI (skip npm in Docker build)
	working-directory: app/ui
	run: npm ci && npm run build

	- uses: docker/setup-buildx-action@v4

	- name: Prepare .env and data dirs
	run: \|
	test -f .env \|\| cp .env.example .env
	for var in BIRDLENSE_INFERENCE_BACKEND BIRDLENSE_CLASSIFIER_INFERENCE_BACKEND \
	BIRDLENSE_OPENVINO_BINARY_ENABLED BIRDLENSE_INFERENCE_DEVICE BIRDLENSE_CLASSIFIER_INFERENCE_DEVICE; do
	sed -i "/^${var}=/d" .env \|\| true
	done
	mkdir -p data/db data/recordings

	- name: Processor .pt weights
	working-directory: ${{ github.workspace }}
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	BIRDLENSE_FETCH_CLASSIFIER_OPTIONAL: "1"
	run: \|
	set -euo pipefail
	link_weights() {
	W="${{ github.workspace }}/app/processor/models/detection/weights"
	C="${{ github.workspace }}/app/processor/models/classification/weights"
	mkdir -p "$W" "$C"
	ln -sf best.pt "$W/trapper_ai_v02_2024.pt"
	if [[ ! -s "$C/best.pt" && -s "$W/best.pt" ]]; then
	ln -sf ../../detection/weights/best.pt "$C/best.pt"
	fi
	if [[ -s "$C/best.pt" && ! -e "$C/convnext_v2_tiny_eu-common256px.pt" ]]; then
	ln -sf best.pt "$C/convnext_v2_tiny_eu-common256px.pt"
	fi
	}
	for attempt in 1 2 3; do
	if ./scripts/fetch-processor-weights.sh; then
	link_weights
	exit 0
	fi
	sleep $((attempt * 10))
	done
	echo "Failed to fetch processor weights" >&2
	exit 1

	- name: Ensure Birder EU classifier for benchmark smoke
	working-directory: ${{ github.workspace }}
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	set -euo pipefail
	C="app/processor/models/classification/weights"
	OV="$C/convnext_v2_tiny_eu-common256px_openvino_model"
	if [[ -f "$OV/class_labels.txt" \|\| -f "$OV/birdlense_manifest.json" ]]; then
	echo "Birder EU metadata already present"
	exit 0
	fi
	pip install -q huggingface_hub birder torch
	python3 scripts/download_birder_classifier.py --variant convnext_v2_tiny_eu-common256px
	test -f "$OV/class_labels.txt" -o -f "$OV/birdlense_manifest.json"

	- name: Reclaim disk before Docker build
	run: \|
	pip cache purge \|\| true
	rm -rf "$HOME/.cache/pip" /tmp/pip-* \|\| true
	docker builder prune -af \|\| true
	docker image prune -af \|\| true
	df -h

	- name: Build birdlense image
	run: docker compose build birdlense
	env:
	DOCKER_BUILDKIT: 1
	COMPOSE_DOCKER_CLI_BUILD: 1

	- name: Run benchmark-track-regen on smoke clip
	env:
	WS: ${{ github.workspace }}
	run: \|
	docker compose run --rm \
	-v "${WS}:/workspace:rw" \
	birdlense \
	bash -lc 'export PYTHONPATH=/app:/app/web:/app/processor/src && \
	python3 /workspace/scripts/benchmark-track-regen.py \
	--video /workspace/.artifacts/smoke_clip.mp4 \
	--frame-step 24 \
	--max-runtime-sec 300 \
	--write-report /workspace/.artifacts/benchmark_smoke_report.json'

	- name: Run SOTA golden benchmark harness (smoke)
	env:
	WS: ${{ github.workspace }}
	run: \|
	docker compose run --rm \
	-v "${WS}:/workspace:rw" \
	birdlense \
	bash -lc 'export PYTHONPATH=/app:/app/web:/app/processor/src && \
	python3 /workspace/scripts/benchmark_sota.py \
	--smoke \
	--clip-1816 /workspace/.artifacts/smoke_clip.mp4 \
	--clip-1819 /workspace/.artifacts/smoke_clip.mp4 \
	--frame-step 24 \
	--max-runtime-sec 300 \
	--write-report /workspace/.artifacts/benchmark_sota_smoke.json'

	- name: Run benchmark_trackers on smoke clip
	env:
	WS: ${{ github.workspace }}
	run: \|
	docker compose run --rm \
	-v "${WS}:/workspace:rw" \
	birdlense \
	bash -lc 'export PYTHONPATH=/app:/app/web:/app/processor/src && \
	python3 /workspace/scripts/benchmark_trackers.py \
	--clip /workspace/.artifacts/smoke_clip.mp4 \
	--presets "bytetrack_birdlense,botsort_birdlense" \
	--frame-step 24 \
	--max-runtime-sec 300 \
	--write-report /workspace/.artifacts/benchmark_trackers_smoke.json'

	- name: Build tracker A/B summary report
	working-directory: ${{ github.workspace }}
	run: \|
	python3 scripts/benchmark_trackers_report.py \
	--trackers-report .artifacts/benchmark_trackers_smoke.json \
	--baseline-preset bytetrack_birdlense \
	--out .artifacts/benchmark_trackers_ab_smoke.json \
	--summary-out .artifacts/benchmark_trackers_ab_smoke.md
	cat .artifacts/benchmark_trackers_ab_smoke.md >> "$GITHUB_STEP_SUMMARY"

	- name: Build track quality core metrics report (HOTA/IDF1/IDSW/fragmentation)
	working-directory: ${{ github.workspace }}
	run: \|
	python3 scripts/report_track_quality_core_metrics.py \
	--benchmark-sota-report .artifacts/benchmark_sota_smoke.json \
	--baseline benchmarks/golden_baseline_smoke.json \
	--out .artifacts/track_quality_core_metrics_smoke.json \
	--summary-out .artifacts/track_quality_core_metrics_smoke.md
	cat .artifacts/track_quality_core_metrics_smoke.md >> "$GITHUB_STEP_SUMMARY"

	- name: Build truth-set before/after delta report
	working-directory: ${{ github.workspace }}
	run: \|
	python3 scripts/report_track_quality_truthset_delta.py \
	--baseline benchmarks/golden_baseline_smoke.json \
	--benchmark-sota-report .artifacts/benchmark_sota_smoke.json \
	--out .artifacts/track_quality_truthset_delta_smoke.json \
	--summary-out .artifacts/track_quality_truthset_delta_smoke.md
	cat .artifacts/track_quality_truthset_delta_smoke.md >> "$GITHUB_STEP_SUMMARY"

	- name: Build failure modes + mitigation report
	working-directory: ${{ github.workspace }}
	run: \|
	python3 scripts/report_track_failure_modes.py \
	--benchmark-sota-report .artifacts/benchmark_sota_smoke.json \
	--benchmark-trackers-ab-report .artifacts/benchmark_trackers_ab_smoke.json \
	--track-quality-core-report .artifacts/track_quality_core_metrics_smoke.json \
	--out .artifacts/track_failure_modes_smoke.json \
	--summary-out .artifacts/track_failure_modes_smoke.md
	cat .artifacts/track_failure_modes_smoke.md >> "$GITHUB_STEP_SUMMARY"

	- name: Build unified parity report (daily)
	working-directory: ${{ github.workspace }}
	run: \|
	python3 scripts/parity_runner.py \
	--benchmark-sota-report .artifacts/benchmark_sota_smoke.json \
	--core-metrics-report .artifacts/track_quality_core_metrics_smoke.json \
	--truthset-delta-report .artifacts/track_quality_truthset_delta_smoke.json \
	--failure-modes-report .artifacts/track_failure_modes_smoke.json \
	--tracker-ab-report .artifacts/benchmark_trackers_ab_smoke.json \
	--period daily \
	--out .artifacts/parity_report_daily_smoke.json \
	--summary-out .artifacts/parity_report_daily_smoke.md
	cat .artifacts/parity_report_daily_smoke.md >> "$GITHUB_STEP_SUMMARY"

	- name: Verify parity report gate
	working-directory: ${{ github.workspace }}
	run: \|
	python3 scripts/verify_parity_report.py \
	--report .artifacts/parity_report_daily_smoke.json \
	--min-precision-proxy 0.5 \
	--min-recall-proxy 0.1 \
	--max-unknown-share 0.95 \
	--out .artifacts/parity_report_gate_smoke.json

	- name: Freeze truth-set splits (smoke contract)
	working-directory: ${{ github.workspace }}
	run: \|
	cat <<'EOF' > .artifacts/truthset_seed_smoke.json
	{
	"clips": [
	{"clip_id":"c1","day_night":"day","weather":"clear"},
	{"clip_id":"c2","day_night":"night","weather":"rain"},
	{"clip_id":"c3","day_night":"day","weather":"cloudy"},
	{"clip_id":"c4","day_night":"night","weather":"rain"},
	{"clip_id":"c5","day_night":"day","weather":"clear"},
	{"clip_id":"c6","day_night":"night","weather":"clear"}
	]
	}
	EOF
	python3 scripts/freeze_truthset_splits.py \
	--input .artifacts/truthset_seed_smoke.json \
	--min-clips 6 \
	--out .artifacts/truthset_splits_smoke.json

	- name: Build active-learning backlog + weekly playbook + comparison (smoke)
	working-directory: ${{ github.workspace }}
	run: \|
	cat <<'EOF' > .artifacts/track_quality_regression_smoke.json
	{
	"schema": "track_quality_regression_report@v1",
	"metrics": {
	"parity_mismatch_rate_24h": 0.31,
	"track_id_switch_rate_24h": 0.06
	}
	}
	EOF
	cat <<'EOF' > .artifacts/species_calibration_baseline_smoke.json
	{
	"schema": "classifier_calibration_report@v1",
	"topk_metrics": {
	"top1_before": 0.56,
	"top3_proxy_before": 0.70,
	"false_species_rate_before": 0.44
	},
	"calibration_metrics": {"ece": 0.22},
	"unknown_ood_dashboard": {
	"unknown_policy": {"unknown_share_after_policy": 0.33}
	},
	"top_confusion_pairs": [
	{"from":"Wood Mouse","to":"Great Tit","count":6}
	]
	}
	EOF
	cat <<'EOF' > .artifacts/species_calibration_current_smoke.json
	{
	"schema": "classifier_calibration_report@v1",
	"topk_metrics": {
	"top1_before": 0.61,
	"top3_proxy_before": 0.76,
	"false_species_rate_before": 0.35
	},
	"calibration_metrics": {"ece": 0.18},
	"unknown_ood_dashboard": {
	"unknown_policy": {"unknown_share_after_policy": 0.29}
	},
	"top_confusion_pairs": [
	{"from":"Wood Mouse","to":"Great Tit","count":4}
	]
	}
	EOF
	cat <<'EOF' > .artifacts/feedback_loop_status_smoke.json
	{
	"schema": "feedback_loop_status@v1",
	"events_total": 42
	}
	EOF
	python3 scripts/report_active_learning_backlog.py \
	--track-regression-report .artifacts/track_quality_regression_smoke.json \
	--species-calibration-report .artifacts/species_calibration_current_smoke.json \
	--truthset-delta-report .artifacts/track_quality_truthset_delta_smoke.json \
	--out .artifacts/active_learning_backlog_smoke.json \
	--summary-out .artifacts/active_learning_backlog_smoke.md
	cat .artifacts/active_learning_backlog_smoke.md >> "$GITHUB_STEP_SUMMARY"
	python3 scripts/build_weekly_quality_cycle_playbook.py \
	--backlog-report .artifacts/active_learning_backlog_smoke.json \
	--feedback-loop-status .artifacts/feedback_loop_status_smoke.json \
	--out .artifacts/weekly_quality_cycle_playbook_smoke.json \
	--summary-out .artifacts/weekly_quality_cycle_playbook_smoke.md
	cat .artifacts/weekly_quality_cycle_playbook_smoke.md >> "$GITHUB_STEP_SUMMARY"
	python3 scripts/compare_quality_cycle_reports.py \
	--baseline-report .artifacts/species_calibration_baseline_smoke.json \
	--current-report .artifacts/species_calibration_current_smoke.json \
	--min-top1-gain 0.0 \
	--min-top3-gain 0.0 \
	--max-ece-delta 0.0 \
	--out .artifacts/quality_cycle_comparison_smoke.json
	python3 scripts/report_feedback_effect.py \
	--baseline-report .artifacts/species_calibration_baseline_smoke.json \
	--current-report .artifacts/species_calibration_current_smoke.json \
	--out .artifacts/feedback_effect_smoke.json \
	--summary-out .artifacts/feedback_effect_smoke.md
	cat .artifacts/feedback_effect_smoke.md >> "$GITHUB_STEP_SUMMARY"
	cat <<'EOF' > .artifacts/similarity_behavior_summary_smoke.json
	{
	"schema": "similarity_behavior_summary@v1",
	"similarity": {
	"topk_hit_rate": 0.81,
	"p95_query_ms": 22.4
	},
	"behavior": {
	"macro_f1": 0.63
	},
	"runtime_cost": {
	"retrieval_p95_ok": true
	}
	}
	EOF
	python3 scripts/verify_similarity_behavior_summary.py \
	--report .artifacts/similarity_behavior_summary_smoke.json \
	--min-topk-hit-rate 0.6 \
	--min-behavior-macro-f1 0.4 \
	--max-retrieval-p95-ms 50.0 \
	--out .artifacts/similarity_behavior_verify_smoke.json
	cat <<'EOF' > .artifacts/runtime_slo_domain_health_smoke.json
	{
	"slo_dashboard": {
	"schema": "runtime_slo_dashboard@v1",
	"snapshot": {
	"sustained_fps_avg_24h": 8.4,
	"skipped_ratio_avg_24h": 0.018,
	"pipeline_latency_p95_ms_24h": 1100.0,
	"per_camera_warn_count_24h": 0
	},
	"status": {
	"ok": true,
	"breaches": []
	}
	}
	}
	EOF
	python3 scripts/verify_runtime_slo_dashboard.py \
	--report .artifacts/runtime_slo_domain_health_smoke.json
	echo "## Similarity/Behavior smoke gate" >> "$GITHUB_STEP_SUMMARY"
	python3 - <<'PY' >> "$GITHUB_STEP_SUMMARY"
	import json
	data = json.load(open(".artifacts/similarity_behavior_verify_smoke.json", "r", encoding="utf-8"))
	print(f"- ok: {data.get('ok')}")
	print(f"- errors: `{data.get('errors')}`")
	PY
	echo "## Runtime SLO smoke gate" >> "$GITHUB_STEP_SUMMARY"
	echo "- verify_runtime_slo_dashboard.py: PASS" >> "$GITHUB_STEP_SUMMARY"

	- name: Verify benchmark report schema
	working-directory: ${{ github.workspace }}
	run: \|
	python3 scripts/verify_benchmark_report_schema.py \
	--report .artifacts/benchmark_smoke_report.json

	- name: Compare to committed reference smoke report
	working-directory: ${{ github.workspace }}
	run: \|
	python3 scripts/compare_benchmark_reports.py \
	--baseline scripts/ci/reference_smoke_report.json \
	--current .artifacts/benchmark_smoke_report.json \
	--match-by-basename \
	--tolerance 0

	- name: Upload benchmark JSON artifacts
	if: always()
	uses: actions/upload-artifact@v6
	with:
	name: benchmark-smoke-reports
	path: \|
	.artifacts/benchmark_smoke_report.json
	.artifacts/benchmark_sota_smoke.json
	.artifacts/benchmark_trackers_smoke.json
	.artifacts/benchmark_trackers_ab_smoke.json
	.artifacts/benchmark_trackers_ab_smoke.md
	.artifacts/track_quality_core_metrics_smoke.json
	.artifacts/track_quality_core_metrics_smoke.md
	.artifacts/track_quality_truthset_delta_smoke.json
	.artifacts/track_quality_truthset_delta_smoke.md
	.artifacts/track_failure_modes_smoke.json
	.artifacts/track_failure_modes_smoke.md
	.artifacts/parity_report_daily_smoke.json
	.artifacts/parity_report_daily_smoke.md
	.artifacts/parity_report_gate_smoke.json
	.artifacts/truthset_splits_smoke.json
	.artifacts/active_learning_backlog_smoke.json
	.artifacts/active_learning_backlog_smoke.md
	.artifacts/weekly_quality_cycle_playbook_smoke.json
	.artifacts/weekly_quality_cycle_playbook_smoke.md
	.artifacts/quality_cycle_comparison_smoke.json
	.artifacts/feedback_effect_smoke.json
	.artifacts/feedback_effect_smoke.md
	.artifacts/similarity_behavior_summary_smoke.json
	.artifacts/similarity_behavior_verify_smoke.json
	.artifacts/runtime_slo_domain_health_smoke.json
	if-no-files-found: ignore

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Implement SOTA alignment reset and reality-check gates #305

Workflow file

Implement SOTA alignment reset and reality-check gates #305

Uh oh!

Workflow file for this run