-
Notifications
You must be signed in to change notification settings - Fork 121
Fix exclude_detection_period_from_training for large time buckets
#925
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
367a10f
5a339f6
74a64d9
2b7f0bb
d45820a
7432d8c
68fe10b
3dabd53
8a56eff
2535304
80a215a
a6ceb5a
02b5f56
2cc884f
0d87279
2e40357
0c19301
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -578,3 +578,112 @@ def test_col_anom_excl_detect_train(test_id: str, dbt_project: DbtProject): | |||||||||
| "Expected FAIL when exclude_detection_period_from_training=True " | ||||||||||
| "(detection data excluded from training baseline, anomaly detected)" | ||||||||||
| ) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| @pytest.mark.skip_targets(["clickhouse"]) | ||||||||||
| def test_col_excl_detect_train_monthly(test_id: str, dbt_project: DbtProject): | ||||||||||
| """ | ||||||||||
| Test exclude_detection_period_from_training with monthly time buckets for column anomalies. | ||||||||||
|
|
||||||||||
| This tests the fix where the detection period is set to the bucket size | ||||||||||
| when the bucket period exceeds backfill_days. With monthly buckets (30 days) | ||||||||||
| and default backfill_days (2), without the fix the 2-day exclusion window | ||||||||||
| cannot contain any monthly bucket_end, making exclusion ineffective. | ||||||||||
|
|
||||||||||
| detection_period is intentionally NOT set so that backfill_days stays at | ||||||||||
| its default (2), which is smaller than the monthly bucket (30 days). | ||||||||||
| Setting detection_period would override backfill_days and mask the bug. | ||||||||||
|
|
||||||||||
| Scenario: | ||||||||||
| - 12 months of normal data with low null count (~10 nulls/day, ~300/month) | ||||||||||
| - 1 month of anomalous data with high null count (25 nulls/day, ~775/month) | ||||||||||
| - time_bucket: month (30 days >> default backfill_days of 2) | ||||||||||
| - Without exclusion: anomaly absorbed into training → test passes | ||||||||||
| - With exclusion + fix: anomaly excluded from training → test fails | ||||||||||
| """ | ||||||||||
| utc_now = datetime.utcnow().date() | ||||||||||
| current_month_1st = utc_now.replace(day=1) | ||||||||||
|
|
||||||||||
| anomaly_month_start = (current_month_1st - timedelta(days=31)).replace(day=1) | ||||||||||
| normal_month_start = (anomaly_month_start - timedelta(days=365)).replace(day=1) | ||||||||||
|
||||||||||
| anomaly_month_start = (current_month_1st - timedelta(days=31)).replace(day=1) | |
| normal_month_start = (anomaly_month_start - timedelta(days=365)).replace(day=1) | |
| anomaly_month_start = (current_month_1st - timedelta(days=1)).replace(day=1) | |
| normal_month_start = (anomaly_month_start - timedelta(days=365)).replace(day=1) |
🤖 Prompt for AI Agents
In `@integration_tests/tests/test_column_anomalies.py` around lines 607 - 608,
Replace the month arithmetic that subtracts 31/365 days with the safe “subtract
one day then replace(day=1)” pattern: compute anomaly_month_start from
current_month_1st by doing current_month_1st minus one day then .replace(day=1),
and compute normal_month_start from anomaly_month_start by subtracting one day
then .replace(day=1) so neither anomaly_month_start nor normal_month_start can
skip calendar months (refer to the anomaly_month_start, normal_month_start, and
current_month_1st variables).
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -619,3 +619,88 @@ def test_exclude_detection_from_training(test_id: str, dbt_project: DbtProject): | |
| assert ( | ||
| test_result_with_exclusion["status"] == "fail" | ||
| ), "Test should fail when anomaly is excluded from training" | ||
|
|
||
|
|
||
| @pytest.mark.skip_targets(["clickhouse"]) | ||
| def test_excl_detect_train_monthly(test_id: str, dbt_project: DbtProject): | ||
| """ | ||
| Test exclude_detection_period_from_training with monthly time buckets. | ||
|
|
||
| This tests the fix where the detection period is set to the bucket size | ||
| when the bucket period exceeds backfill_days. With monthly buckets (30 days) | ||
| and default backfill_days (2), without the fix the 2-day exclusion window | ||
| cannot contain any monthly bucket_end, making exclusion ineffective. | ||
|
|
||
| detection_period is intentionally NOT set so that backfill_days stays at | ||
| its default (2), which is smaller than the monthly bucket (30 days). | ||
| Setting detection_period would override backfill_days and mask the bug. | ||
|
|
||
| Scenario: | ||
| - 12 months of normal data (~20 rows/day, ~600/month) | ||
| - 1 month of anomalous data (~40 rows/day, ~1240/month) | ||
| - time_bucket: month (30 days >> default backfill_days of 2) | ||
| - Without exclusion: anomaly absorbed into training → test passes | ||
| - With exclusion + fix: anomaly excluded from training → test fails | ||
| """ | ||
| utc_now = datetime.utcnow() | ||
| current_month_1st = utc_now.replace( | ||
| day=1, hour=0, minute=0, second=0, microsecond=0 | ||
| ) | ||
|
|
||
| anomaly_month_start = (current_month_1st - timedelta(days=31)).replace(day=1) | ||
| normal_month_start = (anomaly_month_start - timedelta(days=365)).replace(day=1) | ||
|
||
|
|
||
| normal_data = [] | ||
| day = normal_month_start | ||
| day_idx = 0 | ||
| while day < anomaly_month_start: | ||
| rows_per_day = 17 + (day_idx % 7) | ||
| normal_data.extend( | ||
| [{TIMESTAMP_COLUMN: day.strftime(DATE_FORMAT)} for _ in range(rows_per_day)] | ||
| ) | ||
| day += timedelta(days=1) | ||
| day_idx += 1 | ||
|
|
||
| anomalous_data = [] | ||
| day = anomaly_month_start | ||
| while day < utc_now: | ||
| anomalous_data.extend( | ||
| [{TIMESTAMP_COLUMN: day.strftime(DATE_FORMAT)} for _ in range(40)] | ||
| ) | ||
| day += timedelta(days=1) | ||
|
|
||
| all_data = normal_data + anomalous_data | ||
|
|
||
| test_args_without_exclusion = { | ||
| **DBT_TEST_ARGS, | ||
| "training_period": {"period": "day", "count": 365}, | ||
| "time_bucket": {"period": "month", "count": 1}, | ||
| "sensitivity": 4, | ||
| } | ||
|
|
||
| test_result_without = dbt_project.test( | ||
| test_id + "_without", | ||
| DBT_TEST_NAME, | ||
| test_args_without_exclusion, | ||
| data=all_data, | ||
| test_vars={"force_metrics_backfill": True}, | ||
| ) | ||
| assert ( | ||
| test_result_without["status"] == "pass" | ||
| ), "Test should pass when anomaly is included in training" | ||
|
|
||
| test_args_with_exclusion = { | ||
| **test_args_without_exclusion, | ||
| "exclude_detection_period_from_training": True, | ||
| } | ||
|
|
||
| test_result_with = dbt_project.test( | ||
| test_id + "_with", | ||
| DBT_TEST_NAME, | ||
| test_args_with_exclusion, | ||
| data=all_data, | ||
| test_vars={"force_metrics_backfill": True}, | ||
| ) | ||
| assert ( | ||
| test_result_with["status"] == "fail" | ||
| ), "Test should fail when anomaly is excluded from training (large bucket fix)" | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Devin - please check CodeRabbit comments
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed both CodeRabbit comments in commit 2cc884f:
anomaly_month_startcalculation:(current_month_1st - timedelta(days=31)).replace(day=1)→(current_month_1st - timedelta(days=1)).replace(day=1)to avoid skipping months (e.g. March → January instead of February).normal_month_startcalculation similarly, using.replace(year=anomaly_month_start.year - 1)to avoid leap year issues withtimedelta(days=365).