Skip to content
1 change: 1 addition & 0 deletions .github/scripts/check_imports/import_exceptions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ files:
- pandas
"components/**/shared/*":
- pandas
- numpy
- component
"pipelines/**/tests/*":
- tests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,15 @@ def example_pipeline(
Writes ``component_status.json`` under ``component_status`` with ``component_id`` ``autogluon_timeseries_models_training`` and training stages (``load_data``, ``model_selection``, ``refit_full``, ``evaluate_models``). Artifact metadata display name: **Timeseries Models Training Status**.

Inference notebooks are loaded from ``shared/notebook_templates/timeseries_notebook.ipynb`` at runtime (same shared package data as tabular training).

### Model insight artifacts (per refitted model)

Under each ``{model_name}_FULL/metrics/`` directory:

- **`metrics.json`**: Holdout test metrics from ``TimeSeriesPredictor.evaluate`` (finite values only). Uses AutoGluon's raw **higher-is-better** sign convention (error metrics such as MASE are negated) so the HTML leaderboard ranks models correctly.
- **`back_testing.json`**: Multi-window backtest with ``per_window_metrics`` and ``series_analysis``
(best/worst forecast timelines). Window error metrics use **natural positive** signs via
``filter_finite_metrics``. Best-effort after refit; omitted if backtest APIs or history are
insufficient.

The timeseries notebook template loads ``back_testing.json`` when present for model insights.
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,12 @@ def autogluon_timeseries_models_training(

logger = logging.getLogger(__name__)

from kfp_components.components.training.automl.shared.back_testing import build_back_testing_json
from kfp_components.components.training.automl.shared.component_status import ComponentStatusTracker
from kfp_components.components.training.automl.shared.run_status import shared_automl_dir
from kfp_components.components.training.automl.shared.timeseries_notebook_utils import (
build_predict_sample_artifact,
)

status = ComponentStatusTracker(component_status.path, "autogluon_timeseries_models_training")
with status:
Expand Down Expand Up @@ -327,7 +331,8 @@ def replace_placeholder_in_notebook(notebook, replacements):
excluded_model_types=["Chronos", "Chronos2", "Toto"],
)
metrics = predictor_refit.evaluate(test_ts, metrics=list(AVAILABLE_METRICS.keys()))

# Keep raw AutoGluon evaluate() signs for metrics.json (higher-is-better / negated errors)
# so leaderboard_evaluation sorting stays correct. back_testing.json normalizes separately.
metrics_dict = {}
for k, v in metrics.items():
if hasattr(v, "item"):
Expand All @@ -346,17 +351,45 @@ def replace_placeholder_in_notebook(notebook, replacements):
with (metrics_path / "metrics.json").open("w", encoding="utf-8") as f:
json.dump(metrics_dict, f, indent=2)

back_testing_available = False
try:
back_testing_payload = build_back_testing_json(
predictor_refit,
model_name=model_name,
model_name_full=model_name_full,
train_data=full_train_ts_df,
eval_metric=eval_metric,
target=target,
id_column=id_column,
timestamp_column=timestamp_column,
prediction_length=prediction_length,
metrics=list(AVAILABLE_METRICS.keys()),
)
with (metrics_path / "back_testing.json").open("w", encoding="utf-8") as f:
json.dump(back_testing_payload, f, indent=2)
back_testing_available = True
except Exception as backtest_exc:
logger.warning(
"Could not generate back_testing.json for model %r: %s. Skipping backtest artifact.",
model_name_full,
backtest_exc,
)

notebook_file = "timeseries_notebook.ipynb"
with (shared_automl_dir() / "notebook_templates" / notebook_file).open("r", encoding="utf-8") as f:
notebook = json.load(f)
predict_sample = build_predict_sample_artifact(
predictor_refit,
sample_row_list,
id_column,
timestamp_column,
known_covariates_names,
)
replacements = {
"<REPLACE_RUN_ID>": run_id,
"<REPLACE_PIPELINE_NAME>": pipeline_name_trimmed,
"<REPLACE_MODEL_NAME>": model_name_full,
"<REPLACE_SAMPLE_ROW>": str(sample_row_list),
"<REPLACE_ID_COLUMN>": id_column,
"<REPLACE_TIMESTAMP_COLUMN>": timestamp_column,
"<REPLACE_KNOWN_COVARIATES_NAMES>": str(known_covariates_names or []),
"<REPLACE_PREDICT_SAMPLE>": str(predict_sample),
}
notebook = replace_placeholder_in_notebook(notebook, replacements)

Expand All @@ -365,14 +398,19 @@ def replace_placeholder_in_notebook(notebook, replacements):
with (notebook_path / "automl_predictor_notebook.ipynb").open("w", encoding="utf-8") as f:
json.dump(notebook, f)

model_location = {
"model_directory": model_name_full,
"predictor": str(Path(model_name_full) / "predictor"),
"notebook": str(Path(model_name_full) / "notebooks" / "automl_predictor_notebook.ipynb"),
"metrics": str(Path(model_name_full) / "metrics"),
}
# Only include back_testing path if file was successfully written
if back_testing_available:
model_location["back_testing"] = str(Path(model_name_full) / "metrics" / "back_testing.json")

model_metadata = {
"name": model_name_full,
"location": {
"model_directory": model_name_full,
"predictor": str(Path(model_name_full) / "predictor"),
"notebook": str(Path(model_name_full) / "notebooks" / "automl_predictor_notebook.ipynb"),
"metrics": str(Path(model_name_full) / "metrics"),
},
"location": model_location,
"metrics": {
"test_data": metrics_dict,
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ def test_basic_flow_returns_expected_outputs(
train_ts, test_ts = _mock_ts_df(), _mock_ts_df()
extra_ts = _mock_ts_df()
full_train_ts = _mock_ts_df()
mock_ts_df_cls.from_data_frame.side_effect = [train_ts, test_ts]
# from_data_frame is called for: train, test, and once per model for build_predict_sample_artifact
mock_ts_df_cls.from_data_frame.side_effect = [train_ts, test_ts, _mock_ts_df(), _mock_ts_df()]
mock_ts_df_cls.from_path.return_value = extra_ts
mock_ts_df_cls.return_value = full_train_ts
mock_concat.return_value = mock.MagicMock()
Expand Down Expand Up @@ -192,7 +193,8 @@ def test_known_covariates_propagated_to_predictor_and_model_config(
mock_refit_predictor.evaluate.return_value = {"MASE": 0.5}

mock_predictor_cls.side_effect = [mock_predictor, mock_refit_predictor]
mock_ts_df_cls.from_data_frame.side_effect = [_mock_ts_df(), _mock_ts_df()]
# from_data_frame is called for: train, test, and once per model for build_predict_sample_artifact (1 model)
mock_ts_df_cls.from_data_frame.side_effect = [_mock_ts_df(), _mock_ts_df(), _mock_ts_df()]
mock_ts_df_cls.from_path.return_value = _mock_ts_df()
mock_ts_df_cls.return_value = _mock_ts_df()
mock_concat.return_value = mock.MagicMock()
Expand Down Expand Up @@ -492,7 +494,8 @@ def test_partial_refit_failure_succeeds_with_warnings(
mock_refit_3.evaluate.return_value = {"MASE": 0.6, "MSE": 1.2}

mock_predictor_cls.side_effect = [mock_predictor, mock_refit_1, mock_refit_2, mock_refit_3]
mock_ts_df_cls.from_data_frame.side_effect = [_mock_ts_df(), _mock_ts_df()]
# from_data_frame: train, test, and successful refits (DeepAR, AutoARIMA) in build_predict_sample_artifact
mock_ts_df_cls.from_data_frame.side_effect = [_mock_ts_df(), _mock_ts_df(), _mock_ts_df(), _mock_ts_df()]
mock_ts_df_cls.from_path.return_value = _mock_ts_df()
mock_ts_df_cls.return_value = _mock_ts_df()
mock_concat.return_value = mock.MagicMock()
Expand Down Expand Up @@ -576,3 +579,125 @@ def test_all_models_fail_refit_raises(
models_artifact=models_artifact,
extra_train_data_path=extra_train_path,
)


class TestMetricsJsonSignConvention:
"""Tests for metrics.json sign convention (raw AutoGluon, leaderboard-compatible)."""

@mock.patch("kfp_components.components.training.automl.shared.back_testing.build_back_testing_json")
@mock.patch("pandas.read_csv")
@mock.patch("pandas.concat")
@mock.patch("autogluon.timeseries.TimeSeriesDataFrame")
@mock.patch("autogluon.timeseries.TimeSeriesPredictor")
def test_metrics_json_preserves_autogluon_evaluate_signs(
self,
mock_predictor_cls,
mock_ts_df_cls,
mock_concat,
mock_read_csv,
mock_build_back_testing_json,
mock_artifacts, # noqa: F811
):
"""metrics.json keeps negated error metrics from AutoGluon evaluate() for leaderboard sorting."""
models_artifact, extra_train_path = mock_artifacts
mock_build_back_testing_json.return_value = {"schema_version": 1}

mock_predictor = mock.MagicMock()
mock_predictor.leaderboard.return_value = _mock_leaderboard(["DeepAR"])
mock_predictor.fit_summary.return_value = {"model_hyperparams": {"DeepAR": {}}}
mock_predictor._trainer.get_model_attribute.return_value = mock.MagicMock

mock_refit_predictor = mock.MagicMock()
mock_refit_predictor.evaluate.return_value = {"MASE": -0.42, "MSE": -1.0}

mock_predictor_cls.side_effect = [mock_predictor, mock_refit_predictor]
# from_data_frame is called for: train, test, and once per model for build_predict_sample_artifact (1 model)
mock_ts_df_cls.from_data_frame.side_effect = [_mock_ts_df(), _mock_ts_df(), _mock_ts_df()]
mock_ts_df_cls.from_path.return_value = _mock_ts_df()
mock_ts_df_cls.return_value = _mock_ts_df()
mock_concat.return_value = mock.MagicMock()
mock_read_csv.side_effect = [mock.MagicMock(), mock.MagicMock()]
test_data = mock.MagicMock()
test_data.path = "/tmp/test.csv"

autogluon_timeseries_models_training.python_func(
target="sales",
id_column="item_id",
timestamp_column="timestamp",
train_data_path="/tmp/train.csv",
test_data=test_data,
top_n=1,
workspace_path="/tmp/workspace",
pipeline_name="ts-pipeline-123",
run_id="run-123",
models_artifact=models_artifact,
extra_train_data_path=extra_train_path,
)

metrics_path = Path(models_artifact.path) / "DeepAR_FULL" / "metrics" / "metrics.json"
with metrics_path.open(encoding="utf-8") as f:
metrics = json.load(f)
assert metrics["MASE"] == -0.42
assert metrics["MSE"] == -1.0


class TestBackTestingArtifactFailure:
"""Tests for best-effort back_testing.json generation in the component."""

@mock.patch("kfp_components.components.training.automl.shared.back_testing.build_back_testing_json")
@mock.patch("pandas.read_csv")
@mock.patch("pandas.concat")
@mock.patch("autogluon.timeseries.TimeSeriesDataFrame")
@mock.patch("autogluon.timeseries.TimeSeriesPredictor")
def test_back_testing_failure_is_non_fatal(
self,
mock_predictor_cls,
mock_ts_df_cls,
mock_concat,
mock_read_csv,
mock_build_back_testing_json,
mock_artifacts, # noqa: F811
caplog,
):
"""Component continues when back_testing.json generation fails."""
models_artifact, extra_train_path = mock_artifacts
mock_build_back_testing_json.side_effect = RuntimeError("backtest unavailable")

mock_predictor = mock.MagicMock()
mock_predictor.leaderboard.return_value = _mock_leaderboard(["DeepAR"])
mock_predictor.fit_summary.return_value = {"model_hyperparams": {"DeepAR": {}}}
mock_predictor._trainer.get_model_attribute.return_value = mock.MagicMock

mock_refit_predictor = mock.MagicMock()
mock_refit_predictor.evaluate.return_value = {"MASE": 0.5, "MSE": 1.0}

mock_predictor_cls.side_effect = [mock_predictor, mock_refit_predictor]
# from_data_frame is called for: train, test, and once per model for build_predict_sample_artifact (1 model)
mock_ts_df_cls.from_data_frame.side_effect = [_mock_ts_df(), _mock_ts_df(), _mock_ts_df()]
mock_ts_df_cls.from_path.return_value = _mock_ts_df()
mock_ts_df_cls.return_value = _mock_ts_df()
mock_concat.return_value = mock.MagicMock()
mock_read_csv.side_effect = [mock.MagicMock(), mock.MagicMock()]
test_data = mock.MagicMock()
test_data.path = "/tmp/test.csv"

with caplog.at_level("WARNING"):
result = autogluon_timeseries_models_training.python_func(
target="sales",
id_column="item_id",
timestamp_column="timestamp",
train_data_path="/tmp/train.csv",
test_data=test_data,
top_n=1,
workspace_path="/tmp/workspace",
pipeline_name="ts-pipeline-123",
run_id="run-123",
models_artifact=models_artifact,
extra_train_data_path=extra_train_path,
)

metrics_dir = Path(models_artifact.path) / "DeepAR_FULL" / "metrics"
assert result.top_models == ["DeepAR"]
assert (metrics_dir / "metrics.json").is_file()
assert not (metrics_dir / "back_testing.json").exists()
assert "Could not generate back_testing.json" in caplog.text
Loading
Loading