roc and p-r curves generation for classification task

LukaszCmielowski · LukaszCmielowski · commit 8e436e58a720 · 2026-05-21T14:08:55.000+02:00
Signed-off-by: Lukasz Cmielowski &lt;lcmielow@redhat.com&gt;
Assisted-by: Cursor
diff --git a/components/training/automl/autogluon_models_training/README.md b/components/training/automl/autogluon_models_training/README.md
@@ -30,6 +30,7 @@ output artifact so the pipeline does not require a ParallelFor loop. Each model
 | `sampling_config` | `Optional[dict]` | `None` | Data sampling config stored in artifact metadata. |
 | `split_config` | `Optional[dict]` | `None` | Data split config stored in artifact metadata. |
 | `extra_train_data_path` | `str` | `""` | Optional path to extra training CSV passed to ``refit_full``. |
+| `positive_class` | `Optional[str]` | `None` | **Binary only.** Positive class label (``int``/``str``). Passed to ``TabularPredictor`` when set; if omitted, AutoGluon infers it as the **second sorted unique class**. Ignored for multiclass/regression. |
 
 ## Outputs 📤
 
@@ -47,7 +48,7 @@ output artifact so the pipeline does not require a ParallelFor loop. Each model
 - **Tags**:
   - training
   - automl
-- **Last Verified**: 2026-04-21 12:00:00+00:00
+- **Last Verified**: 2026-05-20 12:00:00+00:00
 - **Owners**:
   - Approvers:
     - LukaszCmielowski
@@ -129,7 +130,8 @@ models_artifact/
     ├── metrics/
     │   ├── metrics.json           # Evaluation results on test data (metric names → values)
     │   ├── feature_importance.json
-    │   └── confusion_matrix.json  # Classification tasks only
+    │   ├── confusion_matrix.json  # Classification tasks only
+    │   └── curves.json            # Classification tasks only (ROC + precision-recall)
     └── notebooks/
         └── automl_predictor_notebook.ipynb  # Pre-filled inference notebook
 ```
diff --git a/components/training/automl/autogluon_models_training/component.py b/components/training/automl/autogluon_models_training/component.py
diff --git a/components/training/automl/autogluon_models_training/metadata.yaml b/components/training/automl/autogluon_models_training/metadata.yaml
@@ -8,4 +8,4 @@ dependencies:
 tags:
   - training
   - automl
-lastVerified: 2026-04-21T12:00:00Z
+lastVerified: 2026-05-20T12:00:00Z
diff --git a/components/training/automl/autogluon_models_training/notebook_templates/classification_notebook.ipynb b/components/training/automl/autogluon_models_training/notebook_templates/classification_notebook.ipynb
diff --git a/components/training/automl/autogluon_models_training/tests/test_component_unit.py b/components/training/automl/autogluon_models_training/tests/test_component_unit.py
diff --git a/pipelines/training/automl/autogluon_tabular_training_pipeline/README.md b/pipelines/training/automl/autogluon_tabular_training_pipeline/README.md
@@ -54,6 +54,7 @@ The pipeline leverages AutoGluon's unique ensembling strategy that combines mult
 | `label_column` | `str` | `None` | Name of the target/label column in the dataset. |
 | `task_type` | `str` | `None` | "binary", "multiclass", or "regression"; drives metrics and model types. |
 | `top_n` | `int` | `3` | Number of top models to select and refit (default: 3); positive integer from range [1, 10]. |
+| `positive_class` | `Optional[str]` | `None` | **Binary only.** Label value treated as the positive class (e.g. ``"1"``, ``"yes"``). If omitted, AutoGluon infers it at fit time as the **second unique class after sorting** (e.g. ``[0, 1]`` → ``1``; ``['abc', 'def']`` → ``'def'``). Ignored for multiclass and regression. |
 
 ## Metadata 🗂️
 
@@ -69,7 +70,7 @@ The pipeline leverages AutoGluon's unique ensembling strategy that combines mult
   - pipeline
   - automl
   - autogluon-tabular-training-pipeline
-- **Last Verified**: 2026-05-07 12:00:00+00:00
+- **Last Verified**: 2026-05-20 12:00:00+00:00
 - **Owners**:
   - Approvers:
     - LukaszCmielowski
@@ -98,7 +99,8 @@ Pipeline outputs are written to the artifact store (S3-compatible storage config
     │           │   ├── metrics/
     │           │   │   ├── metrics.json         # model evaluation metrics (eval_metric, etc.)
     │           │   │   ├── feature_importance.json
-    │           │   │   └── confusion_matrix.json  # for classification tasks only
+    │           │   │   ├── confusion_matrix.json  # for classification tasks only
+    │           │   │   └── curves.json            # for classification tasks only (ROC + PR)
     │           │   └── notebooks/
     │           │       └── automl_predictor_notebook.ipynb   # Jupyter notebook for inference & exploration
     │           └── <ModelName>_FULL/
diff --git a/pipelines/training/automl/autogluon_tabular_training_pipeline/metadata.yaml b/pipelines/training/automl/autogluon_tabular_training_pipeline/metadata.yaml
@@ -13,4 +13,4 @@ tags:
   - pipeline
   - automl
   - autogluon-tabular-training-pipeline
-lastVerified: 2026-05-07T12:00:00Z
+lastVerified: 2026-05-20T12:00:00Z
diff --git a/pipelines/training/automl/autogluon_tabular_training_pipeline/pipeline.py b/pipelines/training/automl/autogluon_tabular_training_pipeline/pipeline.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 from kfp import dsl
 from kfp_components.components.data_processing.automl.tabular_data_loader import automl_data_loader
 from kfp_components.components.training.automl.autogluon_leaderboard_evaluation import leaderboard_evaluation
@@ -37,6 +39,7 @@ def autogluon_tabular_training_pipeline(
     label_column: str,
     task_type: str,
     top_n: int = 3,
+    positive_class: Optional[str] = None,
 ):
     """AutoGluon Tabular Training Pipeline.
 
@@ -111,6 +114,10 @@ def autogluon_tabular_training_pipeline(
         label_column: Name of the target/label column in the dataset.
         task_type: "binary", "multiclass", or "regression"; drives metrics and model types.
         top_n: Number of top models to select and refit (default: 3); positive integer from range [1, 10].
+        positive_class: Optional label value for the positive class in binary classification (e.g.
+            ``"1"`` or ``"yes"``). If omitted (``None``), AutoGluon infers it at ``TabularPredictor.fit``
+            time as the **second unique class after sorting** label values (see AutoGluon
+            ``TabularPredictor`` docs). Ignored for multiclass and regression.
 
     Returns:
         HTML artifact with leaderboard of refitted models ranked by task_type metric (e.g. accuracy, r2).
@@ -165,6 +172,7 @@ def autogluon_tabular_training_pipeline(
         label_column=label_column,
         task_type=task_type,
         top_n=top_n,
+        positive_class=positive_class,
         train_data_path=data_loader_task.outputs["models_selection_train_data_path"],
         test_data=data_loader_task.outputs["sampled_test_dataset"],
         workspace_path=dsl.WORKSPACE_PATH_PLACEHOLDER,
diff --git a/pipelines/training/automl/autogluon_tabular_training_pipeline/tests/test_pipeline_integration.py b/pipelines/training/automl/autogluon_tabular_training_pipeline/tests/test_pipeline_integration.py
@@ -69,10 +69,11 @@ def _run_succeeded(detail):
 
 
 def _find_artifacts_in_s3(s3_client, bucket, prefix):
-    """List object keys under prefix; return lists of keys ending in .pkl, .ipynb, and keys containing 'leaderboard' or 'html_artifact'."""  # noqa: E501
+    """List object keys under prefix; return lists of keys by type (.pkl, .ipynb, .json, leaderboard)."""
     pkl_keys = []
     ipynb_keys = []
     leaderboard_keys = []
+    json_keys = []
     try:
         paginator = s3_client.get_paginator("list_objects_v2")
         for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
@@ -82,11 +83,13 @@ def _find_artifacts_in_s3(s3_client, bucket, prefix):
                     pkl_keys.append(key)
                 elif key.endswith(".ipynb"):
                     ipynb_keys.append(key)
+                elif key.endswith(".json"):
+                    json_keys.append(key)
                 elif "leaderboard" in key.lower() or "html_artifact" in key.lower():
                     leaderboard_keys.append(key)
     except Exception:
         pass
-    return pkl_keys, ipynb_keys, leaderboard_keys
+    return pkl_keys, ipynb_keys, leaderboard_keys, json_keys
 
 
 @pytest.mark.integration
@@ -130,9 +133,34 @@ def test_autogluon_pipeline_with_config(
         if s3_client and config.get("s3_bucket_artifacts"):
             bucket = config["s3_bucket_artifacts"]
             prefix = f"{PIPELINE_DISPLAY_NAME}/{run_id}"
-            pkl_keys, ipynb_keys, leaderboard_keys = _find_artifacts_in_s3(s3_client, bucket, prefix)
+            pkl_keys, ipynb_keys, leaderboard_keys, json_keys = _find_artifacts_in_s3(s3_client, bucket, prefix)
             assert len(pkl_keys) >= 1, f"Expected at least one .pkl model artifact under {prefix}; found {pkl_keys}"
             assert len(ipynb_keys) >= 1, f"Expected at least one .ipynb notebook under {prefix}; found {ipynb_keys}"
             assert len(leaderboard_keys) >= 1, (
                 f"Expected leaderboard/html artifact under {prefix}; found {leaderboard_keys}"
             )
+
+            # Verify core metric files (all task types)
+            metrics_json = [k for k in json_keys if k.endswith("metrics/metrics.json")]
+            feature_imp_json = [k for k in json_keys if k.endswith("metrics/feature_importance.json")]
+
+            assert len(metrics_json) >= 1, (
+                f"Expected at least one metrics.json under {prefix}; found {metrics_json}"
+            )
+            assert len(feature_imp_json) >= 1, (
+                f"Expected at least one feature_importance.json under {prefix}; found {feature_imp_json}"
+            )
+
+            # Verify classification-specific metric files
+            if test_config.task_type in {"binary", "multiclass"}:
+                cm_json = [k for k in json_keys if k.endswith("metrics/confusion_matrix.json")]
+                curves_json = [k for k in json_keys if k.endswith("metrics/curves.json")]
+
+                assert len(cm_json) >= 1, (
+                    f"Expected at least one confusion_matrix.json for {test_config.task_type} task "
+                    f"under {prefix}; found {cm_json}"
+                )
+                assert len(curves_json) >= 1, (
+                    f"Expected at least one curves.json for {test_config.task_type} task "
+                    f"under {prefix}; found {curves_json}"
+                )
diff --git a/pipelines/training/automl/autogluon_tabular_training_pipeline/tests/test_pipeline_unit.py b/pipelines/training/automl/autogluon_tabular_training_pipeline/tests/test_pipeline_unit.py
@@ -52,6 +52,7 @@ def test_pipeline_signature(self):
             "label_column",
             "task_type",
             "top_n",
+            "positive_class",
         }
         inputs = autogluon_tabular_training_pipeline.component_spec.inputs
         params = set(inputs.keys())
@@ -76,6 +77,7 @@ def test_compiled_pipeline_has_expected_inputs(self):
                 "label_column",
                 "task_type",
                 "top_n",
+                "positive_class",
             ):
                 assert name in content, f"Expected pipeline input '{name}' in compiled YAML"
         except Exception as e:
@@ -112,6 +114,7 @@ def test_compiled_pipeline_wires_loader_outputs_to_training_task(self):
         assert "outputParameterKey: split_config" in content
         assert "outputParameterKey: sample_config" in content
         assert "outputParameterKey: extra_train_data_path" in content
+        assert "positive_class" in content
 
     def test_compiled_pipeline_data_loader_declares_task_type_and_label(self):
         """Tabular data loader component exposes task_type and label_column inputs."""
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ test = [
     # Once KFP fixes this, these can be removed and tests can use
     # SubprocessRunner(use_venv=True) instead.
     "sdg-hub>=0.7.0,<1.0",
+    "scikit-learn>=1.3.0",
     # Required for pipeline validation (validate_examples, validate_base_images)
     "kfp-kubernetes",
 ]
diff --git a/uv.lock b/uv.lock