refactor: make component_status required, remove None handling

LukaszCmielowski · LukaszCmielowski · commit d2f9dc7208b6 · 2026-06-17T16:55:25.000+02:00
Address Daniel feedback: component_status should never be None
in production KFP pipelines. Remove unnecessary None handling to simplify
code and enforce proper usage.

Signed-off-by: Lukasz Cmielowski &lt;lcmielow@redhat.com&gt;
Assisted-by: Cursor
diff --git a/components/data_processing/autorag/documents_discovery/README.md b/components/data_processing/autorag/documents_discovery/README.md
@@ -13,12 +13,12 @@ Lists available documents from S3, performs sampling if applied and writes a JSO
 | Parameter | Type | Default | Description |
 | --------- | ---- | ------- | ----------- |
 | `input_data_bucket_name` | `str` | `None` | S3 (or compatible) bucket containing input data. |
+| `component_status` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing stage-level progress tracking. |
 | `input_data_path` | `str` | `""` | Path to folder with input documents within the bucket. |
 | `test_data` | `dsl.Input[dsl.Artifact]` | `None` | Optional input artifact containing test data for sampling. |
 | `sampling_enabled` | `bool` | `True` | Whether to enable sampling or not. |
 | `sampling_max_size` | `float` | `1` | Maximum size of sampled documents (in gigabytes). |
 | `discovered_documents` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing the documents descriptor JSON file. |
-| `component_status` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing stage-level progress tracking. |
 | `embedded_artifact` | `dsl.EmbeddedInput[dsl.Dataset]` | `None` | Embedded ``autorag.shared`` helpers injected by KFP at runtime. |
 
 ## Usage Examples 🧪
diff --git a/components/data_processing/autorag/documents_discovery/component.py b/components/data_processing/autorag/documents_discovery/component.py
@@ -13,12 +13,12 @@
 )
 def documents_discovery(
     input_data_bucket_name: str,
+    component_status: dsl.Output[dsl.Artifact],
     input_data_path: str = "",
     test_data: dsl.Input[dsl.Artifact] = None,
     sampling_enabled: bool = True,
     sampling_max_size: float = 1,
     discovered_documents: dsl.Output[dsl.Artifact] = None,
-    component_status: dsl.Output[dsl.Artifact] = None,
     embedded_artifact: dsl.EmbeddedInput[dsl.Dataset] = None,
 ):
     """Documents discovery component.
@@ -86,8 +86,7 @@ def get_test_data_docs_names() -> list[str]:
     _spec.loader.exec_module(_status_module)
     status = _status_module.bootstrap_status_tracker(embedded_artifact, component_status, "documents_discovery")
     with status:
-        if component_status is not None:
-            component_status.metadata["display_name"] = "Documents Discovery Status"
+        component_status.metadata["display_name"] = "Documents Discovery Status"
         with status.stage("discover_documents"):
             s3_creds = {k: os.environ.get(k) for k in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_S3_ENDPOINT"]}
             for k, v in s3_creds.items():
diff --git a/components/data_processing/autorag/test_data_loader/README.md b/components/data_processing/autorag/test_data_loader/README.md
@@ -14,9 +14,9 @@ The component reads S3-compatible credentials from environment variables (inject
 | --------- | ---- | ------- | ----------- |
 | `test_data_bucket_name` | `str` | `None` | S3 (or compatible) bucket that contains the test data file. |
 | `test_data_path` | `str` | `None` | S3 object key to the JSON test data file. |
+| `component_status` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing stage-level progress tracking. |
 | `benchmark_sample_size` | `int` | `25` | Maximum number of records to keep from the test data. When the dataset exceeds this limit, a reproducible random sample is drawn (seed 42). Set to 0 to disable sampling and keep all records. |
 | `test_data` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact that receives the (possibly sampled) file. |
-| `component_status` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing stage-level progress tracking. |
 | `embedded_artifact` | `dsl.EmbeddedInput[dsl.Dataset]` | `None` | Embedded ``autorag.shared`` helpers injected by KFP at runtime. |
 
 ## Usage Examples 🧪
diff --git a/components/data_processing/autorag/test_data_loader/component.py b/components/data_processing/autorag/test_data_loader/component.py
@@ -15,9 +15,9 @@
 def test_data_loader(
     test_data_bucket_name: str,
     test_data_path: str,
+    component_status: dsl.Output[dsl.Artifact],
     benchmark_sample_size: int = 25,
     test_data: dsl.Output[dsl.Artifact] = None,
-    component_status: dsl.Output[dsl.Artifact] = None,
     embedded_artifact: dsl.EmbeddedInput[dsl.Dataset] = None,
 ):
     """Download test data JSON from S3 and sample it for benchmarking.
@@ -78,8 +78,7 @@ class TestDataLoaderException(Exception):
     _spec.loader.exec_module(_status_module)
     status = _status_module.bootstrap_status_tracker(embedded_artifact, component_status, "test_data_loader")
     with status:
-        if component_status is not None:
-            component_status.metadata["display_name"] = "Test Data Loader Status"
+        component_status.metadata["display_name"] = "Test Data Loader Status"
         with status.stage("load_benchmark"):
             if not test_data_bucket_name:
                 raise TypeError("test_data_bucket_name must be a non-empty string")
diff --git a/components/data_processing/autorag/text_extraction/component.py b/components/data_processing/autorag/text_extraction/component.py
@@ -15,7 +15,7 @@
 def text_extraction(
     documents_descriptor: dsl.Input[dsl.Artifact],
     extracted_text: dsl.Output[dsl.Artifact],
-    component_status: dsl.Output[dsl.Artifact] = None,
+    component_status: dsl.Output[dsl.Artifact],
     embedded_artifact: dsl.EmbeddedInput[dsl.Dataset] = None,
     error_tolerance: Optional[float] = None,
     max_extraction_workers: Optional[int] = None,
@@ -357,8 +357,7 @@ def raise_if_threshold_exceeded(error_details: list, total_docs: int, tolerance:
     _spec.loader.exec_module(_status_module)
     status = _status_module.bootstrap_status_tracker(embedded_artifact, component_status, "text_extraction")
     with status:
-        if component_status is not None:
-            component_status.metadata["display_name"] = "Text Extraction Status"
+        component_status.metadata["display_name"] = "Text Extraction Status"
         descriptor_path = Path(documents_descriptor.path) / DOCUMENTS_DESCRIPTOR_FILENAME
 
         with status.stage("extract_documents"):
diff --git a/components/training/automl/shared/component_status.py b/components/training/automl/shared/component_status.py
@@ -61,16 +61,14 @@ class ComponentStatusTracker:
     Each component independently tracks its stages and metadata.
     """
 
-    def __init__(self, artifact_path: str | None, component_id: str) -> None:
+    def __init__(self, artifact_path: str, component_id: str) -> None:
         """Initialize the status tracker.
 
         Args:
-            artifact_path: Path to the KFP artifact directory where status.json will be written.
-                When ``None``, tracking is disabled (e.g. unit tests without a mock artifact).
+            artifact_path: Path to the KFP artifact directory where component_status.json will be written.
             component_id: Unique component identifier (e.g., "autogluon_models_training").
         """
-        self._enabled = artifact_path is not None
-        self.artifact_path = Path(artifact_path) if self._enabled else Path(".")
+        self.artifact_path = Path(artifact_path)
         self.component_id = component_id
         self.stages: list[dict[str, Any]] = []
         self.started_at = utc_now_z()
@@ -131,9 +129,6 @@ def save(self) -> None:
         Creates the artifact directory if needed and writes component_status.json
         with all recorded stages and metadata.
         """
-        if not self._enabled:
-            return
-
         self.artifact_path.mkdir(parents=True, exist_ok=True)
 
         data = {
diff --git a/components/training/automl/shared/tests/test_component_status.py b/components/training/automl/shared/tests/test_component_status.py
@@ -95,13 +95,6 @@ def test_stage_context_manager_records_failed(self, tmp_path: Path) -> None:
         assert data["stages"][-1]["status"] == "failed"
         assert "bad split" in data["stages"][-1]["error"]
 
-    def test_disabled_tracker_skips_save(self, tmp_path: Path) -> None:
-        """When artifact_path is None, save() is a no-op."""
-        tracker = ComponentStatusTracker(None, "automl_data_loader")
-        tracker.record("prepare_data", "completed")
-        tracker.save()
-        assert not (tmp_path / COMPONENT_STATUS_FILENAME).exists()
-
     def test_stage_skips_auto_complete_when_completed_inside_block(self, tmp_path: Path) -> None:
         """stage() does not overwrite a completed record written inside the block."""
         tracker = ComponentStatusTracker(str(tmp_path), "autogluon_models_training")
diff --git a/components/training/autorag/leaderboard_evaluation/component.py b/components/training/autorag/leaderboard_evaluation/component.py
@@ -14,7 +14,7 @@
 def leaderboard_evaluation(
     rag_patterns: dsl.InputPath(dsl.Artifact),
     html_artifact: dsl.Output[dsl.HTML],
-    component_status: dsl.Output[dsl.Artifact] = None,
+    component_status: dsl.Output[dsl.Artifact],
     embedded_artifact: dsl.EmbeddedInput[dsl.Dataset] = None,
     optimization_metric: str = "faithfulness",
 ):
@@ -338,8 +338,7 @@ def _build_leaderboard_html(
     _spec.loader.exec_module(_status_module)
     status = _status_module.bootstrap_status_tracker(embedded_artifact, component_status, "leaderboard_evaluation")
     with status:
-        if component_status is not None:
-            component_status.metadata["display_name"] = "Leaderboard Evaluation Status"
+        component_status.metadata["display_name"] = "Leaderboard Evaluation Status"
         with status.stage("build_leaderboard"):
             if not rag_patterns_dir.is_dir():
                 raise FileNotFoundError("rag_patterns path is not a directory: %s" % rag_patterns_dir)
diff --git a/components/training/autorag/rag_templates_optimization/README.md b/components/training/autorag/rag_templates_optimization/README.md
@@ -18,8 +18,8 @@ Carries out the iterative RAG optimization process.
 | `rag_patterns` | `dsl.Output[dsl.Artifact]` | `None` | kfp-enforced argument specifying an output artifact. Provided by kfp backend automatically. |
 | `test_data_key` | `Optional[str]` | `None` | Path to the benchmark JSON file in object storage used by generated notebooks. |
 | `vector_io_provider_id` | `str` | `None` | Vector I/O provider identifier as registered in OGX. |
-| `embedded_artifact` | `dsl.EmbeddedInput[dsl.Dataset]` | `None` | Embedded ``autorag.shared`` helpers injected by KFP at runtime. |
 | `component_status` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing stage-level progress tracking. |
+| `embedded_artifact` | `dsl.EmbeddedInput[dsl.Dataset]` | `None` | Embedded ``autorag.shared`` helpers injected by KFP at runtime. |
 | `optimization_settings` | `Optional[dict]` | `None` | Additional settings customising the experiment. |
 | `input_data_key` | `Optional[str]` | `""` | A path to documents dir within a bucket used as an input to AI4RAG experiment. |
 
diff --git a/components/training/autorag/rag_templates_optimization/component.py b/components/training/autorag/rag_templates_optimization/component.py
@@ -19,8 +19,8 @@ def rag_templates_optimization(
     rag_patterns: dsl.Output[dsl.Artifact],
     test_data_key: Optional[str],
     vector_io_provider_id: str,
+    component_status: dsl.Output[dsl.Artifact],
     embedded_artifact: dsl.EmbeddedInput[dsl.Dataset] = None,
-    component_status: dsl.Output[dsl.Artifact] = None,
     optimization_settings: Optional[dict] = None,
     input_data_key: Optional[str] = "",
 ):
@@ -551,8 +551,7 @@ def on_pattern_creation(self, payload: dict, evaluation_results: list, **kwargs)
             pass
 
     with status:
-        if component_status is not None:
-            component_status.metadata["display_name"] = "RAG Templates Optimization Status"
+        component_status.metadata["display_name"] = "RAG Templates Optimization Status"
         with status.stage("optimize_templates", steps=optimize_templates_steps):
             if not ogx_client_base_url or not ogx_client_api_key:
                 raise ValueError(
diff --git a/components/training/autorag/search_space_preparation/component.py b/components/training/autorag/search_space_preparation/component.py
@@ -17,7 +17,7 @@ def search_space_preparation(
     test_data: dsl.Input[dsl.Artifact],
     extracted_text: dsl.Input[dsl.Artifact],
     search_space_prep_report: dsl.Output[dsl.Artifact],
-    component_status: dsl.Output[dsl.Artifact] = None,
+    component_status: dsl.Output[dsl.Artifact],
     embedded_artifact: dsl.EmbeddedInput[dsl.Dataset] = None,
     embedding_models: Optional[List] = None,
     generation_models: Optional[List] = None,
@@ -336,8 +336,7 @@ def represent_model_instance(dumper, model: BaseFoundationModel | BaseEmbeddingM
     _spec.loader.exec_module(_status_module)
     status = _status_module.bootstrap_status_tracker(embedded_artifact, component_status, "search_space_preparation")
     with status:
-        if component_status is not None:
-            component_status.metadata["display_name"] = "Search Space Preparation Status"
+        component_status.metadata["display_name"] = "Search Space Preparation Status"
         with status.stage("prepare_search_space"):
             if not ogx_client_base_url or not ogx_client_api_key:
                 raise ValueError("OGX_CLIENT_BASE_URL and OGX_CLIENT_API_KEY environment variables must be set.")
diff --git a/components/training/autorag/shared/component_status.py b/components/training/autorag/shared/component_status.py
@@ -65,16 +65,14 @@ def default(self, obj: Any) -> Any:
 class ComponentStatusTracker:
     """Track stage-level progress within a single AutoRAG component."""
 
-    def __init__(self, artifact_path: str | None, component_id: str) -> None:
+    def __init__(self, artifact_path: str, component_id: str) -> None:
         """Initialize the status tracker.
 
         Args:
-            artifact_path: Path to the KFP artifact directory where status.json will be written.
-                When ``None``, tracking is disabled (e.g. direct unit-test invocations without a mock artifact).
+            artifact_path: Path to the KFP artifact directory where component_status.json will be written.
             component_id: Unique component identifier (e.g., "test_data_loader").
         """
-        self._enabled = artifact_path is not None
-        self.artifact_path = Path(artifact_path) if self._enabled else Path(".")
+        self.artifact_path = Path(artifact_path)
         self.component_id = component_id
         self.stages: list[dict[str, Any]] = []
         self.started_at = utc_now_z()
@@ -110,9 +108,6 @@ def set_metadata(self, **metadata: Any) -> None:
 
     def save(self) -> None:
         """Write the final status to the artifact."""
-        if not self._enabled:
-            return
-
         self.artifact_path.mkdir(parents=True, exist_ok=True)
 
         data = {
@@ -258,9 +253,16 @@ def bootstrap_status_tracker(
 
 
 def component_status_tracker(component_status: Any, component_id: str) -> ComponentStatusTracker:
-    """Build a tracker from an optional KFP ``component_status`` output artifact."""
-    artifact_path = component_status.path if component_status is not None else None
-    return ComponentStatusTracker(artifact_path, component_id)
+    """Build a tracker from a KFP component_status output artifact.
+
+    Args:
+        component_status: KFP output artifact for component status tracking.
+        component_id: Unique component identifier.
+
+    Returns:
+        Configured ComponentStatusTracker instance.
+    """
+    return ComponentStatusTracker(component_status.path, component_id)
 
 
 def load_component_status(artifact_path: str) -> dict[str, Any]:
diff --git a/components/training/autorag/shared/tests/test_component_status.py b/components/training/autorag/shared/tests/test_component_status.py
@@ -33,19 +33,6 @@ def test_record_and_save(self, tmp_path: Path) -> None:
         assert data["stages"][0]["status"] == "completed"
         assert data["stages"][0]["rows"] == 5
 
-    def test_disabled_tracker_skips_save(self, tmp_path: Path) -> None:
-        """When artifact_path is None, save() is a no-op."""
-        tracker = ComponentStatusTracker(None, "test_data_loader")
-        tracker.record("load_benchmark", "completed")
-        tracker.save()
-        assert not (tmp_path / COMPONENT_STATUS_FILENAME).exists()
-
-    def test_component_status_tracker_from_none(self, tmp_path: Path) -> None:
-        """component_status_tracker() accepts a missing artifact for unit tests."""
-        tracker = component_status_tracker(None, "documents_discovery")
-        tracker.record("discover_documents", "completed")
-        tracker.save()
-        assert not (tmp_path / COMPONENT_STATUS_FILENAME).exists()
 
     def test_context_manager_marks_failed_and_saves(self, tmp_path: Path) -> None:
         """Context manager marks active stage failed and persists status on exception."""