Skip to content

Commit 34ca001

Browse files
the fix for RHOAIENG-69903
Signed-off-by: Lukasz Cmielowski <lcmielow@redhat.com> Assisted-by: Cursor
1 parent 5471bfa commit 34ca001

15 files changed

Lines changed: 203 additions & 75 deletions

File tree

components/data_processing/autorag/documents_discovery/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@ Lists available documents from S3, performs sampling if applied and writes a JSO
1313
| Parameter | Type | Default | Description |
1414
| --------- | ---- | ------- | ----------- |
1515
| `input_data_bucket_name` | `str` | `None` | S3 (or compatible) bucket containing input data. |
16-
| `component_status` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing stage-level progress tracking. |
1716
| `input_data_path` | `str` | `""` | Path to folder with input documents within the bucket. |
1817
| `test_data` | `dsl.Input[dsl.Artifact]` | `None` | Optional input artifact containing test data for sampling. |
1918
| `sampling_enabled` | `bool` | `True` | Whether to enable sampling or not. |
2019
| `sampling_max_size` | `float` | `1` | Maximum size of sampled documents (in gigabytes). |
2120
| `discovered_documents` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing the documents descriptor JSON file. |
21+
| `component_status` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing stage-level progress tracking. |
2222
| `embedded_artifact` | `dsl.EmbeddedInput[dsl.Dataset]` | `None` | Embedded ``autorag.shared`` helpers injected by KFP at runtime. |
2323

2424
## Usage Examples 🧪

components/data_processing/autorag/documents_discovery/component.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@
1313
)
1414
def documents_discovery(
1515
input_data_bucket_name: str,
16-
component_status: dsl.Output[dsl.Artifact],
1716
input_data_path: str = "",
1817
test_data: dsl.Input[dsl.Artifact] = None,
1918
sampling_enabled: bool = True,
2019
sampling_max_size: float = 1,
2120
discovered_documents: dsl.Output[dsl.Artifact] = None,
21+
component_status: dsl.Output[dsl.Artifact] = None,
2222
embedded_artifact: dsl.EmbeddedInput[dsl.Dataset] = None,
2323
):
2424
"""Documents discovery component.
@@ -77,17 +77,25 @@ def get_test_data_docs_names() -> list[str]:
7777

7878
from botocore.exceptions import SSLError
7979

80-
_embedded_path = Path(embedded_artifact.path)
81-
_module_path = _embedded_path if _embedded_path.is_file() else _embedded_path / "component_status.py"
82-
_spec = importlib.util.spec_from_file_location("_autorag_component_status", _module_path)
83-
if _spec is None or _spec.loader is None:
84-
raise ValueError(f"Cannot load embedded module from {_module_path}")
85-
_status_module = importlib.util.module_from_spec(_spec)
86-
_spec.loader.exec_module(_status_module)
87-
status = _status_module.bootstrap_status_tracker(embedded_artifact, component_status, "documents_discovery")
80+
if component_status is None:
81+
from kfp_components.components.training.autorag.shared.component_status import ( # pyright: ignore[reportMissingImports]
82+
null_component_status_tracker,
83+
)
84+
85+
status = null_component_status_tracker()
86+
else:
87+
_embedded_path = Path(embedded_artifact.path)
88+
_module_path = _embedded_path if _embedded_path.is_file() else _embedded_path / "component_status.py"
89+
_spec = importlib.util.spec_from_file_location("_autorag_component_status", _module_path)
90+
if _spec is None or _spec.loader is None:
91+
raise ValueError(f"Cannot load embedded module from {_module_path}")
92+
_status_module = importlib.util.module_from_spec(_spec)
93+
_spec.loader.exec_module(_status_module)
94+
status = _status_module.bootstrap_status_tracker(embedded_artifact, component_status, "documents_discovery")
8895
with status:
89-
status.set_metadata(display_name="Documents Discovery Status")
90-
component_status.metadata["display_name"] = "Documents Discovery Status"
96+
if component_status is not None:
97+
status.set_metadata(display_name="Documents Discovery Status")
98+
component_status.metadata["display_name"] = "Documents Discovery Status"
9199
with status.stage("discover_documents"):
92100
s3_creds = {k: os.environ.get(k) for k in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_S3_ENDPOINT"]}
93101
for k, v in s3_creds.items():

components/data_processing/autorag/documents_discovery/tests/test_component_unit.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,36 @@ def test_success_writes_descriptor(self, tmp_path):
104104
assert payload["count"] == 2
105105
assert payload["total_size_bytes"] == 3000
106106

107+
@mock.patch.dict("os.environ", MOCKED_ENV_VARIABLES, clear=True)
108+
def test_notebook_style_call_without_component_status(self, tmp_path):
109+
"""Direct python_func calls without component_status work (indexing notebook path)."""
110+
mock_boto3 = mock.MagicMock()
111+
mock_s3 = mock.MagicMock()
112+
_configure_s3_paginator(mock_s3, [{"Key": "docs/a.pdf", "Size": 1000}])
113+
mock_boto3.client.return_value = mock_s3
114+
mock_botocore, mock_botocore_exceptions = _make_botocore_modules()
115+
116+
discovered = mock.MagicMock()
117+
discovered.path = str(tmp_path / "descriptor")
118+
119+
with mock.patch.dict(
120+
sys.modules,
121+
{
122+
"boto3": mock_boto3,
123+
"botocore": mock_botocore,
124+
"botocore.exceptions": mock_botocore_exceptions,
125+
},
126+
):
127+
documents_discovery.python_func(
128+
input_data_bucket_name="my-bucket",
129+
input_data_path="docs/",
130+
discovered_documents=discovered,
131+
component_status=None,
132+
sampling_enabled=False,
133+
)
134+
135+
assert (tmp_path / "descriptor" / "documents_descriptor.json").is_file()
136+
107137
@mock.patch.dict("os.environ", MOCKED_ENV_VARIABLES_NO_REGION, clear=True)
108138
def test_missing_region_is_allowed(self, tmp_path):
109139
"""Component works when AWS_DEFAULT_REGION is not present."""

components/data_processing/autorag/test_data_loader/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ The component reads S3-compatible credentials from environment variables (inject
1414
| --------- | ---- | ------- | ----------- |
1515
| `test_data_bucket_name` | `str` | `None` | S3 (or compatible) bucket that contains the test data file. |
1616
| `test_data_path` | `str` | `None` | S3 object key to the JSON test data file. |
17-
| `component_status` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing stage-level progress tracking. |
1817
| `benchmark_sample_size` | `int` | `25` | Maximum number of records to keep from the test data. When the dataset exceeds this limit, a reproducible random sample is drawn (seed 42). Set to 0 to disable sampling and keep all records. |
1918
| `test_data` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact that receives the (possibly sampled) file. |
19+
| `component_status` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing stage-level progress tracking. |
2020
| `embedded_artifact` | `dsl.EmbeddedInput[dsl.Dataset]` | `None` | Embedded ``autorag.shared`` helpers injected by KFP at runtime. |
2121

2222
## Usage Examples 🧪

components/data_processing/autorag/test_data_loader/component.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
def test_data_loader(
1616
test_data_bucket_name: str,
1717
test_data_path: str,
18-
component_status: dsl.Output[dsl.Artifact],
1918
benchmark_sample_size: int = 25,
2019
test_data: dsl.Output[dsl.Artifact] = None,
20+
component_status: dsl.Output[dsl.Artifact] = None,
2121
embedded_artifact: dsl.EmbeddedInput[dsl.Dataset] = None,
2222
):
2323
"""Download test data JSON from S3 and sample it for benchmarking.
@@ -69,17 +69,25 @@ class TestDataLoaderException(Exception):
6969

7070
import importlib.util
7171

72-
_embedded_path = Path(embedded_artifact.path)
73-
_module_path = _embedded_path if _embedded_path.is_file() else _embedded_path / "component_status.py"
74-
_spec = importlib.util.spec_from_file_location("_autorag_component_status", _module_path)
75-
if _spec is None or _spec.loader is None:
76-
raise ValueError(f"Cannot load embedded module from {_module_path}")
77-
_status_module = importlib.util.module_from_spec(_spec)
78-
_spec.loader.exec_module(_status_module)
79-
status = _status_module.bootstrap_status_tracker(embedded_artifact, component_status, "test_data_loader")
72+
if component_status is None:
73+
from kfp_components.components.training.autorag.shared.component_status import ( # pyright: ignore[reportMissingImports]
74+
null_component_status_tracker,
75+
)
76+
77+
status = null_component_status_tracker()
78+
else:
79+
_embedded_path = Path(embedded_artifact.path)
80+
_module_path = _embedded_path if _embedded_path.is_file() else _embedded_path / "component_status.py"
81+
_spec = importlib.util.spec_from_file_location("_autorag_component_status", _module_path)
82+
if _spec is None or _spec.loader is None:
83+
raise ValueError(f"Cannot load embedded module from {_module_path}")
84+
_status_module = importlib.util.module_from_spec(_spec)
85+
_spec.loader.exec_module(_status_module)
86+
status = _status_module.bootstrap_status_tracker(embedded_artifact, component_status, "test_data_loader")
8087
with status:
81-
status.set_metadata(display_name="Test Data Loader Status")
82-
component_status.metadata["display_name"] = "Test Data Loader Status"
88+
if component_status is not None:
89+
status.set_metadata(display_name="Test Data Loader Status")
90+
component_status.metadata["display_name"] = "Test Data Loader Status"
8391
with status.stage("load_benchmark"):
8492
if not test_data_bucket_name:
8593
raise TypeError("test_data_bucket_name must be a non-empty string")

components/data_processing/autorag/text_extraction/component.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
def text_extraction(
1616
documents_descriptor: dsl.Input[dsl.Artifact],
1717
extracted_text: dsl.Output[dsl.Artifact],
18-
component_status: dsl.Output[dsl.Artifact],
18+
component_status: dsl.Output[dsl.Artifact] = None,
1919
embedded_artifact: dsl.EmbeddedInput[dsl.Dataset] = None,
2020
error_tolerance: Optional[float] = None,
2121
max_extraction_workers: Optional[int] = None,
@@ -348,17 +348,25 @@ def raise_if_threshold_exceeded(error_details: list, total_docs: int, tolerance:
348348

349349
import importlib.util
350350

351-
_embedded_path = Path(embedded_artifact.path)
352-
_module_path = _embedded_path if _embedded_path.is_file() else _embedded_path / "component_status.py"
353-
_spec = importlib.util.spec_from_file_location("_autorag_component_status", _module_path)
354-
if _spec is None or _spec.loader is None:
355-
raise ValueError(f"Cannot load embedded module from {_module_path}")
356-
_status_module = importlib.util.module_from_spec(_spec)
357-
_spec.loader.exec_module(_status_module)
358-
status = _status_module.bootstrap_status_tracker(embedded_artifact, component_status, "text_extraction")
351+
if component_status is None:
352+
from kfp_components.components.training.autorag.shared.component_status import ( # pyright: ignore[reportMissingImports]
353+
null_component_status_tracker,
354+
)
355+
356+
status = null_component_status_tracker()
357+
else:
358+
_embedded_path = Path(embedded_artifact.path)
359+
_module_path = _embedded_path if _embedded_path.is_file() else _embedded_path / "component_status.py"
360+
_spec = importlib.util.spec_from_file_location("_autorag_component_status", _module_path)
361+
if _spec is None or _spec.loader is None:
362+
raise ValueError(f"Cannot load embedded module from {_module_path}")
363+
_status_module = importlib.util.module_from_spec(_spec)
364+
_spec.loader.exec_module(_status_module)
365+
status = _status_module.bootstrap_status_tracker(embedded_artifact, component_status, "text_extraction")
359366
with status:
360-
status.set_metadata(display_name="Text Extraction Status")
361-
component_status.metadata["display_name"] = "Text Extraction Status"
367+
if component_status is not None:
368+
status.set_metadata(display_name="Text Extraction Status")
369+
component_status.metadata["display_name"] = "Text Extraction Status"
362370
descriptor_path = Path(documents_descriptor.path) / DOCUMENTS_DESCRIPTOR_FILENAME
363371

364372
with status.stage("extract_documents"):

components/training/autorag/leaderboard_evaluation/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ search_mode ("hybrid" | "vector" per ai4rag), ranker_strategy, generation.model_
1515
| --------- | ---- | ------- | ----------- |
1616
| `rag_patterns` | `dsl.InputPath(dsl.Artifact)` | `None` | Path to the directory of RAG patterns; each subdir contains pattern.json (pattern_name, indexing_params, rag_params, scores, execution_time, final_score). |
1717
| `html_artifact` | `dsl.Output[dsl.HTML]` | `None` | Output HTML artifact; the leaderboard table is written to html_artifact.path (single file). |
18-
| `component_status` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing stage-level progress tracking. |
1918
| `embedded_artifact` | `dsl.EmbeddedInput[dsl.Dataset]` | `None` | Embedded ``autorag.shared`` helpers injected by KFP at runtime. |
2019
| `optimization_metric` | `str` | `faithfulness` | Name of the metric used to rank patterns (e.g. faithfulness, answer_correctness, context_correctness). Shown in the leaderboard subtitle. Defaults to "faithfulness". |
20+
| `component_status` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing stage-level progress tracking. |
2121

2222
## Usage Examples 🧪
2323

components/training/autorag/leaderboard_evaluation/component.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414
def leaderboard_evaluation(
1515
rag_patterns: dsl.InputPath(dsl.Artifact),
1616
html_artifact: dsl.Output[dsl.HTML],
17-
component_status: dsl.Output[dsl.Artifact],
1817
embedded_artifact: dsl.EmbeddedInput[dsl.Dataset] = None,
1918
optimization_metric: str = "faithfulness",
19+
component_status: dsl.Output[dsl.Artifact] = None,
2020
):
2121
"""Build an HTML leaderboard artifact from RAG pattern evaluation results.
2222
@@ -329,17 +329,25 @@ def _build_leaderboard_html(
329329

330330
import importlib.util
331331

332-
_embedded_path = Path(embedded_artifact.path)
333-
_module_path = _embedded_path if _embedded_path.is_file() else _embedded_path / "component_status.py"
334-
_spec = importlib.util.spec_from_file_location("_autorag_component_status", _module_path)
335-
if _spec is None or _spec.loader is None:
336-
raise ValueError(f"Cannot load embedded module from {_module_path}")
337-
_status_module = importlib.util.module_from_spec(_spec)
338-
_spec.loader.exec_module(_status_module)
339-
status = _status_module.bootstrap_status_tracker(embedded_artifact, component_status, "leaderboard_evaluation")
332+
if component_status is None:
333+
from kfp_components.components.training.autorag.shared.component_status import ( # pyright: ignore[reportMissingImports]
334+
null_component_status_tracker,
335+
)
336+
337+
status = null_component_status_tracker()
338+
else:
339+
_embedded_path = Path(embedded_artifact.path)
340+
_module_path = _embedded_path if _embedded_path.is_file() else _embedded_path / "component_status.py"
341+
_spec = importlib.util.spec_from_file_location("_autorag_component_status", _module_path)
342+
if _spec is None or _spec.loader is None:
343+
raise ValueError(f"Cannot load embedded module from {_module_path}")
344+
_status_module = importlib.util.module_from_spec(_spec)
345+
_spec.loader.exec_module(_status_module)
346+
status = _status_module.bootstrap_status_tracker(embedded_artifact, component_status, "leaderboard_evaluation")
340347
with status:
341-
status.set_metadata(display_name="Leaderboard Evaluation Status")
342-
component_status.metadata["display_name"] = "Leaderboard Evaluation Status"
348+
if component_status is not None:
349+
status.set_metadata(display_name="Leaderboard Evaluation Status")
350+
component_status.metadata["display_name"] = "Leaderboard Evaluation Status"
343351
with status.stage("build_leaderboard"):
344352
if not rag_patterns_dir.is_dir():
345353
raise FileNotFoundError("rag_patterns path is not a directory: %s" % rag_patterns_dir)

components/training/autorag/pytest_support.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,12 @@ def wrap_component_python_func(
2727
embed_root = embedded_path or str(shared_dir)
2828

2929
def wrapper(*args, **kwargs):
30-
if "embedded_artifact" in signature.parameters and kwargs.get("embedded_artifact") is None:
30+
bound = signature.bind_partial(*args, **kwargs)
31+
if "embedded_artifact" in signature.parameters and "embedded_artifact" not in bound.arguments:
3132
embedded = mock.MagicMock()
3233
embedded.path = embed_root
3334
kwargs["embedded_artifact"] = embedded
34-
if "component_status" in signature.parameters and kwargs.get("component_status") is None:
35+
if "component_status" in signature.parameters and "component_status" not in bound.arguments:
3536
status = mock.MagicMock()
3637
status.path = str(tmp_path / "component_status_out")
3738
status.metadata = {}

components/training/autorag/rag_templates_optimization/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ Carries out the iterative RAG optimization process.
1818
| `rag_patterns` | `dsl.Output[dsl.Artifact]` | `None` | kfp-enforced argument specifying an output artifact. Provided by kfp backend automatically. |
1919
| `test_data_key` | `Optional[str]` | `None` | Path to the benchmark JSON file in object storage used by generated notebooks. |
2020
| `vector_io_provider_id` | `str` | `None` | Vector I/O provider identifier as registered in OGX. |
21-
| `component_status` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing stage-level progress tracking. |
2221
| `embedded_artifact` | `dsl.EmbeddedInput[dsl.Dataset]` | `None` | Embedded ``autorag.shared`` helpers injected by KFP at runtime. |
2322
| `optimization_settings` | `Optional[dict]` | `None` | Additional settings customising the experiment. |
2423
| `input_data_key` | `Optional[str]` | `""` | A path to documents dir within a bucket used as an input to AI4RAG experiment. |
24+
| `component_status` | `dsl.Output[dsl.Artifact]` | `None` | Output artifact containing stage-level progress tracking. |
2525

2626
## Usage Examples 🧪
2727

0 commit comments

Comments
 (0)