opendatahub-io · hbelmiro · Apr 16, 2026 · Apr 13, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/.github/workflows/sync-requirements.yml b/.github/workflows/sync-requirements.yml
@@ -0,0 +1,30 @@
+---
+name: Check requirements.txt
+
+on:
+  pull_request:
+    paths:
+      - pyproject.toml
+      - uv.lock
+      - requirements.txt
+
+permissions:
+  contents: read
+
+concurrency:
+  group: check-requirements-${{ github.head_ref }}
+  cancel-in-progress: true
+
+jobs:
+  check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6
+
+      - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78  # v7.6.0
+
+      - name: Verify requirements.txt is up-to-date
+        run: |
+          make requirements
+          git diff --exit-code requirements.txt \
+            || { echo ""; echo "requirements.txt is out of sync."; echo "Run 'make requirements' and commit the result."; exit 1; }
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,6 +8,12 @@ repos:
         language: system
         files: (^pyproject\.toml$|^uv\.lock$)
         pass_filenames: false
+      - id: sync-requirements
+        name: sync requirements
+        entry: make requirements
+        language: system
+        files: (^pyproject\.toml$|^uv\.lock$)
+        pass_filenames: false
       - id: ruff-format
         name: ruff format
         # --force-exclude respects pyproject.toml excludes when files are passed directly

diff --git a/.tekton/odh-pipelines-components-ci-on-pull-request.yaml b/.tekton/odh-pipelines-components-ci-on-pull-request.yaml
@@ -6,7 +6,7 @@ metadata:
     build.appstudio.redhat.com/commit_sha: '{{revision}}'
     build.appstudio.redhat.com/target_branch: '{{target_branch}}'
     build.appstudio.redhat.com/pull_request_number: '{{pull_request_number}}'
-    pipelinesascode.tekton.dev/cancel-in-progress: "false"
+    pipelinesascode.tekton.dev/cancel-in-progress: "true"
     pipelinesascode.tekton.dev/max-keep-runs: "3"
     pipelinesascode.tekton.dev/on-cel-expression: event == "pull_request" && target_branch
       == "main"
@@ -26,9 +26,17 @@ spec:
   - name: output-image
     value: quay.io/opendatahub/odh-pipelines-components:odh-pr
   - name: dockerfile
-    value: Dockerfile
+    value: Dockerfile.konflux.pipelines-components
   - name: path-context
     value: .
+  - name: hermetic
+    value: 'true'
+  - name: prefetch-input
+    value: >-
+      {"type": "pip", "path": ".",
+      "requirements_files": ["requirements.txt"],
+      "requirements_build_files": ["requirements-build.txt"],
+      "binary": {"arch": ":all:"}}
   - name: additional-tags
     value:
     - 'odh-pr-{{revision}}'

diff --git a/.tekton/odh-pipelines-components-ci-on-push.yaml b/.tekton/odh-pipelines-components-ci-on-push.yaml
@@ -25,13 +25,17 @@ spec:
   - name: output-image
     value: quay.io/opendatahub/odh-pipelines-components:odh-stable
   - name: dockerfile
-    value: Dockerfile
+    value: Dockerfile.konflux.pipelines-components
   - name: path-context
     value: .
-  - name: build-platforms
-    value:
-    - linux/x86_64
-    - linux/aarch64
+  - name: hermetic
+    value: 'true'
+  - name: prefetch-input
+    value: >-
+      {"type": "pip", "path": ".",
+      "requirements_files": ["requirements.txt"],
+      "requirements_build_files": ["requirements-build.txt"],
+      "binary": {"arch": ":all:"}}
   pipelineRef:
     resolver: git
     params:

diff --git a/Dockerfile b/Dockerfile
@@ -16,7 +16,7 @@ COPY utils/ utils/
 RUN chown -R 1001:1001 /app
 USER 1001
 
-RUN uv sync --no-cache --extra test
+RUN uv sync --no-cache
 
 RUN uv run python -m scripts.generate_managed_pipelines.generate_managed_pipelines
 

diff --git a/Dockerfile.konflux.pipelines-components b/Dockerfile.konflux.pipelines-components
@@ -0,0 +1,23 @@
+FROM registry.redhat.io/ubi9/python-312@sha256:ff373f4b42b662e99954adea770ca87b4ea963186cc752174ccb94aa08fa702d
+
+WORKDIR /app
+
+USER root
+
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY pyproject.toml __init__.py ./
+COPY components/ components/
+COPY pipelines/ pipelines/
+COPY scripts/ scripts/
+COPY utils/ utils/
+
+RUN chown -R 1001:1001 /app
+USER 1001
+
+RUN pip install --no-cache-dir --no-deps .
+
+RUN python -m scripts.generate_managed_pipelines.generate_managed_pipelines
+
+CMD ["python", "-m", "scripts.init_managed_pipelines.init_managed_pipelines"]
diff --git a/Makefile b/Makefile
@@ -105,3 +105,18 @@ readme:
 
 sync-packages:
 	@$(UVRUN) python -m scripts.sync_packages.sync_packages
+
+AIPCC_INDEX_URL := https://console.redhat.com/api/pypi/public-rhai/rhoai/3.4/cpu-ubi9/simple
+
+requirements:
+	echo "--index-url $(AIPCC_INDEX_URL)" > requirements.txt
+	echo "" >> requirements.txt
+	uv pip compile pyproject.toml --generate-hashes --no-header --no-annotate \
+		--no-emit-package kfp-components \
+		--python-version 3.12 \
+		--index-url $(AIPCC_INDEX_URL) >> requirements.txt
+	echo "--index-url $(AIPCC_INDEX_URL)" > requirements-build.txt
+	echo "" >> requirements-build.txt
+	printf 'setuptools\nwheel\n' | uv pip compile --generate-hashes --no-header --no-annotate \
+		--python-version 3.12 \
+		--index-url $(AIPCC_INDEX_URL) - >> requirements-build.txt
diff --git a/components/data_processing/automl/OWNERS b/components/data_processing/automl/OWNERS
@@ -1,5 +1,6 @@
 approvers:
   - LukaszCmielowski
+  - DorotaDR
 reviewers:
   - Mateusz-Switala
   - DorotaDR
diff --git a/components/data_processing/automl/tabular_data_loader/OWNERS b/components/data_processing/automl/tabular_data_loader/OWNERS
@@ -1,5 +1,6 @@
 approvers:
   - LukaszCmielowski
+  - DorotaDR
 reviewers:
   - Mateusz-Switala
   - DorotaDR
diff --git a/components/data_processing/automl/tabular_data_loader/README.md b/components/data_processing/automl/tabular_data_loader/README.md
@@ -16,6 +16,11 @@ The component reads data in chunks to efficiently handle large files without loa
 
 For **regression** tasks the split is random; for **binary** and **multiclass** tasks the split is **stratified** by the label column by default.
 
+Rows with a missing label (NaN / empty in ``label_column``) are dropped after load and before splitting, so regression runs do not propagate null targets into splits or the ``sample_row`` JSON (stratified sampling already dropped per chunk; this applies the same rule to random and first-n-rows
+paths).
+
+After sampling, **+/- infinity** values in the frame are replaced with **NaN** (same idea as AutoAI ``loadXy``), then **full-row duplicates** are dropped before the label drop and train/test split.
+
 Authentication uses AWS-style credentials provided via environment variables (e.g. from a Kubernetes secret).
 
 ## Inputs 📥
@@ -90,6 +95,7 @@ def example_pipeline(
 - **Owners**:
   - Approvers:
     - LukaszCmielowski
+    - DorotaDR
   - Reviewers:
     - Mateusz-Switala
     - DorotaDR

diff --git a/components/data_processing/automl/tabular_data_loader/component.py b/components/data_processing/automl/tabular_data_loader/component.py
@@ -45,6 +45,15 @@ def automl_data_loader(  # noqa: D417
     For **regression** tasks the split is random; for **binary** and **multiclass**
     tasks the split is **stratified** by the label column by default.
 
+    Rows with a missing label (NaN / empty in ``label_column``) are dropped after load
+    and before splitting, so regression runs do not propagate null targets into splits
+    or the ``sample_row`` JSON (stratified sampling already dropped per chunk; this
+    applies the same rule to random and first-n-rows paths).
+
+    After sampling, **+/- infinity** values in the frame are replaced with **NaN** (same
+    idea as AutoAI ``loadXy``), then **full-row duplicates** are dropped before the
+    label drop and train/test split.
+
     Authentication uses AWS-style credentials provided via environment variables
     (e.g. from a Kubernetes secret).
 
@@ -67,6 +76,7 @@ def automl_data_loader(  # noqa: D417
     """  # noqa: E501
     import io
     import logging
+    import math
     import os
 
     import boto3
@@ -305,6 +315,43 @@ def load_data_in_batches(
         label_column=label_column,
     )
 
+    if label_column not in sampled_dataframe.columns:
+        raise ValueError(
+            f"Label column {label_column!r} not found in the dataset. "
+            f"Available columns: {list(sampled_dataframe.columns)}"
+        )
+
+    sampled_dataframe.replace([math.inf, -math.inf], float("nan"), inplace=True)
+
+    n_before_dedup = len(sampled_dataframe)
+    sampled_dataframe.drop_duplicates(inplace=True)
+    n_dup_dropped = n_before_dedup - len(sampled_dataframe)
+    if n_dup_dropped:
+        logger.info("Dropped %s full-row duplicate(s) (%s rows remaining).", n_dup_dropped, len(sampled_dataframe))
+
+    if sampled_dataframe.empty:
+        raise ValueError(
+            "No valid data rows remain after replacing infinite values and dropping duplicates. "
+            "The source CSV may contain only infinite/NaN values or duplicate rows."
+        )
+
+    n_before_drop = len(sampled_dataframe)
+    sampled_dataframe = sampled_dataframe.dropna(subset=[label_column])
+    n_dropped = n_before_drop - len(sampled_dataframe)
+    if n_dropped:
+        logger.info(
+            "Dropped %s row(s) with missing label in column %r before splitting (loaded %s rows, %s remaining).",
+            n_dropped,
+            label_column,
+            n_before_drop,
+            len(sampled_dataframe),
+        )
+    if sampled_dataframe.empty:
+        raise ValueError(
+            f"No rows remain after removing missing values in label column {label_column!r}. "
+            "Ensure the dataset has at least one row with a non-null label (e.g. empty cells in the target column)."
+        )
+
     n_samples = len(sampled_dataframe)
     logger.info("Read %d rows from s3://%s/%s (sampling_method=%s)", n_samples, bucket_name, file_key, sampling_method)
 

diff --git a/...onents/data_processing/automl/tabular_data_loader/tests/data/taxi_trip_pricing_sample.csv b/...onents/data_processing/automl/tabular_data_loader/tests/data/taxi_trip_pricing_sample.csv
@@ -0,0 +1,35 @@
+Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
+19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
+47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
+36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
+30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
+,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618000000000002
+8.64,Afternoon,Weekend,2.0,Medium,Clear,2.55,1.71,0.48,89.33,60.202799999999996
+3.85,Afternoon,Weekday,4.0,High,Rain,3.51,1.66,,5.05,11.2645
+43.44,Evening,Weekend,3.0,,Clear,2.97,1.87,0.23,,101.1216
+30.45,Morning,Weekday,3.0,High,Clear,2.77,1.78,0.34,110.33,
+35.7,Afternoon,Weekday,2.0,Low,Rain,3.39,1.52,0.47,,75.5657
+,Morning,Weekday,4.0,,Clear,2.4,0.58,0.43,26.34,14.892
+48.53,Night,Weekday,3.0,Low,Clear,4.78,,0.5,79.94,
+41.79,Night,Weekend,3.0,High,Clear,4.6,1.77,0.11,86.95,88.13279999999999
+11.4,Morning,Weekday,3.0,,Clear,4.12,,0.15,84.12,36.118
+9.91,Evening,Weekday,2.0,High,Clear,2.32,1.26,0.34,41.72,28.991400000000002
+9.99,Night,Weekday,4.0,High,Clear,4.33,0.85,0.43,34.0,27.441499999999998
+15.91,Morning,Weekday,4.0,Low,Clear,4.42,1.77,0.21,114.93,56.716
+26.71,Afternoon,Weekend,4.0,Low,Rain,4.3,1.59,0.2,111.18,69.0049
+22.17,Night,,4.0,Low,Clear,2.34,1.97,0.41,57.59,69.6268
+15.27,Morning,,,Low,Clear,3.93,0.73,0.12,,27.354300000000002
+30.98,Afternoon,Weekend,1.0,Low,Rain,4.5,0.84,0.25,57.02,44.7782
+7.84,Morning,Weekday,4.0,Medium,,3.73,0.82,0.3,53.8,26.298799999999996
+105.94355003672595,Night,Weekend,2.0,Low,Rain,3.94,1.69,0.32,23.03,201.86950918612797
+18.95,Night,Weekday,1.0,Low,Clear,3.38,0.78,0.39,54.04,39.2366
+23.35,Night,,3.0,Low,Rain,3.59,0.6,0.24,66.8,33.632000000000005
+39.47,Afternoon,Weekday,1.0,Low,Clear,,,0.35,7.59,83.69649999999999
+10.78,Evening,,3.0,High,Rain,3.92,0.54,0.33,56.07,28.2443
+138.09832791310237,Evening,Weekend,4.0,Medium,Rain,2.24,1.75,0.32,94.86,280.87730155406564
+30.03,,Weekday,1.0,High,Clear,3.31,1.05,0.36,83.21,64.7971
+3.28,Evening,Weekday,2.0,Medium,Clear,2.88,1.76,0.2,78.04,24.260800000000003
+30.77,Morning,Weekday,1.0,Low,Clear,3.64,1.33,0.13,109.6,58.8121
+9.36,Afternoon,Weekday,1.0,Medium,Clear,2.4,1.85,0.15,7.07,20.7765
+4.19,Morning,Weekday,1.0,Low,Clear,4.07,1.89,0.19,69.06,
+47.5,Morning,Weekend,,Low,Clear,4.39,0.51,0.3,95.55,57.28
diff --git a/components/data_processing/automl/tabular_data_loader/tests/mocked_pandas.py b/components/data_processing/automl/tabular_data_loader/tests/mocked_pandas.py
@@ -7,6 +7,7 @@
 import csv
 import io
 import json
+import math
 import random
 from collections import Counter
 
@@ -73,9 +74,82 @@ def dropna(self, subset=None):
         if not subset:
             return self
         col_indices = [self._columns.index(c) for c in subset]
-        new_rows = [row for row in self._rows if all(row[i] != "" and row[i] is not None for i in col_indices)]
+
+        def _cell_missing(val) -> bool:
+            if val is None or val == "":
+                return True
+            try:
+                return math.isnan(float(val))
+            except (TypeError, ValueError):
+                return False
+
+        new_rows = [row for row in self._rows if all(not _cell_missing(row[i]) for i in col_indices)]
         return MockedDataFrame(self._columns, new_rows)
 
+    def replace(self, to_replace, value, inplace=False):
+        """Minimal ``DataFrame.replace``: map ±infinity to NaN (float), matching production pandas."""
+        inf_like = False
+        if isinstance(to_replace, (list, tuple)):
+            for x in to_replace:
+                if isinstance(x, float) and math.isinf(x):
+                    inf_like = True
+                    break
+        if not inf_like:
+            out = MockedDataFrame(self._columns, [list(r) for r in self._rows])
+            return None if inplace else out
+
+        def _map_cell(v):
+            try:
+                fv = float(v)
+                if math.isinf(fv):
+                    return float("nan")
+            except (TypeError, ValueError):
+                pass
+            return v
+
+        new_rows = [[_map_cell(c) for c in row] for row in self._rows]
+        out = MockedDataFrame(self._columns, new_rows)
+        if inplace:
+            self._columns = out._columns
+            self._rows = out._rows
+            return None
+        return out
+
+    def drop_duplicates(self, inplace=False):
+        """Drop full-row duplicates (first occurrence kept).
+
+        NaN in any cell is treated like pandas duplicate detection (two NaNs in the
+        same column positions count as equal), not Python ``tuple`` equality.
+        """
+
+        def _dedup_key_part(cell):
+            if isinstance(cell, float):
+                if math.isnan(cell):
+                    return "__PANDAS_NAN__"
+                return ("float", cell)
+            try:
+                fv = float(cell)
+                if math.isnan(fv):
+                    return "__PANDAS_NAN__"
+            except (TypeError, ValueError):
+                pass
+            return cell
+
+        seen: set[tuple] = set()
+        new_rows = []
+        for row in self._rows:
+            key = tuple(_dedup_key_part(c) for c in row)
+            if key in seen:
+                continue
+            seen.add(key)
+            new_rows.append(list(row))
+        out = MockedDataFrame(self._columns, new_rows)
+        if inplace:
+            self._columns = out._columns
+            self._rows = out._rows
+            return None
+        return out
+
     def _col_index(self, col):
         """Return the index of the given column name."""
         return self._columns.index(col)