Merge remote-tracking branch 'kubeflow/main' into merge-kubeflow-to-odh

hbelmiro · hbelmiro · commit ac1e9dc8f165 · 2026-05-21T09:15:13.000-03:00
diff --git a/.github/scripts/ci_checks/ci_checks.py b/.github/scripts/ci_checks/ci_checks.py
@@ -42,11 +42,22 @@ def get_check_runs(self, repo: str, head_sha: str) -> dict:
         return json.loads(result.stdout)
 
     def get_own_check_run_id(self, repo: str, head_sha: str, check_name: str) -> int:
-        """Return the ID of the check run matching *check_name*."""
+        """Return the ID of the check run matching *check_name*.
+
+        Prefers an in-progress instance over completed ones to avoid
+        misidentifying a stale run when multiple runs share the same name.
+        Falls back to the first completed run if none is in-progress.
+        """
         data = self.get_check_runs(repo, head_sha)
+        fallback: int | None = None
         for cr in data.get("check_runs", []):
             if cr["name"] == check_name:
-                return cr["id"]
+                if cr["status"] == "in_progress":
+                    return cr["id"]
+                if fallback is None:
+                    fallback = cr["id"]
+        if fallback is not None:
+            return fallback
         raise ChecksError(f"Check run '{check_name}' not found")
 
 
diff --git a/.github/scripts/ci_checks/tests/test_ci_checks.py b/.github/scripts/ci_checks/tests/test_ci_checks.py
@@ -72,11 +72,17 @@ def get_check_runs(self, repo: str, head_sha: str) -> dict:
         return response
 
     def get_own_check_run_id(self, repo: str, head_sha: str, check_name: str) -> int:
-        """Return a fixed check run ID by scanning check_runs_responses."""
+        """Return a fixed check run ID, preferring the in-progress instance."""
         if self._check_runs_responses:
+            fallback: int | None = None
             for cr in self._check_runs_responses[0].get("check_runs", []):
                 if cr["name"] == check_name:
-                    return cr["id"]
+                    if cr["status"] == "in_progress":
+                        return cr["id"]
+                    if fallback is None:
+                        fallback = cr["id"]
+            if fallback is not None:
+                return fallback
         raise ChecksError(f"Check run '{check_name}' not found")
 
 
@@ -389,6 +395,69 @@ def test_empty_check_runs_retries_then_fails(self):
             wait_for_checks(gh, "owner/repo", "abc123", check_run_id=999, delay=0, retries=2, interval=5)
         assert gh.get_check_runs.call_count == 2
 
+    def test_stale_completed_run_with_same_name_excluded_by_ignore(self):
+        """A stale completed check_ci_status run is excluded via ignore_checks."""
+        gh = MagicMock(spec=GhClient)
+        gh.get_check_runs.return_value = json.loads(
+            _api_response(
+                _make_check_run(800, "check_ci_status", "completed", "failure"),
+                _make_check_run(999, "check_ci_status", "in_progress"),
+                _make_check_run(100, "lint", "completed", "success"),
+            )
+        )
+        wait_for_checks(
+            gh,
+            "owner/repo",
+            "abc123",
+            check_run_id=999,
+            delay=0,
+            retries=1,
+            interval=0,
+            ignore_checks=frozenset({"check_ci_status"}),
+        )
+
+    def test_stale_failed_run_causes_false_failure_without_ignore(self):
+        """Without ignore_checks, a stale failed run with the same name causes failure."""
+        gh = MagicMock(spec=GhClient)
+        gh.get_check_runs.return_value = json.loads(
+            _api_response(
+                _make_check_run(800, "check_ci_status", "completed", "failure"),
+                _make_check_run(999, "check_ci_status", "in_progress"),
+                _make_check_run(100, "lint", "completed", "success"),
+            )
+        )
+        with pytest.raises(ChecksError, match="check_ci_status"):
+            wait_for_checks(
+                gh,
+                "owner/repo",
+                "abc123",
+                check_run_id=999,
+                delay=0,
+                retries=1,
+                interval=0,
+            )
+
+    def test_concurrent_in_progress_run_excluded_by_ignore(self):
+        """Two concurrent in-progress runs with same name don't deadlock when using ignore_checks."""
+        gh = MagicMock(spec=GhClient)
+        gh.get_check_runs.return_value = json.loads(
+            _api_response(
+                _make_check_run(998, "check_ci_status", "in_progress"),
+                _make_check_run(999, "check_ci_status", "in_progress"),
+                _make_check_run(100, "lint", "completed", "success"),
+            )
+        )
+        wait_for_checks(
+            gh,
+            "owner/repo",
+            "abc123",
+            check_run_id=999,
+            delay=0,
+            retries=1,
+            interval=0,
+            ignore_checks=frozenset({"check_ci_status"}),
+        )
+
     def test_many_check_runs_all_evaluated(self):
         """All check runs are evaluated, not just the first page worth."""
         gh = MagicMock(spec=GhClient)
@@ -446,14 +515,25 @@ def test_get_own_check_run_id_finds_matching_check(self, mock_run):
         assert client.get_own_check_run_id("owner/repo", "abc123", "check_ci_status") == 200
 
     @patch("ci_checks.ci_checks.subprocess.run")
-    def test_get_own_check_run_id_returns_first_match(self, mock_run):
-        """get_own_check_run_id returns the first matching ID when duplicates exist."""
+    def test_get_own_check_run_id_prefers_in_progress(self, mock_run):
+        """get_own_check_run_id prefers the in-progress run over a completed one."""
         response = _api_response(
             _make_check_run(200, "check_ci_status", "completed", "success"),
             _make_check_run(300, "check_ci_status", "in_progress"),
         )
         mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=0, stdout=response)
         client = GhClient()
+        assert client.get_own_check_run_id("owner/repo", "abc123", "check_ci_status") == 300
+
+    @patch("ci_checks.ci_checks.subprocess.run")
+    def test_get_own_check_run_id_falls_back_to_completed(self, mock_run):
+        """get_own_check_run_id falls back to completed run when none is in-progress."""
+        response = _api_response(
+            _make_check_run(200, "check_ci_status", "completed", "success"),
+            _make_check_run(300, "check_ci_status", "completed", "success"),
+        )
+        mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=0, stdout=response)
+        client = GhClient()
         assert client.get_own_check_run_id("owner/repo", "abc123", "check_ci_status") == 200
 
     @patch("ci_checks.ci_checks.subprocess.run")
diff --git a/.github/workflows/ci-checks.yml b/.github/workflows/ci-checks.yml
@@ -6,8 +6,13 @@ on:
   pull_request_target:
     types: [opened, synchronize, reopened, labeled]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.number }}-${{ github.event.label.name || 'ci' }}
+  cancel-in-progress: true
+
 jobs:
   check_ci_status:
+    if: github.event.action != 'labeled' || github.event.label.name == 'ok-to-test'
     runs-on: ubuntu-24.04
     timeout-minutes: 10
     permissions:
@@ -39,7 +44,7 @@ jobs:
             --author-login "$AUTHOR_LOGIN" \
             --head-sha "$HEAD_SHA" \
             --check-name "check_ci_status" \
-            --ignore-checks "Agent" \
+            --ignore-checks "Agent,check_ci_status" \
             --delay 5 \
             --retries 10 \
             --polling-interval 5 \
diff --git a/AGENTS.md b/AGENTS.md
@@ -9,10 +9,8 @@ See also:
 
 ## Sources of truth (keep this doc aligned)
 
-If this guide conflicts with repository enforcement or process docs, treat these as sources of truth:
-
-This guide is expected to stay current; when repository enforcement, CI, or contribution process changes (or when a
-difference is noted), update `AGENTS.md` alongside the change.
+If this guide conflicts with repository enforcement or process docs, treat these as sources of truth.
+When repository enforcement, CI, or contribution process changes, update `AGENTS.md` alongside the change.
 
 - [`CONTRIBUTING.md`](docs/CONTRIBUTING.md) (required files, workflow, required metadata fields)
 - [`GOVERNANCE.md`](docs/GOVERNANCE.md) (roles, ownership, lifecycle)
@@ -47,30 +45,6 @@ Agents typically interact with this repository in three modes. Use the mode to d
 
 Goal: add or update an asset under `components/` or `pipelines/` that is reusable and passes repo validations.
 
-### Before you generate code
-
-#### Reuse-first: search for existing components/pipelines
-
-Before adding anything new:
-
-- Search under `components/<category>/` and `pipelines/<category>/` for similar functionality.
-- Prefer extending or composing existing assets instead of duplicating.
-
-Good places to look:
-
-- `components/` and `pipelines/` category directories for similar patterns and reusable building blocks (example:
-  `components/data_processing/yoda_data_processor`)
-- `scripts/generate_skeleton/` (canonical templates)
-- `scripts/generate_readme/` (README generation expectations)
-
-#### Establish the target location and naming
-
-- Components live under `components/<category>/<component_name>/`.
-- Components can optionally use subcategories: `components/<category>/<subcategory>/<component_name>/`.
-- Pipelines live under `pipelines/<category>/<pipeline_name>/`.
-- Pipelines can optionally use subcategories: `pipelines/<category>/<subcategory>/<pipeline_name>/`.
-- Use `snake_case` directory names (per `CONTRIBUTING.md`).
-
 ### Required files
 
 When the agent changes or adds a component/pipeline directory, follow
@@ -88,57 +62,18 @@ Process (expected for agents):
 - Open a submission issue using `.github/ISSUE_TEMPLATE/component_submission.md`.
 - Get Pipelines Working Group approval in that issue (link it from the PR).
 - Open a PR with the implementation.
-- Follow the repo’s OWNERS-based review flow described in `CONTRIBUTING.md` (`/lgtm` + `/approve`).
-
-### Example prompts (Mode 1)
-
-#### Add a new component (reuse-first, compliant)
-
-Use this prompt pattern:
-
-"Search `components/` for similar functionality and reuse if possible. If a new component is needed, create it under
-`components/<category>/<name>/` using `make component CATEGORY=<cat> NAME=<name> [NO_TESTS=true]`, then implement
-`component.py` following repository lint rules (including import guard). Create `metadata.yaml` that conforms to
-the metadata schema defined in [`CONTRIBUTING.md`](docs/CONTRIBUTING.md#metadatayaml-schema) (required field order, fresh `lastVerified`). Generate/validate
-`README.md` using `make readme TYPE=component CATEGORY=<cat> NAME=<name>`. Add unit tests using `.python_func()` and a
-LocalRunner test using `setup_and_teardown_subprocess_runner` (you can generate tests via
-`make tests TYPE=component CATEGORY=<cat> NAME=<name>`). Reference an existing component like
-`components/data_processing/yoda_data_processor/` for patterns."
-
-#### Add a component in a subcategory
-
-Use this prompt pattern when creating related components that should share ownership or utilities:
-
-"Create a component in a subcategory using `make component CATEGORY=<cat> SUBCATEGORY=<sub> NAME=<name>`. This
-automatically creates the subcategory structure with OWNERS and README.md if it doesn't exist. For shared utilities,
-add `CREATE_SHARED=true` to create a `shared/` package. Update the subcategory OWNERS and README.md with appropriate
-maintainers and documentation. Follow the same component implementation patterns as above."
-
-#### Add a new pipeline (reuse-first, compliant)
-
-Use this prompt pattern:
+- Follow the repo's OWNERS-based review flow described in `CONTRIBUTING.md` (`/lgtm` + `/approve`).
 
-"Search `pipelines/` for similar functionality and reuse if possible. If a new pipeline is needed, create it under
-`pipelines/<category>/<name>/` using `make pipeline CATEGORY=<cat> NAME=<name> [NO_TESTS=true]`, then implement
-`pipeline.py` following repository lint rules (including import guard). Create `metadata.yaml` that conforms to the
-metadata schema defined in [`CONTRIBUTING.md`](docs/CONTRIBUTING.md#metadatayaml-schema) (required field order, fresh
-`lastVerified`). Generate/validate `README.md` using `make readme TYPE=pipeline CATEGORY=<cat> NAME=<name>`. Add tests
-(you can generate tests via `make tests TYPE=pipeline CATEGORY=<cat> NAME=<name>`)."
+### Common tasks
 
-#### Add a pipeline in a subcategory
-
-Use this prompt pattern when creating related pipelines that should share ownership or utilities:
-
-"Create a pipeline in a subcategory using `make pipeline CATEGORY=<cat> SUBCATEGORY=<sub> NAME=<name>`. This
-automatically creates the subcategory structure with OWNERS and README.md if it doesn't exist. For shared utilities,
-add `CREATE_SHARED=true` to create a `shared/` package. Update the subcategory OWNERS and README.md with appropriate
-maintainers and documentation. Follow the same pipeline implementation patterns as above."
-
-#### Update an existing component safely
-
-"Find the existing component directory. Make the minimal change needed. Update docstrings and regenerate the README
-if the interface changed (`make readme TYPE=component CATEGORY=<cat> NAME=<name>`). Update `metadata.yaml` only if
-needed and keep `lastVerified` fresh. Add/adjust unit tests and LocalRunner tests. Ensure import guard compliance."
+| Task | Command | Reference pattern |
+|---|---|---|
+| New component | `make component CATEGORY=<cat> NAME=<name>` | `components/data_processing/yoda_data_processor/` |
+| New pipeline | `make pipeline CATEGORY=<cat> NAME=<name>` | `pipelines/training/sft/` |
+| Subcategory asset | Add `SUBCATEGORY=<sub>`; add `CREATE_SHARED=true` for shared utils | Subcategory OWNERS/README auto-created |
+| Generate tests | `make tests TYPE=component\|pipeline CATEGORY=<cat> NAME=<name>` | Unit tests + LocalRunner tests |
+| Generate README | `make readme TYPE=component\|pipeline CATEGORY=<cat> NAME=<name>` | Keeps README in sync |
+| Update existing | Minimal change, regenerate README if interface changed, keep `lastVerified` fresh | Same component dir |
 
 ## Mode 2: End user building pipelines from these components
 
@@ -171,52 +106,26 @@ Follow [`CONTRIBUTING.md`](docs/CONTRIBUTING.md#dependency-management-uvlock) fo
 
 ### Python lint and formatting
 
-Python lint/format is enforced by CI on pull requests and runs against **changed files**:
-
-- Workflow: [`.github/workflows/python-lint.yml`](.github/workflows/python-lint.yml)
-
-This uses Ruff formatting and linting (see `pyproject.toml` for configuration).
-
-### Markdown lint
-
-Markdown is linted in CI on pull requests and runs against **changed files**:
-
-- Workflow: [`.github/workflows/markdown-lint.yml`](.github/workflows/markdown-lint.yml)
-- Config: [`.markdownlint.json`](.markdownlint.json)
-
-### YAML lint
-
-YAML is linted in CI on pull requests and runs against **changed files**:
-
-- Workflow: [`.github/workflows/yaml-lint.yml`](.github/workflows/yaml-lint.yml)
-- Config: [`.yamllint.yml`](.yamllint.yml)
-
-### Import guard (components/pipelines)
-
-Follow [`CONTRIBUTING.md` (Testing and Quality)](docs/CONTRIBUTING.md#testing-and-quality).
-Allowlisted exceptions are defined in
-[`.github/scripts/check_imports/import_exceptions.yaml`](.github/scripts/check_imports/import_exceptions.yaml).
-
-### Metadata schema validation
-
-Follow the canonical schema requirements in
-[`CONTRIBUTING.md` (metadata.yaml schema)](docs/CONTRIBUTING.md#metadatayaml-schema).
-
-CI workflow (reference): [`.github/workflows/validate-metadata-schema.yml`](.github/workflows/validate-metadata-schema.yml).
-
-### Base image validation
+Enforced by CI ([`.github/workflows/python-lint.yml`](.github/workflows/python-lint.yml)) using Ruff (see `pyproject.toml`).
 
-Follow the canonical policy in
-[`scripts/validate_base_images/README.md`](scripts/validate_base_images/README.md).
+Single-file commands:
 
-CI workflow (reference): [`.github/workflows/base-image-check.yml`](.github/workflows/base-image-check.yml).
+- `uv run ruff check <file>` — lint one file
+- `uv run ruff check --fix <file>` — lint + auto-fix one file
+- `uv run ruff format <file>` — format one file
 
-### README generation and sync
+Whole-repo: `make lint` (check) or `make format` (fix).
 
-Follow the canonical generator behavior in
-[`scripts/generate_readme/README.md`](scripts/generate_readme/README.md) and keep READMEs in sync.
+### Other validations
 
-CI workflow (reference): [`.github/workflows/readme-check.yml`](.github/workflows/readme-check.yml).
+| Validation | Config | CI workflow |
+|---|---|---|
+| Markdown lint | [`.markdownlint.json`](.markdownlint.json) | [`markdown-lint.yml`](.github/workflows/markdown-lint.yml) |
+| YAML lint | [`.yamllint.yml`](.yamllint.yml) | [`yaml-lint.yml`](.github/workflows/yaml-lint.yml) |
+| Import guard | [`import_exceptions.yaml`](.github/scripts/check_imports/import_exceptions.yaml); see [`CONTRIBUTING.md`](docs/CONTRIBUTING.md#testing-and-quality) | [`ci-checks.yml`](.github/workflows/ci-checks.yml) |
+| Metadata schema | [`CONTRIBUTING.md` (schema)](docs/CONTRIBUTING.md#metadatayaml-schema); keep required field order and fresh `lastVerified` | [`validate-metadata-schema.yml`](.github/workflows/validate-metadata-schema.yml) |
+| Base images | [`validate_base_images/README.md`](scripts/validate_base_images/README.md) | [`base-image-check.yml`](.github/workflows/base-image-check.yml) |
+| README sync | [`generate_readme/README.md`](scripts/generate_readme/README.md) | [`readme-check.yml`](.github/workflows/readme-check.yml) |
 
 ### Tests