Merge branch 'main' into alechan/upgrade-xpk-v0.13.0

aybchan · web-flow · commit fe5639afcef4 · 2025-10-22T11:21:44.000+01:00
diff --git a/.github/triage/jax_toolbox_triage/triage_tool.py b/.github/triage/jax_toolbox_triage/triage_tool.py
@@ -153,6 +153,17 @@ def _gather_histories(
                 args=self.args,
             )
             package_versions[package] = history
+            if package in self.args.cherry_pick:
+                # If explicit commits to cherry-pick were given on the commandline,
+                # make sure they are known to the local working copy. They might not be
+                # if the fix being cherry-picked is newer, or only lives in a remote
+                # that is being passed in via --override-remotes.
+                worker.check_exec(
+                    ["git", "fetch", self.args.override_remotes.get(package, "origin")]
+                    + self.args.cherry_pick[package],
+                    policy="once_per_container",
+                    workdir=self.package_dirs[package],
+                )
             for cherry_pick_range in cherry_pick_ranges:
                 if package not in self.args.cherry_pick:
                     self.args.cherry_pick[package] = []
@@ -529,22 +540,6 @@ def gather_version_info(self, passing_url: str, failing_url: str):
             passing_url, failing_url, passing_env, failing_env
         )
 
-        # We only know how to handle software packages that have versions defined at
-        # both ends of the range.
-        inconsistent_keys = passing_versions.keys() ^ failing_versions.keys()
-        if len(inconsistent_keys):
-            self.logger.warning(
-                f"Ignoring packages that only have defined versions in one endpoint: {' '.join(inconsistent_keys)}"
-            )
-            for k in inconsistent_keys:
-                for d in [passing_versions, failing_versions]:
-                    d.pop(k, None)
-
-        # Which packages have versions that are not always the same?
-        self.dynamic_packages = {
-            pkg
-            for pkg, _ in set(passing_versions.items()) ^ set(failing_versions.items())
-        }
         # Choose an environment to do the version-level bisection in; use directory names that
         # match it, and track what the initial versions of the different packages are
         if self.args.container_runtime == "local":
@@ -560,6 +555,40 @@ def gather_version_info(self, passing_url: str, failing_url: str):
             self.bisection_url = passing_url
             self.bisection_versions = original_passing_versions
             self.package_dirs = passing_package_dirs
+
+        # We only know how to handle software packages that have versions defined at
+        # both ends of the range.
+        inconsistent_keys = passing_versions.keys() ^ failing_versions.keys()
+        if len(inconsistent_keys):
+            self.logger.warning(
+                f"Ignoring packages that only have defined versions in one endpoint: {' '.join(inconsistent_keys)}"
+            )
+            for k in inconsistent_keys:
+                for d in [passing_versions, failing_versions]:
+                    d.pop(k, None)
+
+        # Not sure how to handle a package that does not have a defined version in the
+        # bisection environment but that is expected to be included in the bisection...
+        assert passing_versions.keys() == failing_versions.keys()
+        unknown_initial_packages = (
+            passing_versions.keys() - self.bisection_versions.keys()
+        )
+        assert len(unknown_initial_packages) == 0, (
+            passing_versions.keys(),
+            self.bisection_versions.keys(),
+        )
+
+        # Which packages have versions that are not always the same? There are three
+        # relevant sets of versions: the starting values in the bisection environment,
+        # the start/passing value for the bisection, and the end/failing value for the
+        # bisection.
+        static_packages = {
+            pkg
+            for pkg, _ in set(passing_versions.items())
+            & set(failing_versions.items())
+            & set(self.bisection_versions.items())
+        }
+        self.dynamic_packages = passing_versions.keys() - static_packages
         self.logger.info(f"Using {self.bisection_url} for version-level bisection...")
         assert self.package_dirs is not None
         # This is the set of versions that are already installed
diff --git a/.github/triage/tests/test_pyxis_backend.py b/.github/triage/tests/test_pyxis_backend.py
@@ -13,6 +13,13 @@
 mock_scripts_path = pathlib.Path(__file__).parent / "mock_scripts"
 
 
+pyxis_args = [
+    # Currently no way to use the pyxis backend without a cache
+    "--bazel-cache=https://example.com/does-not-exist",
+    "--container-runtime=pyxis",
+]
+
+
 def git_cmd(*args, cwd=None):
     return subprocess.run(
         ["git"] + list(args),
@@ -73,9 +80,17 @@ def failing_container(passing_container):
             git("commit", "--allow-empty", "-m", "C4")
             git("commit", "--allow-empty", "-m", "C5")
             git("commit", "--allow-empty", "-m", "C6")
+            c6 = git("rev-parse", "HEAD")
             if scenario == "non-linear":
+                # cherry-pick the feature on top of the good commit
+                git("checkout", metadata[f"{project}_good"])
+                git("cherry-pick", passing_container[f"{project}_feature_commit"])
+                metadata[f"{project}_good_with_feature"] = git("rev-parse", "HEAD")
                 git("checkout", "-b", "feature-2")
+                git("reset", "--hard", c6)
                 git("cherry-pick", passing_container[f"{project}_feature_commit"])
+            else:
+                metadata[f"{project}_good_with_feature"] = metadata[f"{project}_good"]
             metadata[f"{project}_failing_container"] = git("rev-parse", "HEAD")
         yield metadata
 
@@ -99,10 +114,7 @@ def test_mock_containers(
     # Ensure bazel, build-jax.sh, srun etc. stubs can be found.
     monkeypatch.setenv("PATH", str(mock_scripts_path), prepend=":")
     with tempfile.TemporaryDirectory() as output_prefix:
-        arg_list = [
-            # Currently no way to use the pyxis backend without a cache
-            "--bazel-cache=https://example.com/does-not-exist",
-            "--container-runtime=pyxis",
+        arg_list = pyxis_args + [
             "--output-prefix",
             output_prefix,
             "--passing-container",
@@ -144,6 +156,93 @@ def test_mock_containers(
         )
 
 
+def test_mock_containers_with_explicit_version_override(
+    monkeypatch,
+    passing_container,
+    failing_container,
+):
+    # The point of this test is that if you pass
+    # --passing-container=<has X=a> --failing-container=<has X=c>,
+    # --passing-versions=X=b        --failing-versions=X=b
+    # then it is important to check out version b of package X in the triage
+    # environment, even though it is not actually included in the triage
+
+    # fixed_package is forced to its good value by --{passing,failing}-versions, and
+    # the test command only passes if it has *exactly* that value -- which it doesn't
+    # initially have in either container. `_with_feature` is needed to make sure the
+    # fake build succeeds.
+    triage_package, fixed_package = compulsory_software[:2]
+    fixed_good_commit = failing_container[f"{fixed_package}_good_with_feature"]
+    # Tell the mock `srun` how to behave
+    monkeypatch.setenv("JAX_TOOLBOX_TRIAGE_MOCK_SRUN_NODES", str(1))
+    monkeypatch.setenv("JAX_TOOLBOX_TRIAGE_MOCK_SRUN_PROCS_PER_NODE", str(1))
+    # Ensure bazel, build-jax.sh, srun etc. stubs can be found.
+    monkeypatch.setenv("PATH", str(mock_scripts_path), prepend=":")
+    with tempfile.TemporaryDirectory() as output_prefix:
+        arg_list = pyxis_args + [
+            "--output-prefix",
+            output_prefix,
+            "--passing-container",
+            str(passing_container["prefix"]),
+            "--passing-versions",
+            f"{fixed_package}:{fixed_good_commit}",
+            "--failing-container",
+            str(failing_container["prefix"]),
+            "--failing-versions",
+            f"{fixed_package}:{fixed_good_commit}",
+            "--",
+            "sh",
+            "-c",
+            " && ".join(
+                [
+                    f'[ $(cd ${{JAX_TOOLBOX_TRIAGE_PREFIX}}/opt/{fixed_package} && git rev-parse HEAD) = "{fixed_good_commit}" ]',
+                    f"test-case.sh /opt/{triage_package} {failing_container[f'{triage_package}_bad']}",
+                ]
+            ),
+        ]
+        args = parse_args(arg_list)
+        logger = logging.getLogger()
+        logger.setLevel(logging.DEBUG)
+        tool = TriageTool(args, logger)
+        # Check the correct versions are extracted from the two pseudocontainers
+        passing_versions, failing_versions = tool.gather_version_info(
+            args.passing_container, args.failing_container
+        )
+        # These are not overridden, they are read from the containers
+        assert (
+            passing_versions[triage_package]
+            == passing_container[f"{triage_package}_passing_container"]
+        )
+        assert (
+            failing_versions[triage_package]
+            == failing_container[f"{triage_package}_failing_container"]
+        )
+        # These are overridden by --passing-version and --failing-version
+        assert passing_versions[fixed_package] == fixed_good_commit
+        assert failing_versions[fixed_package] == fixed_good_commit
+        # The starting value is not the value it is fixed to
+        assert tool.bisection_versions[fixed_package] != fixed_good_commit
+        assert tool.bisection_versions[fixed_package] in {
+            passing_container[f"{fixed_package}_passing_container"],
+            failing_container[f"{fixed_package}_failing_container"],
+        }
+        # fixed_package is dynamic, because its version needs to be changed from its starting value
+        assert fixed_package in tool.dynamic_packages
+        # triage_package is dynamic, because it is being triaged
+        assert triage_package in tool.dynamic_packages
+        # Run the bisection
+        summary_data = tool.run_version_bisection(passing_versions, failing_versions)
+        assert "result" in summary_data, summary_data
+        assert (
+            summary_data["result"][f"{triage_package}_good"]
+            == failing_container[f"{triage_package}_good"]
+        )
+        assert (
+            summary_data["result"][f"{triage_package}_bad"]
+            == failing_container[f"{triage_package}_bad"]
+        )
+
+
 @pytest.fixture
 def passing_container_with_bad_library(passing_container):
     scenario = passing_container["scenario"]
@@ -186,13 +285,9 @@ def test_triage_with_missing_installation_script_dir(
     monkeypatch.setenv("JAX_TOOLBOX_TRIAGE_MOCK_SRUN_PROCS_PER_NODE", "1")
     # Ensure the srun stub can be found
     monkeypatch.setenv("PATH", str(mock_scripts_path), prepend=":")
-    arg_list = [
-        # Currently no way to use the pyxis backend without a cache
-        "--bazel-cache=https://example.com/does-not-exist",
+    arg_list = pyxis_args + [
         "--build-scripts",
         "/path-does-not-exist",
-        "--container-runtime",
-        "pyxis",
         "--passing-container",
         str(passing_container["prefix"]),
         "--failing-container",
@@ -226,13 +321,9 @@ def test_triage_with_installation_scripts(
     # Ensure bazel, build-jax.sh, srun etc. stubs can be found.
     monkeypatch.setenv("PATH", str(mock_scripts_path), prepend=":")
     with tempfile.TemporaryDirectory() as output_prefix:
-        arg_list = [
-            # Currently no way to use the pyxis backend without a cache
-            "--bazel-cache=https://example.com/does-not-exist",
+        arg_list = pyxis_args + [
             "--build-scripts",
             "/build-scripts",
-            "--container-runtime",
-            "pyxis",
             "--output-prefix",
             output_prefix,
             "--passing-container",