google-ml-infra
diff --git a/‎.bazelrc‎
Lines changed: 5 additions & 0 deletions b/‎.bazelrc‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/workflows/asan.yaml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/asan.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/ci-build.yaml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/ci-build.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/cloud-tpu-ci-nightly.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/cloud-tpu-ci-nightly.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/cloud-tpu-ci-presubmit.yml‎
Lines changed: 93 additions & 0 deletions b/‎.github/workflows/cloud-tpu-ci-presubmit.yml‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CHANGELOG.md‎
Lines changed: 19 additions & 6 deletions b/‎CHANGELOG.md‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎build/build.py‎
Lines changed: 4 additions & 1 deletion b/‎build/build.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎ci/run_pytest_tpu.sh‎
Lines changed: 5 additions & 4 deletions b/‎ci/run_pytest_tpu.sh‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎docs/export/export.md‎
Lines changed: 38 additions & 13 deletions b/‎docs/export/export.md‎
Lines changed: 38 additions & 13 deletions
@@ -96,6 +96,11 @@ build:avx_windows --copt=/arch:AVX
 
 build:mkl_open_source_only --define=tensorflow_mkldnn_contraction_kernel=1
 
+# Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL).
+build:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true
+build:mkl_aarch64_threadpool --@compute_library//:openmp=false
+build:mkl_aarch64_threadpool -c opt
+
 # Disable clang extention that rejects type definitions within offsetof.
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
 # Can be removed once upb is updated, since a type definition is used within
 
@@ -16,6 +16,8 @@ on:
 
 jobs:
   asan:
+    # Don't execute in fork due to runner type
+    if: github.repository == 'jax-ml/jax'
     runs-on: linux-x86-n2-64
     container:
       image: index.docker.io/library/ubuntu@sha256:b359f1067efa76f37863778f7b6d0e8d911e3ee8efa807ad01fbf5dc1ef9006b # ratchet:ubuntu:24.04
 
@@ -42,6 +42,8 @@ jobs:
       - run: pre-commit run --show-diff-on-failure --color=always --all-files
 
   build:
+    # Don't execute in fork due to runner type
+    if: github.repository == 'jax-ml/jax'
     name: "build ${{ matrix.name-prefix }} (py ${{ matrix.python-version }} on ubuntu-20.04, x64=${{ matrix.enable-x64}})"
     runs-on: linux-x86-n2-32
     container:
 
@@ -13,7 +13,7 @@
 name: CI - Cloud TPU (nightly)
 on:
   schedule:
-    - cron: "0 */2 * * *" # Run every 2 hours
+    - cron: "0 2,14 * * *" # Run at 7am and 7pm PST
   workflow_dispatch: # allows triggering the workflow run manually
 # This should also be set to read-only in the project settings, but it's nice to
 # document and enforce the permissions here.
@@ -33,12 +33,12 @@ jobs:
         python-version: ["3.10"]
     name: "TPU test (jaxlib=${{ matrix.jaxlib-version }}, ${{ matrix.tpu.type }})"
     env:
-      LIBTPU_OLDEST_VERSION_DATE: 20240722
+      LIBTPU_OLDEST_VERSION_DATE: 20240922
       ENABLE_PJRT_COMPATIBILITY: ${{ matrix.jaxlib-version == 'nightly+oldest_supported_libtpu' }}
       PYTHON: python${{ matrix.python-version }}
     runs-on: ${{ matrix.tpu.runner }}
     container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest"
-    timeout-minutes: 120
+    timeout-minutes: 180
     defaults:
       run:
         shell: bash -ex {0}
 
@@ -0,0 +1,93 @@
+# Cloud TPU CI (presubmit)
+#
+# This job currently runs as a non-blocking presubmit. It is experimental and is currently being
+# tested to get to a stable state before we enable it as a blocking presubmit.
+name: CI - Cloud TPU (presubmit)
+on:
+  workflow_dispatch:
+    inputs:
+      halt-for-connection:
+        description: 'Should this workflow run wait for a remote connection?'
+        type: choice
+        required: true
+        default: 'no'
+        options:
+        - 'yes'
+        - 'no'
+  pull_request:
+    branches:
+      - main
+
+# This should also be set to read-only in the project settings, but it's nice to
+# document and enforce the permissions here.
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  cloud-tpu-test:
+    if: github.event.repository.fork == false
+    strategy:
+      fail-fast: false # don't cancel all jobs on failure
+      matrix:
+        tpu: [
+          {type: "v5e-8", cores: "8", runner: "linux-x86-ct5lp-224-8tpu"}
+        ]
+        python-version: ["3.10"]
+
+    name: "TPU test (jaxlib=head, ${{ matrix.tpu.type }})"
+
+    env:
+      JAXCI_PYTHON: python${{ matrix.python-version }}
+      JAXCI_TPU_CORES: ${{ matrix.tpu.cores }}
+
+    runs-on: ${{ matrix.tpu.runner }}
+    container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest"
+
+    timeout-minutes: 60
+
+    defaults:
+      run:
+        shell: bash -ex {0}
+
+    steps:
+      # https://opensource.google/documentation/reference/github/services#actions
+      # mandates using a specific commit for non-Google actions. We use
+      # https://github.com/sethvargo/ratchet to pin specific versions.
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683  # v4.2.2
+      # Checkout XLA at head, if we're building jaxlib at head.
+      - name: Checkout XLA at head
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683  # v4.2.2
+        with:
+          repository: openxla/xla
+          path: xla
+      # We need to mark the GitHub workspace as safe as otherwise git commands will fail.
+      - name: Mark GitHub workspace as safe
+        run: |
+          git config --global --add safe.directory "$GITHUB_WORKSPACE"
+      - name: Install JAX test requirements
+        run: |
+          $JAXCI_PYTHON -m pip install -U -r build/test-requirements.txt
+          $JAXCI_PYTHON -m pip install -U -r build/collect-profile-requirements.txt
+      - name: Build jaxlib at head with latest XLA
+        run: |
+          # Build and install jaxlib at head
+          $JAXCI_PYTHON build/build.py build --wheels=jaxlib \
+                  --python_version=${{ matrix.python-version }} \
+                  --bazel_options=--config=rbe_linux_x86_64 \
+                  --local_xla_path="$(pwd)/xla" \
+                  --verbose
+
+          # Install libtpu
+          $JAXCI_PYTHON -m pip install --pre libtpu \
+            -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+      # Halt for testing
+      - name: Wait For Connection
+        uses: google-ml-infra/actions/ci_connection@main
+        with:
+          halt-dispatch-input: ${{ inputs.halt-for-connection }}
+      - name: Install jaxlib wheel and run tests
+        run: ./ci/run_pytest_tpu.sh
@@ -36,7 +36,7 @@ repos:
   - id: mypy
     files: (jax/|tests/typing_test\.py)
     exclude: jax/_src/basearray.py|jax/numpy/__init__.py  # Use pyi instead
-    additional_dependencies: [types-requests==2.31.0, jaxlib, numpy~=2.1.0]
+    additional_dependencies: [types-requests==2.31.0, jaxlib, numpy>=2.2.0]
     args: [--config=pyproject.toml]
 
 - repo: https://github.com/mwouts/jupytext
 
@@ -12,13 +12,26 @@ When releasing, please add the new-release-boilerplate to docs/pallas/CHANGELOG.
 
 ## jax 0.4.38
 
+* Changes:
+  * `jax.tree.flatten_with_path` and `jax.tree.map_with_path` are added
+    as shortcuts of the corresponding `tree_util` functions.
+
 * Deprecations
-  * a number of APIs in the internal `jax.core` namespace have been deprecated, including
-    `ClosedJaxpr`, `full_lower`, `Jaxpr`, `JaxprEqn`, `jaxpr_as_fun`, `lattice_join`,
-    `Literal`, `Primitive`, `raise_to_shaped`, `Token`, `Var`. Most can be replaced by
-    APIs of the same name in {mod}`jax.extend.core`; see the documentation for
-    {mod}`jax.extend` for information on the compatibility guarantees of these
-    semi-public extensions.
+  * a number of APIs in the internal `jax.core` namespace have been deprecated.
+    Most were no-ops, were little-used, or can be replaced by APIs of the same
+    name in {mod}`jax.extend.core`; see the documentation for {mod}`jax.extend`
+    for information on the compatibility guarantees of these semi-public extensions.
+  * Several previously-deprecated APIs have been removed, including:
+    * from {mod}`jax.core`: `check_eqn`, `check_type`,  `check_valid_jaxtype`, and
+      `non_negative_dim`.
+    * from {mod}`jax.lib.xla_bridge`: `xla_client` and `default_backend`.
+    * from {mod}`jax.lib.xla_client`: `_xla` and `bfloat16`.
+    * from {mod}`jax.numpy`: `round_`.
+
+* New Features
+  * {func}`jax.export.export` can be used for device-polymorphic export with
+    shardings constructed with {func}`jax.sharding.AbstractMesh`.
+    See the [jax.export documentation](https://jax.readthedocs.io/en/latest/export/export.html#device-polymorphic-export).
 
 ## jax 0.4.37 (Dec 9, 2024)
 
 
@@ -485,7 +485,10 @@ async def main():
 
     if not args.disable_mkl_dnn:
       logging.debug("Enabling MKL DNN")
-      wheel_build_command.append("--config=mkl_open_source_only")
+      if target_cpu == "aarch64":
+          wheel_build_command.append("--config=mkl_aarch64_threadpool")
+      else:
+          wheel_build_command.append("--config=mkl_open_source_only")
 
     if args.target_cpu_features == "release":
       if arch in ["x86_64", "AMD64"]:
 
@@ -40,20 +40,21 @@ source "ci/utilities/setup_build_environment.sh"
 strings /usr/local/lib/"$JAXCI_PYTHON"/dist-packages/libtpu/libtpu.so | grep 'Built on'
 "$JAXCI_PYTHON" -c 'import jax; print("libtpu version:",jax.lib.xla_bridge.get_backend().platform_version)'
 
-# Set up common test environment variables
+# Set up all common test environment variables
 export PY_COLORS=1
-export JAX_SKIP_SLOW_TESTS=true
 export JAX_PLATFORMS=tpu,cpu
+export JAX_SKIP_SLOW_TESTS=true
 # End of common test environment variable setup
 
 echo "Running TPU tests..."
+
 # Run single-accelerator tests in parallel
 JAX_ENABLE_TPU_XDIST=true "$JAXCI_PYTHON" -m pytest -n="$JAXCI_TPU_CORES" --tb=short \
 --deselect=tests/pallas/tpu_pallas_test.py::PallasCallPrintTest \
---maxfail=20 -m "not multiaccelerator" tests examples
+--maxfail=20 -m "not multiaccelerator" tests/pallas/tpu_ops_test.py
 
 # Run Pallas printing tests, which need to run with I/O capturing disabled.
 TPU_STDERR_LOG_LEVEL=0 "$JAXCI_PYTHON" -m pytest -s tests/pallas/tpu_pallas_test.py::PallasCallPrintTest
 
 # Run multi-accelerator across all chips
-"$JAXCI_PYTHON" -m pytest --tb=short --maxfail=20 -m "multiaccelerator" tests
+"$JAXCI_PYTHON" -m pytest --tb=short --maxfail=20 -m "multiaccelerator" tests/pjit_test.py
@@ -240,7 +240,7 @@ present on the exporting machine:
 
 ```
 
-There is a safety check that will be raise an error when trying to compile
+There is a safety check that will raise an error when trying to compile
 an `Exported` object on a machine that does not have the accelerator
 for which the code was exported.
 
@@ -326,7 +326,7 @@ combinations of input shapes.
 
 See the {ref}`shape_poly` documentation.
 
-## Device polymorphic export
+## Device-polymorphic export
 
 An exported artifact may contain sharding annotations for inputs,
 outputs and for some intermediates, but these annotations do not refer
@@ -335,20 +335,28 @@ Instead, the sharding annotations refer to logical devices. This
 means that you can compile and run the exported artifacts on different
 physical devices that were used for exporting.
 
+The cleanest way to achieve a device-polymorphic export is to
+use shardings constructed with a `jax.sharding.AbstractMesh`,
+which contains only the mesh shape and axis names. But,
+you can achieve the same results if you use shardings
+constructed for a mesh with concrete devices, since the actual
+devices in the mesh are ignored for tracing and lowering:
+
 ```python
 >>> import jax
 >>> from jax import export
->>> from jax.sharding import Mesh, NamedSharding
+>>> from jax.sharding import AbstractMesh, Mesh, NamedSharding
 >>> from jax.sharding import PartitionSpec as P
+>>>
+>>> # Use an AbstractMesh for exporting
+>>> export_mesh = AbstractMesh((("a", 4),))
 
->>> # Use the first 4 devices for exporting.
->>> export_devices = jax.local_devices()[:4]
->>> export_mesh = Mesh(export_devices, ("a",))
 >>> def f(x):
 ...   return x.T
 
->>> arg = jnp.arange(8 * len(export_devices))
->>> exp = export.export(jax.jit(f, in_shardings=(NamedSharding(export_mesh, P("a")),)))(arg)
+>>> exp = export.export(jax.jit(f))(
+...    jax.ShapeDtypeStruct((32,), dtype=np.int32,
+...                         sharding=NamedSharding(export_mesh, P("a"))))
 
 >>> # `exp` knows for how many devices it was exported.
 >>> exp.nr_devices
@@ -359,8 +367,20 @@ physical devices that were used for exporting.
 >>> exp.in_shardings_hlo
 ({devices=[4]<=[4]},)
 
+>>> # You can also use a concrete set of devices for exporting
+>>> concrete_devices = jax.local_devices()[:4]
+>>> concrete_mesh = Mesh(concrete_devices, ("a",))
+>>> exp2 = export.export(jax.jit(f))(
+...    jax.ShapeDtypeStruct((32,), dtype=np.int32,
+...                         sharding=NamedSharding(concrete_mesh, P("a"))))
+
+>>> # You can expect the same results
+>>> assert exp.in_shardings_hlo == exp2.in_shardings_hlo
+
+>>> # When you call an Exported, you must use a concrete set of devices
+>>> arg = jnp.arange(8 * 4)
 >>> res1 = exp.call(jax.device_put(arg,
-...                                NamedSharding(export_mesh, P("a"))))
+...                                NamedSharding(concrete_mesh, P("a"))))
 
 >>> # Check out the first 2 shards of the result
 >>> [f"device={s.device} index={s.index}" for s in res1.addressable_shards[:2]]
@@ -397,9 +417,11 @@ of devices than it was exported for:
 >>> def f(x):
 ...   return x.T
 
->>> arg = jnp.arange(4 * len(export_devices))
->>> exp = export.export(jax.jit(f, in_shardings=(NamedSharding(export_mesh, P("a")),)))(arg)
+>>> exp = export.export(jax.jit(f))(
+...    jax.ShapeDtypeStruct((4 * len(export_devices),), dtype=np.int32,
+...                         sharding=NamedSharding(export_mesh, P("a"))))
 
+>>> arg = jnp.arange(4 * len(export_devices))
 >>> exp.call(arg)  # doctest: +IGNORE_EXCEPTION_DETAIL
 Traceback (most recent call last):
 ValueError: Exported module f was lowered for 8 devices and is called in a context with 1 devices. This is disallowed because: the module was lowered for more than 1 device.
@@ -420,13 +442,16 @@ artifacts using a new mesh constructed at the call site:
 >>> def f(x):
 ...   return x.T
 
->>> arg = jnp.arange(4 * len(export_devices))
->>> exp = export.export(jax.jit(f, in_shardings=(NamedSharding(export_mesh, P("a")),)))(arg)
+
+>>> exp = export.export(jax.jit(f))(
+...    jax.ShapeDtypeStruct((4 * len(export_devices),), dtype=np.int32,
+...                         sharding=NamedSharding(export_mesh, P("a"))))
 
 >>> # Prepare the mesh for calling `exp`.
 >>> calling_mesh = Mesh(np.array(export_devices[::-1]), ("b",))
 
 >>> # Shard the arg according to what `exp` expects.
+>>> arg = jnp.arange(4 * len(export_devices))
 >>> sharded_arg = jax.device_put(arg, exp.in_shardings_jax(calling_mesh)[0])
 >>> res = exp.call(sharded_arg)