google-ml-infra
diff --git a/‎.github/workflows/asan.yaml‎
Lines changed: 88 additions & 0 deletions b/‎.github/workflows/asan.yaml‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 8 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/ffi.ipynb‎
Lines changed: 27 additions & 17 deletions b/‎docs/ffi.ipynb‎
Lines changed: 27 additions & 17 deletions
diff --git a/‎docs/ffi.md‎
Lines changed: 27 additions & 17 deletions b/‎docs/ffi.md‎
Lines changed: 27 additions & 17 deletions
diff --git a/‎docs/jax.lax.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/jax.lax.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/jax.numpy.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/jax.numpy.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/notebooks/shard_map.ipynb‎
Lines changed: 2 additions & 0 deletions b/‎docs/notebooks/shard_map.ipynb‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/notebooks/shard_map.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/notebooks/shard_map.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/pallas/CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/pallas/CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,88 @@
+name: CI - Address Sanitizer (nightly)
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+on:
+  schedule:
+    - cron: "0 12 * * *" # Daily at 12:00 UTC
+  workflow_dispatch: # allows triggering the workflow run manually
+  pull_request: # Automatically trigger on pull requests affecting this file
+    branches:
+      - main
+    paths:
+      - '**/workflows/asan.yml'
+
+jobs:
+  asan:
+    runs-on: linux-x86-n2-64
+    container:
+      image: index.docker.io/library/ubuntu@sha256:b359f1067efa76f37863778f7b6d0e8d911e3ee8efa807ad01fbf5dc1ef9006b # ratchet:ubuntu:24.04
+    strategy:
+      fail-fast: false
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+        with:
+          path: jax
+      - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+        with:
+          repository: python/cpython
+          path: cpython
+          ref: v3.12.6
+      - name: Install clang 18
+        env:
+          DEBIAN_FRONTEND: noninteractive
+        run: |
+          apt update
+          apt install -y clang-18 libstdc++-14-dev build-essential libssl-dev \
+            zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev curl git \
+            libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev \
+            libffi-dev liblzma-dev
+      - name: Build CPython with ASAN enabled
+        env:
+          ASAN_OPTIONS: detect_leaks=0
+        run: |
+          cd cpython
+          mkdir ${GITHUB_WORKSPACE}/cpythonasan
+          CC=clang-18 CXX=clang++-18 ./configure --prefix ${GITHUB_WORKSPACE}/cpythonasan --with-address-sanitizer --without-pymalloc
+          make -j64
+          make install
+          ${GITHUB_WORKSPACE}/cpythonasan/bin/python3 -m venv ${GITHUB_WORKSPACE}/venv
+      - name: Install JAX test requirements
+        env:
+          ASAN_OPTIONS: detect_leaks=0
+        run: |
+          source ${GITHUB_WORKSPACE}/venv/bin/activate
+          cd jax
+          pip install -r build/test-requirements.txt
+      - name: Build and install JAX
+        env:
+          ASAN_OPTIONS: detect_leaks=0
+        run: |
+          source ${GITHUB_WORKSPACE}/venv/bin/activate
+          cd jax
+          python build/build.py \
+            --bazel_options=--color=yes \
+            --bazel_options=--copt=-fsanitize=address \
+            --clang_path=/usr/bin/clang-18
+          pip install dist/jaxlib-*.whl
+          pip install -e .
+      - name: Run tests
+        env:
+          ASAN_OPTIONS: detect_leaks=0
+          JAX_NUM_GENERATED_CASES: 1
+          JAX_ENABLE_X64: true
+          JAX_SKIP_SLOW_TESTS: true
+          PY_COLORS: 1
+        run: |
+          source ${GITHUB_WORKSPACE}/venv/bin/activate
+          cd jax
+          echo "JAX_NUM_GENERATED_CASES=$JAX_NUM_GENERATED_CASES"
+          echo "JAX_ENABLE_X64=$JAX_ENABLE_X64"
+          echo "JAX_SKIP_SLOW_TESTS=$JAX_SKIP_SLOW_TESTS"
+          # The LD_PRELOAD works around https://github.com/google/sanitizers/issues/934#issuecomment-649516500
+          LD_PRELOAD=/lib/x86_64-linux-gnu/libstdc++.so.6 python -m pytest -n auto --tb=short --maxfail=20 tests
@@ -10,7 +10,9 @@ Remember to align the itemized text with the first line of an item within a list
 When releasing, please add the new-release-boilerplate to docs/pallas/CHANGELOG.md.
 -->
 
-## jax 0.4.34
+## jax 0.4.35
+
+## jax 0.4.34 (October 4, 2023)
 
 * New Functionality
   * This release includes wheels for Python 3.13. Free-threading mode is not yet
@@ -46,6 +48,11 @@ When releasing, please add the new-release-boilerplate to docs/pallas/CHANGELOG.
   * `jax.lib.xla_client.Device` is deprecated; use `jax.Device` instead.
   * `jax.lib.xla_client.XlaRuntimeError` has been deprecated. Use
     `jax.errors.JaxRuntimeError` instead.
+  * The default behavior of {func}`jax.pure_callback` and
+    {func}`jax.extend.ffi.ffi_call` under `vmap` has been deprecated and so has
+    the `vectorized` parameter to those functions. The `vmap_method` parameter
+    should be used instead for better defined behavior. See the discussion in
+    {jax-issue}`#23881` for more details.
 
 * Deletion:
   * `jax.xla_computation` is deleted. It's been 3 months since it's deprecation
 
@@ -8,8 +8,8 @@
 
 # Transformable numerical computing at scale
 
-![Continuous integration](https://github.com/jax-ml/jax/actions/workflows/ci-build.yaml/badge.svg)
-![PyPI version](https://img.shields.io/pypi/v/jax)
+[![Continuous integration](https://github.com/jax-ml/jax/actions/workflows/ci-build.yaml/badge.svg)](https://github.com/jax-ml/jax/actions/workflows/ci-build.yaml)
+[![PyPI version](https://img.shields.io/pypi/v/jax)](https://pypi.org/project/jax/)
 
 [**Quickstart**](#quickstart-colab-in-the-cloud)
 | [**Transformations**](#transformations)
 
@@ -303,9 +303,9 @@
     "    # type (which corresponds to numpy's `float32` type), and it must be a\n",
     "    # static parameter (i.e. not a JAX array).\n",
     "    eps=np.float32(eps),\n",
-    "    # The `vectorized` parameter controls this function's behavior under `vmap`\n",
+    "    # The `vmap_method` parameter controls this function's behavior under `vmap`\n",
     "    # as discussed below.\n",
-    "    vectorized=True,\n",
+    "    vmap_method=\"broadcast_fullrank\",\n",
     "  )\n",
     "\n",
     "\n",
@@ -325,7 +325,7 @@
     "Any attributes (defined using `Attr` in the C++ wrapper above) should be passed as keyword arguments to {func}`~jax.extend.ffi.ffi_call`.\n",
     "Note that we explicitly cast `eps` to `np.float32` because our FFI library expects a C `float`, and we can't use `jax.numpy` here, because these parameters must be static arguments.\n",
     "\n",
-    "The `vectorized` argument to {func}`~jax.extend.ffi.ffi_call` defines how this FFI call interacts with {func}`~jax.vmap` as described next.\n",
+    "The `vmap_method` argument to {func}`~jax.extend.ffi.ffi_call` defines how this FFI call interacts with {func}`~jax.vmap` as described next.\n",
     "\n",
     "```{tip}\n",
     "If you are familiar with the earlier \"custom call\" interface, you might be surprised that we're not passing the problem dimensions as parameters (batch size, etc.) to {func}`~jax.extend.ffi.ffi_call`.\n",
@@ -336,19 +336,29 @@
     "(ffi-call-vmap)=\n",
     "### Batching with `vmap`\n",
     "\n",
-    "All uses of {func}`~jax.extend.ffi.ffi_call` support {func}`~jax.vmap` out of the box, but this implementation won't necessarily be very efficient.\n",
-    "By default, when `vmap`ped, an `ffi_call` will be rewritten as a {func}`~jax.lax.scan` with the `ffi_call` in the body.\n",
-    "This default implementation is general purpose, but it doesn't parallelize very well.\n",
-    "But, many FFI calls provide more efficient batching behavior and, in some simple cases, the `vectorized` parameter to {func}`~jax.extend.ffi.ffi_call` can be used to expose a better implementation.\n",
+    "{func}`~jax.extend.ffi.ffi_call` supports some simple {func}`~jax.vmap` semantics out of the box using the `vmap_method` parameter.\n",
+    "The docs for {func}`~jax.pure_callback` provide more details about the `vmap_method` parameter, and the same behavior applies to {func}`~jax.extend.ffi.ffi_call`.\n",
     "\n",
-    "The specific assumption required to use the `vectorized` parameter is that all leading dimensions of the inputs should be treated as batch axes.\n",
+    "The simplest `vmap_method` is `\"sequential\"`.\n",
+    "In this case, when `vmap`ped, an `ffi_call` will be rewritten as a {func}`~jax.lax.scan` with the `ffi_call` in the body.\n",
+    "This implementation is general purpose, but it doesn't parallelize very well.\n",
+    "Many FFI calls provide more efficient batching behavior and, in some simple cases, the `\"broadcast\"` or `\"broadcast_fullrank\"` methods can be used to expose a better implementation.\n",
+    "\n",
+    "In this case, since we only have one input argument, `\"broadcast\"` and `\"broadcast_fullrank\"` actually have the same behavior.\n",
+    "The specific assumption required to use these methods is that the foreign function knows how to handle batch dimensions.\n",
     "Another way of saying this is that the result of calling `ffi_call` on the batched inputs is assumed to be equal to stacking the repeated application of `ffi_call` to each element in the batched input, roughly:\n",
     "\n",
     "```python\n",
     "ffi_call(xs) == jnp.stack([ffi_call(x) for x in xs])\n",
     "```\n",
     "\n",
-    "Our implementation of `rms_norm` has the appropriate semantics, and it supports `vmap` with `vectorized=True` out of the box:"
+    "```{tip}\n",
+    "Note that things get a bit more complicated when we have multiple input arguments.\n",
+    "For simplicity, we will use the `\"broadcast_fullrank\"` throughout this tutorial, which guarantees that all inputs will be broadcasted to have the same batch dimensions, but it would also be possible to implement a foreign function to handle the `\"broadcast\"` method.\n",
+    "The documentation for {func}`~jax.pure_callback` includes some examples of this\n",
+    "```\n",
+    "\n",
+    "Our implementation of `rms_norm` has the appropriate semantics, and it supports `vmap` with `vmap_method=\"broadcast_fullrank\"` out of the box:"
    ]
   },
   {
@@ -380,7 +390,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "If `vectorized` is `False` or omitted, `vmap`ping a `ffi_call` will fall back on a {func}`jax.lax.scan` with the `ffi_call` in the body:"
+    "Using `vmap_method=\"sequential\"`, `vmap`ping a `ffi_call` will fall back on a {func}`jax.lax.scan` with the `ffi_call` in the body:"
    ]
   },
   {
@@ -389,24 +399,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def rms_norm_not_vectorized(x, eps=1e-5):\n",
+    "def rms_norm_sequential(x, eps=1e-5):\n",
     "  return jex.ffi.ffi_call(\n",
     "    \"rms_norm\",\n",
     "    jax.ShapeDtypeStruct(x.shape, x.dtype),\n",
     "    x,\n",
     "    eps=np.float32(eps),\n",
-    "    vectorized=False,  # This is the default behavior\n",
+    "    vmap_method=\"sequential\",\n",
     "  )\n",
     "\n",
     "\n",
-    "jax.make_jaxpr(jax.vmap(rms_norm_not_vectorized))(x)"
+    "jax.make_jaxpr(jax.vmap(rms_norm_sequential))(x)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "If your foreign function provides an efficient batching rule that isn't supported by this simple `vectorized` parameter, it might also be possible to define more flexible custom `vmap` rules using the experimental `custom_vmap` interface, but it's worth also opening an issue describing your use case on [the JAX issue tracker](https://github.com/jax-ml/jax/issues)."
+    "If your foreign function provides an efficient batching rule that isn't supported by this simple `vmap_method` parameter, it might also be possible to define more flexible custom `vmap` rules using the experimental `custom_vmap` interface, but it's worth also opening an issue describing your use case on [the JAX issue tracker](https://github.com/jax-ml/jax/issues)."
    ]
   },
   {
@@ -454,7 +464,7 @@
     "    ),\n",
     "    x,\n",
     "    eps=np.float32(eps),\n",
-    "    vectorized=True,\n",
+    "    vmap_method=\"broadcast_fullrank\",\n",
     "  )\n",
     "  return y, (res, x)\n",
     "\n",
@@ -471,7 +481,7 @@
     "      res,\n",
     "      x,\n",
     "      ct,\n",
-    "      vectorized=True,\n",
+    "    vmap_method=\"broadcast_fullrank\",\n",
     "    ),\n",
     "  )\n",
     "\n",
@@ -561,7 +571,7 @@
     "      out_type,\n",
     "      x,\n",
     "      eps=np.float32(eps),\n",
-    "      vectorized=True,\n",
+    "      vmap_method=\"broadcast_fullrank\",\n",
     "    )\n",
     "\n",
     "  return jax.lax.platform_dependent(x, cpu=impl(\"rms_norm\"), cuda=impl(\"rms_norm_cuda\"))\n",
 
@@ -264,9 +264,9 @@ def rms_norm(x, eps=1e-5):
     # type (which corresponds to numpy's `float32` type), and it must be a
     # static parameter (i.e. not a JAX array).
     eps=np.float32(eps),
-    # The `vectorized` parameter controls this function's behavior under `vmap`
+    # The `vmap_method` parameter controls this function's behavior under `vmap`
     # as discussed below.
-    vectorized=True,
+    vmap_method="broadcast_fullrank",
   )
 
 
@@ -282,7 +282,7 @@ It's important to note that the first argument to {func}`~jax.extend.ffi.ffi_cal
 Any attributes (defined using `Attr` in the C++ wrapper above) should be passed as keyword arguments to {func}`~jax.extend.ffi.ffi_call`.
 Note that we explicitly cast `eps` to `np.float32` because our FFI library expects a C `float`, and we can't use `jax.numpy` here, because these parameters must be static arguments.
 
-The `vectorized` argument to {func}`~jax.extend.ffi.ffi_call` defines how this FFI call interacts with {func}`~jax.vmap` as described next.
+The `vmap_method` argument to {func}`~jax.extend.ffi.ffi_call` defines how this FFI call interacts with {func}`~jax.vmap` as described next.
 
 ```{tip}
 If you are familiar with the earlier "custom call" interface, you might be surprised that we're not passing the problem dimensions as parameters (batch size, etc.) to {func}`~jax.extend.ffi.ffi_call`.
@@ -293,19 +293,29 @@ One major perk of this change is {func}`~jax.extend.ffi.ffi_call` can support so
 (ffi-call-vmap)=
 ### Batching with `vmap`
 
-All uses of {func}`~jax.extend.ffi.ffi_call` support {func}`~jax.vmap` out of the box, but this implementation won't necessarily be very efficient.
-By default, when `vmap`ped, an `ffi_call` will be rewritten as a {func}`~jax.lax.scan` with the `ffi_call` in the body.
-This default implementation is general purpose, but it doesn't parallelize very well.
-But, many FFI calls provide more efficient batching behavior and, in some simple cases, the `vectorized` parameter to {func}`~jax.extend.ffi.ffi_call` can be used to expose a better implementation.
+{func}`~jax.extend.ffi.ffi_call` supports some simple {func}`~jax.vmap` semantics out of the box using the `vmap_method` parameter.
+The docs for {func}`~jax.pure_callback` provide more details about the `vmap_method` parameter, and the same behavior applies to {func}`~jax.extend.ffi.ffi_call`.
 
-The specific assumption required to use the `vectorized` parameter is that all leading dimensions of the inputs should be treated as batch axes.
+The simplest `vmap_method` is `"sequential"`.
+In this case, when `vmap`ped, an `ffi_call` will be rewritten as a {func}`~jax.lax.scan` with the `ffi_call` in the body.
+This implementation is general purpose, but it doesn't parallelize very well.
+Many FFI calls provide more efficient batching behavior and, in some simple cases, the `"broadcast"` or `"broadcast_fullrank"` methods can be used to expose a better implementation.
+
+In this case, since we only have one input argument, `"broadcast"` and `"broadcast_fullrank"` actually have the same behavior.
+The specific assumption required to use these methods is that the foreign function knows how to handle batch dimensions.
 Another way of saying this is that the result of calling `ffi_call` on the batched inputs is assumed to be equal to stacking the repeated application of `ffi_call` to each element in the batched input, roughly:
 
 ```python
 ffi_call(xs) == jnp.stack([ffi_call(x) for x in xs])
 ```
 
-Our implementation of `rms_norm` has the appropriate semantics, and it supports `vmap` with `vectorized=True` out of the box:
+```{tip}
+Note that things get a bit more complicated when we have multiple input arguments.
+For simplicity, we will use the `"broadcast_fullrank"` throughout this tutorial, which guarantees that all inputs will be broadcasted to have the same batch dimensions, but it would also be possible to implement a foreign function to handle the `"broadcast"` method.
+The documentation for {func}`~jax.pure_callback` includes some examples of this
+```
+
+Our implementation of `rms_norm` has the appropriate semantics, and it supports `vmap` with `vmap_method="broadcast_fullrank"` out of the box:
 
 ```{code-cell} ipython3
 np.testing.assert_allclose(jax.vmap(rms_norm)(x), jax.vmap(rms_norm_ref)(x), rtol=1e-5)
@@ -317,23 +327,23 @@ We can inspect the [jaxpr](jax-internals-jaxpr) of the {func}`~jax.vmap` of `rms
 jax.make_jaxpr(jax.vmap(rms_norm))(x)
 ```
 
-If `vectorized` is `False` or omitted, `vmap`ping a `ffi_call` will fall back on a {func}`jax.lax.scan` with the `ffi_call` in the body:
+Using `vmap_method="sequential"`, `vmap`ping a `ffi_call` will fall back on a {func}`jax.lax.scan` with the `ffi_call` in the body:
 
 ```{code-cell} ipython3
-def rms_norm_not_vectorized(x, eps=1e-5):
+def rms_norm_sequential(x, eps=1e-5):
   return jex.ffi.ffi_call(
     "rms_norm",
     jax.ShapeDtypeStruct(x.shape, x.dtype),
     x,
     eps=np.float32(eps),
-    vectorized=False,  # This is the default behavior
+    vmap_method="sequential",
   )
 
 
-jax.make_jaxpr(jax.vmap(rms_norm_not_vectorized))(x)
+jax.make_jaxpr(jax.vmap(rms_norm_sequential))(x)
 ```
 
-If your foreign function provides an efficient batching rule that isn't supported by this simple `vectorized` parameter, it might also be possible to define more flexible custom `vmap` rules using the experimental `custom_vmap` interface, but it's worth also opening an issue describing your use case on [the JAX issue tracker](https://github.com/jax-ml/jax/issues).
+If your foreign function provides an efficient batching rule that isn't supported by this simple `vmap_method` parameter, it might also be possible to define more flexible custom `vmap` rules using the experimental `custom_vmap` interface, but it's worth also opening an issue describing your use case on [the JAX issue tracker](https://github.com/jax-ml/jax/issues).
 
 +++
 
@@ -372,7 +382,7 @@ def rms_norm_fwd(x, eps=1e-5):
     ),
     x,
     eps=np.float32(eps),
-    vectorized=True,
+    vmap_method="broadcast_fullrank",
   )
   return y, (res, x)
 
@@ -389,7 +399,7 @@ def rms_norm_bwd(eps, res, ct):
       res,
       x,
       ct,
-      vectorized=True,
+    vmap_method="broadcast_fullrank",
     ),
   )
 
@@ -469,7 +479,7 @@ def rms_norm_cross_platform(x, eps=1e-5):
       out_type,
       x,
       eps=np.float32(eps),
-      vectorized=True,
+      vmap_method="broadcast_fullrank",
     )
 
   return jax.lax.platform_dependent(x, cpu=impl("rms_norm"), cuda=impl("rms_norm_cuda"))
 
@@ -255,4 +255,5 @@ Argument classes
 .. autoclass:: Precision
 .. autoclass:: PrecisionLike
 .. autoclass:: RoundingMethod
+  :members:
 .. autoclass:: ScatterDimensionNumbers
@@ -376,6 +376,7 @@ namespace; they are listed below.
     size
     sort
     sort_complex
+    spacing
     split
     sqrt
     square
 
@@ -510,6 +510,8 @@
     "the corresponding `PartitionSpec` `spec` as roughly\n",
     "`tuple(sz // (1 if n is None else mesh.shape[n]) for sz, n in zip(shape, spec))`.\n",
     "\n",
+    "(shard_map_collectives_tutorial)=\n",
+    "\n",
     "## Collectives tutorial\n",
     "\n",
     "A `shard_map` need not be a pure map: function applications can communicate\n",
 
@@ -357,6 +357,8 @@ from the shape `shape` of the corresponding argument to `shard_map`-of-`f` and
 the corresponding `PartitionSpec` `spec` as roughly
 `tuple(sz // (1 if n is None else mesh.shape[n]) for sz, n in zip(shape, spec))`.
 
+(shard_map_collectives_tutorial)=
+
 ## Collectives tutorial
 
 A `shard_map` need not be a pure map: function applications can communicate
 
@@ -18,6 +18,8 @@ Remember to align the itemized text with the first line of an item within a list
   * {func}`jax.experimental.pallas.debug_print` no longer requires all arguments
     to be scalars. The restrictions on the arguments are backend-specific:
     Non-scalar arguments are currently only supported on GPU, when using Triton.
+  * {class}`jax.experimental.pallas.BlockSpec` no longer supports the previously
+    deprecated argument order, where `index_map` comes before `block_shape`.
 
 * Deprecations
-Original file line number
+Diff line change
     size
     sort
     sort_complex
 +    spacing
     split
     sqrt
     square