Add cuDNN LSTM for JAX backend

MarcosAsh · MarcosAsh · commit 7061b2f7d7fe · 2026-03-11T16:54:37.000Z
diff --git a/keras/src/backend/jax/rnn.py b/keras/src/backend/jax/rnn.py
@@ -211,12 +211,156 @@ def _step(states, current_input):
     return last_output, outputs, new_states
 
 
-def cudnn_ok(*args, **kwargs):
-    return False
+def _is_gpu_available():
+    import jax
 
+    return any(d.platform == "gpu" for d in jax.devices())
 
-def lstm(*args, **kwargs):
-    raise NotImplementedError
+
+def cudnn_ok(
+    activation,
+    recurrent_activation,
+    unroll,
+    use_bias=True,
+):
+    from keras.src import activations
+    from keras.src import ops
+
+    return (
+        activation in (activations.tanh, jnp.tanh, ops.tanh)
+        and recurrent_activation
+        in (activations.sigmoid, ops.sigmoid)  # noqa: E501
+        and not unroll
+        and use_bias
+        and _is_gpu_available()
+    )
+
+
+def _assert_valid_mask(mask):
+    max_seq_length = mask.shape[1]
+    count_of_true = jnp.sum(mask.astype(jnp.int32), axis=1)
+    indices = jnp.broadcast_to(
+        jnp.arange(max_seq_length), mask.shape
+    )
+    right_padded_mask = indices < count_of_true[:, None]
+    is_right_padded = jnp.all(mask == right_padded_mask)
+    has_fully_masked = jnp.any(jnp.all(~mask, axis=1))
+
+    if not (is_right_padded & ~has_fully_masked):
+        raise ValueError(
+            "You are passing a RNN mask that does not correspond to "
+            "right-padded sequences, while using cuDNN, which is not "
+            "supported. With cuDNN, RNN masks can only be used for "
+            "right-padding, e.g. `[[True, True, False, False]]` would "
+            "be a valid mask, but any mask that isn't just contiguous "
+            "`True`'s on the left and contiguous `False`'s on the right "
+            "would be invalid. You can pass `use_cudnn=False` to your "
+            "RNN layer to stop using cuDNN (this may be slower)."
+        )
+
+
+def lstm(
+    inputs,
+    initial_state_h,
+    initial_state_c,
+    mask,
+    kernel,
+    recurrent_kernel,
+    bias,
+    activation,
+    recurrent_activation,
+    return_sequences=False,
+    go_backwards=False,
+    unroll=False,
+):
+    if not cudnn_ok(
+        activation,
+        recurrent_activation,
+        unroll,
+        use_bias=bias is not None,
+    ):
+        raise NotImplementedError
+
+    try:
+        from jax.experimental.rnn import lstm as jax_lstm
+    except ImportError:
+        raise NotImplementedError
+
+    input_size = kernel.shape[0]
+    hidden_size = recurrent_kernel.shape[0]
+    batch_size = inputs.shape[0]
+
+    # Transpose Keras kernels to cuDNN layout and flatten.
+    # Gate order [i, f, c, o] matches cuDNN [i, f, g, o].
+    W_ih = jnp.asarray(kernel).T
+    W_hh = jnp.asarray(recurrent_kernel).T
+
+    if bias is not None:
+        b_ih = jnp.asarray(bias)
+    else:
+        b_ih = jnp.zeros(4 * hidden_size)
+    b_hh = jnp.zeros_like(b_ih)
+
+    # cuDNN flat weight order: [W_ih, W_hh, b_ih, b_hh]
+    weights = jnp.concatenate(
+        [W_ih.ravel(), W_hh.ravel(), b_ih.ravel(), b_hh.ravel()]
+    )
+
+    # cuDNN expects (num_layers * num_directions, batch, hidden)
+    h_0 = jnp.asarray(initial_state_h)
+    c_0 = jnp.asarray(initial_state_c)
+    if h_0.ndim == 2:
+        h_0 = h_0[jnp.newaxis]
+        c_0 = c_0[jnp.newaxis]
+
+    if go_backwards:
+        inputs = jnp.flip(inputs, axis=1)
+        if mask is not None:
+            mask = jnp.flip(mask, axis=1)
+
+    if mask is not None:
+        mask = jnp.asarray(mask).astype(jnp.bool_)
+        if mask.ndim == 3:
+            mask = mask[:, :, 0]
+        _assert_valid_mask(mask)
+        seq_lengths = jnp.sum(mask.astype(jnp.int32), axis=1)
+    else:
+        seq_lengths = jnp.full((batch_size,), inputs.shape[1], dtype=jnp.int32)
+
+    try:
+        y, h_n, c_n = jax_lstm(
+            inputs,
+            h_0,
+            c_0,
+            weights,
+            seq_lengths,
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=1,
+            dropout=0.0,
+            bidirectional=False,
+        )
+    except Exception:
+        raise NotImplementedError
+
+    # y: (batch, seq_len, hidden), h_n/c_n: (1, batch, hidden)
+    h_n = h_n.squeeze(0)
+    c_n = c_n.squeeze(0)
+
+    if mask is not None:
+        last_output = h_n
+    else:
+        last_output = y[:, -1]
+
+    if not return_sequences:
+        outputs = last_output[:, jnp.newaxis, :]
+    else:
+        outputs = y
+
+    if go_backwards and return_sequences:
+        outputs = jnp.flip(outputs, axis=1)
+
+    return last_output, outputs, [h_n, c_n]
 
 
 def gru(*args, **kwargs):