Refactor: Implement batch padding for JIT optimization, migrate to plan.clipped_grad, and patch batch_selection for multi-dim support

debanganghosh08 · debanganghosh08 · commit c501354cba24 · 2026-04-05T12:22:28.000+05:30
diff --git a/examples/dp_sgd_transformer_nnx.py b/examples/dp_sgd_transformer_nnx.py
@@ -38,7 +38,7 @@
 import jax
 import jax.extend.backend
 import jax.numpy as jnp
-from jax_privacy import clipped_grad
+from jax_privacy import batch_selection
 from jax_privacy.experimental import execution_plan
 import numpy as np
 import optax
@@ -56,6 +56,7 @@
 NOISE_MULTIPLIER = 1.0
 EPSILON = 10.0
 DELTA = 1e-6
+PADDING_MULTIPLE = 8
 
 
 # Data loading and preparation
@@ -295,15 +296,6 @@ def main(argv: list[str]) -> None:
 
   opt_state = optimizer.init(params)
 
-  # Configure DP Gradient Clipping
-  grad_fn = clipped_grad(
-      functools.partial(pure_loss_fn, graphdef=graphdef, other=other),
-      l2_clip_norm=CLIP_NORM,
-      batch_argnums=(1, 2),  # x and y are batched
-      prng_argnum=3,  # Explicitly vmap the PRNG key per batch example
-      return_values=True,  # Return loss values for logging
-  )
-
   # Execution Plan Configuration
   dataset_size = len(data)
   config = execution_plan.BandMFExecutionPlanConfig.default(
@@ -318,13 +310,22 @@ def main(argv: list[str]) -> None:
   privatizer = plan.noise_addition_transform
   noise_state = privatizer.init(params)
 
+  # Configure DP Gradient Clipping
+  grad_fn = plan.clipped_grad(
+      functools.partial(pure_loss_fn, graphdef=graphdef, other=other),
+      batch_argnums=(1, 2),  # x and y are batched
+      prng_argnum=3,  # Explicitly vmap the PRNG key per batch example
+      return_values=True,  # Return loss values for logging
+  )
+
   @jax.jit(donate_argnums=(0, 1, 4))
   def train_step(
       params: nnx.State,
       opt_state: optax.OptState,
       batch: Tuple[jax.Array, jax.Array],
       prng_key: jax.Array,
       noise_state: Any,
+      is_padding_example: jax.Array,
   ) -> Tuple[nnx.State, optax.OptState, Any, jax.Array]:
     """Performs a single training step with DP-SGD.
 
@@ -334,20 +335,19 @@ def train_step(
       batch: A tuple (x, y) of input and target data.
       prng_key: A pseudorandom number generator key.
       noise_state: Current state of the noise mechanism.
+      is_padding_example: Boolean mask indicating padding rows.
 
     Returns:
       Updated params, opt_state, noise_state, and the mean loss for the batch.
     """
+    print(f"DEBUG: Compiling train_step for batch size {batch[0].shape[0]}")
     x, y = batch
 
-    # Handle zero-sized batch explicitly to avoid tracing crash in optax
-    if x.shape[0] == 0:
-      grads = jax.tree_util.tree_map(jnp.zeros_like, params)
-      mean_loss = jnp.array(0.0)
-    else:
-      # Compute clipped gradients and per-example loss values
-      grads, loss = grad_fn(params, x, y, prng_key)
-      mean_loss = loss.values.mean()
+    # Compute clipped gradients and per-example loss values
+    grads, loss = grad_fn(
+        params, x, y, prng_key, is_padding_example=is_padding_example
+    )
+    mean_loss = loss.values.mean()
 
     assert all(
         g.shape == p.shape
@@ -371,22 +371,17 @@ def train_step(
   iterator = plan.batch_selection_strategy.batch_iterator(dataset_size)
   prng_key = jax.random.key(42)
   for step, batch_indices in enumerate(iterator):
-    if step >= NUM_STEPS:
-      break
-
-    # Construct batch from indices
-    if len(batch_indices) == 0:
-      x = np.zeros((0, CONTEXT_LENGTH), dtype=np.int32)
-      y = np.zeros((0, CONTEXT_LENGTH), dtype=np.int32)
-    else:
-      batch_seqs = data[batch_indices]
-      x = batch_seqs[:, :-1]
-      y = batch_seqs[:, 1:]
+    idx = batch_selection.pad_to_multiple_of(batch_indices, PADDING_MULTIPLE)
+    is_padding_example = idx == -1
+    safe_idx = np.where(idx == -1, 0, idx)
+    batch_seqs = data[safe_idx]
+    x = batch_seqs[:, :-1]
+    y = batch_seqs[:, 1:]
     batch = (x, y)
 
     prng_key, subkey = jax.random.split(prng_key)
     params, opt_state, noise_state, loss = train_step(
-        params, opt_state, batch, subkey, noise_state
+        params, opt_state, batch, subkey, noise_state, is_padding_example
     )
 
     print(f"Step {step + 1}/{NUM_STEPS}, Loss: {loss:.4f}")
diff --git a/examples/user_level_transformer_example.py b/examples/user_level_transformer_example.py
@@ -36,8 +36,8 @@
 import jax.numpy as jnp
 import numpy as np
 import optax
+from jax_privacy import batch_selection
 from jax_privacy.batch_selection import UserSelectionStrategy
-from jax_privacy.clipping import clipped_grad
 from jax_privacy.experimental import execution_plan
 
 
@@ -49,6 +49,7 @@
 LEARNING_RATE = 1e-3
 EPSILON = 10.0
 DELTA = 1e-6
+PADDING_MULTIPLE = 8
 
 
 class TransformerDecoder(nn.Module):
@@ -142,25 +143,26 @@ def main(argv: list[str]) -> None:
   # We need the grad_fn first.
 
   # 3. Training Step & Clipping
-  def loss_fn(params, batch_data, batch_labels):
-    logits = model.apply({'params': params}, batch_data, train=True)
-    one_hot_labels = jax.nn.one_hot(batch_labels, num_classes=vocab_size)
+  def loss_fn(params, x, y, prng_key=None):
+    del prng_key  # Unused
+    logits = model.apply({'params': params}, x, train=True)
+    one_hot_labels = jax.nn.one_hot(y, num_classes=vocab_size)
     return jnp.mean(
         optax.softmax_cross_entropy(logits=logits, labels=one_hot_labels)
     )
 
-  grad_fn = clipped_grad(
-      loss_fn,
-      l2_clip_norm=L2_CLIP_NORM,
-      batch_argnums=(1, 2),
-      keep_batch_dim=False,
-  )
-
   # Create Plan
   plan = config.plan
   privatizer = plan.noise_addition_transform
   noise_state = privatizer.init(params)
 
+  grad_fn = plan.clipped_grad(
+      loss_fn,
+      batch_argnums=(1, 2),
+      prng_argnum=3,
+      return_values=True,
+  )
+
   # Wrap the plan's strategy with UserSelectionStrategy
   # We assume plan.batch_selection_strategy is compatible
   # (CyclicPoissonSampling)
@@ -170,32 +172,37 @@ def loss_fn(params, batch_data, batch_labels):
   )
 
   @jax.jit(donate_argnums=(0, 1, 4))
-  def train_step(params, opt_state, batch_data, batch_labels, noise_state):
-    grads = grad_fn(params, batch_data, batch_labels)
+  def train_step(
+      params, opt_state, x, y, noise_state, prng_key, is_padding_example
+  ):
+    print(f'DEBUG: Compiling train_step for batch size {x.shape[0]}')
+    grads, loss = grad_fn(
+        params, x, y, prng_key, is_padding_example=is_padding_example
+    )
 
     # Add Privacy Noise (Using plan's privatizer)
     noisy_grads, noise_state = privatizer.update(grads, noise_state)
 
     updates, opt_state = optimizer.update(noisy_grads, opt_state, params)
     params = optax.apply_updates(params, updates)
-    return params, opt_state, noise_state
+    return params, opt_state, noise_state, loss.values.mean()
 
   # 4. Training Loop
   start_time = time.time()
   batch_iterator = user_strategy.batch_iterator(user_ids, rng=0)
+  prng_key = jax.random.key(42)
   for step, user_batch_indices in enumerate(batch_iterator):
-    if user_batch_indices.size == 0:
-      print(f'Step {step}: Skipping empty batch.')
-      continue
-
-    batch_data = data[user_batch_indices]
-    batch_labels = labels[user_batch_indices]
-
-    # Calculate and print loss
-    loss_val = loss_fn(params, batch_data, batch_labels)
-
-    params, opt_state, noise_state = train_step(
-        params, opt_state, batch_data, batch_labels, noise_state
+    idx = batch_selection.pad_to_multiple_of(
+        user_batch_indices, PADDING_MULTIPLE
+    )
+    is_padding_example = idx[:, 0] == -1
+    safe_idx = np.where(idx == -1, 0, idx)
+    x = data[safe_idx]
+    y = labels[safe_idx]
+
+    prng_key, subkey = jax.random.split(prng_key)
+    params, opt_state, noise_state, loss_val = train_step(
+        params, opt_state, x, y, noise_state, subkey, is_padding_example
     )
     print(f'Step {step}: Loss: {loss_val:.4f}')
 
diff --git a/jax_privacy/batch_selection.py b/jax_privacy/batch_selection.py
@@ -157,13 +157,22 @@ def pad_to_multiple_of(indices: np.ndarray, multiple: int) -> np.ndarray:
   Returns:
     A new 1D array of indices padded with -1.
   """
-  if indices.ndim > 1:
-    raise ValueError('pad_to_multiple_of currently expects 1D indices.')
   if multiple <= 0:
     raise ValueError(f'Padding multiple must be positive, got {multiple}.')
   curr_size = indices.shape[0]
   pad_size = (multiple - curr_size) % multiple
-  new_indices = np.full(curr_size + pad_size, -1, dtype=indices.dtype)
+  if pad_size == 0:
+    return indices
+
+  pad_shape = (pad_size,) + indices.shape[1:]
+  if indices.ndim == 1:
+    new_indices = np.full(curr_size + pad_size, -1, dtype=indices.dtype)
+  else:
+    new_indices = np.full(pad_shape, -1, dtype=indices.dtype)
+
+  if indices.ndim > 1:
+    return np.concatenate([indices, new_indices], axis=0)
+
   new_indices[:curr_size] = indices
   return new_indices