Avoid per-item numpy conversion in JaggedArrayStore write path

nevillelyh · claude · nevillelyh · commit fd42cb21ae18 · 2026-04-22T15:56:21.000-04:00
TreeStore.extend and extend_with_batch were converting each item to a
numpy array individually before passing to JaggedArrayStore.extend,
which then concatenated them. For a batch of 16K tokenized sequences
this means 16K np.asarray calls + one np.concatenate.

Add PreparedBatch.from_sequences() that pre-allocates a single flat
array from the cumulative lengths and copies each sequence directly
into the right slice. JaggedArrayStore.extend now detects Python
sequences (lists) and uses this fast path automatically.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/lib/levanter/src/levanter/store/jagged_array.py b/lib/levanter/src/levanter/store/jagged_array.py
@@ -75,10 +75,35 @@ def num_rows(self):
         return len(self.offsets)
 
     @staticmethod
-    def from_batch(items: Sequence[np.ndarray], item_rank: Optional[int] = None) -> "PreparedBatch":
+    def from_batch(
+        items: Sequence, item_rank: Optional[int] = None, dtype: Optional[np.dtype] = None
+    ) -> "PreparedBatch":
+        if items and not hasattr(items[0], "ndim"):
+            if (item_rank or 1) == 1:
+                return PreparedBatch._from_sequences(items, dtype)
+            items = [np.asarray(x) for x in items]
         data, offsets, shapes = _prepare_batch(items, item_rank)
         return PreparedBatch(data, offsets, shapes)
 
+    @staticmethod
+    def _from_sequences(items: Sequence[Sequence], dtype: Optional[np.dtype]) -> "PreparedBatch":
+        """Build from Python sequences without per-item numpy conversion.
+        Pre-allocates a single flat array and copies each sequence into it."""
+        lengths = np.array([len(item) for item in items], dtype=np.int64)
+        offsets = np.cumsum(lengths)
+        total = int(offsets[-1]) if len(offsets) else 0
+        if dtype is None:
+            # Infer from first non-empty item
+            probe = np.asarray(items[0][:1]) if items and len(items[0]) > 0 else np.asarray([0])
+            dtype = probe.dtype
+        data = np.empty(total, dtype=dtype)
+        pos = 0
+        for item, length in zip(items, lengths):
+            end = pos + int(length)
+            data[pos:end] = item
+            pos = end
+        return PreparedBatch(data, offsets, None)
+
     @staticmethod
     def concat(batches: Sequence["PreparedBatch"]) -> "PreparedBatch":
         data = np.concatenate([batch.data for batch in batches])
@@ -282,11 +307,11 @@ def trim_to_size(self, size: int):
             self._cached_num_rows = size
             self._cached_data_size = new_max
 
-    async def extend_async(self, arrays: Sequence[np.ndarray] | PreparedBatch):
+    async def extend_async(self, arrays: Sequence[np.ndarray] | Sequence[Sequence] | PreparedBatch):
         if isinstance(arrays, PreparedBatch):
             prepared = arrays
         else:
-            prepared = PreparedBatch.from_batch(arrays, self.item_rank)
+            prepared = PreparedBatch.from_batch(arrays, self.item_rank, dtype=np.dtype(self.data.dtype.name))
         data = prepared.data
         new_offsets = prepared.offsets
         shapes = prepared.shapes
@@ -313,11 +338,11 @@ async def extend_async(self, arrays: Sequence[np.ndarray] | PreparedBatch):
             self._cached_num_rows = num_rows + num_added
             self._cached_data_size = current_data_size + len(data)
 
-    def extend(self, arrays: Sequence[np.ndarray] | PreparedBatch):
+    def extend(self, arrays: Sequence[np.ndarray] | Sequence[Sequence] | PreparedBatch):
         if isinstance(arrays, PreparedBatch):
             prepared = arrays
         else:
-            prepared = PreparedBatch.from_batch(arrays, self.item_rank)
+            prepared = PreparedBatch.from_batch(arrays, self.item_rank, dtype=np.dtype(self.data.dtype.name))
 
         data = prepared.data
         new_offsets = prepared.offsets
diff --git a/lib/levanter/src/levanter/store/tree_store.py b/lib/levanter/src/levanter/store/tree_store.py
@@ -70,7 +70,7 @@ def extend(self, batch: Sequence[T]):
         Append a batch of data to the store.
         """
         jtu.tree_map(
-            lambda writer, *xs: writer.extend([np.asarray(x) for x in xs]),
+            lambda writer, *xs: writer.extend(xs),
             self.tree,
             *batch,
             is_leaf=heuristic_is_leaf,
@@ -84,7 +84,7 @@ def extend_with_batch(self, batch: T):
         For instance, HF's BatchEncoding is a dict of lists of numpy arrays.
         """
         jtu.tree_map(
-            lambda writer, xs: writer.extend(xs if isinstance(xs, PreparedBatch) else [np.asarray(x) for x in xs]),
+            lambda writer, xs: writer.extend(xs if isinstance(xs, PreparedBatch) else xs),
             self.tree,
             batch,
             is_leaf=heuristic_is_leaf_batched,
@@ -98,9 +98,7 @@ async def extend_with_batch_async(self, batch: T):
         For instance, HF's BatchEncoding is a dict of lists of numpy arrays.
         """
         futures = jtu.tree_map(
-            lambda writer, xs: writer.extend_async(
-                xs if isinstance(xs, PreparedBatch) else [np.asarray(x) for x in xs]
-            ),
+            lambda writer, xs: writer.extend_async(xs if isinstance(xs, PreparedBatch) else xs),
             self.tree,
             batch,
             is_leaf=heuristic_is_leaf_batched,
diff --git a/lib/levanter/tests/test_jagged_array.py b/lib/levanter/tests/test_jagged_array.py
@@ -396,5 +396,39 @@ async def test_get_batch_empty():
     assert batch == []
 
 
+def test_extend_with_python_lists():
+    """Extending a JaggedArrayStore with Python lists should use the fast path
+    (PreparedBatch.from_sequences) and produce identical results to numpy arrays."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        builder = JaggedArrayStore.open(tmpdir, item_rank=1, dtype=jnp.int32)
+
+        lists = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
+        builder.extend(lists)
+
+        assert len(builder) == 3
+        np.testing.assert_array_equal(builder[0], np.array([1, 2, 3]))
+        np.testing.assert_array_equal(builder[1], np.array([4, 5]))
+        np.testing.assert_array_equal(builder[2], np.array([6, 7, 8, 9]))
+
+        # Extend again to verify offsets accumulate correctly
+        builder.extend([[10, 11]])
+        assert len(builder) == 4
+        np.testing.assert_array_equal(builder[3], np.array([10, 11]))
+
+
+def test_from_batch_with_python_lists_matches_numpy():
+    """PreparedBatch.from_batch with Python lists should produce the same result as with numpy arrays."""
+    lists = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
+    arrays = [np.array(lst, dtype=np.int32) for lst in lists]
+
+    from_lists = PreparedBatch.from_batch(lists, dtype=np.int32)
+    from_arrays = PreparedBatch.from_batch(arrays)
+
+    np.testing.assert_array_equal(from_lists.data, from_arrays.data)
+    np.testing.assert_array_equal(from_lists.offsets, from_arrays.offsets)
+    assert from_lists.shapes is None
+    assert from_arrays.shapes is None
+
+
 if __name__ == "__main__":
     pytest.main()