Avoid per-item numpy conversion in JaggedArrayStore write path

nevillelyh · claude · nevillelyh · commit b95eddb8d018 · 2026-04-22T17:05:33.000-04:00
TreeStore.extend and extend_with_batch were converting each item to a
numpy array individually before passing to JaggedArrayStore.extend,
which then concatenated them. For a batch of 16K tokenized sequences
this means 16K np.asarray calls + one np.concatenate.

Add PreparedBatch.from_sequences() that pre-allocates a single flat
array from the cumulative lengths and copies each sequence directly
into the right slice. JaggedArrayStore.extend now detects Python
sequences (lists) and uses this fast path automatically.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/lib/levanter/src/levanter/store/jagged_array.py b/lib/levanter/src/levanter/store/jagged_array.py
@@ -75,10 +75,30 @@ def num_rows(self):
         return len(self.offsets)
 
     @staticmethod
-    def from_batch(items: Sequence[np.ndarray], item_rank: Optional[int] = None) -> "PreparedBatch":
+    def from_batch(items: Sequence, item_rank: Optional[int] = None) -> "PreparedBatch":
+        if items and not hasattr(items[0], "ndim"):
+            if (item_rank or 1) == 1:
+                return PreparedBatch._from_sequences(items)
+            items = [np.asarray(x) for x in items]
         data, offsets, shapes = _prepare_batch(items, item_rank)
         return PreparedBatch(data, offsets, shapes)
 
+    @staticmethod
+    def _from_sequences(items: Sequence[Sequence]) -> "PreparedBatch":
+        """Build from Python sequences without per-item numpy conversion.
+        Pre-allocates a single flat array and copies each sequence into it."""
+        lengths = np.array([len(item) for item in items], dtype=np.int64)
+        offsets = np.cumsum(lengths)
+        total = int(offsets[-1]) if len(offsets) else 0
+        dtype = np.result_type(items[0][0]) if items and len(items[0]) > 0 else np.int64
+        data = np.empty(total, dtype=dtype)
+        pos = 0
+        for item, length in zip(items, lengths):
+            end = pos + int(length)
+            data[pos:end] = item
+            pos = end
+        return PreparedBatch(data, offsets, None)
+
     @staticmethod
     def concat(batches: Sequence["PreparedBatch"]) -> "PreparedBatch":
         data = np.concatenate([batch.data for batch in batches])
@@ -205,10 +225,10 @@ async def data_size_async(self):
             self._cached_data_size = result
         return result
 
-    async def append_async(self, data: np.ndarray):
+    async def append_async(self, data: Sequence):
         await self.extend_async([data])
 
-    def append(self, data: np.ndarray):
+    def append(self, data: Sequence):
         self.extend([data])
 
     async def trim_to_size_async(self, size: int):
@@ -282,7 +302,7 @@ def trim_to_size(self, size: int):
             self._cached_num_rows = size
             self._cached_data_size = new_max
 
-    async def extend_async(self, arrays: Sequence[np.ndarray] | PreparedBatch):
+    async def extend_async(self, arrays: Sequence[Sequence] | PreparedBatch):
         if isinstance(arrays, PreparedBatch):
             prepared = arrays
         else:
@@ -313,7 +333,7 @@ async def extend_async(self, arrays: Sequence[np.ndarray] | PreparedBatch):
             self._cached_num_rows = num_rows + num_added
             self._cached_data_size = current_data_size + len(data)
 
-    def extend(self, arrays: Sequence[np.ndarray] | PreparedBatch):
+    def extend(self, arrays: Sequence[Sequence] | PreparedBatch):
         if isinstance(arrays, PreparedBatch):
             prepared = arrays
         else:
diff --git a/lib/levanter/src/levanter/store/tree_store.py b/lib/levanter/src/levanter/store/tree_store.py
@@ -12,7 +12,7 @@
 from haliax.jax_utils import is_jax_array_like
 from jaxtyping import PyTree
 
-from .jagged_array import JaggedArrayStore, PreparedBatch
+from .jagged_array import JaggedArrayStore
 
 T = TypeVar("T", bound=PyTree)
 
@@ -50,10 +50,6 @@ def __init__(self, tree, path: str, mode: str):
         self.mode = mode
         self.tree = tree
 
-    @property
-    def batch_preparer(self):
-        return TreeBatchPreparer(jtu.tree_map(lambda writer: 9, self.tree, is_leaf=heuristic_is_leaf))
-
     @staticmethod
     def open(exemplar: T, path: str, *, mode="a", cache_metadata: bool = False) -> "TreeStore":
         """
@@ -70,7 +66,7 @@ def extend(self, batch: Sequence[T]):
         Append a batch of data to the store.
         """
         jtu.tree_map(
-            lambda writer, *xs: writer.extend([np.asarray(x) for x in xs]),
+            lambda writer, *xs: writer.extend(xs),
             self.tree,
             *batch,
             is_leaf=heuristic_is_leaf,
@@ -84,7 +80,7 @@ def extend_with_batch(self, batch: T):
         For instance, HF's BatchEncoding is a dict of lists of numpy arrays.
         """
         jtu.tree_map(
-            lambda writer, xs: writer.extend(xs if isinstance(xs, PreparedBatch) else [np.asarray(x) for x in xs]),
+            lambda writer, xs: writer.extend(xs),
             self.tree,
             batch,
             is_leaf=heuristic_is_leaf_batched,
@@ -98,9 +94,7 @@ async def extend_with_batch_async(self, batch: T):
         For instance, HF's BatchEncoding is a dict of lists of numpy arrays.
         """
         futures = jtu.tree_map(
-            lambda writer, xs: writer.extend_async(
-                xs if isinstance(xs, PreparedBatch) else [np.asarray(x) for x in xs]
-            ),
+            lambda writer, xs: writer.extend_async(xs),
             self.tree,
             batch,
             is_leaf=heuristic_is_leaf_batched,
@@ -205,16 +199,3 @@ def _render_path_elem(x):
             return f"{i}"
         case _:
             return str(x)
-
-
-class TreeBatchPreparer(Generic[T]):
-    def __init__(self, exemplar: T):
-        self.exemplar = exemplar
-
-    def __call__(self, batch: List[T]) -> PyTree:
-        return jtu.tree_map(
-            lambda _, *xs: PreparedBatch.from_batch([np.asarray(x) for x in xs]),
-            self.exemplar,
-            *batch,
-            is_leaf=heuristic_is_leaf,
-        )
diff --git a/lib/levanter/tests/test_jagged_array.py b/lib/levanter/tests/test_jagged_array.py
@@ -396,5 +396,40 @@ async def test_get_batch_empty():
     assert batch == []
 
 
+def test_extend_with_python_lists():
+    """Extending a JaggedArrayStore with Python lists should use the fast path
+    (PreparedBatch.from_sequences) and produce identical results to numpy arrays."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        builder = JaggedArrayStore.open(tmpdir, item_rank=1, dtype=jnp.int64)
+
+        lists = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
+        builder.extend(lists)
+
+        assert len(builder) == 3
+        np.testing.assert_array_equal(builder[0], np.array([1, 2, 3]))
+        np.testing.assert_array_equal(builder[1], np.array([4, 5]))
+        np.testing.assert_array_equal(builder[2], np.array([6, 7, 8, 9]))
+
+        # Extend again to verify offsets accumulate correctly
+        builder.extend([[10, 11]])
+        assert len(builder) == 4
+        np.testing.assert_array_equal(builder[3], np.array([10, 11]))
+
+
+def test_from_batch_with_python_lists_matches_numpy():
+    """PreparedBatch.from_batch with Python lists should produce the same result as with numpy arrays."""
+    lists = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
+    arrays = [np.array(lst, dtype=np.int64) for lst in lists]
+
+    from_lists = PreparedBatch.from_batch(lists)
+    from_arrays = PreparedBatch.from_batch(arrays)
+
+    # dtype may differ (int64 inferred vs int32 explicit) but values must match
+    np.testing.assert_array_equal(from_lists.data, from_arrays.data)
+    np.testing.assert_array_equal(from_lists.offsets, from_arrays.offsets)
+    assert from_lists.shapes is None
+    assert from_arrays.shapes is None
+
+
 if __name__ == "__main__":
     pytest.main()
diff --git a/lib/levanter/tests/test_tree_store.py b/lib/levanter/tests/test_tree_store.py
@@ -257,34 +257,6 @@ def test_reading_from_written():
                 pytest.fail("Unexpected index")
 
 
-def test_using_prepared_batches():
-    with tempfile.TemporaryDirectory() as tmpdir:
-        exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}
-        builder = TreeStore.open(exemplar, tmpdir, mode="w")
-        preparer = builder.batch_preparer
-
-        batch = [
-            {"a": np.array([1.0, 2.0]), "b": np.array([3.0, 4.0])},
-            {"a": np.array([5.0, 6.0]), "b": np.array([7.0, 8.0])},
-        ]
-        batch = preparer(batch)
-        builder.extend_with_batch(batch)
-
-        del builder
-
-        builder2 = TreeStore.open(exemplar, tmpdir, mode="r")
-
-        for i, result in enumerate(builder2):
-            if i == 0:
-                assert np.all(result["a"] == np.array([1.0, 2.0]))
-                assert np.all(result["b"] == np.array([3.0, 4.0]))
-            elif i == 1:
-                assert np.all(result["a"] == np.array([5.0, 6.0]))
-                assert np.all(result["b"] == np.array([7.0, 8.0]))
-            else:
-                pytest.fail("Unexpected index")
-
-
 def test_resolve_changed_cache_size():
     with tempfile.TemporaryDirectory() as tmpdir:
         exemplar = {"a": np.array([0], dtype=np.float64), "b": np.array([0], dtype=np.float64)}