33import json
44import os
55from pathlib import Path
6- from typing import Any
6+ from typing import Any , Generator
77
88import numpy as np
99import pyarrow as pa
@@ -157,8 +157,21 @@ def iterate_tables(
157157 self ,
158158 columns : list [str ] | None = None ,
159159 filter : pc .Expression | None = None ,
160- ):
161- """Iterate over tables within the cache."""
160+ ) -> Generator [pa .Table , None , None ]:
161+ """
162+ Iterate over tables within the cache.
163+
164+ Parameters
165+ ----------
166+ columns : list[str], optional
167+ Optionally select columns to be returned.
168+ filter : pyarrow.compute.Expression, optional
169+ Optionally filter table before returning.
170+
171+ Yields
172+ ------
173+ pa.Table
174+ """
162175 dataset = ds .dataset (
163176 source = self ._path ,
164177 schema = self ._schema ,
@@ -167,8 +180,62 @@ def iterate_tables(
167180 for fragment in dataset .get_fragments ():
168181 yield fragment .to_table (columns = columns , filter = filter )
169182
170- def iterate_fragments (self ):
171- """Iterate over fragments within the file-based cache."""
183+ def iterate_pairs (
184+ self ,
185+ columns : list [str ] | None = None ,
186+ ) -> Generator [np .ndarray , None , None ]:
187+ """
188+ Iterate over chunks within the cache returning arrays.
189+
190+ Parameters
191+ ----------
192+ columns : list[str], optional
193+ Optionally select columns to be returned.
194+
195+ Yields
196+ ------
197+ np.ndarray
198+ """
199+ for tbl in self .iterate_tables (columns = columns ):
200+ yield np .column_stack (
201+ [tbl .column (i ).to_numpy () for i in range (tbl .num_columns )]
202+ )
203+
204+ def iterate_pairs_with_table (
205+ self ,
206+ columns : list [str ] | None = None ,
207+ ) -> Generator [tuple [pa .Table , np .ndarray ], None , None ]:
208+ """
209+ Iterate over chunks within the cache returning both tables and arrays.
210+
211+ Parameters
212+ ----------
213+ columns : list[str], optional
214+ Optionally select columns to be returned.
215+
216+ Yields
217+ ------
218+ tuple[pa.Table, np.ndarray]
219+ """
220+ for tbl in self .iterate_tables ():
221+ columns = columns if columns else tbl .columns
222+ yield tbl , np .column_stack (
223+ [tbl [col ].to_numpy () for col in columns ]
224+ )
225+
226+ def iterate_fragments (self ) -> Generator [ds .Fragment , None , None ]:
227+ """
228+ Iterate over fragments within the file-based cache.
229+
230+ Parameters
231+ ----------
232+ columns : list[str], optional
233+ Optionally select columns to be returned.
234+
235+ Yields
236+ ------
237+ tuple[pa.Table, np.ndarray]
238+ """
172239 dataset = ds .dataset (
173240 source = self ._path ,
174241 schema = self ._schema ,
0 commit comments