Striveworks
diff --git a/‎src/valor_lite/common/ephemeral.py‎
Lines changed: 43 additions & 36 deletions b/‎src/valor_lite/common/ephemeral.py‎
Lines changed: 43 additions & 36 deletions
diff --git a/‎src/valor_lite/common/persistent.py‎
Lines changed: 138 additions & 78 deletions b/‎src/valor_lite/common/persistent.py‎
Lines changed: 138 additions & 78 deletions
@@ -2,24 +2,61 @@
 
 import numpy as np
 import pyarrow as pa
+import pyarrow.compute as pc
 
 
 class MemoryCache:
     def __init__(self, table: pa.Table):
         self._table = table
 
+    def count_tables(self) -> int:
+        """Count the number of tables in the cache."""
+        return 1
+
     def count_rows(self) -> int:
         """Count the number of rows in the cache."""
         return self._table.num_rows
 
 
+class MemoryCacheReader(MemoryCache):
+    def __init__(
+        self,
+        table: pa.Table,
+        batch_size: int,
+    ):
+        super().__init__(table)
+        self._schema = self._table.schema
+        self._batch_size = batch_size
+
+    @property
+    def schema(self) -> pa.Schema:
+        return self._schema
+
+    @property
+    def batch_size(self) -> int:
+        return self._batch_size
+
+    def iterate_tables(
+        self,
+        columns: list[str] | None = None,
+        filter: pc.Expression | None = None,
+    ):
+        """Iterate over tables within the cache."""
+        table = self._table
+        if filter is not None:
+            table = table.filter(filter)
+        if columns is not None:
+            table = table.select(columns)
+        yield table
+
+
 class MemoryCacheWriter(MemoryCache):
     def __init__(
         self,
         table: pa.Table,
         batch_size: int,
     ):
-        self._table = table
+        super().__init__(table)
         self._schema = table.schema
         self._batch_size = batch_size
 
@@ -47,13 +84,6 @@ def create(
             batch_size=batch_size,
         )
 
-    def delete(self):
-        """
-        Delete any existing cache data.
-        """
-        self._buffer = []
-        self._table = self._table.schema.empty_table()
-
     def write_rows(
         self,
         rows: list[dict[str, Any]],
@@ -146,31 +176,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         """Context manager exit - ensures data is flushed."""
         self.flush()
 
-
-class MemoryCacheReader:
-    def __init__(
-        self,
-        cache: MemoryCacheWriter,
-    ):
-        self._cache = cache
-        self._schema = self._cache._schema
-
-    @classmethod
-    def load(cls, cache: MemoryCacheWriter):
-        """
-        Load cache from table.
-
-        Parameters
-        ----------
-        cache : MemoryCacheWriter
-            A cache writer containing the ephemeral cache.
-        """
-        return cls(cache=cache)
-
-    def iterate_tables(self):
-        """Iterate over tables within the cache."""
-        yield self._cache._table
-
-    def count_rows(self) -> int:
-        """Count the number of rows in the cache."""
-        return self._cache.count_rows()
+    def to_reader(self) -> MemoryCacheReader:
+        """Get cache reader."""
+        return MemoryCacheReader(
+            table=self._table, batch_size=self._batch_size
+        )
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pyarrow as pa
+import pyarrow.compute as pc
 import pyarrow.dataset as ds
 import pyarrow.parquet as pq
 
@@ -46,6 +47,18 @@ def _decode_schema(encoded_schema: str) -> pa.Schema:
         schema_bytes = base64.b64decode(encoded_schema)
         return pa.ipc.read_schema(pa.BufferReader(schema_bytes))
 
+    def count_rows(self) -> int:
+        """Count the number of rows in the cache."""
+        dataset = ds.dataset(
+            source=self._path,
+            format="parquet",
+        )
+        return dataset.count_rows()
+
+    def count_tables(self) -> int:
+        """Count the number of files in the cache."""
+        return len(self.get_dataset_files())
+
     def get_files(self) -> list[Path]:
         """
         Retrieve all files.
@@ -80,6 +93,106 @@ def get_dataset_files(self) -> list[Path]:
         ]
 
 
+class FileCacheReader(FileCache):
+    def __init__(
+        self,
+        path: str | Path,
+        schema: pa.Schema,
+        batch_size: int,
+        rows_per_file: int,
+        compression: str,
+    ):
+        super().__init__(path)
+        self._schema = schema
+        self._batch_size = batch_size
+        self._rows_per_file = rows_per_file
+        self._compression = compression
+
+    @property
+    def schema(self) -> pa.Schema:
+        return self._schema
+
+    @property
+    def batch_size(self) -> int:
+        return self._batch_size
+
+    @property
+    def rows_per_file(self) -> int:
+        return self._rows_per_file
+
+    @property
+    def compression(self) -> str:
+        return self._compression
+
+    @classmethod
+    def load(cls, path: str | Path | FileCache):
+        """
+        Load cache from disk.
+
+        Parameters
+        ----------
+        path : str | Path
+            Where the cache is stored.
+        """
+        if isinstance(path, FileCache):
+            path = path.path
+        path = Path(path)
+        if not path.exists():
+            raise FileNotFoundError(f"Directory does not exist: {path}")
+        elif not path.is_dir():
+            raise NotADirectoryError(
+                f"Path exists but is not a directory: {path}"
+            )
+
+        def _retrieve(config: dict, key: str):
+            if value := config.get(key, None):
+                return value
+            raise KeyError(
+                f"'{key}' is not defined within {cls._generate_config_path(path)}"
+            )
+
+        # read configuration file
+        cfg_path = cls._generate_config_path(path)
+        with open(cfg_path, "r") as f:
+            cfg = json.load(f)
+            batch_size = _retrieve(cfg, "batch_size")
+            rows_per_file = _retrieve(cfg, "rows_per_file")
+            compression = _retrieve(cfg, "compression")
+            schema = cls._decode_schema(_retrieve(cfg, "schema"))
+
+        return cls(
+            schema=schema,
+            path=path,
+            batch_size=batch_size,
+            rows_per_file=rows_per_file,
+            compression=compression,
+        )
+
+    def iterate_tables(
+        self,
+        columns: list[str] | None = None,
+        filter: pc.Expression | None = None,
+    ):
+        """Iterate over tables within the cache."""
+        dataset = ds.dataset(
+            source=self._path,
+            schema=self._schema,
+            format="parquet",
+        )
+        for fragment in dataset.get_fragments(filter=filter):
+            yield fragment.to_table(columns=columns)
+
+    def iterate_fragments(self):
+        """Iterate over fragments within the file-based cache."""
+        dataset = ds.dataset(
+            source=self._path,
+            schema=self._schema,
+            format="parquet",
+        )
+        for fragment in dataset.get_fragments():
+            yield fragment
+
+
 class FileCacheWriter(FileCache):
     def __init__(
         self,
@@ -89,7 +202,7 @@ def __init__(
         rows_per_file: int,
         compression: str,
     ):
-        self._path = Path(path)
+        super().__init__(path)
         self._schema = schema
         self._batch_size = batch_size
         self._rows_per_file = rows_per_file
@@ -108,6 +221,7 @@ def create(
         batch_size: int,
         rows_per_file: int,
         compression: str = "snappy",
+        delete_if_exists: bool = False,
     ):
         """
         Create a cache on disk.
@@ -124,7 +238,12 @@ def create(
             Target number of rows to store per file.
         compression : str, default="snappy"
             Compression method to use when storing on disk.
+        delete_if_exists : bool, default=False
+            Delete the cache if it already exists.
         """
+        path = Path(path)
+        if delete_if_exists and path.exists():
+            cls.delete(path)
         Path(path).mkdir(parents=True, exist_ok=False)
 
         # write configuration file
@@ -146,29 +265,33 @@ def create(
             compression=compression,
         )
 
-    def delete(self):
+    @classmethod
+    def delete(cls, path: str | Path):
         """
-        Delete the cache.
+        Delete a cache at path.
 
         Parameters
         ----------
         path : str | Path
             Where the cache is stored.
         """
-        if not self._path.exists():
+        path = Path(path)
+        if not path.exists():
             return
-        # clear buffer
-        self.flush()
-        # delete config file
-        cfg_path = self._generate_config_path(self._path)
-        if cfg_path.exists() and cfg_path.is_file():
-            cfg_path.unlink()
+
         # delete dataset files
-        for file in self.get_dataset_files():
+        reader = FileCacheReader.load(path)
+        for file in reader.get_dataset_files():
             if file.exists() and file.is_file() and file.suffix == ".parquet":
                 file.unlink()
+
+        # delete config file
+        cfg_path = cls._generate_config_path(path)
+        if cfg_path.exists() and cfg_path.is_file():
+            cfg_path.unlink()
+
         # delete empty cache directory
-        self._path.rmdir()
+        path.rmdir()
 
     def write_rows(
         self,
@@ -297,69 +420,6 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         """Context manager exit - ensures data is flushed."""
         self.flush()
 
-
-class FileCacheReader(FileCache):
-    def __init__(
-        self,
-        path: str | Path,
-        schema: pa.Schema,
-    ):
-        self._schema = schema
-        self._path = Path(path)
-
-    @classmethod
-    def load(cls, path: str | Path | FileCache):
-        """
-        Load cache from disk.
-
-        Parameters
-        ----------
-        path : str | Path
-            Where the cache is stored.
-        """
-        if isinstance(path, FileCache):
-            path = path.path
-        path = Path(path)
-        if not path.exists():
-            raise FileNotFoundError(f"Directory does not exist: {path}")
-        elif not path.is_dir():
-            raise NotADirectoryError(
-                f"Path exists but is not a directory: {path}"
-            )
-
-        def _retrieve(config: dict, key: str):
-            if value := config.get(key, None):
-                return value
-            raise KeyError(
-                f"'{key}' is not defined within {cls._generate_config_path(path)}"
-            )
-
-        # read configuration file
-        cfg_path = cls._generate_config_path(path)
-        with open(cfg_path, "r") as f:
-            cfg = json.load(f)
-            schema = cls._decode_schema(_retrieve(cfg, "schema"))
-
-        return cls(
-            schema=schema,
-            path=path,
-        )
-
-    def count_rows(self) -> int:
-        """Count the number of rows in the cache."""
-        dataset = ds.dataset(
-            source=self._path,
-            schema=self._schema,
-            format="parquet",
-        )
-        return dataset.count_rows()
-
-    def iterate_tables(self):
-        """Iterate over tables within the cache."""
-        dataset = ds.dataset(
-            source=self._path,
-            schema=self._schema,
-            format="parquet",
-        )
-        for fragment in dataset.get_fragments():
-            yield fragment.to_table()
+    def to_reader(self) -> FileCacheReader:
+        """Get cache reader."""
+        return FileCacheReader.load(path=self.path)