checkout cache src

czaloom · czaloom · commit be71fefa7f4b · 2025-10-23T10:53:47.000-04:00
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -15,7 +15,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
         with:
-          python-version: "3.10"
+          python-version: "3.11"
       - name: install lite
         run: pip install -e .[dev]
         working-directory: ./src
@@ -41,7 +41,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
         with:
-          python-version: "3.10"
+          python-version: "3.11"
       - name: install lite
         run: pip install -e .[dev]
         working-directory: ./src
diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml
@@ -15,7 +15,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
         with:
-          python-version: "3.10"
+          python-version: "3.11"
       - name: Build wheel
         run: pip install build && python -m build
       - name: Publish to PyPI
diff --git a/.github/workflows/check-pre-commit.yml b/.github/workflows/check-pre-commit.yml
@@ -11,7 +11,7 @@ jobs:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v2
         with:
-          python-version: "3.10"
+          python-version: "3.11"
       - name: Install pre-commit
         run: pip install pre-commit && pre-commit install
       - name: Run pre-commit. This will fail if pre-commit hooks fail.
diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
@@ -18,7 +18,7 @@ jobs:
           fetch-depth: 0
       - uses: actions/setup-python@v4
         with:
-          python-version: "3.10"
+          python-version: "3.11"
       - name: install python dependencies
         run: pip install "src/[test, docs]"
       - name: deploy docs to gh
diff --git a/.github/workflows/tests-and-coverage.yml b/.github/workflows/tests-and-coverage.yml
@@ -18,7 +18,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
         with:
-          python-version: "3.10"
+          python-version: "3.11"
       - name: run classification tests and report coverage
         run: |
           pip install -e "./src/[dev]"
@@ -51,6 +51,7 @@ jobs:
         run: |
           pip install -e "./src/[dev]"
           COVERAGE_FILE=.coverage.segmentation python -m coverage run --include "src/valor_lite/*" -m pytest -v tests/semantic_segmentation/
+          COVERAGE_FILE=.coverage.common python -m coverage run --include "src/valor_lite/*" -m pytest -v tests/common/
           python -m coverage combine
           python -m coverage report -m
           python -m coverage json
diff --git a/.gitignore b/.gitignore
@@ -29,4 +29,5 @@ site/*
 *.pt
 *.png
 *.jpg
-*.parquet
+*.parquet
+*.valor
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,7 +25,7 @@ repos:
         # supported by your project here, or alternatively use
         # pre-commit's default_language_version, see
         # https://pre-commit.com/#top_level-default_language_version
-        language_version: python3.10
+        language_version: python3.11
         args: [--line-length=79]
 
   - repo: https://github.com/RobertCraigie/pyright-python
diff --git a/src/valor_lite/cache.py b/src/valor_lite/cache.py
@@ -0,0 +1,274 @@
+import glob
+import json
+import os
+from datetime import datetime
+from enum import StrEnum
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pyarrow as pa
+import pyarrow.dataset as ds
+import pyarrow.lib as pl
+import pyarrow.parquet as pq
+
+
+class DataType(StrEnum):
+    INTEGER = "int"
+    FLOAT = "float"
+    STRING = "string"
+    TIMESTAMP = "timestamp"
+
+    def to_py(self):
+        match self:
+            case DataType.INTEGER:
+                return int
+            case DataType.FLOAT:
+                return float
+            case DataType.STRING:
+                return str
+            case DataType.TIMESTAMP:
+                return datetime
+
+    def to_arrow(self):
+        match self:
+            case DataType.INTEGER:
+                return pa.int64()
+            case DataType.FLOAT:
+                return pa.float64()
+            case DataType.STRING:
+                return pa.string()
+            case DataType.TIMESTAMP:
+                return pa.timestamp("us")
+
+
+def convert_type_mapping_to_schema(
+    type_mapping: dict[str, DataType] | None
+) -> list[tuple[str, pl.DataType]]:
+    """
+    Convert type mapping to a pyarrow schema input.
+
+    Parameters
+    ----------
+    type_mapping : dict[str, DataType] | None
+        A map from string key to datatype. Treats input of `None` as empty mapping.
+
+    Returns
+    -------
+    list[tuple[str, pyarrow.lib.DataType]]
+        A list of field name, field type pairs that can be used as input to pyarrow.schema.
+    """
+    if not type_mapping:
+        return []
+    return [(k, DataType(v).to_arrow()) for k, v in type_mapping.items()]
+
+
+class CacheReader:
+    def __init__(self, where: str | Path):
+        self._dir = Path(where)
+        self._cfg = self._dir / ".cfg"
+
+        with open(self._cfg, "r") as f:
+            cfg = json.load(f)
+            self._batch_size = cfg.get("batch_size")
+            self._rows_per_file = cfg.get("rows_per_file")
+            self._compression = cfg.get("compression")
+
+    @property
+    def files(self) -> list[str]:
+        files = []
+        for entry in os.listdir(self._dir):
+            full_path = os.path.join(self._dir, entry)
+            if os.path.isfile(full_path):
+                files.append(full_path)
+        return files
+
+    @property
+    def num_files(self) -> int:
+        return len(self.files)
+
+    @property
+    def dataset_files(self) -> list[str]:
+        return glob.glob(f"{self._dir}/*.parquet")
+
+    @property
+    def num_dataset_files(self) -> int:
+        return len(self.dataset_files)
+
+    @property
+    def dataset(self):
+        return ds.dataset(
+            self._dir,
+            format="parquet",
+        )
+
+    @property
+    def schema(self):
+        return self.dataset.schema
+
+    @property
+    def batch_size(self) -> int:
+        return self._batch_size
+
+    @property
+    def rows_per_file(self) -> int:
+        return self._rows_per_file
+
+    @property
+    def compression(self) -> str:
+        return self._compression
+
+
+class CacheWriter(CacheReader):
+    def __init__(
+        self,
+        where: str | Path,
+        schema: pa.Schema,
+        batch_size: int = 1000,
+        rows_per_file: int = 10000,
+        compression: str = "snappy",
+        delete_if_exists: bool = True,
+    ):
+        self._dir = Path(where)
+        self._cfg = self._dir / ".cfg"
+
+        self._schema = schema
+        self._batch_size = batch_size
+        self._rows_per_file = rows_per_file
+        self._compression = compression
+
+        if delete_if_exists:
+            self.delete_files()
+        self._dir.mkdir(parents=True, exist_ok=True)
+
+        # Internal state
+        self._writer = None
+        self._buffer = []
+        self._count = 0
+
+        with open(self._cfg, "w") as f:
+            info = dict(
+                batch_size=batch_size,
+                rows_per_file=rows_per_file,
+                compression=compression,
+            )
+            json.dump(info, f, indent=2)
+
+    @property
+    def schema(self):
+        return self._schema
+
+    @property
+    def dataset(self):
+        return ds.dataset(
+            self._dir,
+            format="parquet",
+            schema=self.schema,
+        )
+
+    def delete_files(self):
+        for file in self.dataset_files:
+            Path(file).unlink()
+
+    @property
+    def next_index(self):
+        files = self.dataset_files
+        if not files:
+            return 0
+        return max([int(Path(f).stem) for f in files]) + 1
+
+    def write_rows(
+        self,
+        rows: list[dict[str, Any]],
+    ):
+        if not rows:
+            return
+        batch = pa.RecordBatch.from_pylist(rows, schema=self.schema)
+        self.write_batch(batch)
+
+    def write_batch(
+        self,
+        batch: pa.RecordBatch | dict[str, list | np.ndarray | pa.Array],
+    ):
+        if isinstance(batch, dict):
+            batch = pa.RecordBatch.from_pydict(batch)
+
+        size = batch.num_rows  # type: ignore - pyarrow typing
+        if self._buffer:
+            size += sum([b.num_rows for b in self._buffer])
+
+        # check size
+        if size < self.batch_size and self._count < self.rows_per_file:
+            self._buffer.append(batch)
+            return
+
+        if self._buffer:
+            self._buffer.append(batch)
+            combined_arrays = [
+                pa.concat_arrays([b.column(name) for b in self._buffer])
+                for name in self.schema.names
+            ]
+            batch = pa.RecordBatch.from_arrays(
+                combined_arrays, schema=self.schema
+            )
+            self._buffer = []
+
+        # write batch
+        writer = self._get_or_create_writer()
+        writer.write_batch(batch)
+
+        # check file size
+        self._count += size
+        if self._count >= self.rows_per_file:
+            self.flush()
+
+    def write_table(
+        self,
+        table: pa.Table,
+    ):
+        self.flush()
+        pq.write_table(table, where=self._next_filename())
+
+    def flush(self):
+        if self._buffer:
+            combined_arrays = [
+                pa.concat_arrays([b.column(name) for b in self._buffer])
+                for name in self.schema.names
+            ]
+            batch = pa.RecordBatch.from_arrays(
+                combined_arrays, schema=self.schema
+            )
+            self._buffer = []
+            writer = self._get_or_create_writer()
+            writer.write_batch(batch)
+        self._buffer = []
+        self._count = 0
+        self._close_writer()
+
+    def _next_filename(self) -> Path:
+        return self._dir / f"{self.next_index:06d}.parquet"
+
+    def _get_or_create_writer(self) -> pq.ParquetWriter:
+        """Open a new parquet file for writing."""
+        if self._writer is not None:
+            return self._writer
+        self._writer = pq.ParquetWriter(
+            where=self._next_filename(),
+            schema=self.schema,
+            compression=self.compression,
+        )
+        return self._writer
+
+    def _close_writer(self) -> None:
+        """Close the current parquet file."""
+        if self._writer is not None:
+            self._writer.close()
+            self._writer = None
+
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - ensures data is flushed."""
+        self.flush()
diff --git a/src/valor_lite/exceptions.py b/src/valor_lite/exceptions.py
@@ -5,6 +5,11 @@ def __init__(self):
         )
 
 
+class EmptyCacheError(Exception):
+    def __init__(self):
+        super().__init__("cache contains no data")
+
+
 class EmptyFilterError(Exception):
     def __init__(self, message: str):
         super().__init__(message)
diff --git a/tests/common/test_cache.py b/tests/common/test_cache.py
diff --git a/tests/common/test_exceptions.py b/tests/common/test_exceptions.py