From 77b2033b98be0d7765777cbe5707dba86738071a Mon Sep 17 00:00:00 2001 From: Donghao Ren Date: Fri, 3 Apr 2026 10:27:02 -0700 Subject: [PATCH] feat: replace basic cache with an encrypted cache to prevent leaking information --- packages/backend/embedding_atlas/cache.py | 407 ++++++++++++++++++ packages/backend/embedding_atlas/cli.py | 9 +- .../backend/embedding_atlas/data_source.py | 75 ++-- .../backend/embedding_atlas/projection.py | 254 ++++++----- packages/backend/embedding_atlas/utils.py | 51 --- packages/backend/pyproject.toml | 1 + packages/backend/tests/test_cache.py | 189 ++++++++ packages/backend/uv.lock | 61 +++ 8 files changed, 853 insertions(+), 194 deletions(-) create mode 100644 packages/backend/embedding_atlas/cache.py create mode 100644 packages/backend/tests/test_cache.py diff --git a/packages/backend/embedding_atlas/cache.py b/packages/backend/embedding_atlas/cache.py new file mode 100644 index 00000000..00fe48b5 --- /dev/null +++ b/packages/backend/embedding_atlas/cache.py @@ -0,0 +1,407 @@ +# Copyright (c) 2025 Apple Inc. Licensed under MIT License. + +""" +Encrypted file-based caching module for embedding_atlas. + +This module provides secure caching functionality with automatic encryption and +decryption of cached values. It supports caching arbitrary Python objects by +serializing them to JSON and encrypting the data using AES-GCM encryption. + +Key features: +- Automatic encryption/decryption of cached data using AES-GCM +- Support for arbitrary Python objects (strings, dicts, lists, numpy arrays, etc.) +- Secure key derivation using HMAC and HKDF +- Atomic file operations to prevent corruption +- Configurable cache directory and serialization methods +- Two-level directory structure for efficient file organization + +The cache uses a combination of HMAC-SHA256 for cache key generation and +HKDF-SHA256 for encryption key derivation, ensuring that cache keys and +encryption keys are cryptographically secure and derived from the input data. + +Example: + >>> from embedding_atlas.cache import file_cache_get, file_cache_set + >>> + >>> # Cache a value + >>> file_cache_set("my_key", {"data": [1, 2, 3]}) + >>> + >>> # Retrieve the cached value + >>> cached_value = file_cache_get("my_key") + >>> print(cached_value) # {"data": [1, 2, 3]} +""" + +import base64 +import hashlib +import hmac +import json +import logging +import secrets +import struct +from functools import lru_cache +from io import BytesIO, TextIOWrapper +from pathlib import Path +from typing import IO, Any, Callable + +import numpy as np +from cryptography.hazmat.primitives import hashes +from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes +from cryptography.hazmat.primitives.kdf.hkdf import HKDF +from platformdirs import user_cache_path + +logger = logging.getLogger("embedding-atlas") + + +def file_cache_get( + key: Any, + *, + scope: str | None = None, + cache_root: str | Path | None = None, + deserializer: Callable[[IO[bytes]], Any] | None = None, +) -> Any | None: + """ + Retrieve a cached value from the encrypted file cache. + + This function attempts to load and decrypt a previously cached value using + the provided key. The key is used to derive both the cache file location + and the encryption key for decryption. + + Args: + key: The cache key used to identify and decrypt the cached value. + Can be any hashable type (str, bytes, dict, list, numpy array, etc.). + cache_root: Optional custom cache directory path. If None, uses the + default user cache directory for embedding_atlas. + deserializer: Optional custom function to deserialize the decrypted value + from a binary file descriptor. If None, uses JSON deserialization. + + Returns: + The cached value if found and successfully decrypted, None otherwise. + Returns None if the cache file doesn't exist, decryption fails, or + deserialization fails. + """ + cache_root = _resolve_cache_root(cache_root) + if deserializer is None: + deserializer = default_deserializer + + cache_key, encryption_key = _derive_cache_key_and_encryption_key( + key, scope, cache_root + ) + cache_path = cache_root / cache_key[:2] / cache_key + + if not cache_path.exists(): + return None + + try: + with open(cache_path, "rb") as file: + data = _decrypt_data(file.read(), key=encryption_key) + + return deserializer(BytesIO(data)) + except Exception: + logger.debug("Cache read failed for key %s", cache_key, exc_info=True) + return None + + +def file_cache_set( + key: Any, + value: Any, + *, + scope: str | None = None, + cache_root: str | Path | None = None, + serializer: Callable[[Any, IO[bytes]], None] | None = None, +): + """ + Store a value in the encrypted file cache. + + This function serializes, encrypts, and stores a value in the file cache + using the provided key. The key is used to derive both the cache file + location and the encryption key. The operation is atomic - the file is + written to a temporary location first, then renamed to prevent corruption. + + Args: + key: The cache key used to identify and encrypt the cached value. + Can be any hashable type (str, bytes, dict, list, numpy array, etc.). + value: The value to cache. Must be serializable by the chosen serializer. + cache_root: Optional custom cache directory path. If None, uses the + default user cache directory for embedding_atlas. + serializer: Optional custom function to serialize the value to a binary + file descriptor before encryption. If None, uses JSON serialization. + + Raises: + OSError: If there are issues creating cache directories or writing files. + Exception: If serialization or encryption fails. + """ + cache_root = _resolve_cache_root(cache_root) + if serializer is None: + serializer = default_serializer + + cache_key, encryption_key = _derive_cache_key_and_encryption_key( + key, scope, cache_root + ) + + cache_path = cache_root / cache_key[:2] / cache_key + + # Generate a random temporary filename to avoid conflicts + random_suffix = secrets.token_hex(8) + cache_path_tmp = cache_root / cache_key[:2] / f"{cache_key}.tmp-{random_suffix}" + cache_path.parent.mkdir(parents=True, exist_ok=True) + + buffer = BytesIO() + serializer(value, buffer) + encrypted_data = _encrypt_data(buffer.getvalue(), key=encryption_key) + + with open(cache_path_tmp, "wb") as file: + file.write(encrypted_data) + + cache_path_tmp.rename(cache_path) + + +def file_cache_value( + key: Any, + value_func: Callable[[], Any], + *, + scope: str | None = None, + cache_root: str | Path | None = None, + serializer: Callable[[Any, IO[bytes]], None] | None = None, + deserializer: Callable[[IO[bytes]], Any] | None = None, + callback: Callable[[Path], None] | None = None, +): + """ + Retrieve a cached value or compute and cache it if not present. + + This is a read-through cache helper: if a cached value exists for the given + key, it is decrypted and returned. Otherwise, ``value_func`` is called to + compute the value, which is then encrypted and stored before being returned. + + Args: + key: The cache key used to locate and encrypt/decrypt the cached value. + Can be any hashable type (str, bytes, dict, list, numpy array, etc.). + value_func: A callable that takes no arguments and returns the value to + cache when a cache miss occurs. + scope: Optional namespace to isolate cache entries. Different scopes + produce different cache keys even for the same key value. + cache_root: Optional custom cache directory path. If None, uses the + default user cache directory for embedding_atlas. + serializer: Optional custom function to serialize the value to a binary + file descriptor before encryption. If None, uses JSON serialization. + deserializer: Optional custom function to deserialize the decrypted value + from a binary file descriptor. If None, uses JSON deserialization. + callback: Optional function called with the cache file path on a cache hit. + + Returns: + The cached value on a hit, or the freshly computed value from + ``value_func`` on a miss. + """ + cache_root = _resolve_cache_root(cache_root) + if serializer is None: + serializer = default_serializer + if deserializer is None: + deserializer = default_deserializer + + cache_key, encryption_key = _derive_cache_key_and_encryption_key( + key, scope, cache_root + ) + + cache_path = cache_root / cache_key[:2] / cache_key + + if cache_path.exists(): + try: + with open(cache_path, "rb") as file: + data = _decrypt_data(file.read(), key=encryption_key) + + result = deserializer(BytesIO(data)) + + if callback is not None: + callback(cache_path) + + return result + except Exception: + # If we can't read the file, move on. + logger.debug("Cache read failed for key %s", cache_key, exc_info=True) + + value = value_func() + + try: + # Generate a random temporary filename to avoid conflicts + random_suffix = secrets.token_hex(8) + cache_path_tmp = cache_root / cache_key[:2] / f"{cache_key}.tmp-{random_suffix}" + cache_path.parent.mkdir(parents=True, exist_ok=True) + + buffer = BytesIO() + serializer(value, buffer) + encrypted_data = _encrypt_data(buffer.getvalue(), key=encryption_key) + + with open(cache_path_tmp, "wb") as file: + file.write(encrypted_data) + + cache_path_tmp.rename(cache_path) + except Exception: + logger.debug("Cache write failed for key %s", cache_key, exc_info=True) + + return value + + +def _resolve_cache_root(cache_root: str | Path | None = None) -> Path: + if cache_root is None: + return (user_cache_path("embedding_atlas") / "cache").resolve() + else: + return Path(cache_root).resolve() + + +@lru_cache(maxsize=None) +def _get_constants(cache_root: Path) -> dict[str, bytes]: + cache_root.mkdir(parents=True, exist_ok=True) + constants_path = cache_root / "cache_constants.json" + + if constants_path.exists(): + with open(constants_path, "r") as f: + data = json.load(f) + return { + "HMAC_KEY": base64.b64decode(data["HMAC_KEY"]), + "HKDF_SALT": base64.b64decode(data["HKDF_SALT"]), + } + + hmac_key = secrets.token_bytes(32) + hkdf_salt = secrets.token_bytes(32) + + data = json.dumps( + { + "HMAC_KEY": base64.b64encode(hmac_key).decode("ascii"), + "HKDF_SALT": base64.b64encode(hkdf_salt).decode("ascii"), + }, + indent=2, + ).encode("utf-8") + + try: + # Use exclusive create ("xb") to avoid TOCTOU races. + # open(..., "xb") fails with FileExistsError if the file already exists, + # ensuring only one process wins when multiple start simultaneously. + with open(constants_path, "xb") as f: + f.write(data) + except FileExistsError: + # Another process created the file first — use their constants. + with open(constants_path, "r") as f: + existing = json.load(f) + return { + "HMAC_KEY": base64.b64decode(existing["HMAC_KEY"]), + "HKDF_SALT": base64.b64decode(existing["HKDF_SALT"]), + } + + return {"HMAC_KEY": hmac_key, "HKDF_SALT": hkdf_salt} + + +def _derive_cache_key_and_encryption_key( + value: Any, scope: str | None, cache_root: Path +) -> tuple[str, bytes]: + # First get sha256 of the value + h = hashlib.sha256() + _update_hash_with_value(h.update, scope, value) + sha256 = h.digest() + + consts = _get_constants(cache_root) + HMAC_KEY = consts["HMAC_KEY"] + HKDF_SALT = consts["HKDF_SALT"] + HKDF_INFO = b"cache-encryption-key" + + # HMAC for the cache key + hasher = hmac.new(HMAC_KEY, digestmod=hashlib.sha256) + hasher.update(sha256) + cache_key = hasher.hexdigest() + + # HKDF for the encryption key + hkdf = HKDF( + algorithm=hashes.SHA256(), + length=32, + salt=HKDF_SALT, + info=HKDF_INFO, + ) + encryption_key = hkdf.derive(sha256) + + return (cache_key, encryption_key) + + +def _encrypt_data(data: bytes, key: bytes) -> bytes: + # Generate a random 96-bit (12 byte) nonce for GCM + nonce = secrets.token_bytes(12) + + # Create cipher + cipher = Cipher(algorithms.AES(key), modes.GCM(nonce)) + encryptor = cipher.encryptor() + + # Encrypt the data + ciphertext = encryptor.update(data) + encryptor.finalize() + + # Return nonce + tag + ciphertext + return nonce + encryptor.tag + ciphertext + + +def _decrypt_data(encrypted_data: bytes, key: bytes) -> bytes: + # Extract nonce (12 bytes), tag (16 bytes), and ciphertext + nonce = encrypted_data[:12] + tag = encrypted_data[12:28] + ciphertext = encrypted_data[28:] + + # Create cipher + cipher = Cipher(algorithms.AES(key), modes.GCM(nonce, tag)) + decryptor = cipher.decryptor() + + # Decrypt the data + return decryptor.update(ciphertext) + decryptor.finalize() + + +def _update_hash_with_value(update_func: Callable[[bytes], None], *value: Any): + def preamble(kind: bytes, length: int): + update_func(kind + b":" + struct.pack(" None: + text_fd = TextIOWrapper(fd, encoding="utf-8") + json.dump(value, text_fd) + text_fd.detach() + + +def default_deserializer(fd: IO[bytes]) -> Any: + text_fd = TextIOWrapper(fd, encoding="utf-8") + result = json.load(text_fd) + text_fd.detach() + return result diff --git a/packages/backend/embedding_atlas/cli.py b/packages/backend/embedding_atlas/cli.py index 19720c91..08243d80 100644 --- a/packages/backend/embedding_atlas/cli.py +++ b/packages/backend/embedding_atlas/cli.py @@ -16,11 +16,11 @@ import pandas as pd import uvicorn +from .cache import sha256_hexdigest from .data_source import DataSource from .options import make_embedding_atlas_props from .server import make_server from .utils import ( - Hasher, apply_logging_config, load_huggingface_data, load_pandas_data, @@ -488,12 +488,7 @@ def main( "props": props, } - hasher = Hasher() - hasher.update(__version__) - hasher.update(inputs) - hasher.update(metadata) - identifier = hasher.hexdigest() - + identifier = sha256_hexdigest([__version__, inputs, metadata], scope="DataSource") dataset = DataSource(identifier, df, metadata) if static is None: diff --git a/packages/backend/embedding_atlas/data_source.py b/packages/backend/embedding_atlas/data_source.py index 3894bcd5..b67f20b4 100644 --- a/packages/backend/embedding_atlas/data_source.py +++ b/packages/backend/embedding_atlas/data_source.py @@ -6,10 +6,12 @@ import shutil import zipfile from io import BytesIO +from typing import Any import pandas as pd -from .utils import cache_path, to_parquet_bytes +from .cache import file_cache_get, file_cache_set +from .utils import to_parquet_bytes def _deep_merge(base: dict, overrides: dict) -> dict: @@ -32,20 +34,45 @@ def __init__( self.identifier = identifier self.dataset = dataset self.metadata = metadata - self.cache_path = cache_path("cache", self.identifier) + self._cache_index: set[str] = set(self._cache_index_load()) + + def _cache_index_key(self): + return [self.identifier, "__index__"] + + def _cache_index_load(self) -> list[str]: + index = file_cache_get(self._cache_index_key(), scope="DataSource") + if index is None: + return [] + return index + + def _cache_index_save(self): + file_cache_set( + self._cache_index_key(), sorted(self._cache_index), scope="DataSource" + ) + + def _cache_index_add(self, name: str): + if name not in self._cache_index: + self._cache_index.add(name) + # Re-read from disk and merge to avoid losing entries from other processes + persisted = set(self._cache_index_load()) + merged = self._cache_index | persisted + file_cache_set(self._cache_index_key(), sorted(merged), scope="DataSource") def cache_set(self, name: str, data): - path = self.cache_path / name - with open(path, "w") as f: - json.dump(data, f) + file_cache_set([self.identifier, name], data, scope="DataSource") + self._cache_index_add(name) def cache_get(self, name: str): - path = self.cache_path / name - if path.exists(): - with open(path, "r") as f: - return json.load(f) - else: - return None + return file_cache_get([self.identifier, name], scope="DataSource") + + def cache_items(self) -> dict[str, Any]: + """Return all cached entries as a dict of {name: value}.""" + result = {} + for name in self._cache_index: + value = self.cache_get(name) + if value is not None: + result[name] = value + return result def _build_metadata(self, metadata_overrides: dict | None = None) -> dict: metadata = self.metadata | { @@ -68,13 +95,11 @@ def make_archive(self, static_path: str, metadata_overrides: dict | None = None) for fn in files: p = os.path.relpath(os.path.join(root, fn), static_path) zip.write(os.path.join(root, fn), p) - for root, _, files in os.walk(self.cache_path): - for fn in files: - p = os.path.join( - "data/cache", - os.path.relpath(os.path.join(root, fn), str(self.cache_path)), - ) - zip.write(os.path.join(root, fn), p) + for name, value in self.cache_items().items(): + zip.writestr( + f"data/cache/{name}", + json.dumps(value), + ) return io.getvalue() def export_to_folder( @@ -103,11 +128,9 @@ def export_to_folder( dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dst) - # Copy cache files - for root, _, files in os.walk(self.cache_path): - for fn in files: - src = os.path.join(root, fn) - rel = os.path.relpath(src, str(self.cache_path)) - dst = data_dir / "cache" / rel - dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(src, dst) + # Write cache files + cache_dir = data_dir / "cache" + for name, value in self.cache_items().items(): + cache_file = cache_dir / name + cache_file.parent.mkdir(parents=True, exist_ok=True) + cache_file.write_text(json.dumps(value)) diff --git a/packages/backend/embedding_atlas/projection.py b/packages/backend/embedding_atlas/projection.py index bc358657..751f6e9a 100644 --- a/packages/backend/embedding_atlas/projection.py +++ b/packages/backend/embedding_atlas/projection.py @@ -1,13 +1,15 @@ # Copyright (c) 2025 Apple Inc. Licensed under MIT License. from dataclasses import dataclass +from io import BytesIO from pathlib import Path -from typing import Callable, Literal +from typing import IO, Callable, Literal import numpy as np import pandas as pd -from .utils import Hasher, cache_path, logger +from .cache import file_cache_value +from .utils import logger @dataclass @@ -26,6 +28,24 @@ def exists(path: Path): and path.with_suffix(".knn_distances.npy").exists() ) + @staticmethod + def serialize(value: "Projection", fd: IO[bytes]) -> None: + np.savez( + fd, + projection=value.projection, + knn_indices=value.knn_indices, + knn_distances=value.knn_distances, + ) + + @staticmethod + def deserialize(fd: IO[bytes]) -> "Projection": + d = np.load(fd, allow_pickle=False) + return Projection( + projection=d["projection"], + knn_indices=d["knn_indices"], + knn_distances=d["knn_distances"], + ) + @staticmethod def save(path: Path, value: "Projection"): np.save( @@ -64,8 +84,12 @@ def load(path: Path) -> "Projection": def _run_umap( hidden_vectors: np.ndarray, - umap_args: dict = {}, + *, + umap_args: dict | None = None, ) -> Projection: + if umap_args is None: + umap_args = {} + logger.info("Running UMAP for input with shape %s...", str(hidden_vectors.shape)) # type: ignore import umap @@ -173,43 +197,18 @@ async def run_async_coro() -> list: def _projection_for_texts( texts: list[str], + *, model: str | None = None, batch_size: int | None = None, text_projector_args: dict | None = None, text_projector: TextProjectorCallback | None = None, - umap_args: dict = {}, + umap_args: dict | None = None, ) -> Projection: if model is None: model = "all-MiniLM-L6-v2" if text_projector is None: text_projector = _project_text_with_sentence_transformers - # Some arguments may contain sensitive info (e.g., API keys) or do not invalidate the cache, so we exclude them - excluded_text_projector_args = {"api_key", "api_base", "sync"} - hashed_text_projector_args = { - k: v - for k, v in (text_projector_args or {}).items() - if k not in excluded_text_projector_args - } - hasher = Hasher() - hasher.update( - { - "version": 2, - "texts": texts, - "model": model, - "batch_size": batch_size, - "text_projector_args": hashed_text_projector_args, - "text_projector": text_projector.__name__, - "umap_args": umap_args, - } - ) - digest = hasher.hexdigest() - cpath = cache_path("projections") / digest - - if Projection.exists(cpath): - logger.info("Using cached projection from %s", str(cpath)) - return Projection.load(cpath) - # Set default batch size if not provided if batch_size is None: batch_size = 32 @@ -218,64 +217,56 @@ def _projection_for_texts( batch_size, ) - logger.info( - "Running embedding for %d texts with batch size %d using %s...", - len(texts), - batch_size, - text_projector.__name__, - ) - hidden_vectors = text_projector(texts, batch_size, model, text_projector_args) + def compute(): + logger.info( + "Running embedding for %d texts with batch size %d using %s...", + len(texts), + batch_size, + text_projector.__name__, + ) + hidden_vectors = text_projector(texts, batch_size, model, text_projector_args) - result = _run_umap(hidden_vectors, umap_args) - Projection.save(cpath, result) - return result + return _run_umap(hidden_vectors, umap_args=umap_args) + + # Some arguments may contain sensitive info (e.g., API keys) or do not invalidate the cache, so we exclude them + excluded_text_projector_args = {"api_key", "api_base", "sync"} + hashed_text_projector_args = { + k: v + for k, v in (text_projector_args or {}).items() + if k not in excluded_text_projector_args + } + cache_key = { + "version": 2, + "texts": texts, + "model": model, + "batch_size": batch_size, + "text_projector_args": hashed_text_projector_args, + "text_projector": text_projector.__name__, + "umap_args": umap_args, + } + + return file_cache_value( + cache_key, + compute, + scope="projection_for_texts", + serializer=Projection.serialize, + deserializer=Projection.deserialize, + callback=lambda cache_path: logger.info( + "Using cached projection from " + str(cache_path) + ), + ) def _projection_for_images( images: list, + *, model: str | None = None, trust_remote_code: bool = False, batch_size: int | None = None, - umap_args: dict = {}, + umap_args: dict | None = None, ) -> Projection: if model is None: model = "google/vit-base-patch16-384" - hasher = Hasher() - hasher.update( - { - "version": 1, - "images": images, - "model": model, - "batch_size": batch_size, - "umap_args": umap_args, - } - ) - digest = hasher.hexdigest() - cpath = cache_path("projections") / (digest + ".npy") - - if Projection.exists(cpath): - logger.info("Using cached projection from %s", str(cpath)) - return Projection.load(cpath) - - # Import on demand. - from io import BytesIO - - import torch - import tqdm - from PIL import Image - from transformers import pipeline - - def load_image(value): - if isinstance(value, bytes): - return Image.open(BytesIO(value)).convert("RGB") - elif isinstance(value, dict) and "bytes" in value: - return Image.open(BytesIO(value["bytes"])).convert("RGB") - else: - raise ValueError("invalid image value") - - logger.info("Loading model %s...", model) - - pipe = pipeline("image-feature-extraction", model=model, device_map="auto") # Set default batch size if not provided if batch_size is None: @@ -285,34 +276,77 @@ def load_image(value): batch_size, ) - logger.info( - "Running embedding for %d images with batch size %d...", len(images), batch_size - ) - tensors = [] - - current_batch = [] - - @torch.no_grad() - def process_batch(): - rs: torch.Tensor = pipe(current_batch, return_tensors=True) # type: ignore - current_batch.clear() - for r in rs: - if len(r.shape) == 3: - r = r.mean(1) - assert len(r.shape) == 2 - tensors.append(r) - - for image in tqdm.tqdm(images, smoothing=0.1): - current_batch.append(load_image(image)) - if len(current_batch) >= batch_size: - process_batch() - process_batch() + cache_key = { + "version": 1, + "images": images, + "model": model, + "batch_size": batch_size, + "umap_args": umap_args, + } - hidden_vectors = torch.concat(tensors).to(torch.float32).cpu().numpy() + def compute(): + # Import on demand. + import torch + import tqdm + from PIL import Image + from transformers import pipeline + + def load_image(value): + if isinstance(value, bytes): + return Image.open(BytesIO(value)).convert("RGB") + elif isinstance(value, dict) and "bytes" in value: + return Image.open(BytesIO(value["bytes"])).convert("RGB") + else: + raise ValueError("invalid image value") + + logger.info("Loading model %s...", model) + + pipe = pipeline( + "image-feature-extraction", + model=model, + device_map="auto", + trust_remote_code=trust_remote_code, + ) - result = _run_umap(hidden_vectors, umap_args) - Projection.save(cpath, result) - return result + logger.info( + "Running embedding for %d images with batch size %d...", + len(images), + batch_size, + ) + tensors = [] + + current_batch = [] + + @torch.no_grad() + def process_batch(): + rs: torch.Tensor = pipe(current_batch, return_tensors=True) # type: ignore + current_batch.clear() + for r in rs: + if len(r.shape) == 3: + r = r.mean(1) + assert len(r.shape) == 2 + tensors.append(r) + + for image in tqdm.tqdm(images, smoothing=0.1): + current_batch.append(load_image(image)) + if len(current_batch) >= batch_size: + process_batch() + process_batch() + + hidden_vectors = torch.concat(tensors).to(torch.float32).cpu().numpy() + result = _run_umap(hidden_vectors, umap_args=umap_args) + return result + + return file_cache_value( + cache_key, + compute, + scope="projection_for_images", + serializer=Projection.serialize, + deserializer=Projection.deserialize, + callback=lambda cache_path: logger.info( + "Using cached projection from " + str(cache_path) + ), + ) def _find_text_projector_callback(name: str) -> TextProjectorCallback: @@ -342,7 +376,7 @@ def compute_text_projection( "sentence_transformers", "litellm", ] = "sentence_transformers", - umap_args: dict = {}, + umap_args: dict | None = None, **kwargs, ): """ @@ -411,7 +445,7 @@ def compute_text_projection( data_frame[x] = proj.projection[:, 0] data_frame[y] = proj.projection[:, 1] if neighbors is not None: - data_frame[neighbors] = [ + data_frame[neighbors] = [ # type: ignore {"distances": b, "ids": a} # ID is always the same as the row index. for a, b in zip(proj.knn_indices, proj.knn_distances) ] @@ -424,7 +458,7 @@ def compute_vector_projection( x: str = "projection_x", y: str = "projection_y", neighbors: str | None = "neighbors", - umap_args: dict = {}, + umap_args: dict | None = None, ): """ Generate 2D projections from pre-existing vector embeddings using UMAP. @@ -464,13 +498,13 @@ def compute_vector_projection( hidden_vectors = np.stack(vector_list) # Run UMAP on the pre-existing vectors - proj = _run_umap(hidden_vectors, umap_args) + proj = _run_umap(hidden_vectors, umap_args=umap_args) # Add projection results to dataframe data_frame[x] = proj.projection[:, 0] data_frame[y] = proj.projection[:, 1] if neighbors is not None: - data_frame[neighbors] = [ + data_frame[neighbors] = [ # type: ignore {"distances": b, "ids": a} # ID is always the same as the row index. for a, b in zip(proj.knn_indices, proj.knn_distances) ] @@ -486,7 +520,7 @@ def compute_image_projection( model: str | None = None, trust_remote_code: bool = False, batch_size: int | None = None, - umap_args: dict = {}, + umap_args: dict | None = None, ): """ Compute image embeddings and generate 2D projections using UMAP. @@ -524,7 +558,7 @@ def compute_image_projection( data_frame[x] = proj.projection[:, 0] data_frame[y] = proj.projection[:, 1] if neighbors is not None: - data_frame[neighbors] = [ + data_frame[neighbors] = [ # type: ignore {"distances": b, "ids": a} # ID is always the same as the row index. for a, b in zip(proj.knn_indices, proj.knn_distances) ] diff --git a/packages/backend/embedding_atlas/utils.py b/packages/backend/embedding_atlas/utils.py index 3cf7d279..f7e83c0a 100644 --- a/packages/backend/embedding_atlas/utils.py +++ b/packages/backend/embedding_atlas/utils.py @@ -1,17 +1,13 @@ # Copyright (c) 2025 Apple Inc. Licensed under MIT License. -import hashlib -import json import logging from io import BytesIO from pathlib import Path from typing import Any import inquirer -import numpy as np import pandas as pd import pyarrow as pa -from platformdirs import user_cache_path logger = logging.getLogger("embedding-atlas") @@ -97,15 +93,6 @@ def actually_close(self): return result -def cache_path(*subfolders: str, mkdir=True) -> Path: - p = user_cache_path("embedding_atlas") - for f in subfolders: - p = p / f - if mkdir: - p.mkdir(parents=True, exist_ok=True) - return p - - def apply_logging_config(): logging.basicConfig( level=logging.INFO, @@ -113,41 +100,3 @@ def apply_logging_config(): ) logging.getLogger("httpx").setLevel(logging.WARNING) - - -class Hasher: - def __init__(self): - self.hash = hashlib.sha256() - self.counter = 0 - - def _emit(self, type: bytes, data: bytes): - self.hash.update(type + b"{") - self.hash.update(data) - self.hash.update(b"}") - - def _emit_value(self, value): - if isinstance(value, bytes): - self._emit(b"bytes", value) - elif isinstance(value, str): - self._emit(b"str", value.encode("utf-8")) - elif isinstance(value, np.ndarray): - self._emit(b"np.ndarray", value.tobytes()) - elif isinstance(value, list): - self.hash.update(b"list{") - for item in value: - self._emit_value(item) - self.hash.update(b"}") - elif isinstance(value, dict): - self.hash.update(b"dict{") - for key, value in value.items(): - self._emit_value(key) - self._emit_value(value) - self.hash.update(b"}") - else: - self._emit(b"json", json.dumps(value, sort_keys=True).encode("utf-8")) - - def update(self, value): - self._emit_value(value) - - def hexdigest(self): - return self.hash.hexdigest() diff --git a/packages/backend/pyproject.toml b/packages/backend/pyproject.toml index 2d9b0a20..687dc4c9 100644 --- a/packages/backend/pyproject.toml +++ b/packages/backend/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "tqdm >= 4.60.0", "litellm >= 1.70.0, != 1.82.7, != 1.82.8", # Critical supply chain attack: https://futuresearch.ai/blog/litellm-pypi-supply-chain-attack/ "websockets >= 15.0.1", + "cryptography >= 35.0.0", ] [project.scripts] diff --git a/packages/backend/tests/test_cache.py b/packages/backend/tests/test_cache.py new file mode 100644 index 00000000..d9fc872c --- /dev/null +++ b/packages/backend/tests/test_cache.py @@ -0,0 +1,189 @@ +import numpy as np +import pytest +from embedding_atlas.cache import ( + file_cache_get, + file_cache_set, + file_cache_value, + sha256_hexdigest, +) + +# --------------------------------------------------------------------------- +# sha256_hexdigest +# --------------------------------------------------------------------------- + + +def test_sha256_hexdigest_deterministic(): + assert sha256_hexdigest("hello") == sha256_hexdigest("hello") + + +def test_sha256_hexdigest_different_values(): + assert sha256_hexdigest("a") != sha256_hexdigest("b") + + +def test_sha256_hexdigest_with_scope(): + h1 = sha256_hexdigest("data", scope="scope1") + h2 = sha256_hexdigest("data", scope="scope2") + h3 = sha256_hexdigest("data", scope=None) + assert h1 != h2 + assert h1 != h3 + + +def test_sha256_hexdigest_bytes(): + h = sha256_hexdigest(b"raw bytes") + assert isinstance(h, str) and len(h) == 64 + + +def test_sha256_hexdigest_numpy_array(): + arr = np.array([1.0, 2.0, 3.0]) + h1 = sha256_hexdigest(arr) + h2 = sha256_hexdigest(np.array([1.0, 2.0, 3.0])) + h3 = sha256_hexdigest(np.array([1.0, 2.0, 4.0])) + assert h1 == h2 + assert h1 != h3 + + +def test_sha256_hexdigest_list(): + assert sha256_hexdigest([1, 2, 3]) == sha256_hexdigest([1, 2, 3]) + assert sha256_hexdigest([1, 2, 3]) != sha256_hexdigest([1, 2, 4]) + + +def test_sha256_hexdigest_dict(): + # Dict hashing should be order-independent (keys are sorted) + assert sha256_hexdigest({"a": 1, "b": 2}) == sha256_hexdigest({"b": 2, "a": 1}) + + +def test_sha256_hexdigest_none(): + h = sha256_hexdigest(None) + assert isinstance(h, str) and len(h) == 64 + assert h != sha256_hexdigest("") + + +def test_sha256_hexdigest_nested(): + val = {"key": [1, "two", None, {"inner": True}]} + assert sha256_hexdigest(val) == sha256_hexdigest(val) + + +# --------------------------------------------------------------------------- +# file_cache_get / file_cache_set / file_cache_value (integration tests) +# --------------------------------------------------------------------------- + + +@pytest.fixture() +def cache_dir(tmp_path): + """Provide a fresh temporary cache directory and clear the constants LRU cache.""" + import embedding_atlas.cache as cache_module + + cache_module._get_constants.cache_clear() + yield tmp_path / "cache" + cache_module._get_constants.cache_clear() + + +def test_file_cache_miss(cache_dir): + assert file_cache_get("nonexistent", cache_root=cache_dir) is None + + +def test_file_cache_set_and_get(cache_dir): + file_cache_set("key1", {"value": 42}, cache_root=cache_dir) + result = file_cache_get("key1", cache_root=cache_dir) + assert result == {"value": 42} + + +def test_file_cache_set_overwrite(cache_dir): + file_cache_set("key", "old", cache_root=cache_dir) + file_cache_set("key", "new", cache_root=cache_dir) + assert file_cache_get("key", cache_root=cache_dir) == "new" + + +def test_file_cache_different_keys(cache_dir): + file_cache_set("a", 1, cache_root=cache_dir) + file_cache_set("b", 2, cache_root=cache_dir) + assert file_cache_get("a", cache_root=cache_dir) == 1 + assert file_cache_get("b", cache_root=cache_dir) == 2 + + +def test_file_cache_with_scope(cache_dir): + file_cache_set("key", "val1", scope="s1", cache_root=cache_dir) + file_cache_set("key", "val2", scope="s2", cache_root=cache_dir) + assert file_cache_get("key", scope="s1", cache_root=cache_dir) == "val1" + assert file_cache_get("key", scope="s2", cache_root=cache_dir) == "val2" + # Wrong scope returns None + assert file_cache_get("key", scope="s3", cache_root=cache_dir) is None + + +def test_file_cache_value_miss(cache_dir): + calls = [] + + def compute(): + calls.append(1) + return "computed" + + result = file_cache_value("k", compute, cache_root=cache_dir) + assert result == "computed" + assert len(calls) == 1 + + +def test_file_cache_value_hit(cache_dir): + file_cache_set("k", "cached", cache_root=cache_dir) + calls = [] + + def compute(): + calls.append(1) + return "computed" + + result = file_cache_value("k", compute, cache_root=cache_dir) + assert result == "cached" + assert len(calls) == 0 + + +def test_file_cache_value_callback(cache_dir): + file_cache_set("k", "cached", cache_root=cache_dir) + paths = [] + result = file_cache_value( + "k", lambda: "x", cache_root=cache_dir, callback=lambda p: paths.append(p) + ) + assert result == "cached" + assert len(paths) == 1 + + +def test_file_cache_value_no_callback_on_miss(cache_dir): + paths = [] + file_cache_value( + "k", lambda: "x", cache_root=cache_dir, callback=lambda p: paths.append(p) + ) + assert len(paths) == 0 + + +def test_file_cache_custom_serializer(cache_dir): + def ser(v, fd): + fd.write(v.encode("ascii")) + + def deser(fd): + return fd.read().decode("ascii") + + file_cache_set("k", "hello", cache_root=cache_dir, serializer=ser) + result = file_cache_get("k", cache_root=cache_dir, deserializer=deser) + assert result == "hello" + + +def test_file_cache_complex_key(cache_dir): + key = {"model": "bert", "params": [1, 2], "array": np.array([1.0, 2.0])} + file_cache_set(key, "result", cache_root=cache_dir) + assert file_cache_get(key, cache_root=cache_dir) == "result" + + +def test_file_cache_value_populates_cache(cache_dir): + file_cache_value("k", lambda: [1, 2, 3], cache_root=cache_dir) + # Should be readable via file_cache_get now + assert file_cache_get("k", cache_root=cache_dir) == [1, 2, 3] + + +def test_cache_files_are_encrypted(cache_dir): + file_cache_set("key", "secret_value", cache_root=cache_dir) + # Find the cache file and verify contents are not plaintext + cache_files = list(cache_dir.rglob("*")) + data_files = [ + f for f in cache_files if f.is_file() and f.name != "cache_constants.json" + ] + assert len(data_files) == 1 + raw = data_files[0].read_bytes() + assert b"secret_value" not in raw diff --git a/packages/backend/uv.lock b/packages/backend/uv.lock index 76143510..4e0f26c1 100644 --- a/packages/backend/uv.lock +++ b/packages/backend/uv.lock @@ -674,6 +674,65 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/90/20d1747255f1ee69a412e319da51ea594c18cca195e7a4d4c713f045eff5/cramjam-2.11.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:6c2eea545fef1065c7dd4eda991666fd9c783fbc1d226592ccca8d8891c02f23", size = 1714982, upload-time = "2025-07-27T21:25:05.79Z" }, ] +[[package]] +name = "cryptography" +version = "46.0.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/60/04/ee2a9e8542e4fa2773b81771ff8349ff19cdd56b7258a0cc442639052edb/cryptography-46.0.5.tar.gz", hash = "sha256:abace499247268e3757271b2f1e244b36b06f8515cf27c4d49468fc9eb16e93d", size = 750064, upload-time = "2026-02-10T19:18:38.255Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/81/b0bb27f2ba931a65409c6b8a8b358a7f03c0e46eceacddff55f7c84b1f3b/cryptography-46.0.5-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:351695ada9ea9618b3500b490ad54c739860883df6c1f555e088eaf25b1bbaad", size = 7176289, upload-time = "2026-02-10T19:17:08.274Z" }, + { url = "https://files.pythonhosted.org/packages/ff/9e/6b4397a3e3d15123de3b1806ef342522393d50736c13b20ec4c9ea6693a6/cryptography-46.0.5-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c18ff11e86df2e28854939acde2d003f7984f721eba450b56a200ad90eeb0e6b", size = 4275637, upload-time = "2026-02-10T19:17:10.53Z" }, + { url = "https://files.pythonhosted.org/packages/63/e7/471ab61099a3920b0c77852ea3f0ea611c9702f651600397ac567848b897/cryptography-46.0.5-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d7e3d356b8cd4ea5aff04f129d5f66ebdc7b6f8eae802b93739ed520c47c79b", size = 4424742, upload-time = "2026-02-10T19:17:12.388Z" }, + { url = "https://files.pythonhosted.org/packages/37/53/a18500f270342d66bf7e4d9f091114e31e5ee9e7375a5aba2e85a91e0044/cryptography-46.0.5-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:50bfb6925eff619c9c023b967d5b77a54e04256c4281b0e21336a130cd7fc263", size = 4277528, upload-time = "2026-02-10T19:17:13.853Z" }, + { url = "https://files.pythonhosted.org/packages/22/29/c2e812ebc38c57b40e7c583895e73c8c5adb4d1e4a0cc4c5a4fdab2b1acc/cryptography-46.0.5-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:803812e111e75d1aa73690d2facc295eaefd4439be1023fefc4995eaea2af90d", size = 4947993, upload-time = "2026-02-10T19:17:15.618Z" }, + { url = "https://files.pythonhosted.org/packages/6b/e7/237155ae19a9023de7e30ec64e5d99a9431a567407ac21170a046d22a5a3/cryptography-46.0.5-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3ee190460e2fbe447175cda91b88b84ae8322a104fc27766ad09428754a618ed", size = 4456855, upload-time = "2026-02-10T19:17:17.221Z" }, + { url = "https://files.pythonhosted.org/packages/2d/87/fc628a7ad85b81206738abbd213b07702bcbdada1dd43f72236ef3cffbb5/cryptography-46.0.5-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:f145bba11b878005c496e93e257c1e88f154d278d2638e6450d17e0f31e558d2", size = 3984635, upload-time = "2026-02-10T19:17:18.792Z" }, + { url = "https://files.pythonhosted.org/packages/84/29/65b55622bde135aedf4565dc509d99b560ee4095e56989e815f8fd2aa910/cryptography-46.0.5-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:e9251e3be159d1020c4030bd2e5f84d6a43fe54b6c19c12f51cde9542a2817b2", size = 4277038, upload-time = "2026-02-10T19:17:20.256Z" }, + { url = "https://files.pythonhosted.org/packages/bc/36/45e76c68d7311432741faf1fbf7fac8a196a0a735ca21f504c75d37e2558/cryptography-46.0.5-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:47fb8a66058b80e509c47118ef8a75d14c455e81ac369050f20ba0d23e77fee0", size = 4912181, upload-time = "2026-02-10T19:17:21.825Z" }, + { url = "https://files.pythonhosted.org/packages/6d/1a/c1ba8fead184d6e3d5afcf03d569acac5ad063f3ac9fb7258af158f7e378/cryptography-46.0.5-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:4c3341037c136030cb46e4b1e17b7418ea4cbd9dd207e4a6f3b2b24e0d4ac731", size = 4456482, upload-time = "2026-02-10T19:17:25.133Z" }, + { url = "https://files.pythonhosted.org/packages/f9/e5/3fb22e37f66827ced3b902cf895e6a6bc1d095b5b26be26bd13c441fdf19/cryptography-46.0.5-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:890bcb4abd5a2d3f852196437129eb3667d62630333aacc13dfd470fad3aaa82", size = 4405497, upload-time = "2026-02-10T19:17:26.66Z" }, + { url = "https://files.pythonhosted.org/packages/1a/df/9d58bb32b1121a8a2f27383fabae4d63080c7ca60b9b5c88be742be04ee7/cryptography-46.0.5-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:80a8d7bfdf38f87ca30a5391c0c9ce4ed2926918e017c29ddf643d0ed2778ea1", size = 4667819, upload-time = "2026-02-10T19:17:28.569Z" }, + { url = "https://files.pythonhosted.org/packages/ea/ed/325d2a490c5e94038cdb0117da9397ece1f11201f425c4e9c57fe5b9f08b/cryptography-46.0.5-cp311-abi3-win32.whl", hash = "sha256:60ee7e19e95104d4c03871d7d7dfb3d22ef8a9b9c6778c94e1c8fcc8365afd48", size = 3028230, upload-time = "2026-02-10T19:17:30.518Z" }, + { url = "https://files.pythonhosted.org/packages/e9/5a/ac0f49e48063ab4255d9e3b79f5def51697fce1a95ea1370f03dc9db76f6/cryptography-46.0.5-cp311-abi3-win_amd64.whl", hash = "sha256:38946c54b16c885c72c4f59846be9743d699eee2b69b6988e0a00a01f46a61a4", size = 3480909, upload-time = "2026-02-10T19:17:32.083Z" }, + { url = "https://files.pythonhosted.org/packages/00/13/3d278bfa7a15a96b9dc22db5a12ad1e48a9eb3d40e1827ef66a5df75d0d0/cryptography-46.0.5-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:94a76daa32eb78d61339aff7952ea819b1734b46f73646a07decb40e5b3448e2", size = 7119287, upload-time = "2026-02-10T19:17:33.801Z" }, + { url = "https://files.pythonhosted.org/packages/67/c8/581a6702e14f0898a0848105cbefd20c058099e2c2d22ef4e476dfec75d7/cryptography-46.0.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5be7bf2fb40769e05739dd0046e7b26f9d4670badc7b032d6ce4db64dddc0678", size = 4265728, upload-time = "2026-02-10T19:17:35.569Z" }, + { url = "https://files.pythonhosted.org/packages/dd/4a/ba1a65ce8fc65435e5a849558379896c957870dd64fecea97b1ad5f46a37/cryptography-46.0.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe346b143ff9685e40192a4960938545c699054ba11d4f9029f94751e3f71d87", size = 4408287, upload-time = "2026-02-10T19:17:36.938Z" }, + { url = "https://files.pythonhosted.org/packages/f8/67/8ffdbf7b65ed1ac224d1c2df3943553766914a8ca718747ee3871da6107e/cryptography-46.0.5-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:c69fd885df7d089548a42d5ec05be26050ebcd2283d89b3d30676eb32ff87dee", size = 4270291, upload-time = "2026-02-10T19:17:38.748Z" }, + { url = "https://files.pythonhosted.org/packages/f8/e5/f52377ee93bc2f2bba55a41a886fd208c15276ffbd2569f2ddc89d50e2c5/cryptography-46.0.5-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:8293f3dea7fc929ef7240796ba231413afa7b68ce38fd21da2995549f5961981", size = 4927539, upload-time = "2026-02-10T19:17:40.241Z" }, + { url = "https://files.pythonhosted.org/packages/3b/02/cfe39181b02419bbbbcf3abdd16c1c5c8541f03ca8bda240debc467d5a12/cryptography-46.0.5-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:1abfdb89b41c3be0365328a410baa9df3ff8a9110fb75e7b52e66803ddabc9a9", size = 4442199, upload-time = "2026-02-10T19:17:41.789Z" }, + { url = "https://files.pythonhosted.org/packages/c0/96/2fcaeb4873e536cf71421a388a6c11b5bc846e986b2b069c79363dc1648e/cryptography-46.0.5-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:d66e421495fdb797610a08f43b05269e0a5ea7f5e652a89bfd5a7d3c1dee3648", size = 3960131, upload-time = "2026-02-10T19:17:43.379Z" }, + { url = "https://files.pythonhosted.org/packages/d8/d2/b27631f401ddd644e94c5cf33c9a4069f72011821cf3dc7309546b0642a0/cryptography-46.0.5-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:4e817a8920bfbcff8940ecfd60f23d01836408242b30f1a708d93198393a80b4", size = 4270072, upload-time = "2026-02-10T19:17:45.481Z" }, + { url = "https://files.pythonhosted.org/packages/f4/a7/60d32b0370dae0b4ebe55ffa10e8599a2a59935b5ece1b9f06edb73abdeb/cryptography-46.0.5-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:68f68d13f2e1cb95163fa3b4db4bf9a159a418f5f6e7242564fc75fcae667fd0", size = 4892170, upload-time = "2026-02-10T19:17:46.997Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b9/cf73ddf8ef1164330eb0b199a589103c363afa0cf794218c24d524a58eab/cryptography-46.0.5-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:a3d1fae9863299076f05cb8a778c467578262fae09f9dc0ee9b12eb4268ce663", size = 4441741, upload-time = "2026-02-10T19:17:48.661Z" }, + { url = "https://files.pythonhosted.org/packages/5f/eb/eee00b28c84c726fe8fa0158c65afe312d9c3b78d9d01daf700f1f6e37ff/cryptography-46.0.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c4143987a42a2397f2fc3b4d7e3a7d313fbe684f67ff443999e803dd75a76826", size = 4396728, upload-time = "2026-02-10T19:17:50.058Z" }, + { url = "https://files.pythonhosted.org/packages/65/f4/6bc1a9ed5aef7145045114b75b77c2a8261b4d38717bd8dea111a63c3442/cryptography-46.0.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7d731d4b107030987fd61a7f8ab512b25b53cef8f233a97379ede116f30eb67d", size = 4652001, upload-time = "2026-02-10T19:17:51.54Z" }, + { url = "https://files.pythonhosted.org/packages/86/ef/5d00ef966ddd71ac2e6951d278884a84a40ffbd88948ef0e294b214ae9e4/cryptography-46.0.5-cp314-cp314t-win32.whl", hash = "sha256:c3bcce8521d785d510b2aad26ae2c966092b7daa8f45dd8f44734a104dc0bc1a", size = 3003637, upload-time = "2026-02-10T19:17:52.997Z" }, + { url = "https://files.pythonhosted.org/packages/b7/57/f3f4160123da6d098db78350fdfd9705057aad21de7388eacb2401dceab9/cryptography-46.0.5-cp314-cp314t-win_amd64.whl", hash = "sha256:4d8ae8659ab18c65ced284993c2265910f6c9e650189d4e3f68445ef82a810e4", size = 3469487, upload-time = "2026-02-10T19:17:54.549Z" }, + { url = "https://files.pythonhosted.org/packages/e2/fa/a66aa722105ad6a458bebd64086ca2b72cdd361fed31763d20390f6f1389/cryptography-46.0.5-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:4108d4c09fbbf2789d0c926eb4152ae1760d5a2d97612b92d508d96c861e4d31", size = 7170514, upload-time = "2026-02-10T19:17:56.267Z" }, + { url = "https://files.pythonhosted.org/packages/0f/04/c85bdeab78c8bc77b701bf0d9bdcf514c044e18a46dcff330df5448631b0/cryptography-46.0.5-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1f30a86d2757199cb2d56e48cce14deddf1f9c95f1ef1b64ee91ea43fe2e18", size = 4275349, upload-time = "2026-02-10T19:17:58.419Z" }, + { url = "https://files.pythonhosted.org/packages/5c/32/9b87132a2f91ee7f5223b091dc963055503e9b442c98fc0b8a5ca765fab0/cryptography-46.0.5-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:039917b0dc418bb9f6edce8a906572d69e74bd330b0b3fea4f79dab7f8ddd235", size = 4420667, upload-time = "2026-02-10T19:18:00.619Z" }, + { url = "https://files.pythonhosted.org/packages/a1/a6/a7cb7010bec4b7c5692ca6f024150371b295ee1c108bdc1c400e4c44562b/cryptography-46.0.5-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ba2a27ff02f48193fc4daeadf8ad2590516fa3d0adeeb34336b96f7fa64c1e3a", size = 4276980, upload-time = "2026-02-10T19:18:02.379Z" }, + { url = "https://files.pythonhosted.org/packages/8e/7c/c4f45e0eeff9b91e3f12dbd0e165fcf2a38847288fcfd889deea99fb7b6d/cryptography-46.0.5-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:61aa400dce22cb001a98014f647dc21cda08f7915ceb95df0c9eaf84b4b6af76", size = 4939143, upload-time = "2026-02-10T19:18:03.964Z" }, + { url = "https://files.pythonhosted.org/packages/37/19/e1b8f964a834eddb44fa1b9a9976f4e414cbb7aa62809b6760c8803d22d1/cryptography-46.0.5-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3ce58ba46e1bc2aac4f7d9290223cead56743fa6ab94a5d53292ffaac6a91614", size = 4453674, upload-time = "2026-02-10T19:18:05.588Z" }, + { url = "https://files.pythonhosted.org/packages/db/ed/db15d3956f65264ca204625597c410d420e26530c4e2943e05a0d2f24d51/cryptography-46.0.5-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:420d0e909050490d04359e7fdb5ed7e667ca5c3c402b809ae2563d7e66a92229", size = 3978801, upload-time = "2026-02-10T19:18:07.167Z" }, + { url = "https://files.pythonhosted.org/packages/41/e2/df40a31d82df0a70a0daf69791f91dbb70e47644c58581d654879b382d11/cryptography-46.0.5-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:582f5fcd2afa31622f317f80426a027f30dc792e9c80ffee87b993200ea115f1", size = 4276755, upload-time = "2026-02-10T19:18:09.813Z" }, + { url = "https://files.pythonhosted.org/packages/33/45/726809d1176959f4a896b86907b98ff4391a8aa29c0aaaf9450a8a10630e/cryptography-46.0.5-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:bfd56bb4b37ed4f330b82402f6f435845a5f5648edf1ad497da51a8452d5d62d", size = 4901539, upload-time = "2026-02-10T19:18:11.263Z" }, + { url = "https://files.pythonhosted.org/packages/99/0f/a3076874e9c88ecb2ecc31382f6e7c21b428ede6f55aafa1aa272613e3cd/cryptography-46.0.5-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a3d507bb6a513ca96ba84443226af944b0f7f47dcc9a399d110cd6146481d24c", size = 4452794, upload-time = "2026-02-10T19:18:12.914Z" }, + { url = "https://files.pythonhosted.org/packages/02/ef/ffeb542d3683d24194a38f66ca17c0a4b8bf10631feef44a7ef64e631b1a/cryptography-46.0.5-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9f16fbdf4da055efb21c22d81b89f155f02ba420558db21288b3d0035bafd5f4", size = 4404160, upload-time = "2026-02-10T19:18:14.375Z" }, + { url = "https://files.pythonhosted.org/packages/96/93/682d2b43c1d5f1406ed048f377c0fc9fc8f7b0447a478d5c65ab3d3a66eb/cryptography-46.0.5-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ced80795227d70549a411a4ab66e8ce307899fad2220ce5ab2f296e687eacde9", size = 4667123, upload-time = "2026-02-10T19:18:15.886Z" }, + { url = "https://files.pythonhosted.org/packages/45/2d/9c5f2926cb5300a8eefc3f4f0b3f3df39db7f7ce40c8365444c49363cbda/cryptography-46.0.5-cp38-abi3-win32.whl", hash = "sha256:02f547fce831f5096c9a567fd41bc12ca8f11df260959ecc7c3202555cc47a72", size = 3010220, upload-time = "2026-02-10T19:18:17.361Z" }, + { url = "https://files.pythonhosted.org/packages/48/ef/0c2f4a8e31018a986949d34a01115dd057bf536905dca38897bacd21fac3/cryptography-46.0.5-cp38-abi3-win_amd64.whl", hash = "sha256:556e106ee01aa13484ce9b0239bca667be5004efb0aabbed28d353df86445595", size = 3467050, upload-time = "2026-02-10T19:18:18.899Z" }, + { url = "https://files.pythonhosted.org/packages/eb/dd/2d9fdb07cebdf3d51179730afb7d5e576153c6744c3ff8fded23030c204e/cryptography-46.0.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:3b4995dc971c9fb83c25aa44cf45f02ba86f71ee600d81091c2f0cbae116b06c", size = 3476964, upload-time = "2026-02-10T19:18:20.687Z" }, + { url = "https://files.pythonhosted.org/packages/e9/6f/6cc6cc9955caa6eaf83660b0da2b077c7fe8ff9950a3c5e45d605038d439/cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:bc84e875994c3b445871ea7181d424588171efec3e185dced958dad9e001950a", size = 4218321, upload-time = "2026-02-10T19:18:22.349Z" }, + { url = "https://files.pythonhosted.org/packages/3e/5d/c4da701939eeee699566a6c1367427ab91a8b7088cc2328c09dbee940415/cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:2ae6971afd6246710480e3f15824ed3029a60fc16991db250034efd0b9fb4356", size = 4381786, upload-time = "2026-02-10T19:18:24.529Z" }, + { url = "https://files.pythonhosted.org/packages/ac/97/a538654732974a94ff96c1db621fa464f455c02d4bb7d2652f4edc21d600/cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:d861ee9e76ace6cf36a6a89b959ec08e7bc2493ee39d07ffe5acb23ef46d27da", size = 4217990, upload-time = "2026-02-10T19:18:25.957Z" }, + { url = "https://files.pythonhosted.org/packages/ae/11/7e500d2dd3ba891197b9efd2da5454b74336d64a7cc419aa7327ab74e5f6/cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:2b7a67c9cd56372f3249b39699f2ad479f6991e62ea15800973b956f4b73e257", size = 4381252, upload-time = "2026-02-10T19:18:27.496Z" }, + { url = "https://files.pythonhosted.org/packages/bc/58/6b3d24e6b9bc474a2dcdee65dfd1f008867015408a271562e4b690561a4d/cryptography-46.0.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8456928655f856c6e1533ff59d5be76578a7157224dbd9ce6872f25055ab9ab7", size = 3407605, upload-time = "2026-02-10T19:18:29.233Z" }, +] + [[package]] name = "cuda-bindings" version = "12.9.4" @@ -849,6 +908,7 @@ source = { editable = "." } dependencies = [ { name = "accelerate" }, { name = "click" }, + { name = "cryptography" }, { name = "duckdb" }, { name = "fastapi" }, { name = "fastparquet" }, @@ -880,6 +940,7 @@ dev = [ requires-dist = [ { name = "accelerate", specifier = ">=1.5.0" }, { name = "click", specifier = ">=7.0.0" }, + { name = "cryptography", specifier = ">=35.0.0" }, { name = "duckdb", specifier = ">=1.4.0" }, { name = "fastapi", specifier = ">=0.115.0" }, { name = "fastparquet", specifier = ">=2024.0.0" },