Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion src/zarr/abc/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@


class AccessMode(NamedTuple):
str: AccessModeLiteral
readonly: bool
overwrite: bool
create: bool
Expand All @@ -20,6 +21,7 @@ class AccessMode(NamedTuple):
def from_literal(cls, mode: AccessModeLiteral) -> Self:
if mode in ("r", "r+", "a", "w", "w-"):
return cls(
str=mode,
readonly=mode == "r",
overwrite=mode == "w",
create=mode in ("a", "w", "w-"),
Expand All @@ -42,6 +44,14 @@ async def open(cls, *args: Any, **kwargs: Any) -> Self:
await store._open()
return store

def __enter__(self) -> Self:
"""Enter a context manager that will close the store upon exiting."""
return self

def __exit__(self, *args: Any) -> None:
"""Close the store."""
self.close()

async def _open(self) -> None:
if self._is_open:
raise ValueError("store is already open")
Expand Down Expand Up @@ -143,6 +153,12 @@ async def set(self, key: str, value: Buffer) -> None:
"""
...

@property
@abstractmethod
def supports_deletes(self) -> bool:
"""Does the store support deletes?"""
...

@abstractmethod
async def delete(self, key: str) -> None:
"""Remove a key from the store
Expand Down Expand Up @@ -221,7 +237,6 @@ def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
def close(self) -> None:
"""Close the store."""
self._is_open = False
pass


@runtime_checkable
Expand Down
11 changes: 10 additions & 1 deletion src/zarr/store/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,14 @@
from zarr.store.local import LocalStore
from zarr.store.memory import MemoryStore
from zarr.store.remote import RemoteStore
from zarr.store.zip import ZipStore

__all__ = ["StorePath", "StoreLike", "make_store_path", "RemoteStore", "LocalStore", "MemoryStore"]
__all__ = [
"StorePath",
"StoreLike",
"make_store_path",
"RemoteStore",
"LocalStore",
"MemoryStore",
"ZipStore",
]
1 change: 1 addition & 0 deletions src/zarr/store/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def _put(

class LocalStore(Store):
supports_writes: bool = True
supports_deletes: bool = True
supports_partial_writes: bool = True
supports_listing: bool = True

Expand Down
1 change: 1 addition & 0 deletions src/zarr/store/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
# When that is done, the `MemoryStore` will just be a store that wraps a dict.
class MemoryStore(Store):
supports_writes: bool = True
supports_deletes: bool = True
supports_partial_writes: bool = True
supports_listing: bool = True

Expand Down
1 change: 1 addition & 0 deletions src/zarr/store/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
class RemoteStore(Store):
# based on FSSpec
supports_writes: bool = True
supports_deletes: bool = True
supports_partial_writes: bool = False
supports_listing: bool = True

Expand Down
244 changes: 244 additions & 0 deletions src/zarr/store/zip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
from __future__ import annotations

import os
import threading
import time
import zipfile
from collections.abc import AsyncGenerator
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal

from zarr.abc.store import Store
from zarr.core.buffer import Buffer, BufferPrototype

if TYPE_CHECKING:
from typing_extensions import Self

ZipStoreAccessModeLiteral = Literal["r", "w", "a"]


class ZipStore(Store):
supports_writes: bool = True
supports_deletes: bool = False
supports_partial_writes: bool = False
supports_listing: bool = True

path: Path
compression: int
allowZip64: bool

_zf: zipfile.ZipFile
_lock: threading.RLock

def __init__(
self,
path: Path | str,
*,
mode: ZipStoreAccessModeLiteral = "r",
compression: int = zipfile.ZIP_STORED,
allowZip64: bool = True,
):
super().__init__(mode=mode)

if isinstance(path, str):
path = Path(path)
assert isinstance(path, Path)
self.path = path # root?

self._zmode = mode
self.compression = compression
self.allowZip64 = allowZip64

@classmethod
async def open(cls, *args: Any, **kwargs: Any) -> Self:
store = cls(*args, **kwargs)
store._lock = threading.RLock() # TODO: evaluate if this is the lock we want or if we want an asyncio.Lock or something like that

store._zf = zipfile.ZipFile(
store.path,
mode=store._zmode,
compression=store.compression,
allowZip64=store.allowZip64,
)
store._is_open = True
return store

def close(self) -> None:
super().close()
with self._lock:
self._zf.close()

async def clear(self) -> None:
with self._lock:
self._check_writable()
self._zf.close()
os.remove(self.path)
self._zf = zipfile.ZipFile(
self.path, mode="w", compression=self.compression, allowZip64=self.allowZip64
)

async def empty(self) -> bool:
with self._lock:
if self._zf.namelist():
return False
else:
return True

def __str__(self) -> str:
return f"zip://{self.path}"

def __repr__(self) -> str:
return f"ZipStore({str(self)!r})"

def __eq__(self, other: object) -> bool:
return isinstance(other, type(self)) and self.path == other.path

def _get(
self,
key: str,
prototype: BufferPrototype,
byte_range: tuple[int | None, int | None] | None = None,
) -> Buffer | None:
try:
with self._zf.open(key) as f: # will raise KeyError
if byte_range is None:
return prototype.buffer.from_bytes(f.read())
start, length = byte_range
if start:
if start < 0:
start = f.seek(start, os.SEEK_END) + start
else:
start = f.seek(start, os.SEEK_SET)
if length:
return prototype.buffer.from_bytes(f.read(length))
else:
return prototype.buffer.from_bytes(f.read())
except KeyError:
return None

async def get(
self,
key: str,
prototype: BufferPrototype,
byte_range: tuple[int | None, int | None] | None = None,
) -> Buffer | None:
assert isinstance(key, str)

with self._lock:
return self._get(key, prototype=prototype, byte_range=byte_range)

async def get_partial_values(
self,
prototype: BufferPrototype,
key_ranges: list[tuple[str, tuple[int | None, int | None]]],
) -> list[Buffer | None]:
"""
Read byte ranges from multiple keys.
Parameters
----------
key_ranges: List[Tuple[str, Tuple[int, int]]]
A list of (key, (start, length)) tuples. The first element of the tuple is the name of
the key in storage to fetch bytes from. The second element the tuple defines the byte
range to retrieve. These values are arguments to `get`, as this method wraps
concurrent invocation of `get`.
"""
out = []
with self._lock:
for key, byte_range in key_ranges:
out.append(self._get(key, prototype=prototype, byte_range=byte_range))
return out

def _set(self, key: str, value: Buffer) -> None:
# generally, this should be called inside a lock
keyinfo = zipfile.ZipInfo(filename=key, date_time=time.localtime(time.time())[:6])
keyinfo.compress_type = self.compression
if keyinfo.filename[-1] == os.sep:
keyinfo.external_attr = 0o40775 << 16 # drwxrwxr-x
keyinfo.external_attr |= 0x10 # MS-DOS directory flag
else:
keyinfo.external_attr = 0o644 << 16 # ?rw-r--r--
self._zf.writestr(keyinfo, value.to_bytes())

async def set(self, key: str, value: Buffer) -> None:
self._check_writable()
assert isinstance(key, str)
if not isinstance(value, Buffer):
raise TypeError("ZipStore.set(): `value` must a Buffer instance")
with self._lock:
self._set(key, value)

async def set_partial_values(self, key_start_values: list[tuple[str, int, bytes]]) -> None:
raise NotImplementedError

async def delete(self, key: str) -> None:
raise NotImplementedError
Comment on lines +183 to +184
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Today I was reminded that you can't delete anything from inside a ZipFile 😢. This behavior also existed in 2.18.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be nice to have an In-memory version of a Zip store where all the zip data is read in memory. This way it can support deleting and updating entries. Thereafter, a user can persist the data using a method like write_to_file. I think it would be very efficient for data sets whose compressed size is small enough to fit entirely in memory.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed that would be nice @zoj613. I would like to save that until after the 3.0 release though as the minimal zip store is a release blocker at this point.


async def exists(self, key: str) -> bool:
with self._lock:
try:
self._zf.getinfo(key)
except KeyError:
return False
else:
return True

async def list(self) -> AsyncGenerator[str, None]:
"""Retrieve all keys in the store.
Returns
-------
AsyncGenerator[str, None]
"""
with self._lock:
for key in self._zf.namelist():
yield key

async def list_prefix(self, prefix: str) -> AsyncGenerator[str, None]:
"""Retrieve all keys in the store with a given prefix.
Parameters
----------
prefix : str
Returns
-------
AsyncGenerator[str, None]
"""

async for key in self.list():
if key.startswith(prefix):
yield key

async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
"""
Retrieve all keys and prefixes with a given prefix and which do not contain the character
“/” after the given prefix.
Parameters
----------
prefix : str
Returns
-------
AsyncGenerator[str, None]
"""

if prefix.endswith("/"):
prefix = prefix[:-1]

keys = self._zf.namelist()
seen = set()
if prefix == "":
keys_unique = set(k.split("/")[0] for k in keys)
for key in keys_unique:
if key not in seen:
seen.add(key)
yield key
else:
for key in keys:
if key.startswith(prefix + "/") and key != prefix:
k = key.removeprefix(prefix + "/").split("/")[0]
if k not in seen:
seen.add(k)
yield k
13 changes: 10 additions & 3 deletions tests/v3/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,21 @@
import pytest
from hypothesis import HealthCheck, Verbosity, settings

from zarr.store import LocalStore, MemoryStore, StorePath
from zarr.store import LocalStore, MemoryStore, StorePath, ZipStore
from zarr.store.remote import RemoteStore


async def parse_store(
store: Literal["local", "memory", "remote"], path: str
store: Literal["local", "memory", "remote", "zip"], path: str
) -> LocalStore | MemoryStore | RemoteStore:
if store == "local":
return await LocalStore.open(path, mode="w")
if store == "memory":
return await MemoryStore.open(mode="w")
if store == "remote":
return await RemoteStore.open(url=path, mode="w")
if store == "zip":
return await ZipStore.open(path + "/zarr.zip", mode="w")
raise AssertionError


Expand Down Expand Up @@ -64,6 +66,11 @@ async def memory_store() -> MemoryStore:
return await MemoryStore.open(mode="w")


@pytest.fixture(scope="function")
async def zip_store(tmpdir: LEGACY_PATH) -> ZipStore:
return await ZipStore.open(str(tmpdir / "zarr.zip"), mode="w")


@pytest.fixture(scope="function")
async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store:
param = request.param
Expand All @@ -73,7 +80,7 @@ async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store:
@dataclass
class AsyncGroupRequest:
zarr_format: ZarrFormat
store: Literal["local", "remote", "memory"]
store: Literal["local", "remote", "memory", "zip"]
attributes: dict[str, Any] = field(default_factory=dict)


Expand Down
Loading