Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion src/zarr/store/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,14 @@
from zarr.store.local import LocalStore
from zarr.store.memory import MemoryStore
from zarr.store.remote import RemoteStore
from zarr.store.zip import ZipStore

__all__ = ["StorePath", "StoreLike", "make_store_path", "RemoteStore", "LocalStore", "MemoryStore"]
__all__ = [
"StorePath",
"StoreLike",
"make_store_path",
"RemoteStore",
"LocalStore",
"MemoryStore",
"ZipStore",
]
221 changes: 221 additions & 0 deletions src/zarr/store/zip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
from __future__ import annotations

import os
import threading
import time
import zipfile
from collections.abc import AsyncGenerator
from pathlib import Path
from typing import Literal

from zarr.abc.store import Store
from zarr.buffer import Buffer, BufferPrototype

ZipStoreAccessModeLiteral = Literal["r", "w", "a"]


class ZipStore(Store):
supports_writes: bool = True
supports_partial_writes: bool = False
supports_listing: bool = True
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do we think about adding supports_deletes: bool = False as a class attribute?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that would be a solid extrapolation of the design we are currently using. one downside of that design is that, even if a class has supports_x set to False, the class will still need an implementation of x. Another solution would be to express supports_x by having the class inherit from a DoesX mixin. But that's maybe out of scope for this PR.

Copy link
Member Author

@jhamman jhamman Aug 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've gone the supports_deletes route for now.


path: Path
compression: int
allowZip64: bool

def __init__(
self,
path: Path | str,
*,
mode: ZipStoreAccessModeLiteral = "r",
compression: int = zipfile.ZIP_STORED,
allowZip64: bool = True,
):
super().__init__(mode=mode)

if isinstance(path, str):
path = Path(path)
assert isinstance(path, Path)
self.path = path # root?

self.compression = compression
self.allowZip64 = allowZip64

self._lock = threading.RLock() # TODO: evaluate if this is the lock we want or if we want an asyncio.Lock or something like that

self._zf = zipfile.ZipFile(path, mode=mode, compression=compression, allowZip64=allowZip64)

def close(self) -> None:
self._is_open = False
with self._lock:
self._zf.close()

async def clear(self) -> None:
with self._lock:
self._check_writable()
self._zf.close()
os.remove(self.path)
self._zf = zipfile.ZipFile(
self.path, mode="w", compression=self.compression, allowZip64=self.allowZip64
)

async def empty(self) -> bool:
async for _ in self.list():
return False
return True

def __str__(self) -> str:
return f"zip://{self.path}"

def __repr__(self) -> str:
return f"ZipStore({str(self)!r})"

def __eq__(self, other: object) -> bool:
return isinstance(other, type(self)) and self.path == other.path

def _get(
self,
key: str,
prototype: BufferPrototype,
byte_range: tuple[int | None, int | None] | None = None,
) -> Buffer | None:
try:
with self._zf.open(key) as f: # will raise KeyError
if byte_range is None:
return prototype.buffer.from_bytes(f.read())
start, length = byte_range
if start:
f.seek(start or 0)
if length:
return prototype.buffer.from_bytes(f.read(length))
else:
return prototype.buffer.from_bytes(f.read())
except KeyError:
return None

async def get(
self,
key: str,
prototype: BufferPrototype,
byte_range: tuple[int | None, int | None] | None = None,
) -> Buffer | None:
assert isinstance(key, str)

with self._lock:
return self._get(key, prototype=prototype, byte_range=byte_range)

async def get_partial_values(
self,
prototype: BufferPrototype,
key_ranges: list[tuple[str, tuple[int | None, int | None]]],
) -> list[Buffer | None]:
"""
Read byte ranges from multiple keys.

Parameters
----------
key_ranges: List[Tuple[str, Tuple[int, int]]]
A list of (key, (start, length)) tuples. The first element of the tuple is the name of
the key in storage to fetch bytes from. The second element the tuple defines the byte
range to retrieve. These values are arguments to `get`, as this method wraps
concurrent invocation of `get`.
"""
out = []
with self._lock:
for key, byte_range in key_ranges:
out.append(self._get(key, prototype=prototype, byte_range=byte_range))
return out

def _set(self, key: str, value: Buffer) -> None:
# generally, this should be called inside a lock
keyinfo = zipfile.ZipInfo(filename=key, date_time=time.localtime(time.time())[:6])
keyinfo.compress_type = self.compression
if keyinfo.filename[-1] == os.sep:
keyinfo.external_attr = 0o40775 << 16 # drwxrwxr-x
keyinfo.external_attr |= 0x10 # MS-DOS directory flag
else:
keyinfo.external_attr = 0o644 << 16 # ?rw-r--r--
self._zf.writestr(keyinfo, value.to_bytes())

async def set(self, key: str, value: Buffer) -> None:
self._check_writable()
assert isinstance(key, str)
if not isinstance(value, Buffer):
raise TypeError("ZipStore.set(): `value` must a Buffer instance")
with self._lock:
self._set(key, value)

async def set_partial_values(self, key_start_values: list[tuple[str, int, bytes]]) -> None:
raise NotImplementedError

async def delete(self, key: str) -> None:
raise NotImplementedError
Comment on lines +183 to +184
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Today I was reminded that you can't delete anything from inside a ZipFile 😢. This behavior also existed in 2.18.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be nice to have an In-memory version of a Zip store where all the zip data is read in memory. This way it can support deleting and updating entries. Thereafter, a user can persist the data using a method like write_to_file. I think it would be very efficient for data sets whose compressed size is small enough to fit entirely in memory.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed that would be nice @zoj613. I would like to save that until after the 3.0 release though as the minimal zip store is a release blocker at this point.


async def exists(self, key: str) -> bool:
with self._lock:
try:
self._zf.getinfo(key)
except KeyError:
return False
else:
return True

async def list(self) -> AsyncGenerator[str, None]:
"""Retrieve all keys in the store.

Returns
-------
AsyncGenerator[str, None]
"""
with self._lock:
for key in self._zf.namelist():
yield key

async def list_prefix(self, prefix: str) -> AsyncGenerator[str, None]:
"""Retrieve all keys in the store with a given prefix.

Parameters
----------
prefix : str

Returns
-------
AsyncGenerator[str, None]
"""

async for key in self.list():
if key.startswith(prefix):
yield key

async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
"""
Retrieve all keys and prefixes with a given prefix and which do not contain the character
“/” after the given prefix.

Parameters
----------
prefix : str

Returns
-------
AsyncGenerator[str, None]
"""

if prefix.endswith("/"):
prefix = prefix[:-1]

keys = self._zf.namelist()
seen = set()
if prefix == "":
keys_unique = set(k.split("/")[0] for k in keys)
for key in keys_unique:
if key not in seen:
seen.add(key)
yield key
else:
for key in keys:
if key.startswith(prefix + "/") and key != prefix:
k = key.removeprefix(prefix + "/").split("/")[0]
if k not in seen:
seen.add(k)
yield k
13 changes: 10 additions & 3 deletions tests/v3/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,21 @@
import pytest
from hypothesis import HealthCheck, Verbosity, settings

from zarr.store import LocalStore, MemoryStore, StorePath
from zarr.store import LocalStore, MemoryStore, StorePath, ZipStore
from zarr.store.remote import RemoteStore


async def parse_store(
store: Literal["local", "memory", "remote"], path: str
store: Literal["local", "memory", "remote", "zip"], path: str
) -> LocalStore | MemoryStore | RemoteStore:
if store == "local":
return await LocalStore.open(path, mode="w")
if store == "memory":
return await MemoryStore.open(mode="w")
if store == "remote":
return await RemoteStore.open(url=path, mode="w")
if store == "zip":
return await ZipStore.open(path + "/zarr.zip", mode="w")
raise AssertionError


Expand Down Expand Up @@ -64,6 +66,11 @@ async def memory_store() -> MemoryStore:
return await MemoryStore.open(mode="w")


@pytest.fixture(scope="function")
async def zip_store(tmpdir: LEGACY_PATH) -> ZipStore:
return await ZipStore.open(str(tmpdir / "zarr.zip"), mode="w")


@pytest.fixture(scope="function")
async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store:
param = request.param
Expand All @@ -73,7 +80,7 @@ async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store:
@dataclass
class AsyncGroupRequest:
zarr_format: ZarrFormat
store: Literal["local", "remote", "memory"]
store: Literal["local", "remote", "memory", "zip"]
attributes: dict[str, Any] = field(default_factory=dict)


Expand Down
6 changes: 3 additions & 3 deletions tests/v3/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from zarr.store.core import StorePath


@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
@pytest.mark.parametrize("zarr_format", (2, 3))
@pytest.mark.parametrize("exists_ok", [True, False])
@pytest.mark.parametrize("extant_node", ["array", "group"])
Expand Down Expand Up @@ -60,7 +60,7 @@ def test_array_creation_existing_node(
)


@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
@pytest.mark.parametrize("zarr_format", (2, 3))
def test_array_name_properties_no_group(
store: LocalStore | MemoryStore, zarr_format: ZarrFormat
Expand All @@ -71,7 +71,7 @@ def test_array_name_properties_no_group(
assert arr.basename is None


@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
@pytest.mark.parametrize("zarr_format", (2, 3))
def test_array_name_properties_with_group(
store: LocalStore | MemoryStore, zarr_format: ZarrFormat
Expand Down
16 changes: 8 additions & 8 deletions tests/v3/test_codecs/test_sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from .test_codecs import _AsyncArrayProxy, order_from_dim


@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
@pytest.mark.parametrize("index_location", ["start", "end"])
@pytest.mark.parametrize(
"array_fixture",
Expand Down Expand Up @@ -71,7 +71,7 @@ def test_sharding(


@pytest.mark.parametrize("index_location", ["start", "end"])
@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
@pytest.mark.parametrize(
"array_fixture",
[
Expand Down Expand Up @@ -121,7 +121,7 @@ def test_sharding_partial(
indirect=["array_fixture"],
)
@pytest.mark.parametrize("index_location", ["start", "end"])
@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
def test_sharding_partial_read(
store: Store, array_fixture: np.ndarray, index_location: ShardingCodecIndexLocation
) -> None:
Expand Down Expand Up @@ -158,7 +158,7 @@ def test_sharding_partial_read(
indirect=["array_fixture"],
)
@pytest.mark.parametrize("index_location", ["start", "end"])
@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
def test_sharding_partial_overwrite(
store: Store, array_fixture: np.ndarray, index_location: ShardingCodecIndexLocation
) -> None:
Expand Down Expand Up @@ -209,7 +209,7 @@ def test_sharding_partial_overwrite(
"inner_index_location",
["start", "end"],
)
@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
def test_nested_sharding(
store: Store,
array_fixture: np.ndarray,
Expand Down Expand Up @@ -242,7 +242,7 @@ def test_nested_sharding(
assert np.array_equal(data, read_data)


@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
def test_open_sharding(store: Store) -> None:
path = "open_sharding"
spath = StorePath(store, path)
Expand All @@ -267,7 +267,7 @@ def test_open_sharding(store: Store) -> None:
assert a.metadata == b.metadata


@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
def test_write_partial_sharded_chunks(store: Store) -> None:
data = np.arange(0, 16 * 16, dtype="uint16").reshape((16, 16))
spath = StorePath(store)
Expand All @@ -291,7 +291,7 @@ def test_write_partial_sharded_chunks(store: Store) -> None:
assert np.array_equal(a[0:16, 0:16], data)


@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
async def test_delete_empty_shards(store: Store) -> None:
path = "delete_empty_shards"
spath = StorePath(store, path)
Expand Down
Loading