Skip to content

Commit f74e53a

Browse files
TomAugspurgerd-v-b
andauthored
Added Store.getsize (#2426)
* Added Store.getsize Closes #2420 * fixups * lint * wip * Use prefix * fixup * Maybe fixup * lint * revert buffer chnages * fixup * fixup * Remove AsyncIterable support * fixup --------- Co-authored-by: Davis Bennett <[email protected]>
1 parent 7ba5296 commit f74e53a

File tree

11 files changed

+163
-3
lines changed

11 files changed

+163
-3
lines changed

Diff for: src/zarr/abc/store.py

+68
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
from itertools import starmap
66
from typing import TYPE_CHECKING, Protocol, runtime_checkable
77

8+
from zarr.core.buffer.core import default_buffer_prototype
9+
from zarr.core.common import concurrent_map
10+
from zarr.core.config import config
11+
812
if TYPE_CHECKING:
913
from collections.abc import AsyncGenerator, AsyncIterator, Iterable
1014
from types import TracebackType
@@ -344,6 +348,70 @@ async def _get_many(
344348
for req in requests:
345349
yield (req[0], await self.get(*req))
346350

351+
async def getsize(self, key: str) -> int:
352+
"""
353+
Return the size, in bytes, of a value in a Store.
354+
355+
Parameters
356+
----------
357+
key : str
358+
359+
Returns
360+
-------
361+
nbytes : int
362+
The size of the value (in bytes).
363+
364+
Raises
365+
------
366+
FileNotFoundError
367+
When the given key does not exist in the store.
368+
"""
369+
# Note to implementers: this default implementation is very inefficient since
370+
# it requires reading the entire object. Many systems will have ways to get the
371+
# size of an object without reading it.
372+
value = await self.get(key, prototype=default_buffer_prototype())
373+
if value is None:
374+
raise FileNotFoundError(key)
375+
return len(value)
376+
377+
async def getsize_prefix(self, prefix: str) -> int:
378+
"""
379+
Return the size, in bytes, of all values under a prefix.
380+
381+
Parameters
382+
----------
383+
prefix : str
384+
The prefix of the directory to measure.
385+
386+
Returns
387+
-------
388+
nbytes : int
389+
The sum of the sizes of the values in the directory (in bytes).
390+
391+
See Also
392+
--------
393+
zarr.Array.nbytes_stored
394+
Store.getsize
395+
396+
Notes
397+
-----
398+
``getsize_prefix`` is just provided as a potentially faster alternative to
399+
listing all the keys under a prefix calling :meth:`Store.getsize` on each.
400+
401+
In general, ``prefix`` should be the path of an Array or Group in the Store.
402+
Implementations may differ on the behavior when some other ``prefix``
403+
is provided.
404+
"""
405+
# TODO: Overlap listing keys with getsize calls.
406+
# Currently, we load the list of keys into memory and only then move
407+
# on to getting sizes. Ideally we would overlap those two, which should
408+
# improve tail latency and might reduce memory pressure (since not all keys
409+
# would be in memory at once).
410+
keys = [(x,) async for x in self.list_prefix(prefix)]
411+
limit = config.get("async.concurrency")
412+
sizes = await concurrent_map(keys, self.getsize, limit=limit)
413+
return sum(sizes)
414+
347415

348416
@runtime_checkable
349417
class ByteGetter(Protocol):

Diff for: src/zarr/core/array.py

+13
Original file line numberDiff line numberDiff line change
@@ -889,6 +889,9 @@ async def nchunks_initialized(self) -> int:
889889
"""
890890
return len(await chunks_initialized(self))
891891

892+
async def nbytes_stored(self) -> int:
893+
return await self.store_path.store.getsize_prefix(self.store_path.path)
894+
892895
def _iter_chunk_coords(
893896
self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None
894897
) -> Iterator[ChunkCoords]:
@@ -1727,6 +1730,16 @@ def nchunks_initialized(self) -> int:
17271730
"""
17281731
return sync(self._async_array.nchunks_initialized())
17291732

1733+
def nbytes_stored(self) -> int:
1734+
"""
1735+
Determine the size, in bytes, of the array actually written to the store.
1736+
1737+
Returns
1738+
-------
1739+
size : int
1740+
"""
1741+
return sync(self._async_array.nbytes_stored())
1742+
17301743
def _iter_chunk_keys(
17311744
self, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None
17321745
) -> Iterator[str]:

Diff for: src/zarr/core/common.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,9 @@ def product(tup: ChunkCoords) -> int:
5050

5151

5252
async def concurrent_map(
53-
items: Iterable[T], func: Callable[..., Awaitable[V]], limit: int | None = None
53+
items: Iterable[T],
54+
func: Callable[..., Awaitable[V]],
55+
limit: int | None = None,
5456
) -> list[V]:
5557
if limit is None:
5658
return await asyncio.gather(*list(starmap(func, items)))

Diff for: src/zarr/storage/local.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22

33
import asyncio
44
import io
5+
import os
56
import shutil
67
from pathlib import Path
78
from typing import TYPE_CHECKING
89

910
from zarr.abc.store import ByteRangeRequest, Store
1011
from zarr.core.buffer import Buffer
12+
from zarr.core.buffer.core import default_buffer_prototype
1113
from zarr.core.common import concurrent_map
1214

1315
if TYPE_CHECKING:
@@ -124,10 +126,12 @@ def __eq__(self, other: object) -> bool:
124126
async def get(
125127
self,
126128
key: str,
127-
prototype: BufferPrototype,
129+
prototype: BufferPrototype | None = None,
128130
byte_range: tuple[int | None, int | None] | None = None,
129131
) -> Buffer | None:
130132
# docstring inherited
133+
if prototype is None:
134+
prototype = default_buffer_prototype()
131135
if not self._is_open:
132136
await self._open()
133137
assert isinstance(key, str)
@@ -222,3 +226,6 @@ async def list_dir(self, prefix: str) -> AsyncIterator[str]:
222226
yield key.relative_to(base).as_posix()
223227
except (FileNotFoundError, NotADirectoryError):
224228
pass
229+
230+
async def getsize(self, key: str) -> int:
231+
return os.path.getsize(self.root / key)

Diff for: src/zarr/storage/logging.py

+8
Original file line numberDiff line numberDiff line change
@@ -225,3 +225,11 @@ async def delete_dir(self, prefix: str) -> None:
225225
# docstring inherited
226226
with self.log(prefix):
227227
await self._store.delete_dir(prefix=prefix)
228+
229+
async def getsize(self, key: str) -> int:
230+
with self.log(key):
231+
return await self._store.getsize(key)
232+
233+
async def getsize_prefix(self, prefix: str) -> int:
234+
with self.log(prefix):
235+
return await self._store.getsize_prefix(prefix)

Diff for: src/zarr/storage/remote.py

+13
Original file line numberDiff line numberDiff line change
@@ -325,3 +325,16 @@ async def list_prefix(self, prefix: str) -> AsyncIterator[str]:
325325
f"{self.path}/{prefix}", detail=False, maxdepth=None, withdirs=False
326326
):
327327
yield onefile.removeprefix(f"{self.path}/")
328+
329+
async def getsize(self, key: str) -> int:
330+
path = _dereference_path(self.path, key)
331+
info = await self.fs._info(path)
332+
333+
size = info.get("size")
334+
335+
if size is None:
336+
# Not all filesystems support size. Fall back to reading the entire object
337+
return await super().getsize(key)
338+
else:
339+
# fsspec doesn't have typing. We'll need to assume or verify this is true
340+
return int(size)

Diff for: src/zarr/testing/store.py

+23
Original file line numberDiff line numberDiff line change
@@ -318,3 +318,26 @@ async def test_set_if_not_exists(self, store: S) -> None:
318318

319319
result = await store.get("k2", default_buffer_prototype())
320320
assert result == new
321+
322+
async def test_getsize(self, store: S) -> None:
323+
key = "k"
324+
data = self.buffer_cls.from_bytes(b"0" * 10)
325+
await self.set(store, key, data)
326+
327+
result = await store.getsize(key)
328+
assert isinstance(result, int)
329+
assert result > 0
330+
331+
async def test_getsize_raises(self, store: S) -> None:
332+
with pytest.raises(FileNotFoundError):
333+
await store.getsize("not-a-real-key")
334+
335+
async def test_getsize_prefix(self, store: S) -> None:
336+
prefix = "array/c/"
337+
for i in range(10):
338+
data = self.buffer_cls.from_bytes(b"0" * 10)
339+
await self.set(store, f"{prefix}/{i}", data)
340+
341+
result = await store.getsize_prefix(prefix)
342+
assert isinstance(result, int)
343+
assert result > 0

Diff for: tests/test_array.py

+24
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,30 @@ async def test_chunks_initialized() -> None:
372372
assert observed == expected
373373

374374

375+
def test_nbytes_stored() -> None:
376+
arr = zarr.create(shape=(100,), chunks=(10,), dtype="i4")
377+
result = arr.nbytes_stored()
378+
assert result == 366 # the size of the metadata document. This is a fragile test.
379+
arr[:50] = 1
380+
result = arr.nbytes_stored()
381+
assert result == 566 # the size with 5 chunks filled.
382+
arr[50:] = 2
383+
result = arr.nbytes_stored()
384+
assert result == 766 # the size with all chunks filled.
385+
386+
387+
async def test_nbytes_stored_async() -> None:
388+
arr = await zarr.api.asynchronous.create(shape=(100,), chunks=(10,), dtype="i4")
389+
result = await arr.nbytes_stored()
390+
assert result == 366 # the size of the metadata document. This is a fragile test.
391+
await arr.setitem(slice(50), 1)
392+
result = await arr.nbytes_stored()
393+
assert result == 566 # the size with 5 chunks filled.
394+
await arr.setitem(slice(50, 100), 2)
395+
result = await arr.nbytes_stored()
396+
assert result == 766 # the size with all chunks filled.
397+
398+
375399
def test_default_fill_values() -> None:
376400
a = Array.create(MemoryStore(), shape=5, chunk_shape=5, dtype="<U4")
377401
assert a.fill_value == ""

Diff for: tests/test_indexing.py

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
if TYPE_CHECKING:
3333
from collections.abc import AsyncGenerator
3434

35+
from zarr.core.buffer import BufferPrototype
3536
from zarr.core.buffer.core import Buffer
3637
from zarr.core.common import ChunkCoords
3738

Diff for: tests/test_metadata/test_consolidated.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
open,
1717
open_consolidated,
1818
)
19-
from zarr.core.buffer.core import default_buffer_prototype
19+
from zarr.core.buffer import default_buffer_prototype
2020
from zarr.core.group import ConsolidatedMetadata, GroupMetadata
2121
from zarr.core.metadata import ArrayV3Metadata
2222
from zarr.core.metadata.v2 import ArrayV2Metadata

Diff for: tests/test_v2.py

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from numcodecs.blosc import Blosc
1010

1111
import zarr
12+
import zarr.core.buffer
1213
import zarr.storage
1314
from zarr import Array
1415
from zarr.storage import MemoryStore, StorePath

0 commit comments

Comments
 (0)