diff --git a/Project.toml b/Project.toml index fe43535..cfe36f7 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.9.6" [deps] Blosc = "a74b3585-a348-5f62-a45c-50e91977d574" +CRC32c = "8bf52ea8-c179-5cab-976a-9e18b702a9bc" ChunkCodecCore = "0b6fb165-00bc-4d37-ab8b-79f91016dbe1" ChunkCodecLibZlib = "4c0bbee4-addc-4d73-81a0-b6caacae83c8" ChunkCodecLibZstd = "55437552-ac27-4d47-9aa3-63184e8fd398" @@ -32,6 +33,7 @@ Blosc = "0.5, 0.6, 0.7" ChunkCodecCore = "1" ChunkCodecLibZlib = "1" ChunkCodecLibZstd = "1" +CRC32c = "1.10, 1.11" DataStructures = "0.17, 0.18, 0.19" DateTimes64 = "1" DiskArrays = "0.4.2" diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md index 6b9f6b3..b42c202 100644 --- a/docs/src/tutorial.md +++ b/docs/src/tutorial.md @@ -197,7 +197,7 @@ Order : C Read-Only : false Compressor : Zarr.BloscCompressor(0, 3, "zstd", 1) Filters : nothing -Store type : Dictionary Storage +Store type : Zarr.FormattedStore{2, '.', Zarr.DictStore}(Dictionary Storage) No. bytes : 400000000 No. bytes stored : 2412289 Storage ratio : 165.81761140559857 diff --git a/src/Codecs/Codecs.jl b/src/Codecs/Codecs.jl new file mode 100644 index 0000000..ec6e620 --- /dev/null +++ b/src/Codecs/Codecs.jl @@ -0,0 +1,49 @@ +module Codecs + +using JSON: JSON + +""" + abstract type Codec + +The abstract supertype for all Zarr codecs + +## Interface + +All subtypes of `Codec` SHALL implement the following methods: + +- `zencode(a, c::Codec)`: compress the array `a` using the codec `c`. +- `zdecode(a, c::Codec, T)`: decode the array `a` using the codec `c` + and return an array of type `T`. +- `JSON.lower(c::Codec)`: return a JSON representation of the codec `c`, which + follows the Zarr specification for that codec. +- `getCodec(::Type{<:Codec}, d::Dict)`: return a codec object from a given + dictionary `d` which contains the codec's parameters according to the Zarr spec. + +Subtypes of `Codec` MAY also implement the following methods: + +- `zencode!(encoded, data, c::Codec)`: encode the array `data` using the + codec `c` and store the result in the array `encoded`. +- `zdecode!(data, encoded, c::Codec)`: decode the array `encoded` + using the codec `c` and store the result in the array `data`. + +Finally, an entry MUST be added to the `VN.codectypes` dictionary for each codec type where N is the +Zarr format version. +This must also follow the Zarr specification's name for that compressor. The name of the compressor +is the key, and the value is the compressor type (e.g. `BloscCodec` or `NoCodec`). + +For example, the Blosc codec is named "blosc" in the Zarr spec, so the entry for [`BloscCodec`](@ref) +must be added to `codectypes` as `codectypes["blosc"] = BloscCodec`. +""" + +abstract type Codec end + +zencode(a, c::Codec) = error("Unimplemented") +zencode!(encoded, data, c::Codec) = error("Unimplemented") +zdecode(a, c::Codec, T::Type) = error("Unimplemented") +zdecode!(data, encoded, c::Codec) = error("Unimplemented") +JSON.lower(c::Codec) = error("Unimplemented") +getCodec(::Type{<:Codec}, d::Dict) = error("Unimplemented") + +include("V3/V3.jl") + +end diff --git a/src/Codecs/V3/V3.jl b/src/Codecs/V3/V3.jl new file mode 100644 index 0000000..2cf3c58 --- /dev/null +++ b/src/Codecs/V3/V3.jl @@ -0,0 +1,531 @@ +module V3Codecs + +import ..Codecs: zencode, zdecode, zencode!, zdecode! +using CRC32c: CRC32c +using JSON: JSON + +abstract type V3Codec{In,Out} end +const codectypes = Dict{String, V3Codec}() + +@enum BloscCompressor begin + lz4 + lz4hc + blosclz + zstd + snappy + zlib +end + +@enum BloscShuffle begin + noshuffle + shuffle + bitshuffle +end + +struct BloscCodec <: V3Codec{:bytes, :bytes} + cname::BloscCompressor + clevel::Int64 + shuffle::BloscShuffle + typesize::UInt8 + blocksize::UInt +end +name(::BloscCodec) = "blosc" + +struct BytesCodec <: V3Codec{:array, :bytes} +end +name(::BytesCodec) = "bytes" + +struct CRC32cCodec <: V3Codec{:bytes, :bytes} +end +name(::CRC32cCodec) = "crc32c" + +struct GzipCodec <: V3Codec{:bytes, :bytes} +end +name(::GzipCodec) = "gzip" + + +#= +zencode(a, c::Codec) = error("Unimplemented") +zencode!(encoded, data, c::Codec) = error("Unimplemented") +zdecode(a, c::Codec, T::Type) = error("Unimplemented") +zdecode!(data, encoded, c::Codec) = error("Unimplemented") +=# + +function crc32c_stream!(output::IO, input::IO; buffer = Vector{UInt8}(undef, 1024*32)) + hash::UInt32 = 0x00000000 + while(bytesavailable(input) > 0) + sized_buffer = @view(buffer[1:min(length(buffer), bytesavailable(input))]) + read!(input, sized_buffer) + write(output, sized_buffer) + hash = CRC32c.crc32c(sized_buffer, hash) + end + return hash +end +function zencode!(encoded::Vector{UInt8}, data::Vector{UInt8}, c::CRC32cCodec) + output = IOBuffer(encoded, read=false, write=true) + input = IOBuffer(data, read=true, write=false) + zencode!(output, input, c) + return take!(output) +end +function zencode!(output::IO, input::IO, c::CRC32cCodec) + hash = crc32c_stream!(output, input) + write(output, hash) + return output +end +function zdecode!(encoded::Vector{UInt8}, data::Vector{UInt8}, c::CRC32cCodec) + output = IOBuffer(encoded, read=false, write=true) + input = IOBuffer(data, read=true, write=true) + zdecode!(output, input, c) + return take!(output) +end +function zdecode!(output::IOBuffer, input::IOBuffer, c::CRC32cCodec) + input_vec = take!(input) + truncated_input = IOBuffer(@view(input_vec[1:end-4]); read=true, write=false) + hash = crc32c_stream!(output, truncated_input) + if input_vec[end-3:end] != reinterpret(UInt8, [hash]) + throw(IOError("CRC32c hash does not match")) + end + return output +end + +""" + ShardingCodec{N} + +Sharding codec for Zarr v3. Sharding splits chunks into smaller "shards" and stores them +in a single file with an index mapping chunk coordinates to shard locations. + +# Fields +- `chunk_shape`: Shape of each shard (NTuple{N,Int}) +- `codecs`: Vector of codecs to apply to shard data (e.g., [BytesCodec(), GzipCodec()]) +- `index_codecs`: Vector of codecs to apply to the index (e.g., [BytesCodec()]) +- `index_location`: Location of index in shard file, either `:start` or `:end` + +# Implementation Notes +Sharding works by: +1. Taking a chunk of data and splitting it into shards based on `chunk_shape` +2. Encoding each shard using the `codecs` pipeline +3. Creating an index that maps (chunk_coords, shard_coords) -> (offset, size) in the shard file +4. Encoding the index using `index_codecs` +5. Writing the shard file with index at `index_location` (start or end) + +""" +struct ShardingCodec{N} <: V3Codec{:array, :bytes} + chunk_shape::NTuple{N,Int} # Shape of each shard + codecs::Vector{V3Codec} # Codecs to apply to shard data + index_codecs::Vector{V3Codec} # Codecs to apply to the index + index_location::Symbol # :start or :end +end +name(::ShardingCodec) = "sharding_indexed" + +""" + JSON.lower(c::ShardingCodec) + +Serialize ShardingCodec to JSON format for Zarr v3 metadata. +""" +function JSON.lower(c::ShardingCodec) + return Dict( + "name" => "sharding_indexed", + "configuration" => Dict( + "chunk_shape" => collect(c.chunk_shape), + "codecs" => [JSON.lower(codec) for codec in c.codecs], + "index_codecs" => [JSON.lower(codec) for codec in c.index_codecs], + "index_location" => string(c.index_location) + ) + ) +end + +""" + getCodec(::Type{ShardingCodec}, d::Dict) + +Deserialize ShardingCodec from JSON configuration dict. +""" +function getCodec(::Type{ShardingCodec}, d::Dict) + config = d["configuration"] + N = length(config["chunk_shape"]) + chunk_shape = NTuple{N,Int}(config["chunk_shape"]) + codecs = [getCodec(codec_dict) for codec_dict in config["codecs"]] + index_codecs = [getCodec(codec_dict) for codec_dict in config["index_codecs"]] + index_location = Symbol(get(config, "index_location", "end")) + return ShardingCodec{N}(chunk_shape, codecs, index_codecs, index_location) +end + +const MAX_UINT64 = typemax(UInt64) + +""" + ChunkShardInfo + +Information about a chunk's location within a shard. +""" +struct ChunkShardInfo + offset::UInt64 # Byte offset within shard where chunk begins + nbytes::UInt64 # Number of bytes the chunk occupies +end + +ChunkShardInfo() = ChunkShardInfo(MAX_UINT64, MAX_UINT64) # Empty chunk marker + +""" + ShardIndex{N} + +Internal structure representing the shard index. +Stores chunk location info for an N-dimensional grid of chunks. +Empty chunks are marked with ChunkShardInfo(MAX_UINT64, MAX_UINT64) +""" +struct ShardIndex{N} + chunks::Array{ChunkShardInfo, N} # N-dimensional array of chunk info +end + +""" + ShardIndex(chunks_per_shard::NTuple{N,Int}) + +Create an empty shard index with all chunks marked as empty. +""" +function ShardIndex(chunks_per_shard::NTuple{N,Int}) where N + chunks = fill(ChunkShardInfo(), chunks_per_shard) + return ShardIndex{N}(chunks) +end + +""" + get_chunk_slice(idx::ShardIndex, chunk_coords::NTuple{N,Int}) + +Get the byte range (offset, offset+nbytes) for a chunk, or nothing if empty. +""" +function get_chunk_slice(idx::ShardIndex, chunk_coords::NTuple{N,Int}) where N + info = idx.chunks[chunk_coords...] + + if info.offset == MAX_UINT64 && info.nbytes == MAX_UINT64 + return nothing + end + + return (Int(info.offset), Int(info.offset + info.nbytes)) +end + +""" + set_chunk_slice!(idx::ShardIndex, chunk_coords::NTuple{N,Int}, offset::Int, nbytes::Int) + +Set the byte range for a chunk in the index. +""" +function set_chunk_slice!(idx::ShardIndex, chunk_coords::NTuple{N,Int}, offset::Int, nbytes::Int) where N + idx.chunks[chunk_coords...] = ChunkShardInfo(UInt64(offset), UInt64(nbytes)) +end + +""" + set_chunk_empty!(idx::ShardIndex, chunk_coords::NTuple{N,Int}) + +Mark a chunk as empty in the index. +""" +function set_chunk_empty!(idx::ShardIndex, chunk_coords::NTuple{N,Int}) where N + idx.chunks[chunk_coords...] = ChunkShardInfo() +end + +""" + calculate_chunks_per_shard(shard_shape::NTuple{N,Int}, chunk_shape::NTuple{N,Int}) + +Calculate how many chunks fit in each shard dimension. +""" +function calculate_chunks_per_shard(shard_shape::NTuple{N,Int}, chunk_shape::NTuple{N,Int}) where N + return ntuple(i -> div(shard_shape[i], chunk_shape[i]), N) +end + +""" + get_chunk_slice_in_shard(chunk_coords::NTuple{N,Int}, chunk_shape::NTuple{N,Int}, shard_shape::NTuple{N,Int}) + +Get the array slice ranges for a chunk within a shard. +chunk_coords are 1-based indices. +""" +function get_chunk_slice_in_shard(chunk_coords::NTuple{N,Int}, chunk_shape::NTuple{N,Int}, shard_shape::NTuple{N,Int}) where N + return ntuple(N) do i + start_idx = (chunk_coords[i] - 1) * chunk_shape[i] + 1 + end_idx = min(chunk_coords[i] * chunk_shape[i], shard_shape[i]) + start_idx:end_idx + end +end + +""" + apply_codec_chain(data, codecs::Vector{V3Codec}) + +Apply codec pipeline in forward order (encoding). +""" +function apply_codec_chain(data, codecs::Vector{V3Codec}) + result = data + for codec in codecs + result = zencode(result, codec) + end + return result +end + +""" + reverse_codec_chain(data, codecs::Vector{V3Codec}) + +Apply codec pipeline in reverse order (decoding). +""" +function reverse_codec_chain(data, codecs::Vector{V3Codec}) + result = data + for codec in reverse(codecs) + result = zdecode(result, codec) + end + return result +end + +""" + encode_shard_index(index::ShardIndex, index_codecs::Vector{V3Codec}) + +Encode the shard index using the index codec pipeline. + +Per Zarr v3 spec, the index is linearized in C-order (row-major) with alternating +offset/nbytes values: [chunk_0_offset, chunk_0_nbytes, chunk_1_offset, chunk_1_nbytes, ...] +``` +""" +function encode_shard_index(index::ShardIndex{N}, index_codecs::Vector{V3Codec}) where N + # Pre-allocate buffer for index data + n_chunks = length(index.chunks) + index_data = Vector{UInt64}(undef, 2 * n_chunks) + + # Iterate in C-order (row-major) and interleave offset/nbytes + idx = 1 + for cart_idx in CartesianIndices(index.chunks) + info = index.chunks[cart_idx] + index_data[idx] = info.offset + index_data[idx + 1] = info.nbytes + idx += 2 + end + + # Convert to bytes + index_bytes = reinterpret(UInt8, index_data) + + # Apply index codecs + encoded = apply_codec_chain(index_bytes, index_codecs) + + return encoded +end + +""" + decode_shard_index(index_bytes::Vector{UInt8}, chunks_per_shard::NTuple{N,Int}, index_codecs::Vector{V3Codec}) + +Decode the shard index from bytes. + +The bytes are in C-order with alternating offset/nbytes: +[offset0, nbytes0, offset1, nbytes1, ...] +""" +function decode_shard_index(index_bytes::Vector{UInt8}, chunks_per_shard::NTuple{N,Int}, index_codecs::Vector{V3Codec}) where N + # Decode using index codecs (in reverse order) + decoded_bytes = reverse_codec_chain(index_bytes, index_codecs) + + # Expected size: 16 bytes (2 * UInt64) per chunk + n_chunks = prod(chunks_per_shard) + expected_length = n_chunks * 2 * sizeof(UInt64) + + if length(decoded_bytes) != expected_length + throw(DimensionMismatch("Index size mismatch: expected $expected_length, got $(length(decoded_bytes))")) + end + + # Reinterpret as UInt64 array: [offset1, nbytes1, offset1, nbytes1, ...] + index_data = reinterpret(UInt64, decoded_bytes) + + # Reconstruct the N-dimensional array of ChunkShardInfo + chunks = Array{ChunkShardInfo, N}(undef, chunks_per_shard) + + idx = 1 + for cart_idx in CartesianIndices(chunks) + offset = index_data[idx] + nbytes = index_data[idx + 1] + chunks[cart_idx] = ChunkShardInfo(offset, nbytes) + idx += 2 + end + + return ShardIndex{N}(chunks) +end + +""" + compute_encoded_index_size(chunks_per_shard::NTuple{N,Int}, index_codecs::Vector{V3Codec}) + +Compute the byte size of the encoded shard index. +Per spec: "The size of the index can be determined by applying c.compute_encoded_size +for each index codec recursively. The initial size is the byte size of the index array, +i.e. 16 * chunks per shard." +""" +function compute_encoded_index_size(chunks_per_shard::NTuple{N,Int}, index_codecs::Vector{V3Codec}) where N + # Initial size: 16 bytes per chunk (2 * UInt64) + n_chunks = prod(chunks_per_shard) + size = n_chunks * 16 + + # Apply each codec's size transformation + # For most codecs, we need to actually encode to know the size + # For simplicity, we encode an empty index + index = ShardIndex(chunks_per_shard) + encoded = encode_shard_index(index, index_codecs) + + return length(encoded) +end + +""" + zencode!(encoded::Vector{UInt8}, data::AbstractArray, c::ShardingCodec) + +Encode array data using sharding codec following Zarr v3 spec. + +Per spec: "In the sharding_indexed binary format, inner chunks are written successively +in a shard, where unused space between them is allowed, followed by an index referencing them." +""" +function zencode!(encoded::Vector{UInt8}, data::AbstractArray, c::ShardingCodec{N}) where N + shard_shape = size(data) + chunks_per_shard = calculate_chunks_per_shard(shard_shape, c.chunk_shape) + + # Create empty index + index = ShardIndex(chunks_per_shard) + + # Buffers for encoded chunks + chunk_buffers = Vector{UInt8}[] + current_offset = 0 + + # Process chunks in C order (row-major) + # Per spec: "The actual order of the chunk content is not fixed" + for cart_idx in CartesianIndices(chunks_per_shard) + chunk_coords = Tuple(cart_idx) + + # Extract chunk data from shard + slice_ranges = get_chunk_slice_in_shard(chunk_coords, c.chunk_shape, shard_shape) + chunk_data = data[slice_ranges...] + + # Encode chunk using codec pipeline + encoded_chunk = apply_codec_chain(chunk_data, c.codecs) + + # Skip if chunk is empty (no bytes) + if isempty(encoded_chunk) + set_chunk_empty!(index, chunk_coords) + continue + end + + nbytes = length(encoded_chunk) + + # Record offset and length in index + set_chunk_slice!(index, chunk_coords, current_offset, nbytes) + + push!(chunk_buffers, encoded_chunk) + current_offset += nbytes + end + + # Encode the index + encoded_index = encode_shard_index(index, c.index_codecs) + index_size = length(encoded_index) + + # If index is at start, adjust all offsets to account for index size + if c.index_location == :start + # Add index_size to all non-empty chunk offsets + for cart_idx in CartesianIndices(chunks_per_shard) + chunk_coords = Tuple(cart_idx) + info = index.chunks[cart_idx] + if info.offset != MAX_UINT64 + index.chunks[cart_idx] = ChunkShardInfo(info.offset + index_size, info.nbytes) + end + end + # Re-encode index with corrected offsets + encoded_index = encode_shard_index(index, c.index_codecs) + end + + # If all chunks are empty, return empty buffer (no shard) + if isempty(chunk_buffers) + resize!(encoded, 0) + return encoded + end + + # Assemble final shard: [index] + chunks or chunks + [index] + total_size = (c.index_location == :start ? index_size : 0) + + current_offset + + (c.index_location == :end ? index_size : 0) + + resize!(encoded, total_size) + output = IOBuffer(encoded, write=true) + + if c.index_location == :start + write(output, encoded_index) + for buf in chunk_buffers + write(output, buf) + end + else # :end + for buf in chunk_buffers + write(output, buf) + end + write(output, encoded_index) + end + + return encoded +end + +""" + zdecode!(data::AbstractArray, encoded::Vector{UInt8}, c::ShardingCodec) + +Decode sharded data back to array following Zarr v3 spec. + +Per spec: "A simple implementation to decode inner chunks in a shard would +(a) read the entire value from the store into a byte buffer, +(b) parse the shard index from the beginning or end of the buffer and +(c) cut out the relevant bytes that belong to the requested chunk." +""" +function zdecode!(data::AbstractArray, encoded::Vector{UInt8}, c::ShardingCodec{N}) where N + # Handle empty shard (no data) + if isempty(encoded) + fill!(data, zero(eltype(data))) # Fill with zeros (or should use fill_value from spec) + return data + end + + shard_shape = size(data) + chunks_per_shard = calculate_chunks_per_shard(shard_shape, c.chunk_shape) + + # Compute encoded index size + index_size = compute_encoded_index_size(chunks_per_shard, c.index_codecs) + + # Extract index bytes based on location + if c.index_location == :start + index_bytes = encoded[1:index_size] + chunk_data_offset = index_size + else # :end + index_bytes = encoded[end-index_size+1:end] + chunk_data_offset = 0 + end + + # Decode the index + index = decode_shard_index(index_bytes, chunks_per_shard, c.index_codecs) + + # Decode each chunk and place into output array + for cart_idx in CartesianIndices(chunks_per_shard) + chunk_coords = Tuple(cart_idx) + + # Get chunk byte range from index + chunk_slice = get_chunk_slice(index, chunk_coords) + + # Get array slice for this chunk + array_slice = get_chunk_slice_in_shard(chunk_coords, c.chunk_shape, shard_shape) + + if chunk_slice === nothing + # Empty chunk - fill with zeros (or fill_value) + # Per spec: "Empty inner chunks are interpreted as being filled with the fill value" + data[array_slice...] .= zero(eltype(data)) + continue + end + + # Extract chunk bytes + # Offsets in index are relative to start of chunk data + offset_start, offset_end = chunk_slice + + # Adjust for where chunk data begins in the shard + byte_start = chunk_data_offset + offset_start + 1 # Julia 1-based indexing + byte_end = chunk_data_offset + offset_end + + encoded_chunk = encoded[byte_start:byte_end] + + # Decode chunk using codec pipeline (in reverse) + decoded_chunk = reverse_codec_chain(encoded_chunk, c.codecs) + + # Place decoded chunk into output array + expected_shape = length.(array_slice) + data[array_slice...] = reshape(decoded_chunk, expected_shape) + end + + return data +end + +struct TransposeCodec <: V3Codec{:array, :array} +end +name(::TransposeCodec) = "transpose" + + +end diff --git a/src/Compressors/Compressors.jl b/src/Compressors/Compressors.jl index 1854128..c647eff 100644 --- a/src/Compressors/Compressors.jl +++ b/src/Compressors/Compressors.jl @@ -49,10 +49,13 @@ const compressortypes = Dict{Union{String,Nothing}, Type{<: Compressor}}() include("blosc.jl") include("zlib.jl") include("zstd.jl") +include("v3.jl") # ## Fallback definitions for the compressor interface # Define fallbacks and generic methods for the compressor interface -getCompressor(compdict::Dict) = getCompressor(compressortypes[compdict["id"]],compdict) +getCompressor(compdict::Dict) = haskey(compdict, "id") ? + getCompressor(compressortypes[compdict["id"]], compdict) : + getCompressor(compressortypes[compdict["name"]], compdict["configuration"]) getCompressor(::Nothing) = NoCompressor() # Compression when no filter is given @@ -104,4 +107,4 @@ end JSON.lower(::NoCompressor) = nothing -compressortypes[nothing] = NoCompressor \ No newline at end of file +compressortypes[nothing] = NoCompressor diff --git a/src/Compressors/v3.jl b/src/Compressors/v3.jl new file mode 100644 index 0000000..fa8c1ef --- /dev/null +++ b/src/Compressors/v3.jl @@ -0,0 +1,58 @@ +""" + Compressor v3{C <: Compressor} <: Compressor + +Wrapper to indicate Zarr v3 of a compressor +""" +struct Compressor_v3{C} <: Compressor + parent::C +end +Base.parent(c::Compressor_v3) = c.parent + +function zuncompress(a, z::Compressor_v3, T) + zuncompress(a, parent(z), T) +end + +function zuncompress!(data::DenseArray, compressed, z::Compressor_v3) + zuncompress!(data, compressed, parent(z)) +end + +function zcompress(a, z::Compressor_v3) + zcompress(a, parent(z)) +end + + +function JSON.lower(c::Compressor_v3{BloscCompressor}) + p = parent(c) + return Dict( + "name" => "blosc", + "configuration" => Dict( + "cname" => p.cname, + "clevel" => p.clevel, + "shuffle" => p.shuffle, +# TODO: Evaluate if we can encode typesize +# "typesize" => p.typesize, + "blocksize" => p.blocksize + ) + ) +end + +function JSON.lower(c::Compressor_v3{ZlibCompressor}) + p = parent(c) + return Dict( + "name" => "gzip", + "configuration" => Dict( + "level" => p.config.level + ) + ) +end + +function JSON.lower(c::Compressor_v3{ZstdCompressor}) + p = parent(c) + return Dict( + "name" => "zstd", + "configuration" => Dict( + "level" => p.config.compressionlevel, + "checksum" => p.config.checksum + ) + ) +end diff --git a/src/Compressors/zstd.jl b/src/Compressors/zstd.jl index c0e0254..6cd80a0 100644 --- a/src/Compressors/zstd.jl +++ b/src/Compressors/zstd.jl @@ -4,6 +4,7 @@ This file implements a Zstd compressor via ChunkCodecLibZstd.jl. =# + using ChunkCodecLibZstd: ZstdEncodeOptions using ChunkCodecCore: encode, decode, decode! @@ -51,4 +52,4 @@ function JSON.lower(z::ZstdCompressor) end end -Zarr.compressortypes["zstd"] = ZstdCompressor \ No newline at end of file +Zarr.compressortypes["zstd"] = ZstdCompressor diff --git a/src/Storage/Storage.jl b/src/Storage/Storage.jl index c113b05..0543bb2 100644 --- a/src/Storage/Storage.jl +++ b/src/Storage/Storage.jl @@ -78,31 +78,25 @@ Returns the keys of files in the given store. """ function subkeys end +# Function to construct the full path to a chunk given the base path, Cartesian Index i, and the chunk ecoding +store_readchunk(s::AbstractStore, p, i::CartesianIndex, e::ChunkEncoding) = s[p, citostring(e, i)] +store_deletechunk(s::AbstractStore, p, i::CartesianIndex, e::ChunkEncoding) = delete!(s, p, citostring(e, i)) +store_writechunk(s::AbstractStore, v, p, i::CartesianIndex, e::ChunkEncoding) = s[p, citostring(e, i)] = v +store_isinitialized(s::AbstractStore, p, i::CartesianIndex, e::ChunkEncoding) = isinitialized(s, p, citostring(e, i)) -""" - Base.delete!(d::AbstractStore, k::String) - -Deletes the given key from the store. -""" -citostring(i::CartesianIndex) = join(reverse((i - oneunit(i)).I), '.') -citostring(::CartesianIndex{0}) = "0" -_concatpath(p,s) = isempty(p) ? s : rstrip(p,'/') * '/' * s - -Base.getindex(s::AbstractStore, p, i::CartesianIndex) = s[p, citostring(i)] -Base.getindex(s::AbstractStore, p, i) = s[_concatpath(p,i)] -Base.delete!(s::AbstractStore, p, i::CartesianIndex) = delete!(s, p, citostring(i)) -Base.delete!(s::AbstractStore, p, i) = delete!(s, _concatpath(p,i)) -Base.haskey(s::AbstractStore, k) = isinitialized(s,k) -Base.setindex!(s::AbstractStore,v,p,i) = setindex!(s,v,_concatpath(p,i)) -Base.setindex!(s::AbstractStore,v,p,i::CartesianIndex) = s[p, citostring(i)]=v +#Functions to concat path and key +Base.getindex(s::AbstractStore, p, i::AbstractString) = s[_concatpath(p, i)] +Base.delete!(s::AbstractStore, p, i::AbstractString) = delete!(s, _concatpath(p, i)) +Base.haskey(s::AbstractStore, k::AbstractString) = isinitialized(s, k) +Base.setindex!(s::AbstractStore, v, p, i::AbstractString) = setindex!(s, v, _concatpath(p, i)) maybecopy(x) = copy(x) maybecopy(x::String) = x -function getattrs(s::AbstractStore, p) +function getattrs(::ZarrFormat{2}, s::AbstractStore, p) atts = s[p,".zattrs"] if atts === nothing Dict() @@ -110,7 +104,18 @@ function getattrs(s::AbstractStore, p) JSON.parse(replace(String(maybecopy(atts)),": NaN,"=>": \"NaN\","); dicttype = Dict{String,Any}) end end -function writeattrs(s::AbstractStore, p, att::Dict; indent_json::Bool= false) + +function getattrs(::ZarrFormat{3}, s::AbstractStore, p) + md = s[p, "zarr.json"] + if md === nothing + error("zarr.json not found") + else + md = JSON.parse(replace(String(maybecopy(md)), ": NaN," => ": \"NaN\",")) + return get(md, "attributes", Dict{String,Any}()) + end +end + +function writeattrs(::ZarrFormat{2}, s::AbstractStore, p, att::Dict; indent_json::Bool=false) b = IOBuffer() if indent_json @@ -123,15 +128,50 @@ function writeattrs(s::AbstractStore, p, att::Dict; indent_json::Bool= false) att end -is_zgroup(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zgroup")) -is_zarray(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zarray")) +function writeattrs(::ZarrFormat{3}, s::AbstractStore, p, att::Dict; indent_json::Bool=false) + # This is messy, we need to open zarr.json and replace the attributes section + md = s[p, "zarr.json"] + if md === nothing + error("zarr.json not found") + else + md = JSON.parse(replace(String(maybecopy(md)), ": NaN," => ": \"NaN\",")) + end + md = Dict(md) + md["attributes"] = att + + b = IOBuffer() + + if indent_json + JSON.print(b, md, 4) + else + JSON.print(b, md) + end + + s[p, "zarr.json"] = take!(b) + att +end + +is_zarr3(s::AbstractStore, p) = isinitialized(s,_concatpath(p,"zarr.json")) +is_zarr2(s::AbstractStore, p) = is_zarray(ZarrFormat(Val(2)), s, p) || is_zgroup(ZarrFormat((Val(2))), s, p) + +is_zgroup(::ZarrFormat{2}, s::AbstractStore, p) = isinitialized(s, _concatpath(p, ".zgroup")) +is_zarray(::ZarrFormat{2}, s::AbstractStore, p) = isinitialized(s, _concatpath(p, ".zarray")) +is_zgroup(::ZarrFormat{3}, s::AbstractStore, p, metadata=getmetadata(s, p, false)) = + isinitialized(s, _concatpath(p, "zarr.json")) && + metadata.node_type == "group" +is_zarray(::ZarrFormat{3}, s::AbstractStore, p, metadata=getmetadata(s, p, false)) = + isinitialized(s, _concatpath(p, "zarr.json")) && + metadata.node_type == "array" -isinitialized(s::AbstractStore, p, i::CartesianIndex)=isinitialized(s,p,citostring(i)) -isinitialized(s::AbstractStore, p, i) = isinitialized(s,_concatpath(p,i)) -isinitialized(s::AbstractStore, i) = s[i] !== nothing -getmetadata(s::AbstractStore, p,fill_as_missing) = Metadata(String(maybecopy(s[p,".zarray"])),fill_as_missing) -function writemetadata(s::AbstractStore, p, m::Metadata; indent_json::Bool= false) +isinitialized(s::AbstractStore, p, i::AbstractString) = isinitialized(s, _concatpath(p, i)) +isinitialized(s::AbstractStore, i::AbstractString) = s[i] !== nothing + +getmetadata(::ZarrFormat{2}, s::AbstractStore, p, fill_as_missing) = Metadata(String(maybecopy(s[p, ".zarray"])), fill_as_missing) + +getmetadata(::ZarrFormat{3}, s::AbstractStore, p, fill_as_missing) = Metadata(String(maybecopy(s[p, "zarr.json"])), fill_as_missing) + +function writemetadata(::ZarrFormat{2}, s::AbstractStore, p, m::AbstractMetadata; indent_json::Bool=false) met = IOBuffer() if indent_json @@ -143,6 +183,19 @@ function writemetadata(s::AbstractStore, p, m::Metadata; indent_json::Bool= fals s[p,".zarray"] = take!(met) m end +function writemetadata(::ZarrFormat{3}, s::AbstractStore, p, m::AbstractMetadata; indent_json::Bool=false) + met = IOBuffer() + + if indent_json + JSON.print(met, m, 4) + else + JSON.print(met, m) + end + + s[p, "zarr.json"] = take!(met) + m +end + ## Handling sequential vs parallel IO @@ -156,50 +209,50 @@ channelsize(s) = channelsize(store_read_strategy(s)) channelsize(::SequentialRead) = 0 channelsize(c::ConcurrentRead) = c.ntasks -read_items!(s::AbstractStore,c::AbstractChannel, p, i) = read_items!(s,c,store_read_strategy(s),p,i) -function read_items!(s::AbstractStore,c::AbstractChannel, ::SequentialRead ,p,i) +read_items!(s::AbstractStore, c::AbstractChannel, e::ChunkEncoding, p, i) = read_items!(s, c, store_read_strategy(s), e, p, i) +function read_items!(s::AbstractStore, c::AbstractChannel, ::SequentialRead, e::ChunkEncoding, p, i) for ii in i - res = s[p,ii] + res = store_readchunk(s, p, ii, e) put!(c,(ii=>res)) end end -function read_items!(s::AbstractStore,c::AbstractChannel, r::ConcurrentRead ,p,i) +function read_items!(s::AbstractStore, c::AbstractChannel, r::ConcurrentRead, e::ChunkEncoding, p, i) ntasks = r.ntasks #@show ntasks asyncmap(i,ntasks = ntasks) do ii #@show ii,objectid(current_task),p - res = s[p,ii] + res = store_readchunk(s, p, ii, e) #@show ii,length(res) put!(c,(ii=>res)) nothing end end -write_items!(s::AbstractStore,c::AbstractChannel, p, i) = write_items!(s,c,store_read_strategy(s),p,i) -function write_items!(s::AbstractStore,c::AbstractChannel, ::SequentialRead ,p,i) +write_items!(s::AbstractStore, c::AbstractChannel, e::ChunkEncoding, p, i) = write_items!(s, c, store_read_strategy(s), e, p, i) +function write_items!(s::AbstractStore, c::AbstractChannel, ::SequentialRead, e::ChunkEncoding, p, i) for _ in 1:length(i) ii,data = take!(c) if data === nothing if isinitialized(s,p,ii) - delete!(s,p,ii) + store_deletechunk(s, p, ii, e) end else - s[p,ii] = data + store_writechunk(s, data, p, ii, e) end end close(c) end -function write_items!(s::AbstractStore,c::AbstractChannel, r::ConcurrentRead ,p,i) +function write_items!(s::AbstractStore, c::AbstractChannel, r::ConcurrentRead, e::ChunkEncoding, p, i) ntasks = r.ntasks asyncmap(i,ntasks = ntasks) do _ ii,data = take!(c) if data === nothing if isinitialized(s,ii) - delete!(s,ii) + store_deletechunk(s, p, ii, e) end else - s[p,ii] = data + store_writechunk(s, data, p, ii, e) = data end nothing end @@ -213,6 +266,7 @@ isemptysub(s::AbstractStore, p) = isempty(subkeys(s,p)) && isempty(subdirs(s,p)) storageregexlist = Pair[] push!(storageregexlist, r"^s3://" => S3Store) +#include("formattedstore.jl") include("directorystore.jl") include("dictstore.jl") include("gcstore.jl") diff --git a/src/Storage/http.jl b/src/Storage/http.jl index 9b68cb1..980284f 100644 --- a/src/Storage/http.jl +++ b/src/Storage/http.jl @@ -13,8 +13,8 @@ python package. In case you experience performance issues, one can try to use struct HTTPStore <: AbstractStore url::String allowed_codes::Set{Int} + HTTPStore(url, allowed_codes = Set((404,))) = new(url, allowed_codes) end -HTTPStore(url) = HTTPStore(url,Set((404,))) function Base.getindex(s::HTTPStore, k::String) r = HTTP.request("GET",string(s.url,"/",k),status_exception = false,socket_type_tls=OpenSSL.SSLStream) @@ -39,7 +39,21 @@ end push!(storageregexlist,r"^https://"=>HTTPStore) push!(storageregexlist,r"^http://"=>HTTPStore) -storefromstring(::Type{<:HTTPStore}, s,_) = ConsolidatedStore(HTTPStore(s),""),"" +function storefromstring(::Type{<:HTTPStore}, s,_) + http_store = HTTPStore(s) + try + if http_store["", ".zmetadata"] !== nothing + http_store = ConsolidatedStore(http_store,"") + end + if is_zarray(http_store, "") + meta = getmetadata(http_store, "", false) + http_store = FormattedStore{meta.zarr_format, meta.dimension_separator}(http_store) + end + catch err + @warn exception=err "Additional metadata was not available for HTTPStore." + end + return http_store,"" +end """ missing_chunk_return_code!(s::HTTPStore, code::Union{Int,AbstractVector{Int}}) diff --git a/src/ZArray.jl b/src/ZArray.jl index 95035d6..ca687f2 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -30,10 +30,8 @@ Base.IndexStyle(::Type{<:SenMissArray})=Base.IndexLinear() # Struct representing a Zarr Array in Julia, note that # chunks(chunk size) and size are always in Julia column-major order -# Currently this is not an AbstractArray, because indexing single elements is -# would be really slow, although most AbstractArray interface functions are implemented -struct ZArray{T, N, C<:Compressor, S<:AbstractStore} <: AbstractDiskArray{T,N} - metadata::Metadata{T, N, C} +struct ZArray{T,N,S<:AbstractStore,M<:AbstractMetadata{T,N}} <: AbstractDiskArray{T,N} + metadata::M storage::S path::String attrs::Dict @@ -42,20 +40,20 @@ end Base.eltype(::ZArray{T}) where {T} = T Base.ndims(::ZArray{<:Any,N}) where {N} = N -Base.size(z::ZArray) = z.metadata.shape[] -function Base.size(z::ZArray,i) +Base.size(z::ZArray{<:Any,N}) where {N} = z.metadata.shape[]::NTuple{N, Int} +function Base.size(z::ZArray{<:Any,N}, i::Integer) where {N} len = length(z.metadata.shape[]) if 0 < i <= len - z.metadata.shape[][i] + z.metadata.shape[][i]::Int elseif i > len 1 else error("arraysize: dimension out of range") end end -Base.length(z::ZArray) = prod(z.metadata.shape[]) -Base.lastindex(z::ZArray,n) = size(z,n) -Base.lastindex(z::ZArray{<:Any,1}) = size(z,1) +Base.length(z::ZArray) = prod(z.metadata.shape[])::Int +Base.lastindex(z::ZArray{<:Any,N}, n::Integer) where {N} = size(z, n)::Int +Base.lastindex(z::ZArray{<:Any,1}) = size(z, 1)::Int function Base.show(io::IO,z::ZArray) print(io, "ZArray{", eltype(z) ,"} of size ",join(string.(size(z)), " x ")) @@ -95,7 +93,7 @@ nobytes(z::ZArray{<:String}) = "unknown" zinfo(z::ZArray) = zinfo(stdout,z) function zinfo(io::IO,z::ZArray) ninit = sum(chunkindices(z)) do i - isinitialized(z.storage,z.path,i) + store_isinitialized(z.storage, z.path, i, z.metadata.chunk_encoding) end allinfos = [ "Type" => "ZArray", @@ -117,15 +115,22 @@ function zinfo(io::IO,z::ZArray) end end -function ZArray(s::T, mode="r",path="";fill_as_missing=false) where T <: AbstractStore - metadata = getmetadata(s,path,fill_as_missing) - attrs = getattrs(s,path) +function ZArray(s::T, mode="r", path="", zarr_format=:auto; fill_as_missing=false) where T<:AbstractStore + zv = if zarr_format == :auto + ZarrFormat(s, path) + else + ZarrFormat(zarr_format) + end + metadata = getmetadata(zv, s, path, fill_as_missing) + attrs = getattrs(zv, s, path) writeable = mode == "w" startswith(path,"/") && error("Paths should never start with a leading '/'") - ZArray{eltype(metadata), length(metadata.shape[]), typeof(metadata.compressor), T}( - metadata, s, path, attrs, writeable) + ZArray(metadata, s, string(path), attrs, writeable) end +zarr_format(z::ZArray) = zarr_format(z.metadata) +dimension_separator(z::ZArray) = dimension_separator(z.metadata) + """ trans_ind(r, bs) @@ -174,7 +179,7 @@ function readblock!(aout::AbstractArray{<:Any,N}, z::ZArray{<:Any, N}, r::Cartes c = Channel{Pair{eltype(blockr),Union{Nothing,Vector{UInt8}}}}(channelsize(z.storage)) task = @async begin - read_items!($z.storage,c, $z.path, $blockr) + read_items!($(z.storage), c, $(z.metadata.chunk_encoding), $(z.path), $(blockr)) end bind(c,task) @@ -210,14 +215,14 @@ function writeblock!(ain::AbstractArray{<:Any,N}, z::ZArray{<:Any, N}, r::Cartes readchannel = Channel{Pair{eltype(blockr),Union{Nothing,Vector{UInt8}}}}(channelsize(z.storage)) readtask = @async begin - read_items!(z.storage,readchannel, z.path, blockr) + read_items!(z.storage, readchannel, z.metadata.chunk_encoding, z.path, blockr) end bind(readchannel,readtask) writechannel = Channel{Pair{eltype(blockr),Union{Nothing,Vector{UInt8}}}}(channelsize(z.storage)) writetask = @async begin - write_items!(z.storage,writechannel,z.path,blockr) + write_items!(z.storage, writechannel, z.metadata.chunk_encoding, z.path, blockr) end bind(writechannel,writetask) @@ -312,6 +317,7 @@ Creates a new empty zarr array with element type `T` and array dimensions `dims` * `path=""` directory name to store a persistent array. If left empty, an in-memory array will be created * `name=""` name of the zarr array, defaults to the directory name +* `zarr_format`=$(DV) Zarr format version (2 or 3) * `storagetype` determines the storage to use, current options are `DirectoryStore` or `DictStore` * `chunks=dims` size of the individual array chunks, must be a tuple of length `length(dims)` * `fill_value=nothing` value to represent missing values @@ -321,23 +327,28 @@ Creates a new empty zarr array with element type `T` and array dimensions `dims` * `attrs=Dict()` a dict containing key-value pairs with metadata attributes associated to the array * `writeable=true` determines if the array is opened in read-only or write mode * `indent_json=false` determines if indents are added to format the json files `.zarray` and `.zattrs`. This makes them more readable, but increases file size. +* `dimension_separator='.'` sets how chunks are encoded. The Zarr v2 default is '.' such that the first 3D chunk would be `0.0.0`. The Zarr v3 default is `/`. """ function zcreate(::Type{T}, dims::Integer...; name="", path=nothing, + zarr_format=DV, + dimension_separator=default_sep(zarr_format), kwargs... ) where T + if path===nothing store = DictStore() else - store = DirectoryStore(joinpath(path,name)) + store = DirectoryStore(joinpath(path, name)) end - zcreate(T, store, dims...; kwargs...) + zcreate(T, store, dims...; zarr_format, dimension_separator, kwargs...) end function zcreate(::Type{T},storage::AbstractStore, dims...; path = "", + zarr_format = DV, chunks=dims, fill_value=nothing, fill_as_missing=false, @@ -345,32 +356,45 @@ function zcreate(::Type{T},storage::AbstractStore, filters = filterfromtype(T), attrs=Dict(), writeable=true, - indent_json=false - ) where T + indent_json=false, + dimension_separator=nothing + ) where {T} + + v = ZarrFormat(zarr_format) + if isnothing(dimension_separator) + dimension_separator = default_sep(v) + end + if dimension_separator isa AbstractString + # Convert AbstractString to Char + dimension_separator = only(dimension_separator) + end + chunk_encoding = ChunkEncoding(dimension_separator, default_prefix(v)) length(dims) == length(chunks) || throw(DimensionMismatch("Dims must have the same length as chunks")) N = length(dims) C = typeof(compressor) - T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} - metadata = Metadata{T2, N, C, typeof(filters)}( - 2, - dims, - chunks, - typestr(T), - compressor, - fill_value, - 'C', - filters, + + # Create a dummy array to use with Metadata constructor + # This allows us to leverage the multiple dispatch in Metadata constructors + dummy_array = Array{T,N}(undef, dims...) + metadata = Metadata(dummy_array, chunks, v; + compressor=compressor, + fill_value=fill_value, + filters=filters, + fill_as_missing=fill_as_missing, + chunk_encoding=chunk_encoding ) + # Extract the element type from the metadata (handles T2 calculation) + T2 = eltype(metadata) + isemptysub(storage,path) || error("$storage $path is not empty") - writemetadata(storage, path, metadata, indent_json=indent_json) + writemetadata(v, storage, path, metadata, indent_json=indent_json) - writeattrs(storage, path, attrs, indent_json=indent_json) + writeattrs(v, storage, path, attrs, indent_json=indent_json) - ZArray{T2, N, typeof(compressor), typeof(storage)}( - metadata, storage, path, attrs, writeable) + ZArray(metadata, storage, path, attrs, writeable) end filterfromtype(::Type{<:Any}) = nothing @@ -413,7 +437,7 @@ function zzeros(T,dims...;kwargs...) data_encoded = compress_raw(as,z) p = z.path for i in chunkindices(z) - z.storage[p,i] = data_encoded + store_writechunk(z.storage, data_encoded, p, i, z.metadata.chunk_encoding) end z end @@ -431,9 +455,9 @@ function Base.resize!(z::ZArray{T,N}, newsize::NTuple{N}) where {T,N} z.metadata.shape[] = newsize #Check if array was shrunk if any(map(<,newsize, oldsize)) - prune_oob_chunks(z.storage,z.path,oldsize,newsize, z.metadata.chunks) + prune_oob_chunks(z.storage, z.path, oldsize, newsize, z.metadata.chunks, z.metadata.chunk_encoding) end - writemetadata(z.storage, z.path, z.metadata) + writemetadata(zarr_format(z), z.storage, z.path, z.metadata) nothing end Base.resize!(z::ZArray, newsize::Integer...) = resize!(z,newsize) @@ -474,14 +498,14 @@ function Base.append!(z::ZArray{<:Any, N},a;dims = N) where N nothing end -function prune_oob_chunks(s::AbstractStore,path,oldsize, newsize, chunks) +function prune_oob_chunks(s::AbstractStore, path, oldsize, newsize, chunks, chunk_encoding) dimstoshorten = findall(map(<,newsize, oldsize)) for idim in dimstoshorten delrange = (fld1(newsize[idim],chunks[idim])+1):(fld1(oldsize[idim],chunks[idim])) allchunkranges = map(i->1:fld1(oldsize[i],chunks[i]),1:length(oldsize)) r = (allchunkranges[1:idim-1]..., delrange, allchunkranges[idim+1:end]...) for cI in CartesianIndices(r) - delete!(s,path,cI) + store_deletechunk(s, path, cI, chunk_encoding) end end end diff --git a/src/ZGroup.jl b/src/ZGroup.jl index be2b0d1..a85bc3c 100644 --- a/src/ZGroup.jl +++ b/src/ZGroup.jl @@ -13,45 +13,60 @@ ZGroup(storage, path::AbstractString, arrays, groups, attrs, writeable) = zname(g::ZGroup) = zname(g.path) + + #Open an existing ZGroup -function ZGroup(s::T,mode="r",path="";fill_as_missing=false) where T <: AbstractStore +function ZGroup(s::T, mode="r", path="", zarr_format=:auto; fill_as_missing=false) where T<:AbstractStore arrays = Dict{String, ZArray}() groups = Dict{String, ZGroup}() - + zv = if zarr_format == :auto + ZarrFormat(s, path) + else + ZarrFormat(zarr_format) + end for d in subdirs(s,path) dshort = split(d,'/')[end] - m = zopen_noerr(s,mode,path=_concatpath(path,dshort),fill_as_missing=fill_as_missing) - if isa(m, ZArray) + subpath = _concatpath(path,dshort) + if is_zarray(zv, s, subpath) + m = zopen_noerr(s, mode, zv, path=_concatpath(path, dshort), fill_as_missing=fill_as_missing) arrays[dshort] = m - elseif isa(m, ZGroup) + elseif is_zgroup(s, subpath) + m = zopen_noerr(s, mode, zv, path=_concatpath(path, dshort), fill_as_missing=fill_as_missing) groups[dshort] = m end end - attrs = getattrs(s,path) + attrs = getattrs(zv, s, path) startswith(path,"/") && error("Paths should never start with a leading '/'") ZGroup(s, path, arrays, groups, attrs,mode=="w") end +#Function to guess a Zarr format from a store and a path, useful for guessing format when trying to open a group/array +ZarrFormat(s::AbstractStore, path) = is_zarr2(s, path) ? ZarrFormat(2) : + is_zarr3(s, path) ? ZarrFormat(3) : + throw(ArgumentError("Specified store $s in path $(path) is neither a ZArray nor a ZGroup in a recognized zarr format.")) + + """ zopen_noerr(AbstractStore, mode = "r"; consolidated = false) Works like `zopen` with the single difference that no error is thrown when the path or store does not point to a valid zarr array or group, but nothing -is returned instead. +is returned instead. """ -function zopen_noerr(s::AbstractStore, mode="r"; +function zopen_noerr(s::AbstractStore, mode, zv::ZarrFormat; consolidated = false, path="", lru = 0, - fill_as_missing) - consolidated && isinitialized(s,".zmetadata") && return zopen(ConsolidatedStore(s, path), mode, path=path,lru=lru,fill_as_missing=fill_as_missing) - if lru !== 0 - error("LRU caches are not supported anymore by the current Zarr version. Please use an earlier version of Zarr for now and open an issue at Zarr.jl if you need this functionality") - end - if is_zarray(s, path) - return ZArray(s,mode,path;fill_as_missing=fill_as_missing) - elseif is_zgroup(s,path) - return ZGroup(s,mode,path;fill_as_missing=fill_as_missing) + fill_as_missing=false) + + consolidated && isinitialized(s, ".zmetadata") && return zopen(ConsolidatedStore(s, path), mode, path=path, lru=lru, fill_as_missing=fill_as_missing) + if lru !== 0 + error("LRU caches are not supported anymore by the current Zarr version. Please use an earlier version of Zarr for now and open an issue at Zarr.jl if you need this functionality") + end + if is_zarray(zv, s, path) + return ZArray(s, mode, path, zv; fill_as_missing=fill_as_missing) + elseif is_zgroup(zv, s, path) + return ZGroup(s, mode, path, zv; fill_as_missing=fill_as_missing) else return nothing end @@ -76,6 +91,7 @@ function Base.getindex(g::ZGroup, k) end end + """ zopen(s::AbstractStore, mode="r"; consolidated = false, path = "", lru = 0) @@ -84,20 +100,29 @@ Zarr will search for a consolidated metadata field as created by the python zarr `consolidate_metadata` function. This can substantially speed up metadata parsing of large zarr groups. Setting `lru` to a value > 0 means that chunks that have been accessed before will be cached and consecutive reads will happen from the cache. -Here, `lru` denotes the number of chunks that remain in memory. +Here, `lru` denotes the number of chunks that remain in memory. The expected zarr version +can be supplied through `zarr_format` and defaults to `:auto` which tries to detect +if the zarr version is v2 or v3. """ function zopen(s::AbstractStore, mode="r"; + zarr_format=:auto, consolidated = false, path = "", lru = 0, fill_as_missing = false) - # add interfaces to Stores later - r = zopen_noerr(s,mode; consolidated=consolidated, path=path, lru=lru, fill_as_missing=fill_as_missing) - if r === nothing - throw(ArgumentError("Specified store $s in path $(path) is neither a ZArray nor a ZGroup")) - else - return r - end + + zarr_format = if zarr_format == :auto + ZarrFormat(s, path) + else + ZarrFormat(zarr_format) + end + # add interfaces to Stores later + r = zopen_noerr(s, mode, zarr_format; consolidated=consolidated, path=path, lru=lru, fill_as_missing=fill_as_missing) + if r === nothing + throw(ArgumentError("Specified store $s in path $(path) is neither a ZArray nor a ZGroup")) + else + return r + end end """ @@ -128,8 +153,8 @@ end Create a new zgroup in the store `s` """ -function zgroup(s::AbstractStore, path::String=""; attrs=Dict(), indent_json::Bool= false) - d = Dict("zarr_format"=>2) +function zgroup(s::AbstractStore, path::String="", zarr_format=ZarrFormat(2); attrs=Dict(), indent_json::Bool=false) + d = Dict("zarr_format" => Int(DV)) isemptysub(s, path) || error("Store is not empty") b = IOBuffer() @@ -140,7 +165,7 @@ function zgroup(s::AbstractStore, path::String=""; attrs=Dict(), indent_json::Bo end s[path,".zgroup"]=take!(b) - writeattrs(s,path,attrs, indent_json=indent_json) + writeattrs(DV, s, path, attrs, indent_json=indent_json) ZGroup(s, path, Dict{String,ZArray}(), Dict{String,ZGroup}(), attrs,true) end diff --git a/src/Zarr.jl b/src/Zarr.jl index dbdeb9a..40196c9 100644 --- a/src/Zarr.jl +++ b/src/Zarr.jl @@ -3,8 +3,20 @@ module Zarr import JSON import Blosc +struct ZarrFormat{V} + version::Val{V} +end +Base.Int(v::ZarrFormat{V}) where V = V +@inline ZarrFormat(v::Int) = ZarrFormat(Val(v)) +ZarrFormat(v::ZarrFormat) = v +#Default Zarr Version +const DV = ZarrFormat(Val(2)) + +include("chunkencoding.jl") include("metadata.jl") +include("metadata3.jl") include("Compressors/Compressors.jl") +include("Codecs/Codecs.jl") include("Storage/Storage.jl") include("Filters/Filters.jl") include("ZArray.jl") diff --git a/src/chunkencoding.jl b/src/chunkencoding.jl new file mode 100644 index 0000000..911cd36 --- /dev/null +++ b/src/chunkencoding.jl @@ -0,0 +1,27 @@ + +struct ChunkEncoding + sep::Char + prefix::Bool +end + +# Default Zarr v2 separator +const DS2 = '.' +# Default Zarr v3 separator +const DS3 = '/' + +default_sep(::ZarrFormat{2}) = DS2 +default_sep(::ZarrFormat{3}) = DS3 +default_prefix(::ZarrFormat{2}) = false +default_prefix(::ZarrFormat{3}) = true +const DS = default_sep(DV) + +@inline function citostring(e::ChunkEncoding, i::CartesianIndex) + if e.prefix + "c$(e.sep)" * join(reverse((i - oneunit(i)).I), e.sep) + else + join(reverse((i - oneunit(i)).I), e.sep) + end +end +@inline citostring(e::ChunkEncoding, ::CartesianIndex{0}) = e.prefix ? "c$(e.sep)0" : "0" + +_concatpath(p,s) = isempty(p) ? s : rstrip(p,'/') * '/' * s diff --git a/src/metadata.jl b/src/metadata.jl index 17e48c4..da94b04 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -86,16 +86,31 @@ function typestr(s::AbstractString, filterlist=nothing) end end + """Metadata configuration of the stored array Each array requires essential configuration metadata to be stored, enabling correct interpretation of the stored data. This metadata is encoded using JSON and stored as the -value of the “.zarray” key within an array store. +value of the ".zarray" key within an array store. + +# Type Parameters +* T - element type of the array +* N - dimensionality of the array +* C - compressor +* F - filters + +# See Also https://zarr.readthedocs.io/en/stable/spec/v2.html#metadata """ -struct Metadata{T, N, C, F} +abstract type AbstractMetadata{T,N,C,F} end +Base.ndims(::AbstractMetadata{<:Any,N}) where N = N + + +"""Metadata for Zarr version 2 arrays""" +struct MetadataV2{T,N,C,F} <: AbstractMetadata{T,N,C,F} zarr_format::Int + node_type::String shape::Base.RefValue{NTuple{N, Int}} chunks::NTuple{N, Int} dtype::String # structured data types not yet supported @@ -103,50 +118,123 @@ struct Metadata{T, N, C, F} fill_value::Union{T, Nothing} order::Char filters::F # not yet supported - function Metadata{T2, N, C, F}(zarr_format, shape, chunks, dtype, compressor,fill_value, order, filters) where {T2,N,C,F} - #We currently only support version - zarr_format == 2 || throw(ArgumentError("Zarr.jl currently only support v2 of the protocol")) + chunk_encoding::ChunkEncoding + function MetadataV2{T2,N,C,F}(zarr_format, node_type, shape, chunks, dtype, compressor, fill_value, order, filters, chunk_encoding) where {T2,N,C,F} + zarr_format == 2 || throw(ArgumentError("MetadataV2 only functions if zarr_format == 2")) #Do some sanity checks to make sure we have a sane array any(<(0), shape) && throw(ArgumentError("Size must be positive")) any(<(1), chunks) && throw(ArgumentError("Chunk size must be >= 1 along each dimension")) order === 'C' || throw(ArgumentError("Currently only 'C' storage order is supported")) - new{T2, N, C, F}(zarr_format, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters) + new{T2,N,C,F}(zarr_format, node_type, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor, fill_value, order, filters, chunk_encoding) end end +zarr_format(::MetadataV2) = ZarrFormat(Val(2)) + +"""Metadata for Zarr version 3 arrays""" +struct MetadataV3{T,N,C,F} <: AbstractMetadata{T,N,C,F} + zarr_format::Int + node_type::String + shape::Base.RefValue{NTuple{N, Int}} + chunks::NTuple{N, Int} + dtype::String # data_type in v3 + compressor::C + fill_value::Union{T, Nothing} + order::Char + filters::F # not yet supported + chunk_encoding::ChunkEncoding + function MetadataV3{T2,N,C,F}(zarr_format, node_type, shape, chunks, dtype, compressor, fill_value, order, filters, chunk_encoding) where {T2,N,C,F} + zarr_format == 3 || throw(ArgumentError("MetadataV3 only functions if zarr_format == 3")) + #Do some sanity checks to make sure we have a sane array + any(<(0), shape) && throw(ArgumentError("Size must be positive")) + any(<(1), chunks) && throw(ArgumentError("Chunk size must be >= 1 along each dimension")) + order === 'C' || throw(ArgumentError("Currently only 'C' storage order is supported")) + new{T2,N,C,F}(zarr_format, node_type, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor, fill_value, order, filters, chunk_encoding) + end +end +zarr_format(::MetadataV3) = ZarrFormat(Val(3)) + +# Type alias for backward compatibility +const Metadata = AbstractMetadata #To make unit tests pass with ref shape import Base.== -function ==(m1::Metadata, m2::Metadata) +function ==(m1::AbstractMetadata, m2::AbstractMetadata) m1.zarr_format == m2.zarr_format && + m1.node_type == m2.node_type && m1.shape[] == m2.shape[] && m1.chunks == m2.chunks && m1.dtype == m2.dtype && m1.compressor == m2.compressor && m1.fill_value == m2.fill_value && m1.order == m2.order && - m1.filters == m2.filters + m1.filters == m2.filters && + m1.chunk_encoding == m2.chunk_encoding end "Construct Metadata based on your data" -function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; - zarr_format::Integer=2, +function Metadata(A::AbstractArray{T,N}, chunks::NTuple{N,Int}, zarr_format=DV; + node_type::String="array", compressor::C=BloscCompressor(), fill_value::Union{T, Nothing}=nothing, order::Char='C', - filters::Nothing=nothing, + filters=nothing, fill_as_missing = false, + dimension_separator::Char = '.' ) where {T, N, C} + return Metadata(A, chunks, ZarrFormat(zarr_format); + node_type=node_type, + compressor=compressor, + fill_value=fill_value, + order=order, + filters=filters, + fill_as_missing=fill_as_missing, + chunk_encoding=ChunkEncoding(dimension_separator, default_prefix(ZarrFormat(zarr_format))) + ) +end + +# V2 constructor +function Metadata(A::AbstractArray{T,N}, chunks::NTuple{N,Int}, ::ZarrFormat{2}; + node_type::String="array", + compressor::C=BloscCompressor(), + fill_value::Union{T, Nothing}=nothing, + order::Char='C', + filters::F=nothing, + fill_as_missing = false, + chunk_encoding=ChunkEncoding('.', false) + ) where {T, N, C, F} T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} - Metadata{T2, N, C, typeof(filters)}( - zarr_format, + MetadataV2{T2,N,C,typeof(filters)}( + 2, + node_type, size(A), chunks, typestr(eltype(A)), compressor, fill_value, order, - filters + filters, + chunk_encoding, + ) +end + +function Metadata(A::AbstractArray{T,N}, chunks::NTuple{N,Int}, ::ZarrFormat{3}; + node_type::String="array", + compressor::C=BloscCompressor(), + fill_value::Union{T, Nothing}=nothing, + order::Char='C', + filters::F=nothing, + fill_as_missing = false, + chunk_encoding::ChunkEncoding=ChunkEncoding('/', true) + ) where {T, N, C, F} + return Metadata3(A, chunks; + node_type=node_type, + compressor=compressor, + fill_value=fill_value, + order=order, + filters=filters, + fill_as_missing=fill_as_missing, + chunk_encoding=chunk_encoding ) end @@ -154,7 +242,15 @@ Metadata(s::Union{AbstractString, IO}, fill_as_missing) = Metadata(JSON.parse(s; "Construct Metadata from Dict" function Metadata(d::AbstractDict, fill_as_missing) - # create a Metadata struct from it + zarr_format = d["zarr_format"]::Int + zarr_format ∉ (2, 3) && throw(ArgumentError("Zarr.jl currently only supports v2 or v3 of the specification")) + return Metadata(d, fill_as_missing, ZarrFormat(zarr_format)) +end + +# V2 constructor from Dict +function Metadata(d::AbstractDict, fill_as_missing, ::ZarrFormat{2}) + # Zarr v2 metadata is only for arrays + node_type = "array" compdict = d["compressor"] if isnothing(compdict) @@ -176,8 +272,11 @@ function Metadata(d::AbstractDict, fill_as_missing) TU = (fv === nothing || !fill_as_missing) ? T : Union{T,Missing} - Metadata{TU, N, C, F}( + dim_sep = only(get(d, "dimension_separator", '.')) + + MetadataV2{TU,N,C,F}( d["zarr_format"], + node_type, NTuple{N, Int}(d["shape"]) |> reverse, NTuple{N, Int}(d["chunks"]) |> reverse, d["dtype"], @@ -185,23 +284,35 @@ function Metadata(d::AbstractDict, fill_as_missing) fv, first(d["order"]), filters, + ChunkEncoding(dim_sep, false), ) end +# V3 constructor from Dict - delegate to metadata3.jl +function Metadata(d::AbstractDict, fill_as_missing, ::ZarrFormat{3}) + return Metadata3(d, fill_as_missing) +end + "Describes how to lower Metadata to JSON, used in json(::Metadata)" -function JSON.lower(md::Metadata) +function JSON.lower(md::MetadataV2) Dict{String, Any}( - "zarr_format" => md.zarr_format, + "zarr_format" => Int(md.zarr_format), + "node_type" => md.node_type, "shape" => md.shape[] |> reverse, "chunks" => md.chunks |> reverse, "dtype" => md.dtype, "compressor" => md.compressor, "fill_value" => fill_value_encoding(md.fill_value), "order" => md.order, - "filters" => md.filters + "filters" => md.filters, + "dimension_separator" => md.chunk_encoding.sep ) end +function JSON.lower(md::MetadataV3) + return lower3(md) +end + # Fill value encoding and decoding as described in # https://zarr.readthedocs.io/en/stable/spec/v2.html#fill-value-encoding @@ -217,7 +328,7 @@ function fill_value_encoding(v::AbstractFloat) end end -Base.eltype(::Metadata{T}) where T = T +Base.eltype(::AbstractMetadata{T}) where T = T # this correctly parses "NaN" and "Infinity" fill_value_decoding(v::AbstractString, T::Type{<:Number}) = parse(T, v) diff --git a/src/metadata3.jl b/src/metadata3.jl new file mode 100644 index 0000000..c7b3caf --- /dev/null +++ b/src/metadata3.jl @@ -0,0 +1,337 @@ +""" +Prototype Zarr version 3 support +""" + +const typemap3 = Dict{String, DataType}() +foreach([Bool, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float16, Float32, Float64]) do t + typemap3[lowercase(string(t))] = t +end +typemap3["complex64"] = ComplexF32 +typemap3["complex128"] = ComplexF64 + +function typestr3(t::Type) + return lowercase(string(t)) +end +# TODO: Check raw types +function typestr3(::Type{NTuple{N,UInt8}}) where {N} + return "r$(N*8)" +end + +function typestr3(s::AbstractString, codecs=nothing) + if !haskey(typemap3, s) + if startswith(s, "r") + num_bits = tryparse(Int, s[2:end]) + if isnothing(num_bits) + raise(ArgumentError("$s is not a known type")) + end + if mod(num_bits, 8) == 0 + return NTuple{num_bits÷8,UInt8} + else + raise(ArgumentError("$s must describe a raw type with bit size that is a multiple of 8 bits")) + end + end + end + return typemap3[s] +end + +function check_keys(d::AbstractDict, keys) + for key in keys + if !haskey(d, key) + throw(ArgumentError("Zarr v3 metadata must have a key called $key")) + end + end +end + +function Metadata3(d::AbstractDict, fill_as_missing) + check_keys(d, ("zarr_format", "node_type")) + + zarr_format = d["zarr_format"]::Int + + node_type = d["node_type"]::String + if node_type ∉ ("group", "array") + throw(ArgumentError("Unknown node_type of $node_type")) + end + + zarr_format == 3 || throw(ArgumentError("Metadata3 only functions if zarr_format == 3")) + + # Groups + if node_type == "group" + # Groups only need zarr_format and node_type + # Optionally they can have attributes + for key in keys(d) + if key ∉ ("zarr_format", "node_type", "attributes") + throw(ArgumentError("Zarr v3 group metadata cannot have a key called $key")) + end + end + + return MetadataV3{Int,0,Nothing,Nothing}(zarr_format, node_type, (), (), "", nothing, 0, 'C', nothing, ChunkEncoding('/', true)) + end + + # Array keys + mandatory_keys = [ + "zarr_format", + "node_type", + "shape", + "data_type", + "chunk_grid", + "chunk_key_encoding", + "fill_value", + "codecs", + ] + optional_keys = [ + "attributes", + "storage_transformers", + "dimension_names", + ] + + check_keys(d, mandatory_keys) + for key in keys(d) + if key ∉ mandatory_keys && key ∉ optional_keys + throw(ArgumentError("Zarr v3 metadata cannot have a key called $key")) + end + end + + # Shape + shape = Int.(d["shape"]) + + # Datatype + data_type = d["data_type"]::String + + # Chunk Grid + chunk_grid = d["chunk_grid"] + if chunk_grid["name"] == "regular" + chunks = Int.(chunk_grid["configuration"]["chunk_shape"]) + if length(shape) != length(chunks) + throw(ArgumentError("Shape has rank $(length(shape)) which does not match the chunk_shape rank of $(length(chunks))")) + end + else + throw(ArgumentError("Unknown chunk_grid of name, $(chunk_grid["name"])")) + end + + # Chunk Key Encoding + chunk_key_encoding = d["chunk_key_encoding"] + if chunk_key_encoding["name"] ∉ ("default", "v2") + throw(ArgumentError("Unknown chunk_key_encoding of name, $(chunk_key_encoding["name"])")) + end + + + # Codecs + compdict = nothing + + # For transpose codec permutation tracking + default_dim_perm = Tuple(1:length(shape)) + dim_perm = default_dim_perm + + codec_data_type = Ref(:array) + + function check_codec_data_type(codec_name, from, to) + codec_data_type[] == from || + throw(ArgumentError("$codec_name found by codec_data_type is $(codec_data_type[])")) + codec_data_type[] = to + return nothing + end + + for codec in d["codecs"] + codec_name = codec["name"] + if codec_name == "bytes" + # array -> bytes + check_codec_data_type(codec_name, :array, :bytes) + if haskey(codec, "configuration") + codec["configuration"]["endian"] == "little" || + throw(ArgumentError("Zarr.jl currently only supports little endian for the bytes codec")) + end + elseif codec_name == "zstd" + # bytes -> bytes + check_codec_data_type(codec_name, :bytes, :bytes) + compdict = codec + elseif codec_name == "blosc" + # bytes -> bytes + check_codec_data_type(codec_name, :bytes, :bytes) + compdict = codec + elseif codec_name == "gzip" + # bytes -> bytes + check_codec_data_type(codec_name, :bytes, :bytes) + compdict = codec + elseif codec_name == "transpose" + # array -> array + check_codec_data_type(codec_name, :array, :array) + _dim_order = codec["configuration"]["order"] + if _dim_order == "C" + @warn "Transpose codec dimension order of $_dim_order is deprecated" + _dim_order = 1:length(shape) + elseif _dim_order == "F" + @warn "Transpose codec dimension order of $_dim_order is deprecated" + _dim_order = reverse(1:length(shape)) + else + _dim_order = Int.(codec["configuration"]["order"]) .+ 1 + end + dim_perm = dim_perm[_dim_order] + elseif codec_name == "sharding_indexed" + # array -> bytes + check_codec_data_type(codec_name, :array, :bytes) + # TODO: Implement sharding codec support + # See implementation suggestions in src/Codecs/V3/V3.jl for ShardingCodec + throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec. See src/Codecs/V3/V3.jl for implementation suggestions.")) + elseif codec_name == "crc32c" + # bytes -> bytes + check_codec_data_type(codec_name, :bytes, :bytes) + throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec")) + else + throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec")) + end + end + + if dim_perm == default_dim_perm + order = 'C' + elseif dim_perm == reverse(default_dim_perm) + order = 'F' + else + throw(ArgumentError("Dimension permutation of $dim_perm is not implemented")) + end + + compressor = getCompressor(compdict) + + # Filters (NOT IMPLEMENTED) + # For v3, filters are not yet implemented, so we return nothing + filters = nothing + + # Type Parameters + T = typestr3(data_type) + N = length(shape) + C = typeof(compressor) + F = typeof(filters) + + fv = fill_value_decoding(d["fill_value"], T)::T + + TU = (fv === nothing || !fill_as_missing) ? T : Union{T,Missing} + + cke_configuration = get(chunk_key_encoding, "configuration") do + Dict{String,Any}() + end + # V2 uses '.' while default CKE uses '/' by default + if chunk_key_encoding["name"] == "v2" + separator = only(get(cke_configuration, "separator", '.')) + chunk_encoding = ChunkEncoding(separator, false) + elseif chunk_key_encoding["name"] == "default" + chunk_encoding = ChunkEncoding(only(get(cke_configuration, "separator", '/')), true) + end + + MetadataV3{TU, N, C, F, S}( + zarr_format, + node_type, + NTuple{N, Int}(shape) |> reverse, + NTuple{N, Int}(chunks) |> reverse, + data_type, + compressor, + fv, + order, + filters, + chunk_encoding, + ) +end + +"Construct MetadataV3 based on your data" +function Metadata3(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; + node_type::String="array", + compressor::C=BloscCompressor(), + fill_value::Union{T, Nothing}=nothing, + order::Char='C', + filters::F=nothing, + fill_as_missing = false, + dimension_separator::Char = '/' + ) where {T, N, C, F} + @warn("Zarr v3 support is experimental") + T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} + if fill_value === nothing + fill_value = zero(T) + end + MetadataV3{T2,N,C,typeof(filters)}( + 3, + node_type, + size(A), + chunks, + typestr3(eltype(A)), + compressor, + fill_value, + order, + filters, + ChunkEncoding(dimension_separator, true) + ) +end + +function lower3(md::MetadataV3{T}) where T + + mandatory_keys = [ + "zarr_format", + "node_type", + "shape", + "data_type", + "chunk_grid", + "chunk_key_encoding", + "fill_value", + "codecs", + ] + optional_keys = [ + "attributes", + "storage_transformers", + "dimension_names", + ] + + chunk_grid = Dict{String,Any}( + "name" => "regular", + "configuration" => Dict{String,Any}( + "chunk_shape" => md.chunks |> reverse + ) + ) + + chunk_key_encoding = Dict{String,Any}( + "name" => isa(md.dimension_separator, Char) ? "default" : + isa(md.dimension_separator, V2ChunkKeyEncoding) ? "v2" : + error("Unknown encoding for $(md.dimension_separator)"), + "configuration" => Dict{String,Any}( + "separator" => separator(md.dimension_separator) + ) + ) + + # TODO: Incorporate filters + codecs = Dict{String,Any}[] + + default_dim_perm = Tuple(0:length(md.shape[])-1) + + # Encode the order as a single transpose codec (array to array) + push!(codecs, + Dict{String,Any}( + "name" => "transpose", + "configuration" => Dict( + "order" => md.order == 'C' ? default_dim_perm : + md.order == 'F' ? reverse(default_dim_perm) : + error("Unable to encode order $(md.order)") + ) + ) + ) + + # Convert from array to bytes + push!(codecs, + Dict{String,Any}( + "name" => "bytes", + "configuration" => Dict{String, Any}( + "endian" => "little" + ) + ) + ) + # Compress bytes to bytes (only if not NoCompressor) + if !(md.compressor isa NoCompressor) + push!(codecs, JSON.lower(Compressor_v3(md.compressor))) + end + + Dict{String, Any}( + "zarr_format" => Int(md.zarr_format), + "node_type" => md.node_type, + "shape" => md.shape[] |> reverse, + "data_type" => typestr3(T), + "chunk_grid" => chunk_grid, + "chunk_key_encoding" => chunk_key_encoding, + "fill_value" => fill_value_encoding(md.fill_value)::T, + "codecs" => codecs + ) +end diff --git a/test/runtests.jl b/test/runtests.jl index f730495..f5342c5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,12 +12,13 @@ using Dates @testset "ZArray" begin @testset "fields" begin z = zzeros(Int64, 2, 3) - @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, - Zarr.DictStore} - + @test z isa ZArray{Int64,2,Zarr.DictStore,Zarr.MetadataV2{Int64,2,Zarr.BloscCompressor,Nothing}} + @test :a ∈ propertynames(z.storage) @test length(z.storage.a) === 3 @test length(z.storage.a["0.0"]) === 64 @test eltype(z.storage.a["0.0"]) === UInt8 + @test z.metadata.zarr_format === 2 + @test z.metadata.node_type === "array" @test z.metadata.shape[] === (2, 3) @test z.metadata.order === 'C' @test z.metadata.chunks === (2, 3) @@ -29,17 +30,15 @@ using Dates @test z.metadata.compressor.shuffle === 1 @test z.attrs == Dict{Any, Any}() @test z.writeable === true + @test z.metadata.chunk_encoding === Zarr.ChunkEncoding(Zarr.default_sep(Zarr.DV), Zarr.default_prefix(Zarr.DV)) @test_throws ArgumentError zzeros(Int64,2,3, chunks = (0,1)) @test_throws ArgumentError zzeros(Int64,0,-1) - @test_throws ArgumentError Zarr.Metadata(zeros(2,2), (2,2), zarr_format = 3) @test_throws ArgumentError Zarr.Metadata(zeros(2,2), (2,2), order = 'F') end @testset "methods" begin z = zzeros(Int64, 2, 3) - @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, - Zarr.DictStore} - + @test z isa ZArray{Int64,2,Zarr.DictStore,Zarr.MetadataV2{Int64,2,Zarr.BloscCompressor,Nothing}} @test eltype(z) === Int64 @test ndims(z) === 2 @test size(z) === (2, 3) @@ -60,7 +59,7 @@ using Dates compressor=Zarr.NoCompressor()) @test z.metadata.compressor === Zarr.NoCompressor() - @test z.storage === Zarr.DirectoryStore("$dir/$name") + @test z.storage === Zarr.DirectoryStore("$dir/$name") @test isdir("$dir/$name") @test ispath("$dir/$name/.zarray") @test ispath("$dir/$name/.zattrs") @@ -69,12 +68,15 @@ using Dates @test JSON.parsefile("$dir/$name/.zarray") == Dict{String, Any}( "dtype" => " nothing, - "shape" => [3, 2], + "shape" => Any[3, 2], "order" => "C", "zarr_format" => 2, - "chunks" => [3, 2], + "node_type" => "array", + "chunks" => Any[3, 2], "fill_value" => nothing, - "compressor" => nothing) + "compressor" => nothing, + "dimension_separator" => "." + ) # call gc to avoid unlink: operation not permitted (EPERM) on Windows # might be because files are left open # from https://github.com/JuliaLang/julia/blob/f6344d32d3ebb307e2b54a77e042559f42d2ebf6/stdlib/SharedArrays/test/runtests.jl#L146 @@ -87,8 +89,8 @@ end store = DirectoryStore(tempname()) g = zgroup(store,"mygroup") g2 = zgroup(g,"asubgroup",attrs = Dict("a1"=>5)) - @test Zarr.is_zgroup(store,"mygroup") - @test Zarr.is_zgroup(store,"mygroup/asubgroup") + @test Zarr.is_zgroup(Zarr.DV, store, "mygroup") + @test Zarr.is_zgroup(Zarr.DV, store, "mygroup/asubgroup") @test g2.attrs["a1"]==5 @test isdir(joinpath(store.folder,"mygroup")) @test isdir(joinpath(store.folder,"mygroup","asubgroup")) @@ -176,7 +178,7 @@ end @test all(ismissing,amiss[:,2]) @test all(i->isequal(i...),zip(amiss[1:3,4],[1,missing,3])) # Test that chunk containing only missings is not initialized - @test !Zarr.isinitialized(amiss.storage,Zarr.citostring(CartesianIndex((1,5)))) + @test !Zarr.isinitialized(amiss.storage, Zarr.citostring(Zarr.ChunkEncoding('/', false), CartesianIndex((1, 5)))) # amiss = zcreate(Int64, 10,10,chunks=(5,2), fill_value=-1, fill_as_missing=false) amiss[:,1] = 1:10 @@ -188,7 +190,7 @@ end @test all(==(-1),amiss[:,2]) @test all(i->isequal(i...),zip(amiss[1:3,4],[1,-1,3])) # Test that chunk containing only fill values is not initialized - @test !Zarr.isinitialized(amiss.storage,Zarr.citostring(CartesianIndex((1,5)))) + @test !Zarr.isinitialized(amiss.storage, Zarr.citostring(Zarr.ChunkEncoding('/', false), CartesianIndex((1, 5)))) end @testset "resize" begin diff --git a/test/storage.jl b/test/storage.jl index d233301..ad1ee39 100644 --- a/test/storage.jl +++ b/test/storage.jl @@ -14,40 +14,73 @@ using AWSS3 @test Zarr.normalize_path("/path/to/a") == "/path/to/a" end +@testset "Version and Dimension Separator" begin + dot_noprefix = Zarr.ChunkEncoding('.', false) + dot_prefix = Zarr.ChunkEncoding('.', true) + slash_noprefix = Zarr.ChunkEncoding('/', false) + slash_prefix = Zarr.ChunkEncoding('/', true) + let ci = CartesianIndex() + @test Zarr.citostring(dot_noprefix, ci) == "0" + @test Zarr.citostring(dot_prefix, ci) == "c.0" + @test Zarr.citostring(slash_noprefix, ci) == "0" + @test Zarr.citostring(slash_prefix, ci) == "c/0" + end + let ci = CartesianIndex(1,1,1) + @test Zarr.citostring(dot_noprefix, ci) == "0.0.0" + @test Zarr.citostring(dot_prefix, ci) == "c.0.0.0" + @test Zarr.citostring(slash_noprefix, ci) == "0/0/0" + @test Zarr.citostring(slash_prefix, ci) == "c/0/0/0" + end + let ci = CartesianIndex(1,3,5) + @test Zarr.citostring(dot_noprefix, ci) == "4.2.0" + @test Zarr.citostring(dot_prefix, ci) == "c.4.2.0" + @test Zarr.citostring(slash_noprefix, ci) == "4/2/0" + @test Zarr.citostring(slash_prefix, ci) == "c/4/2/0" + end +end + """ Function to test the interface of AbstractStore. Every complete implementation should pass this test. """ -function test_store_common(ds) - @test !Zarr.is_zgroup(ds,"") +function test_store_common(ds::Zarr.AbstractStore) + V = Zarr.DV + enc = Zarr.ChunkEncoding(Zarr.default_sep(V), Zarr.default_prefix(V)) + + @test !Zarr.is_zgroup(V, ds, "") ds[".zgroup"]=rand(UInt8,50) @test haskey(ds,".zgroup") - @test Zarr.is_zgroup(ds,"") - @test !Zarr.is_zarray(ds,"") + @test Zarr.is_zgroup(V, ds, "") + @test !Zarr.is_zarray(V, ds, "") @test isempty(Zarr.subdirs(ds,"")) @test sort(collect(Zarr.subkeys(ds,"")))==[".zgroup"] #Create a subgroup - @test !Zarr.is_zarray(ds,"bar") + @test !Zarr.is_zarray(V, ds, "bar") ds["bar/.zarray"] = rand(UInt8,50) - @test Zarr.is_zarray(ds,"bar") + @test Zarr.is_zarray(V, ds, "bar") @test Zarr.subdirs(ds,"") == ["bar"] @test Zarr.subdirs(ds,"bar") == String[] #Test getindex and setindex data = rand(UInt8,50) - ds["bar/0.0.0"] = data + + first_ci_str = Zarr.citostring(enc, CartesianIndex(1, 1, 1)) + second_ci_str = Zarr.citostring(enc, CartesianIndex(2, 1, 1)) + ds["bar/" * first_ci_str] = data @test ds["bar/0.0.0"]==data @test Zarr.storagesize(ds,"bar")==50 - @test Zarr.isinitialized(ds,"bar/0.0.0") - @test !Zarr.isinitialized(ds,"bar/0.0.1") - Zarr.writeattrs(ds,"bar",Dict("a"=>"b")) - @test Zarr.getattrs(ds,"bar")==Dict("a"=>"b") - delete!(ds,"bar/0.0.0") - @test !Zarr.isinitialized(ds,"bar",CartesianIndex((0,0,0))) - @test !Zarr.isinitialized(ds,"bar/0.0.0") - ds["bar/0.0.0"] = data + @test Zarr.isinitialized(ds,"bar/" * first_ci_str) + @test !Zarr.isinitialized(ds,"bar/" * second_ci_str) + Zarr.writeattrs(V, ds, "bar", Dict("a" => "b")) + @test Zarr.getattrs(V, ds, "bar") == Dict("a" => "b") + delete!(ds,"bar/" * first_ci_str) + @test !Zarr.store_isinitialized(ds, "bar", CartesianIndex((1, 1, 1)), enc) + @test !Zarr.isinitialized(ds,"bar/" * first_ci_str) + ds["bar/" * first_ci_str] = data + @test !Zarr.store_isinitialized(ds, "bar", CartesianIndex(0, 0, 0), enc) + @test Zarr.store_isinitialized(ds, "bar", CartesianIndex(1, 1, 1), enc) #Add tests for empty storage @test Zarr.isemptysub(ds,"ba") @test Zarr.isemptysub(ds,"ba/") @@ -63,9 +96,11 @@ Function to test the interface of a read only AbstractStore. Every complete impl `closer` is a function that gets called to close the read only store. """ function test_read_only_store_common(converter, closer=Returns(nothing)) + V = Zarr.DV + enc = Zarr.ChunkEncoding(Zarr.default_sep(V), Zarr.default_prefix(V)) ds = Zarr.DictStore() rs = converter(ds) - @test !Zarr.is_zgroup(rs,"") + @test !Zarr.is_zgroup(V, rs, "") closer(rs) ds[".zgroup"]=rand(UInt8,50) @@ -73,20 +108,20 @@ function test_read_only_store_common(converter, closer=Returns(nothing)) @test haskey(rs,".zgroup") - @test Zarr.is_zgroup(rs,"") - @test !Zarr.is_zarray(rs,"") + @test Zarr.is_zgroup(V, rs, "") + @test !Zarr.is_zarray(V, rs, "") @test isempty(Zarr.subdirs(rs,"")) @test sort(collect(Zarr.subkeys(rs,"")))==[".zgroup"] #Create a subgroup - @test !Zarr.is_zarray(rs,"bar") + @test !Zarr.is_zarray(V, rs, "bar") closer(rs) ds["bar/.zarray"] = rand(UInt8,50) rs = converter(ds) - @test Zarr.is_zarray(rs,"bar") + @test Zarr.is_zarray(V, rs, "bar") @test Zarr.subdirs(rs,"") == ["bar"] @test Zarr.subdirs(rs,"bar") == String[] #Test getindex and setindex @@ -102,16 +137,16 @@ function test_read_only_store_common(converter, closer=Returns(nothing)) @test !Zarr.isinitialized(rs,"bar/0.0.1") closer(rs) - Zarr.writeattrs(ds,"bar",Dict("a"=>"b")) + Zarr.writeattrs(V, ds, "bar", Dict("a" => "b")) rs = converter(ds) - @test Zarr.getattrs(rs,"bar")==Dict("a"=>"b") + @test Zarr.getattrs(V, rs, "bar") == Dict("a" => "b") closer(rs) delete!(ds,"bar/0.0.0") rs = converter(ds) - @test !Zarr.isinitialized(rs,"bar",CartesianIndex((0,0,0))) + @test !Zarr.store_isinitialized(rs, "bar", CartesianIndex((0, 0, 0)), enc) @test !Zarr.isinitialized(rs,"bar/0.0.0") closer(rs) @@ -157,6 +192,7 @@ end @testset "Minio S3 storage" begin + @info "Testing Minio S3 storage" A = fill(1.0, 30, 20) chunks = (5,10) metadata = Zarr.Metadata(A, chunks; fill_value=-1.5) @@ -177,9 +213,11 @@ end end @testset "AWS S3 Storage" begin + V = Zarr.DV + @info "Testing AWS S3 storage" AWSS3.AWS.global_aws_config(AWSS3.AWS.AWSConfig(creds=nothing, region="us-west-2")) S3, p = Zarr.storefromstring("s3://mur-sst/zarr-v1") - @test Zarr.is_zgroup(S3, p) + @test Zarr.is_zgroup(V, S3, p) @test storagesize(S3, p) == 10551 S3group = zopen(S3,path=p) S3Array = S3group["time"] @@ -189,6 +227,7 @@ end end @testset "GCS Storage" begin + @info "Testing GCS storage" for s in ( "gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/highresSST-present/r1i1p1f1/6hrPlev/psl/gn/v20170706", "https://storage.googleapis.com/cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/highresSST-present/r1i1p1f1/6hrPlev/psl/gn/v20170706", @@ -210,6 +249,7 @@ end end @testset "HTTP Storage" begin + @info "Testing HTTP Storage" s = Zarr.DictStore() g = zgroup(s, attrs = Dict("groupatt"=>5)) a = zcreate(Int,g,"a1",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5)) @@ -237,14 +277,16 @@ end g = zgroup(s, attrs = Dict("groupatt"=>5)) a = zcreate(Int,g,"a",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5),fill_value = -1) @async HTTP.serve(Zarr.zarr_req_handler(s,g.path,403),ip,port,server=server) - g3 = zopen("http://$ip:$port") - @test_throws "Received error code 403" g3["a"][:,:] - Zarr.missing_chunk_return_code!(g3.storage,403) + httpstore = Zarr.ConsolidatedStore(Zarr.HTTPStore("http://$ip:$port"), "") + @test_throws "Received error code 403" zopen(httpstore) + Zarr.missing_chunk_return_code!(httpstore, 403) + g3 = zopen(httpstore) @test all(==(-1),g3["a"][:,:]) close(server) end @testset "Zip Storage" begin + @info "Testing Zip Storage" s = Zarr.DictStore() g = zgroup(s, attrs = Dict("groupatt"=>5)) a = zcreate(Int,g,"a1",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5)) @@ -265,4 +307,5 @@ end Zarr.writezip(io, ds) Zarr.ZipStore(take!(io)) end + @info "Finished testing ZipStore" end diff --git a/test/v3_julia.jl b/test/v3_julia.jl new file mode 100644 index 0000000..ec73502 --- /dev/null +++ b/test/v3_julia.jl @@ -0,0 +1,308 @@ +# Julia script to generate Zarr v3 fixtures using pure Julia +# Mirrors the examples from v3_python.jl + +using Zarr +using JSON + +# Paths +path_v3 = joinpath(@__DIR__, "v3_julia", "data.zarr") + +# Remove existing +if isdir(path_v3) + rm(path_v3, recursive=true) +end + +# Create store and root group for v3 +store = Zarr.FormattedStore{3, '/'}(Zarr.DirectoryStore(path_v3)) +# Manually create v3 group metadata (zgroup defaults to v2) # TODO: we need to fix this! +group_meta = Dict("zarr_format" => 3, "node_type" => "group") +b = IOBuffer() +JSON.print(b, group_meta) +store["", "zarr.json"] = take!(b) + +# Helper: create array and set data +function create_and_fill(store, name, data; + dtype=nothing, + shape=nothing, + chunks=nothing, + compressor=Zarr.BloscCompressor(), + fill_value=nothing, + zarr_format=3, + dimension_separator='/') + + # Create the array + z = zcreate(eltype(data), store, shape...; + path=name, + chunks=chunks, + compressor=compressor, + fill_value=fill_value, + zarr_format=zarr_format, + dimension_separator=dimension_separator) + # Fill the array with the data + z[:] = data + return z +end + +# 1d.contiguous.gzip.i2 +create_and_fill(store, "1d.contiguous.gzip.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.blosc.i2 +create_and_fill(store, "1d.contiguous.blosc.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.raw.i2 +create_and_fill(store, "1d.contiguous.raw.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.NoCompressor(), +) + +# 1d.contiguous.i4 +create_and_fill(store, "1d.contiguous.i4", Int32[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.u1 +create_and_fill(store, "1d.contiguous.u1", UInt8[255,0,255,0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.f2.le +create_and_fill(store, "1d.contiguous.f2.le", Float16[-1000.5, 0.0, 1000.5, 0.0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.f4.le +create_and_fill(store, "1d.contiguous.f4.le", Float32[-1000.5, 0.0, 1000.5, 0.0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.f4.be +# Note: Big endian is not directly supported in Julia, but we can create the array +# The actual endianness is handled by the bytes codec in v3 +create_and_fill(store, "1d.contiguous.f4.be", Float32[-1000.5, 0.0, 1000.5, 0.0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.f8 +create_and_fill(store, "1d.contiguous.f8", Float64[1.5,2.5,3.5,4.5]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.b1 +create_and_fill(store, "1d.contiguous.b1", Bool[true,false,true,false]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.chunked.i2 +z = create_and_fill(store, "1d.chunked.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(2,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# Adjust zarr.json to set dimension_names = null +meta_path = joinpath(path_v3, "1d.chunked.i2", "zarr.json") +meta = JSON.parsefile(meta_path; dicttype = Dict{String,Any}) +meta["dimension_names"] = nothing +open(meta_path, "w") do io + JSON.print(io, meta) +end + +# 1d.chunked.ragged.i2 +create_and_fill(store, "1d.chunked.ragged.i2", Int16[1,2,3,4,5]; + shape=(5,), + chunks=(2,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 2d.contiguous.i2 +create_and_fill(store, "2d.contiguous.i2", Int16[1 2; 3 4]; + shape=(2,2), + chunks=(2,2), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 2d.chunked.i2 +create_and_fill(store, "2d.chunked.i2", Int16[1 2; 3 4]; + shape=(2,2), + chunks=(1,1), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 2d.chunked.ragged.i2 +create_and_fill(store, "2d.chunked.ragged.i2", Int16[1 2 3; 4 5 6; 7 8 9]; + shape=(3,3), + chunks=(2,2), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 3d.contiguous.i2 +create_and_fill(store, "3d.contiguous.i2", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,3), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 3d.chunked.i2 +create_and_fill(store, "3d.chunked.i2", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(1,1,1), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 3d.chunked.mixed.i2.C +create_and_fill(store, "3d.chunked.mixed.i2.C", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,1), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 3d.chunked.mixed.i2.F +# Note: Column-major order (F) is simulated with transpose filter in Python +# In Julia, we create with C order as that's what's currently supported +create_and_fill(store, "3d.chunked.mixed.i2.F", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,1), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +##### Sharded/compressed examples +# Note: Sharding is not yet fully implemented in Zarr.jl, so these examples +# may not produce the exact same structure as the Python version. +# They are included for completeness but may need adjustment once sharding is supported. + +# 1d.contiguous.compressed.sharded.i2 +create_and_fill(store, "1d.contiguous.compressed.sharded.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.i4 +create_and_fill(store, "1d.contiguous.compressed.sharded.i4", Int32[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.u1 +create_and_fill(store, "1d.contiguous.compressed.sharded.u1", UInt8[255,0,255,0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.f4 +create_and_fill(store, "1d.contiguous.compressed.sharded.f4", Float32[-1000.5,0,1000.5,0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.f8 +create_and_fill(store, "1d.contiguous.compressed.sharded.f8", Float64[1.5,2.5,3.5,4.5]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.b1 +create_and_fill(store, "1d.contiguous.compressed.sharded.b1", Bool[true,false,true,false]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.chunked.compressed.sharded.i2 +create_and_fill(store, "1d.chunked.compressed.sharded.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(1,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.chunked.filled.compressed.sharded.i2 +create_and_fill(store, "1d.chunked.filled.compressed.sharded.i2", Int16[1,2,0,0]; + shape=(4,), + chunks=(1,), + compressor=Zarr.ZlibCompressor(), +) + +# 2d.contiguous.compressed.sharded.i2 +create_and_fill(store, "2d.contiguous.compressed.sharded.i2", Int16[1 2; 3 4]; + shape=(2,2), + chunks=(2,2), + compressor=Zarr.ZlibCompressor(), +) + +# 2d.chunked.compressed.sharded.filled.i2 +create_and_fill(store, "2d.chunked.compressed.sharded.filled.i2", reshape(Int16.(0:15), 4, 4); + shape=(4,4), + chunks=(1,1), + compressor=Zarr.ZlibCompressor(), +) + +# 2d.chunked.compressed.sharded.i2 +create_and_fill(store, "2d.chunked.compressed.sharded.i2", reshape(Int16.(1:16), 4, 4); + shape=(4,4), + chunks=(1,1), + compressor=Zarr.ZlibCompressor(), +) + +# 2d.chunked.ragged.compressed.sharded.i2 +create_and_fill(store, "2d.chunked.ragged.compressed.sharded.i2", reshape(Int16.(1:9), 3, 3); + shape=(3,3), + chunks=(1,1), + compressor=Zarr.ZlibCompressor(), +) + +# 3d.contiguous.compressed.sharded.i2 +create_and_fill(store, "3d.contiguous.compressed.sharded.i2", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,3), + compressor=Zarr.ZlibCompressor(), +) + +# 3d.chunked.compressed.sharded.i2 +create_and_fill(store, "3d.chunked.compressed.sharded.i2", reshape(Int16.(0:63), 4, 4, 4); + shape=(4,4,4), + chunks=(1,1,1), + compressor=Zarr.ZlibCompressor(), +) + +# 3d.chunked.mixed.compressed.sharded.i2 +create_and_fill(store, "3d.chunked.mixed.compressed.sharded.i2", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,1), + compressor=Zarr.ZlibCompressor(), +) + +# Group with spaces in the name +group_path = "my group with spaces" +group_meta2 = Dict("zarr_format" => 3, "node_type" => "group", "attributes" => Dict("description" => "A group with spaces in the name")) +b2 = IOBuffer() +JSON.print(b2, group_meta2) +store[group_path, "zarr.json"] = take!(b2) + +@info "Zarr v3 fixtures generated at: $path_v3" \ No newline at end of file diff --git a/test/v3_python.jl b/test/v3_python.jl new file mode 100644 index 0000000..247d181 --- /dev/null +++ b/test/v3_python.jl @@ -0,0 +1,479 @@ +# Julia script to generate Zarr v3 fixtures using PythonCall + CondaPkg +# Adapted from: https://github.com/manzt/zarrita.js/blob/23abb3bee9094aabbe60985626caef2802360963/scripts/generate-v3.py + +using CondaPkg +using JSON + +# Install Python deps into Conda env used by PythonCall (zarr v3 and numpy) +CondaPkg.add("numpy") +CondaPkg.add("zarr"; version="3.*") +CondaPkg.add("numcodecs") + +using PythonCall +# Import Python modules +np = pyimport("numpy") +zarr = pyimport("zarr") +codecs = pyimport("zarr.codecs") +storage = pyimport("zarr.storage") +json = pyimport("json") +shutil = pyimport("shutil") +pathlib = pyimport("pathlib") +builtins = pyimport("builtins") + +# Paths +path_v3 = joinpath(@__DIR__, "v3_python", "data.zarr") + +# deterministic RNG for numpy +np.random.seed(42) + +# remove existing +try + shutil.rmtree(path_v3) +catch + # ignore +end + +# create store and path_v3 group +store = storage.LocalStore(path_v3) +zarr.create_group(store) + +# helper: create array and set data (value should be a numpy array or convertible) +function create_and_fill(store; name, dtype=nothing, shape=nothing, chunks=nothing, + serializer=nothing, compressors=nothing, filters=nothing, shards=nothing, data) + # Build NamedTuple of only non-nothing keyword arguments + kwargs = (; name=name) + if dtype !== nothing + kwargs = merge(kwargs, (; dtype=dtype)) + end + if shape !== nothing + kwargs = merge(kwargs, (; shape=shape)) + end + if chunks !== nothing + kwargs = merge(kwargs, (; chunks=chunks)) + end + if serializer !== nothing + kwargs = merge(kwargs, (; serializer=serializer)) + end + if compressors !== nothing + kwargs = merge(kwargs, (; compressors=compressors)) + end + if filters !== nothing + kwargs = merge(kwargs, (; filters=filters)) + end + if shards !== nothing + kwargs = merge(kwargs, (; shards=shards)) + end + + # create the array + a = zarr.create_array(store; kwargs...) + + # ensure numpy array + arr = data isa Py ? data : np.array(data) + + # assign content + a.__setitem__(builtins.Ellipsis, arr) + + return a +end + +# 1d.contiguous.gzip.i2 +create_and_fill(store; + name="1d.contiguous.gzip.i2", + dtype="int16", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=[1,2,3,4], +) + +# 1d.contiguous.blosc.i2 +create_and_fill(store; + name="1d.contiguous.blosc.i2", + dtype="int16", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=[1,2,3,4], +) + +# 1d.contiguous.raw.i2 +create_and_fill(store; + name="1d.contiguous.raw.i2", + dtype="int16", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=nothing, + data=[1,2,3,4], +) + +# 1d.contiguous.i4 +create_and_fill(store; + name="1d.contiguous.i4", + dtype="int32", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=[1,2,3,4], +) + +# 1d.contiguous.u1 +create_and_fill(store; + name="1d.contiguous.u1", + dtype="uint8", + shape=(4,), + chunks=(4,), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([255,0,255,0], dtype="u1") +) + +# 1d.contiguous.f2.le +create_and_fill(store; + name="1d.contiguous.f2.le", + dtype="float16", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([-1000.5, 0.0, 1000.5, 0.0], dtype="f2"), +) + +# 1d.contiguous.f4.le +create_and_fill(store; + name="1d.contiguous.f4.le", + dtype="float32", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([-1000.5, 0.0, 1000.5, 0.0], dtype="f4"), +) + +# 1d.contiguous.f4.be +create_and_fill(store; + name="1d.contiguous.f4.be", + dtype="float32", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="big"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([-1000.5, 0.0, 1000.5, 0.0], dtype="f4"), +) + +# 1d.contiguous.f8 +create_and_fill(store; + name="1d.contiguous.f8", + dtype="float64", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([1.5,2.5,3.5,4.5], dtype="f8"), +) + +# 1d.contiguous.b1 +create_and_fill(store; + name="1d.contiguous.b1", + dtype="bool", + shape=(4,), + chunks=(4,), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([true,false,true,false], dtype="bool"), +) + +# 1d.chunked.i2 +create_and_fill(store; + name="1d.chunked.i2", + dtype="int16", + shape=(4,), + chunks=(2,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([1,2,3,4], dtype="i2"), +) + +# adjust zarr.json to set dimension_names = null +meta_path = joinpath(path_v3, "1d.chunked.i2", "zarr.json") +meta = JSON.parsefile(meta_path; dicttype = Dict{String,Any}) +meta["dimension_names"] = nothing +open(meta_path, "w") do io + JSON.print(io, meta) +end + +# 1d.chunked.ragged.i2 +create_and_fill(store; + name="1d.chunked.ragged.i2", + dtype="int16", + shape=(5,), + chunks=(2,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([1,2,3,4,5], dtype="i2"), +) + +# 2d.contiguous.i2 +create_and_fill(store; + name="2d.contiguous.i2", + dtype="int16", + shape=(2,2), + chunks=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data= np.array([ [1,2], [3,4] ] |> pylist, dtype="i2"), +) + +# 2d.chunked.i2 +create_and_fill(store; + name="2d.chunked.i2", + dtype="int16", + shape=(2,2), + chunks=(1,1), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([[1,2],[3,4]] |> pylist, dtype="i2"), +) + +# 2d.chunked.ragged.i2 +create_and_fill(store; + name="2d.chunked.ragged.i2", + dtype="int16", + shape=(3,3), + chunks=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([[1,2,3],[4,5,6],[7,8,9]] |> pylist, dtype="i2"), +) + +# 3d.contiguous.i2 +create_and_fill(store; + name="3d.contiguous.i2", + dtype="int16", + shape=(3,3,3), + chunks=(3,3,3), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.arange(27).reshape(3,3,3), +) + +# 3d.chunked.i2 +create_and_fill(store; + name="3d.chunked.i2", + dtype="int16", + shape=(3,3,3), + chunks=(1,1,1), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.arange(27).reshape(3,3,3), +) + +# 3d.chunked.mixed.i2.C +create_and_fill(store; + name="3d.chunked.mixed.i2.C", + dtype="int16", + shape=(3,3,3), + chunks=(3,3,1), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.arange(27).reshape(3,3,3), +) + +# 3d.chunked.mixed.i2.F (with transpose filter to simulate column-major) +transpose_filter = codecs.TransposeCodec(order=[2,1,0]) +create_and_fill(store; + name="3d.chunked.mixed.i2.F", + dtype="int16", + shape=(3,3,3), + chunks=(3,3,1), + filters=[transpose_filter], + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.arange(27).reshape(3,3,3), +) + +##### Sharded/compressed examples +# 1d.contiguous.compressed.sharded.i2 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.i2", + shape=(4,), + dtype=np.array([1,2,3,4], dtype="i2").dtype, + chunks=(4,), + shards=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1,2,3,4], dtype="i2"), +) + +# 1d.contiguous.compressed.sharded.i4 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.i4", + shape=(4,), + dtype=np.array([1,2,3,4], dtype="i4").dtype, + chunks=(4,), + shards=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1,2,3,4], dtype="i4"), +) + +# 1d.contiguous.compressed.sharded.u1 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.u1", + shape=(4,), + dtype=np.array([255,0,255,0], dtype="u1").dtype, + chunks=(4,), + shards=(4,), + compressors=[codecs.GzipCodec()], + data=np.array([255,0,255,0], dtype="u1"), +) + +# 1d.contiguous.compressed.sharded.f4 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.f4", + shape=(4,), + dtype=np.array([-1000.5,0,1000.5,0], dtype="f4").dtype, + chunks=(4,), + shards=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([-1000.5,0,1000.5,0], dtype="f4"), +) + +# 1d.contiguous.compressed.sharded.f8 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.f8", + shape=(4,), + dtype=np.array([1.5,2.5,3.5,4.5], dtype="f8").dtype, + chunks=(4,), + shards=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1.5,2.5,3.5,4.5], dtype="f8"), +) + +# 1d.contiguous.compressed.sharded.b1 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.b1", + shape=(4,), + dtype="bool", + chunks=(4,), + shards=(4,), + compressors=[codecs.GzipCodec()], + data=np.array([true,false,true,false], dtype="bool"), +) + +# 1d.chunked.compressed.sharded.i2 +create_and_fill(store; + name="1d.chunked.compressed.sharded.i2", + shape=(4,), + dtype=np.array([1,2,3,4], dtype="i2").dtype, + chunks=(1,), + shards=(2,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1,2,3,4], dtype="i2"), +) + +# 1d.chunked.filled.compressed.sharded.i2 +create_and_fill(store; + name="1d.chunked.filled.compressed.sharded.i2", + shape=(4,), + dtype=np.array([1,2,0,0], dtype="i2").dtype, + chunks=(1,), + shards=(2,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1,2,0,0], dtype="i2"), +) + +# 2d.contiguous.compressed.sharded.i2 +create_and_fill(store; + name="2d.contiguous.compressed.sharded.i2", + shape=(2,2), + dtype=np.arange(1,5, dtype="i2").dtype, + chunks=(2,2), + shards=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(1,5, dtype="i2").reshape(2,2), +) + +# 2d.chunked.compressed.sharded.filled.i2 +create_and_fill(store; + name="2d.chunked.compressed.sharded.filled.i2", + shape=(4,4), + dtype=np.arange(16, dtype="i2").dtype, + chunks=(1,1), + shards=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(16, dtype="i2").reshape(4,4), +) + +# 2d.chunked.compressed.sharded.i2 +create_and_fill(store; + name="2d.chunked.compressed.sharded.i2", + shape=(4,4), + dtype=np.arange(16, dtype="i2").dtype, + chunks=(1,1), + shards=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=(np.arange(16, dtype="i2").reshape(4,4) + 1), +) + +# 2d.chunked.ragged.compressed.sharded.i2 +create_and_fill(store; + name="2d.chunked.ragged.compressed.sharded.i2", + shape=(3,3), + dtype=np.arange(1,10, dtype="i2").dtype, + chunks=(1,1), + shards=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(1,10, dtype="i2").reshape(3,3), +) + +# 3d.contiguous.compressed.sharded.i2 +create_and_fill(store; + name="3d.contiguous.compressed.sharded.i2", + shape=(3,3,3), + dtype=np.arange(27, dtype="i2").dtype, + chunks=(3,3,3), + shards=(3,3,3), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(27, dtype="i2").reshape(3,3,3), +) + +# 3d.chunked.compressed.sharded.i2 +create_and_fill(store; + name="3d.chunked.compressed.sharded.i2", + shape=(4,4,4), + dtype=np.arange(64, dtype="i2").dtype, + chunks=(1,1,1), + shards=(2,2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(64, dtype="i2").reshape(4,4,4), +) + +# 3d.chunked.mixed.compressed.sharded.i2 +create_and_fill(store; + name="3d.chunked.mixed.compressed.sharded.i2", + shape=(3,3,3), + dtype=np.arange(27, dtype="i2").dtype, + chunks=(3,3,1), + shards=(3,3,3), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(27, dtype="i2").reshape(3,3,3), +) + +# Group with spaces in the name +g = zarr.create_group(store, path="my group with spaces") +g.attrs["description"] = "A group with spaces in the name" + +@info "Zarr v3 fixtures generated at: $path_v3"