diff --git a/.changeset/numcodecs-namespace.md b/.changeset/numcodecs-namespace.md new file mode 100644 index 00000000..c332556c --- /dev/null +++ b/.changeset/numcodecs-namespace.md @@ -0,0 +1,27 @@ +--- +"zarrita": minor +--- + +Add `numcodecs.` namespace for v2 codec registry and built-in shuffle/delta codecs + +V2 filters and compressors are now registered under a `numcodecs.` prefix +(e.g., `numcodecs.blosc`, `numcodecs.zlib`) matching zarr-python's convention. +This separates v2-specific codecs from the v3 codec namespace. + +Adds built-in `numcodecs.shuffle` and `numcodecs.delta` codecs for reading v2 +data that uses these common numcodecs filters. Both are pure JS with no WASM +dependencies. + +Custom v2 codecs can be registered under the same namespace: + +```ts +zarr.registry.set("numcodecs.my-filter", async () => ({ + fromConfig(config) { + return { + kind: "bytes_to_bytes", + encode(data) { /* ... */ }, + decode(data) { /* ... */ }, + }; + }, +})); +``` diff --git a/fixtures/v2/data.zarr/.zmetadata b/fixtures/v2/data.zarr/.zmetadata index 9e21c3af..961daa2f 100644 --- a/fixtures/v2/data.zarr/.zmetadata +++ b/fixtures/v2/data.zarr/.zmetadata @@ -166,6 +166,56 @@ ], "zarr_format": 2 }, + "1d.contiguous.delta.i2/.zarray": { + "chunks": [ + 4 + ], + "compressor": { + "id": "zlib", + "level": 1 + }, + "dtype": " { + it("roundtrips with elementSize=4", () => { + let codec = ShuffleCodec.fromConfig({ elementsize: 4 }); + let input = new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); + let encoded = codec.encode(input); + let decoded = codec.decode(encoded); + expect(decoded).toEqual(input); + }); + + it("roundtrips with elementSize=2", () => { + let codec = ShuffleCodec.fromConfig({ elementsize: 2 }); + let input = new Uint8Array([1, 2, 3, 4, 5, 6]); + let encoded = codec.encode(input); + let decoded = codec.decode(encoded); + expect(decoded).toEqual(input); + }); + + it("shuffles bytes correctly", () => { + let codec = ShuffleCodec.fromConfig({ elementsize: 4 }); + // 3 elements of 4 bytes each: [A0 A1 A2 A3] [B0 B1 B2 B3] [C0 C1 C2 C3] + let input = new Uint8Array([ + 0xa0, 0xa1, 0xa2, 0xa3, 0xb0, 0xb1, 0xb2, 0xb3, 0xc0, 0xc1, 0xc2, 0xc3, + ]); + let encoded = codec.encode(input); + // Shuffled: [A0 B0 C0] [A1 B1 C1] [A2 B2 C2] [A3 B3 C3] + expect(encoded).toEqual( + new Uint8Array([ + 0xa0, 0xb0, 0xc0, 0xa1, 0xb1, 0xc1, 0xa2, 0xb2, 0xc2, 0xa3, 0xb3, 0xc3, + ]), + ); + }); + + it("defaults to elementSize=4", () => { + let codec = ShuffleCodec.fromConfig({}); + let input = new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8]); + let decoded = codec.decode(codec.encode(input)); + expect(decoded).toEqual(input); + }); +}); + +describe("delta", () => { + it("roundtrips int32 data", () => { + let codec = DeltaCodec.fromConfig({}, { data_type: "int32" }); + let buf = new ArrayBuffer(16); + let view = new Int32Array(buf); + view.set([10, 20, 25, 30]); + let input = new Uint8Array(buf); + let encoded = codec.encode(input); + let decoded = codec.decode(encoded); + let result = new Int32Array( + decoded.buffer, + decoded.byteOffset, + decoded.byteLength / 4, + ); + expect(Array.from(result)).toEqual([10, 20, 25, 30]); + }); + + it("encodes as differences", () => { + let codec = DeltaCodec.fromConfig({}, { data_type: "int32" }); + let buf = new ArrayBuffer(16); + new Int32Array(buf).set([10, 20, 25, 30]); + let encoded = codec.encode(new Uint8Array(buf)); + let deltas = new Int32Array( + encoded.buffer, + encoded.byteOffset, + encoded.byteLength / 4, + ); + expect(Array.from(deltas)).toEqual([10, 10, 5, 5]); + }); + + it("roundtrips float32 data", () => { + let codec = DeltaCodec.fromConfig({}, { data_type: "float32" }); + let buf = new ArrayBuffer(12); + new Float32Array(buf).set([1.5, 2.5, 4.0]); + let input = new Uint8Array(buf); + let decoded = codec.decode(codec.encode(input)); + let result = new Float32Array( + decoded.buffer, + decoded.byteOffset, + decoded.byteLength / 4, + ); + expect(Array.from(result)).toEqual([1.5, 2.5, 4.0]); + }); + + it("roundtrips int16 data", () => { + let codec = DeltaCodec.fromConfig({}, { data_type: "int16" }); + let buf = new ArrayBuffer(8); + new Int16Array(buf).set([100, 200, 150, 300]); + let input = new Uint8Array(buf); + let decoded = codec.decode(codec.encode(input)); + let result = new Int16Array( + decoded.buffer, + decoded.byteOffset, + decoded.byteLength / 2, + ); + expect(Array.from(result)).toEqual([100, 200, 150, 300]); + }); + + it("roundtrips uint16 data", () => { + let codec = DeltaCodec.fromConfig({}, { data_type: "uint16" }); + let buf = new ArrayBuffer(8); + new Uint16Array(buf).set([65000, 100, 50000, 40000]); + let input = new Uint8Array(buf); + let decoded = codec.decode(codec.encode(input)); + let result = new Uint16Array( + decoded.buffer, + decoded.byteOffset, + decoded.byteLength / 2, + ); + expect(Array.from(result)).toEqual([65000, 100, 50000, 40000]); + }); + + it("roundtrips uint32 data", () => { + let codec = DeltaCodec.fromConfig({}, { data_type: "uint32" }); + let buf = new ArrayBuffer(12); + new Uint32Array(buf).set([3000000000, 100, 4000000000]); + let input = new Uint8Array(buf); + let decoded = codec.decode(codec.encode(input)); + let result = new Uint32Array( + decoded.buffer, + decoded.byteOffset, + decoded.byteLength / 4, + ); + expect(Array.from(result)).toEqual([3000000000, 100, 4000000000]); + }); + + it("roundtrips float64 data", () => { + let codec = DeltaCodec.fromConfig({}, { data_type: "float64" }); + let buf = new ArrayBuffer(24); + new Float64Array(buf).set([1.1, 2.2, 3.3]); + let input = new Uint8Array(buf); + let decoded = codec.decode(codec.encode(input)); + let result = new Float64Array( + decoded.buffer, + decoded.byteOffset, + decoded.byteLength / 8, + ); + expect(Array.from(result)).toEqual([1.1, 2.2, 3.3]); + }); + + it("roundtrips int8 data", () => { + let codec = DeltaCodec.fromConfig({}, { data_type: "int8" }); + let buf = new ArrayBuffer(4); + new Int8Array(buf).set([-50, 0, 50, 100]); + let input = new Uint8Array(buf); + let decoded = codec.decode(codec.encode(input)); + let result = new Int8Array( + decoded.buffer, + decoded.byteOffset, + decoded.byteLength, + ); + expect(Array.from(result)).toEqual([-50, 0, 50, 100]); + }); + + it("roundtrips int64 (bigint) data", () => { + let codec = DeltaCodec.fromConfig({}, { data_type: "int64" }); + let buf = new ArrayBuffer(24); + new BigInt64Array(buf).set([10n, 20n, 30n]); + let input = new Uint8Array(buf); + let decoded = codec.decode(codec.encode(input)); + let result = new BigInt64Array( + decoded.buffer, + decoded.byteOffset, + decoded.byteLength / 8, + ); + expect(Array.from(result)).toEqual([10n, 20n, 30n]); + }); + + it("roundtrips uint64 (bigint) data", () => { + let codec = DeltaCodec.fromConfig({}, { data_type: "uint64" }); + let buf = new ArrayBuffer(24); + new BigUint64Array(buf).set([0n, 9007199254740993n, 18014398509481984n]); + let input = new Uint8Array(buf); + let decoded = codec.decode(codec.encode(input)); + let result = new BigUint64Array( + decoded.buffer, + decoded.byteOffset, + decoded.byteLength / 8, + ); + expect(Array.from(result)).toEqual([ + 0n, + 9007199254740993n, + 18014398509481984n, + ]); + }); + + it("handles empty data", () => { + let codec = DeltaCodec.fromConfig({}, { data_type: "int32" }); + let input = new Uint8Array(0); + let encoded = codec.encode(input); + expect(encoded.length).toBe(0); + }); + + it("throws on unsupported dtype", () => { + expect(() => + DeltaCodec.fromConfig( + {}, + { + // @ts-expect-error - invalid dtype + data_type: "complex64", + }, + ), + ).toThrow("Unknown or unsupported data_type"); + }); + + it("throws on unaligned data", () => { + let codec = DeltaCodec.fromConfig({}, { data_type: "int32" }); + let input = new Uint8Array(5); // not a multiple of 4 + expect(() => codec.encode(input)).toThrow("not a multiple of element size"); + }); +}); diff --git a/packages/zarrita/__tests__/consolidated.test.ts b/packages/zarrita/__tests__/consolidated.test.ts index 0431451f..467a5b87 100644 --- a/packages/zarrita/__tests__/consolidated.test.ts +++ b/packages/zarrita/__tests__/consolidated.test.ts @@ -25,12 +25,15 @@ describe("withConsolidated", () => { "/1d.contiguous.U7" => "array", "/1d.contiguous.b1" => "array", "/1d.contiguous.blosc.i2" => "array", + "/1d.contiguous.delta.i2" => "array", + "/1d.contiguous.delta.shuffle.i2" => "array", "/1d.contiguous.f4.be" => "array", "/1d.contiguous.f4.le" => "array", "/1d.contiguous.f8" => "array", "/1d.contiguous.i4" => "array", "/1d.contiguous.lz4.i2" => "array", "/1d.contiguous.raw.i2" => "array", + "/1d.contiguous.shuffle.i2" => "array", "/1d.contiguous.u1" => "array", "/1d.contiguous.zlib.i2" => "array", "/1d.contiguous.zstd.i2" => "array", diff --git a/packages/zarrita/__tests__/open.test.ts b/packages/zarrita/__tests__/open.test.ts index aaceacf2..d3ff4675 100644 --- a/packages/zarrita/__tests__/open.test.ts +++ b/packages/zarrita/__tests__/open.test.ts @@ -533,6 +533,39 @@ describe("v2", () => { }); }); + it("1d.contiguous.shuffle.i2", async () => { + let arr = await open.v2(store.resolve("/1d.contiguous.shuffle.i2"), { + kind: "array", + }); + expect(await arr.getChunk([0])).toStrictEqual({ + data: new Int16Array([1, 2, 3, 4]), + shape: [4], + stride: [1], + }); + }); + + it("1d.contiguous.delta.i2", async () => { + let arr = await open.v2(store.resolve("/1d.contiguous.delta.i2"), { + kind: "array", + }); + expect(await arr.getChunk([0])).toStrictEqual({ + data: new Int16Array([1, 2, 3, 4]), + shape: [4], + stride: [1], + }); + }); + + it("1d.contiguous.delta.shuffle.i2", async () => { + let arr = await open.v2(store.resolve("/1d.contiguous.delta.shuffle.i2"), { + kind: "array", + }); + expect(await arr.getChunk([0])).toStrictEqual({ + data: new Int16Array([10, 20, 30, 40]), + shape: [4], + stride: [1], + }); + }); + it("opens group from root", async () => { let grp = await open(store, { kind: "group" }); expect(grp.path).toBe("/"); diff --git a/packages/zarrita/__tests__/util.test.ts b/packages/zarrita/__tests__/util.test.ts index 04b775ee..d3a2726c 100644 --- a/packages/zarrita/__tests__/util.test.ts +++ b/packages/zarrita/__tests__/util.test.ts @@ -506,7 +506,7 @@ describe("v2_to_v3_array_metadata", () => { "configuration": { "level": 5, }, - "name": "zlib", + "name": "numcodecs.zlib", }, ], "data_type": "float32", @@ -550,7 +550,7 @@ describe("v2_to_v3_array_metadata", () => { "configuration": { "dtype": " { "configuration": { "dtype": ">f4", }, - "name": "delta", + "name": "numcodecs.delta", }, { "configuration": { "level": 1, }, - "name": "zlib", + "name": "numcodecs.zlib", }, ], "data_type": "float32", diff --git a/packages/zarrita/src/codecs.ts b/packages/zarrita/src/codecs.ts index b6681adc..968c16f5 100644 --- a/packages/zarrita/src/codecs.ts +++ b/packages/zarrita/src/codecs.ts @@ -2,8 +2,10 @@ import type { Codec as _Codec } from "numcodecs"; import { BitroundCodec } from "./codecs/bitround.js"; import { BytesCodec } from "./codecs/bytes.js"; import { Crc32cCodec } from "./codecs/crc32c.js"; +import { DeltaCodec } from "./codecs/delta.js"; import { GzipCodec } from "./codecs/gzip.js"; import { JsonCodec } from "./codecs/json2.js"; +import { ShuffleCodec } from "./codecs/shuffle.js"; import { TransposeCodec } from "./codecs/transpose.js"; import { VLenUTF8 } from "./codecs/vlen-utf8.js"; import { ZlibCodec } from "./codecs/zlib.js"; @@ -24,18 +26,35 @@ type CodecEntry = { type Codec = _Codec & { kind: CodecEntry["kind"] }; function create_default_registry(): Map Promise> { - return new Map() - .set("blosc", () => import("numcodecs/blosc").then((m) => m.default)) - .set("lz4", () => import("numcodecs/lz4").then((m) => m.default)) - .set("zstd", () => import("numcodecs/zstd").then((m) => m.default)) - .set("gzip", () => GzipCodec) - .set("zlib", () => ZlibCodec) - .set("transpose", () => TransposeCodec) - .set("bytes", () => BytesCodec) - .set("crc32c", () => Crc32cCodec) - .set("vlen-utf8", () => VLenUTF8) - .set("json2", () => JsonCodec) - .set("bitround", () => BitroundCodec); + let blosc = () => import("numcodecs/blosc").then((m) => m.default); + let lz4 = () => import("numcodecs/lz4").then((m) => m.default); + let zstd = () => import("numcodecs/zstd").then((m) => m.default); + let gzip = () => GzipCodec; + let zlib = () => ZlibCodec; + return ( + new Map() + // v3 codecs + .set("blosc", blosc) + .set("lz4", lz4) + .set("zstd", zstd) + .set("gzip", gzip) + .set("zlib", zlib) + .set("transpose", () => TransposeCodec) + .set("bytes", () => BytesCodec) + .set("crc32c", () => Crc32cCodec) + .set("vlen-utf8", () => VLenUTF8) + .set("json2", () => JsonCodec) + .set("bitround", () => BitroundCodec) + // numcodecs (v2 compat) + .set("numcodecs.blosc", blosc) + .set("numcodecs.lz4", lz4) + .set("numcodecs.zstd", zstd) + .set("numcodecs.gzip", gzip) + .set("numcodecs.zlib", zlib) + .set("numcodecs.vlen-utf8", () => VLenUTF8) + .set("numcodecs.shuffle", () => ShuffleCodec) + .set("numcodecs.delta", () => DeltaCodec) + ); } export const registry: Map Promise> = diff --git a/packages/zarrita/src/codecs/delta.ts b/packages/zarrita/src/codecs/delta.ts new file mode 100644 index 00000000..a02ea664 --- /dev/null +++ b/packages/zarrita/src/codecs/delta.ts @@ -0,0 +1,73 @@ +import type { + BigintDataType, + NumberDataType, + TypedArrayConstructor, +} from "../metadata.js"; +import { assert, get_ctr } from "../util.js"; + +/** + * Delta filter codec (numcodecs compat). + * + * Stores differences between consecutive elements rather than the elements + * themselves. + */ +export class DeltaCodec { + kind = "bytes_to_bytes"; + #TypedArray: TypedArrayConstructor; + #BYTES_PER_ELEMENT: number; + + constructor(_configuration: { dtype?: string }, meta: { data_type: D }) { + this.#TypedArray = get_ctr(meta.data_type); + let sample = new this.#TypedArray(0); + assert( + "BYTES_PER_ELEMENT" in sample, + `Delta codec requires a fixed-size dtype, got "${meta.data_type}"`, + ); + this.#BYTES_PER_ELEMENT = sample.BYTES_PER_ELEMENT; + } + + static fromConfig( + configuration: { dtype?: string }, + meta: { data_type: D }, + ): DeltaCodec { + return new DeltaCodec(configuration, meta); + } + + encode(data: Uint8Array): Uint8Array { + return this.#apply(data, "encode"); + } + + decode(data: Uint8Array): Uint8Array { + return this.#apply(data, "decode"); + } + + #apply(data: Uint8Array, mode: "encode" | "decode"): Uint8Array { + let bpe = this.#BYTES_PER_ELEMENT; + if (data.length % bpe !== 0) { + throw new Error( + `Data length (${data.length}) is not a multiple of element size (${bpe})`, + ); + } + let n = data.length / bpe; + if (n === 0) return new Uint8Array(0); + + let input = new this.#TypedArray(data.buffer, data.byteOffset, n); + let result = new Uint8Array(data.length); + let output = new this.#TypedArray(result.buffer, 0, n); + + output[0] = input[0]; + if (mode === "encode") { + for (let i = 1; i < n; i++) { + // @ts-expect-error - we know the types are the same + output[i] = input[i] - input[i - 1]; + } + } else { + for (let i = 1; i < n; i++) { + // @ts-expect-error - we know the types are the same + output[i] = output[i - 1] + input[i]; + } + } + + return result; + } +} diff --git a/packages/zarrita/src/codecs/shuffle.ts b/packages/zarrita/src/codecs/shuffle.ts new file mode 100644 index 00000000..24b1b478 --- /dev/null +++ b/packages/zarrita/src/codecs/shuffle.ts @@ -0,0 +1,68 @@ +import type { DataType } from "../metadata.js"; +import { assert, get_ctr } from "../util.js"; + +/** + * Shuffle filter codec (numcodecs compat). + * + * Reorders bytes so that corresponding bytes of each element are grouped + * together, improving compression of typed data. + */ +export class ShuffleCodec { + kind = "bytes_to_bytes"; + #BYTES_PER_ELEMENT: number; + + constructor( + configuration: { elementsize?: number }, + meta?: { data_type: D }, + ) { + if (meta) { + let sample = new (get_ctr(meta.data_type))(0); + assert( + "BYTES_PER_ELEMENT" in sample, + `Shuffle codec requires a fixed-size dtype, got "${meta.data_type}"`, + ); + this.#BYTES_PER_ELEMENT = sample.BYTES_PER_ELEMENT as number; + } else { + this.#BYTES_PER_ELEMENT = configuration.elementsize ?? 4; + } + } + + static fromConfig( + configuration: { elementsize?: number }, + meta?: { data_type: D }, + ): ShuffleCodec { + return new ShuffleCodec(configuration, meta); + } + + encode(data: Uint8Array): Uint8Array { + return shuffle(data, this.#BYTES_PER_ELEMENT); + } + + decode(data: Uint8Array): Uint8Array { + return unshuffle(data, this.#BYTES_PER_ELEMENT); + } +} + +function shuffle(data: Uint8Array, elementSize: number): Uint8Array { + let length = data.length; + let nElements = Math.floor(length / elementSize); + let result = new Uint8Array(length); + for (let byte = 0; byte < elementSize; byte++) { + for (let i = 0; i < nElements; i++) { + result[byte * nElements + i] = data[i * elementSize + byte]; + } + } + return result; +} + +function unshuffle(data: Uint8Array, elementSize: number): Uint8Array { + let length = data.length; + let nElements = Math.floor(length / elementSize); + let result = new Uint8Array(length); + for (let byte = 0; byte < elementSize; byte++) { + for (let i = 0; i < nElements; i++) { + result[i * elementSize + byte] = data[byte * nElements + i]; + } + } + return result; +} diff --git a/packages/zarrita/src/util.ts b/packages/zarrita/src/util.ts index 7ca7b782..40466c06 100644 --- a/packages/zarrita/src/util.ts +++ b/packages/zarrita/src/util.ts @@ -212,11 +212,11 @@ export function v2_to_v3_array_metadata( codecs.push({ name: "bytes", configuration: { endian: "big" } }); } for (let { id, ...configuration } of meta.filters ?? []) { - codecs.push({ name: id, configuration }); + codecs.push({ name: `numcodecs.${id}`, configuration }); } if (meta.compressor) { let { id, ...configuration } = meta.compressor; - codecs.push({ name: id, configuration }); + codecs.push({ name: `numcodecs.${id}`, configuration }); } let dimension_names: string[] | undefined; if (globalThis.Array.isArray(attributes._ARRAY_DIMENSIONS)) { diff --git a/scripts/generate-v2.py b/scripts/generate-v2.py index 7045a370..81394764 100644 --- a/scripts/generate-v2.py +++ b/scripts/generate-v2.py @@ -1,5 +1,5 @@ # /// script -# requires-python = ">=3.13" +# requires-python = "==3.13" # dependencies = [ # "zarr==2.18.1", # ] @@ -12,7 +12,7 @@ import zarr import numpy as np -from numcodecs import Zlib, Blosc, LZ4, Zstd, VLenUTF8 +from numcodecs import Zlib, Blosc, LZ4, Zstd, VLenUTF8, Shuffle, Delta SELF_DIR = pathlib.Path(__file__).parent @@ -187,6 +187,36 @@ chunks=(1, 1, 2), ) +# 1d.contiguous.shuffle.i2 +root.create_dataset( + "1d.contiguous.shuffle.i2", + data=[1, 2, 3, 4], + dtype="i2", + chunks=(4,), + compressor=Zlib(), + filters=[Shuffle(elementsize=2)], +) + +# 1d.contiguous.delta.i2 +root.create_dataset( + "1d.contiguous.delta.i2", + data=[1, 2, 3, 4], + dtype="i2", + chunks=(4,), + compressor=Zlib(), + filters=[Delta(dtype="i2")], +) + +# 1d.contiguous.delta.shuffle.i2 +root.create_dataset( + "1d.contiguous.delta.shuffle.i2", + data=[10, 20, 30, 40], + dtype="i2", + chunks=(4,), + compressor=Zlib(), + filters=[Delta(dtype="i2"), Shuffle(elementsize=2)], +) + # Group with spaces in the name g = root.create_group("my group with spaces") g.attrs["description"] = "A group with spaces in the name"