diff --git a/packages/ipfs-unixfs-importer/src/dir-flat.ts b/packages/ipfs-unixfs-importer/src/dir-flat.ts index 71937061..eb5ba250 100644 --- a/packages/ipfs-unixfs-importer/src/dir-flat.ts +++ b/packages/ipfs-unixfs-importer/src/dir-flat.ts @@ -1,6 +1,8 @@ import { encode, prepare } from '@ipld/dag-pb' import { UnixFS } from 'ipfs-unixfs' import { Dir } from './dir.ts' +import { InvalidShardingStrategyError } from './index.ts' +import { dataFieldSerializedSize, linkSerializedSize, utf8ByteLength } from './utils/pb-size.ts' import { persist } from './utils/persist.ts' import type { DirProps } from './dir.ts' import type { ImportResult, InProgressImportResult } from './index.ts' @@ -9,6 +11,24 @@ import type { PBLink, PBNode } from '@ipld/dag-pb' import type { Blockstore } from 'interface-blockstore' import type { CID } from 'multiformats/cid' +function estimateLinkSize (nameBytes: number, child: InProgressImportResult | Dir | undefined): number { + if (child?.cid != null && child?.size != null) { + return nameBytes + child.cid.byteLength + } + + return 0 +} + +function calculateLinkSize (nameBytes: number, child: InProgressImportResult | Dir | undefined): number { + if (child?.cid != null && child?.size != null) { + return linkSerializedSize( + nameBytes, child.cid.byteLength, Number(child.size) + ) + } + + return 0 +} + export class DirFlat extends Dir { private readonly _children: Map @@ -19,10 +39,29 @@ export class DirFlat extends Dir { } async put (name: string, value: InProgressImportResult | Dir): Promise { + if (this.nodeSize !== undefined) { + const oldChild = this._children.get(name) + const nameBytes = utf8ByteLength(name) + + const strategy = this.options?.shardSplitStrategy + if (strategy === 'links-bytes') { + this.nodeSize -= estimateLinkSize(nameBytes, oldChild) + this.nodeSize += estimateLinkSize(nameBytes, value) + } else if (strategy === 'block-bytes') { + this.nodeSize -= calculateLinkSize(nameBytes, oldChild) + this.nodeSize += calculateLinkSize(nameBytes, value) + } else { + throw new InvalidShardingStrategyError(`Invalid shardSplitStrategy: ${strategy}`) + } + + // safety: reset on underflow to force recomputation + if (this.nodeSize < 0) { + this.nodeSize = undefined + } + } + this.cid = undefined this.size = undefined - this.nodeSize = undefined - this._children.set(name, value) } @@ -89,18 +128,25 @@ export class DirFlat extends Dir { return this.nodeSize } - this.nodeSize = 0 - - if (this.options?.shardSplitStrategy === 'links-bytes') { - // estimate size only based on DAGLink name and CID byte lengths + const strategy = this.options?.shardSplitStrategy + if (strategy === 'links-bytes') { + // estimate size based on DAGLink name (UTF-8 byte length) and CID byte lengths // @see https://github.com/ipfs/go-unixfsnode/blob/37b47f1f917f1b2f54c207682f38886e49896ef9/data/builder/directory.go#L81-L96 + this.nodeSize = 0 + for (const [name, child] of this._children.entries()) { - if (child.size != null && (child.cid != null)) { - this.nodeSize += name.length + child.cid.byteLength - } + this.nodeSize += estimateLinkSize(utf8ByteLength(name), child) + } + } else if (strategy === 'block-bytes') { + // compute exact serialized size arithmetically + // (matches marshal().byteLength without allocating byte arrays) + this.nodeSize = dataFieldSerializedSize(this.mode, this.mtime) + + for (const [name, child] of this._children.entries()) { + this.nodeSize += calculateLinkSize(utf8ByteLength(name), child) } } else { - this.nodeSize = this.marshal().byteLength + throw new InvalidShardingStrategyError(`Invalid shardSplitStrategy: ${strategy}`) } return this.nodeSize diff --git a/packages/ipfs-unixfs-importer/src/errors.ts b/packages/ipfs-unixfs-importer/src/errors.ts index a8facda8..1c273dda 100644 --- a/packages/ipfs-unixfs-importer/src/errors.ts +++ b/packages/ipfs-unixfs-importer/src/errors.ts @@ -52,3 +52,10 @@ export class InvalidContentError extends Error { super(message) } } + +export class InvalidShardingStrategyError extends Error { + static name = 'InvalidShardingStrategyError' + static code = 'ERR_SHARDING_STRATEGY' + name = InvalidContentError.name + code = InvalidContentError.code +} diff --git a/packages/ipfs-unixfs-importer/src/utils/pb-size.ts b/packages/ipfs-unixfs-importer/src/utils/pb-size.ts new file mode 100644 index 00000000..c921270c --- /dev/null +++ b/packages/ipfs-unixfs-importer/src/utils/pb-size.ts @@ -0,0 +1,158 @@ +/** + * Protobuf size calculation utilities for DAG-PB nodes. + * + * Computes exact serialized sizes matching @ipld/dag-pb's encoding + * without allocating byte arrays. Used by DirFlat to avoid O(N) + * re-serialization on every file insert. + * + * Ported from @ipld/dag-pb/src/pb-encode.js (sov, len64, sizeLink, sizeNode) + * and boxo's directory.go estimatedSize logic. + */ + +import type { Mtime } from 'ipfs-unixfs' + +// --- varint helpers (from @ipld/dag-pb/src/pb-encode.js:166-214) --- + +const maxInt32 = 2 ** 32 + +const len8tab = [ + 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 +] + +function len64 (x: number): number { + let n = 0 + if (x >= maxInt32) { + x = Math.floor(x / maxInt32) + n = 32 + } + if (x >= (1 << 16)) { + x >>>= 16 + n += 16 + } + if (x >= (1 << 8)) { + x >>>= 8 + n += 8 + } + return n + len8tab[x] +} + +/** + * Protobuf varint byte size, matching @ipld/dag-pb's sov(). + */ +export function varintLen (x: number): number { + if (x % 2 === 0) { + x++ + } + return Math.floor((len64(x) + 6) / 7) +} + +/** + * Compute UTF-8 byte length of a JS string without allocation. + * + * Safe to assume UTF-8 because @ipld/dag-pb always encodes PBLink.Name + * via TextEncoder (UTF-8) and decodes via TextDecoder (UTF-8). + * This produces the same result as textEncoder.encode(str).length + * without the Uint8Array allocation on every put() call. + */ +export function utf8ByteLength (str: string): number { + let len = 0 + for (let i = 0; i < str.length; i++) { + const c = str.charCodeAt(i) + if (c < 0x80) { + // ASCII: 1 UTF-8 byte + len++ + } else if (c < 0x800) { + // U+0080 - U+07FF: 2 UTF-8 bytes + len += 2 + } else if (c >= 0xD800 && c <= 0xDBFF && i + 1 < str.length) { + // Surrogate pair (JS encodes code points above U+FFFF as two + // UTF-16 surrogates). The pair maps to one code point that takes + // 4 UTF-8 bytes. Lone surrogates cannot occur here because names + // always round-trip through @ipld/dag-pb's TextEncoder/TextDecoder + // which only produce valid UTF-8 strings. + i++ + len += 4 + } else { + // U+0800 - U+FFFF: 3 UTF-8 bytes + len += 3 + } + } + return len +} + +/** + * Exact bytes a single PBLink adds to the PBNode encoding. + * + * Matches sizeLink() + its wrapper in sizeNode() from pb-encode.js: + * linkLen = Hash(1+sov(cidLen)+cidLen) + Name(1+sov(nameLen)+nameLen) + Tsize(1+sov(tsize)) + * total = 1 + sov(linkLen) + linkLen + */ +export function linkSerializedSize (nameByteLen: number, cidByteLength: number, tsize: number): number { + // Hash field: tag(1) + varint(cidLen) + cidBytes + let linkLen = 1 + varintLen(cidByteLength) + cidByteLength + // Name field: tag(1) + varint(nameLen) + nameBytes + linkLen += 1 + varintLen(nameByteLen) + nameByteLen + // Tsize field: tag(1) + varint(tsize) + linkLen += 1 + varintLen(tsize) + // PBNode Links wrapper: tag(1) + varint(linkLen) + linkBytes + return 1 + varintLen(linkLen) + linkLen +} + +// Default mode for directories (0o755 = 493) +const DIR_DEFAULT_MODE = 0o755 + +/** + * Exact bytes the PBNode Data field adds for a UnixFS directory. + * + * Directory-only: the type field is hardcoded to directory (2 bytes) and + * the default mode is 0o755. Do not use for file nodes (different type + * byte, different default mode 0o644). + * + * For the common case (no mode, no mtime) this is always 4 bytes: + * innerSize=2 [0x08,0x01], wrapper 1+1+2=4. + */ +export function dataFieldSerializedSize (mode?: number, mtime?: Mtime): number { + // UnixFS inner: type field [0x08, 0x01] = 2 bytes for directory + let innerSize = 2 + + // mode (field 7, varint) -- only encoded if set and not the default + if (mode !== undefined && mode !== DIR_DEFAULT_MODE) { + innerSize += 1 + varintLen(mode) + } + + // mtime (field 8, nested UnixTime message) + if (mtime != null) { + let mtimeInner = 0 + // Seconds (field 1, int64 varint) + const secs = Number(mtime.secs) + if (secs < 0) { + // negative int64 always takes 10 bytes in protobuf two's complement + mtimeInner += 1 + 10 + } else { + mtimeInner += 1 + varintLen(secs) + } + // FractionalNanoseconds (field 2, fixed32) -- optional + if (mtime.nsecs != null) { + mtimeInner += 1 + 4 + } + innerSize += 1 + varintLen(mtimeInner) + mtimeInner + } + + // PBNode Data wrapper: tag(1) + varint(innerSize) + innerBytes + return 1 + varintLen(innerSize) + innerSize +} diff --git a/packages/ipfs-unixfs-importer/test/ipip-499-profiles.spec.ts b/packages/ipfs-unixfs-importer/test/ipip-499-profiles.spec.ts index 9d7a3769..7ba945f1 100644 --- a/packages/ipfs-unixfs-importer/test/ipip-499-profiles.spec.ts +++ b/packages/ipfs-unixfs-importer/test/ipip-499-profiles.spec.ts @@ -43,9 +43,6 @@ const v0Options: ImporterOptions = { profile: 'unixfs-v0-2015' } const v1Options: ImporterOptions = { profile: 'unixfs-v1-2025' } describe('IPIP-499 CID Profiles', function () { - // TODO: remove after https://github.com/ipfs/js-ipfs-unixfs/pull/458 - this.timeout(640_000_000) - // Pre-compute deterministic filenames once (shared across tests) let v0Names4096: string[] let v0Names4033: string[] diff --git a/packages/ipfs-unixfs-importer/test/pb-size.spec.ts b/packages/ipfs-unixfs-importer/test/pb-size.spec.ts new file mode 100644 index 00000000..c7606cea --- /dev/null +++ b/packages/ipfs-unixfs-importer/test/pb-size.spec.ts @@ -0,0 +1,244 @@ +import { encode, prepare } from '@ipld/dag-pb' +import { expect } from 'aegir/chai' +import { UnixFS } from 'ipfs-unixfs' +import { CID } from 'multiformats/cid' +import { sha256 } from 'multiformats/hashes/sha2' +import { + varintLen, + utf8ByteLength, + linkSerializedSize, + dataFieldSerializedSize +} from '../src/utils/pb-size.js' +import type { PBNode } from '@ipld/dag-pb' + +// helper: create a CID from arbitrary bytes for testing +async function fakeCid (version: 0 | 1, bytes: Uint8Array): Promise { + const hash = await sha256.digest(bytes) + return CID.create(version, version === 0 ? 0x70 : 0x55, hash) +} + +describe('pb-size', () => { + describe('varintLen', () => { + it('returns 1 for small values (0-127)', () => { + expect(varintLen(0)).to.equal(1) + expect(varintLen(1)).to.equal(1) + expect(varintLen(127)).to.equal(1) + }) + + it('returns 2 for values 128-16383', () => { + expect(varintLen(128)).to.equal(2) + expect(varintLen(255)).to.equal(2) + expect(varintLen(16383)).to.equal(2) + }) + + it('returns 3 for values 16384-2097151', () => { + expect(varintLen(16384)).to.equal(3) + expect(varintLen(262144)).to.equal(3) + }) + + it('handles large values', () => { + expect(varintLen(2 ** 32)).to.equal(5) + expect(varintLen(Number.MAX_SAFE_INTEGER)).to.equal(8) + }) + }) + + describe('utf8ByteLength', () => { + const textEncoder = new TextEncoder() + + it('counts ASCII strings correctly', () => { + expect(utf8ByteLength('')).to.equal(0) + expect(utf8ByteLength('hello')).to.equal(5) + expect(utf8ByteLength('hello world')).to.equal(11) + }) + + it('matches TextEncoder for ASCII', () => { + const str = 'abcdefghijklmnopqrstuvwxyz0123456789' + expect(utf8ByteLength(str)).to.equal(textEncoder.encode(str).length) + }) + + it('counts 2-byte characters correctly', () => { + // U+00E9 (e-acute) = 2 UTF-8 bytes + const str = '\u00e9' + expect(utf8ByteLength(str)).to.equal(2) + expect(utf8ByteLength(str)).to.equal(textEncoder.encode(str).length) + }) + + it('counts 3-byte characters correctly', () => { + // U+4E16 (CJK character) = 3 UTF-8 bytes + const str = '\u4e16' + expect(utf8ByteLength(str)).to.equal(3) + expect(utf8ByteLength(str)).to.equal(textEncoder.encode(str).length) + }) + + it('counts surrogate pairs (4-byte characters) correctly', () => { + // U+1F600 (grinning face emoji) = 4 UTF-8 bytes, stored as surrogate pair + const str = '\uD83D\uDE00' + expect(utf8ByteLength(str)).to.equal(4) + expect(utf8ByteLength(str)).to.equal(textEncoder.encode(str).length) + }) + + it('matches TextEncoder for mixed content', () => { + const str = 'hello-\u00e9\u4e16\uD83D\uDE00-world' + expect(utf8ByteLength(str)).to.equal(textEncoder.encode(str).length) + }) + }) + + describe('linkSerializedSize', () => { + it('matches encode(prepare(node)) for a single CIDv0 link', async () => { + const cid = await fakeCid(0, new Uint8Array([1, 2, 3])) + const name = 'test-file.txt' + const tsize = 1024 + + const node: PBNode = { + Data: new Uint8Array(0), + Links: [{ Hash: cid, Name: name, Tsize: tsize }] + } + const encoded = encode(prepare(node)) + // subtract the empty-data overhead (1 tag + 1 varint + 0 bytes = 2) + const dataOverhead = 2 + const expected = encoded.byteLength - dataOverhead + + const nameBytes = utf8ByteLength(name) + expect(linkSerializedSize(nameBytes, cid.byteLength, tsize)).to.equal(expected) + }) + + it('matches encode(prepare(node)) for a single CIDv1 link', async () => { + const cid = await fakeCid(1, new Uint8Array([4, 5, 6])) + const name = 'short' + const tsize = 42 + + const node: PBNode = { + Data: new Uint8Array(0), + Links: [{ Hash: cid, Name: name, Tsize: tsize }] + } + const encoded = encode(prepare(node)) + const dataOverhead = 2 + const expected = encoded.byteLength - dataOverhead + + const nameBytes = utf8ByteLength(name) + expect(linkSerializedSize(nameBytes, cid.byteLength, tsize)).to.equal(expected) + }) + + it('matches encode(prepare(node)) for multiple links', async () => { + const cid1 = await fakeCid(1, new Uint8Array([1])) + const cid2 = await fakeCid(1, new Uint8Array([2])) + const cid3 = await fakeCid(1, new Uint8Array([3])) + + const links = [ + { Hash: cid1, Name: 'aaa', Tsize: 100 }, + { Hash: cid2, Name: 'bbb', Tsize: 200 }, + { Hash: cid3, Name: 'ccc', Tsize: 300 } + ] + + const node: PBNode = { Data: new Uint8Array(0), Links: links } + const encoded = encode(prepare(node)) + const dataOverhead = 2 + + const sum = links.reduce((acc, l) => { + return acc + linkSerializedSize( + utf8ByteLength(l.Name), l.Hash.byteLength, l.Tsize + ) + }, 0) + + expect(sum).to.equal(encoded.byteLength - dataOverhead) + }) + + it('handles tsize = 0', async () => { + const cid = await fakeCid(1, new Uint8Array([7])) + const name = 'empty' + + const node: PBNode = { + Data: new Uint8Array(0), + Links: [{ Hash: cid, Name: name, Tsize: 0 }] + } + const encoded = encode(prepare(node)) + const dataOverhead = 2 + const expected = encoded.byteLength - dataOverhead + + expect(linkSerializedSize(utf8ByteLength(name), cid.byteLength, 0)).to.equal(expected) + }) + + it('handles large tsize', async () => { + const cid = await fakeCid(1, new Uint8Array([8])) + const name = 'big' + const tsize = 1_073_741_824 // 1 GiB + + const node: PBNode = { + Data: new Uint8Array(0), + Links: [{ Hash: cid, Name: name, Tsize: tsize }] + } + const encoded = encode(prepare(node)) + const dataOverhead = 2 + const expected = encoded.byteLength - dataOverhead + + expect(linkSerializedSize(utf8ByteLength(name), cid.byteLength, tsize)).to.equal(expected) + }) + }) + + describe('dataFieldSerializedSize', () => { + it('returns 4 for a plain directory (no mode, no mtime)', () => { + const unixfs = new UnixFS({ type: 'directory' }) + const node: PBNode = { Data: unixfs.marshal(), Links: [] } + const encoded = encode(prepare(node)) + + expect(dataFieldSerializedSize()).to.equal(encoded.byteLength) + expect(dataFieldSerializedSize(undefined, undefined)).to.equal(4) + }) + + it('omits default directory mode (0o755)', () => { + const unixfs = new UnixFS({ type: 'directory', mode: 0o755 }) + const node: PBNode = { Data: unixfs.marshal(), Links: [] } + const encoded = encode(prepare(node)) + + expect(dataFieldSerializedSize(0o755)).to.equal(encoded.byteLength) + // should be same as no mode + expect(dataFieldSerializedSize(0o755)).to.equal(dataFieldSerializedSize()) + }) + + it('includes non-default mode', () => { + const mode = 0o555 + const unixfs = new UnixFS({ type: 'directory', mode }) + const node: PBNode = { Data: unixfs.marshal(), Links: [] } + const encoded = encode(prepare(node)) + + expect(dataFieldSerializedSize(mode)).to.equal(encoded.byteLength) + expect(dataFieldSerializedSize(mode)).to.be.greaterThan(dataFieldSerializedSize()) + }) + + it('includes mtime with seconds only', () => { + const mtime = { secs: 1000000n } + const unixfs = new UnixFS({ type: 'directory', mtime }) + const node: PBNode = { Data: unixfs.marshal(), Links: [] } + const encoded = encode(prepare(node)) + + expect(dataFieldSerializedSize(undefined, mtime)).to.equal(encoded.byteLength) + }) + + it('includes mtime with seconds and nanoseconds', () => { + const mtime = { secs: 1000000n, nsecs: 500000 } + const unixfs = new UnixFS({ type: 'directory', mtime }) + const node: PBNode = { Data: unixfs.marshal(), Links: [] } + const encoded = encode(prepare(node)) + + expect(dataFieldSerializedSize(undefined, mtime)).to.equal(encoded.byteLength) + }) + + it('matches full directory node size (data + links)', async () => { + const cid = await fakeCid(1, new Uint8Array([1])) + const name = 'test' + const tsize = 42 + + const unixfs = new UnixFS({ type: 'directory' }) + const node: PBNode = { + Data: unixfs.marshal(), + Links: [{ Hash: cid, Name: name, Tsize: tsize }] + } + const encoded = encode(prepare(node)) + + const computed = dataFieldSerializedSize() + + linkSerializedSize(utf8ByteLength(name), cid.byteLength, tsize) + + expect(computed).to.equal(encoded.byteLength) + }) + }) +})