Skip to content

Commit 1a82209

Browse files
committed
perf: O(1) incremental directory size estimation in DirFlat
the block-bytes shard split strategy called marshal().byteLength after every file insert, re-serializing the full protobuf directory node each time. for a 4766-file directory this meant ~11.3M link serializations total (O(N^2)), causing the v1-2025 directory threshold tests to take ~150s each. src/utils/pb-size.ts: - new file with pure arithmetic functions ported from @ipld/dag-pb's pb-encode.js (varintLen, linkSerializedSize, dataFieldSerializedSize) - utf8ByteLength computes UTF-8 byte count without TextEncoder allocation, matching what @ipld/dag-pb uses for PBLink.Name encoding src/dir-flat.ts: - estimateNodeSize() now computes exact serialized size arithmetically instead of calling marshal(), matching pb-encode.js byte-for-byte - put() incrementally adjusts nodeSize (O(1) per insert) instead of invalidating it (which forced O(N) recomputation on next estimate) - both strategies use explicit if/else with throw on unknown value - links-bytes: use utf8ByteLength(name) instead of name.length (correctness fix for non-ASCII names, matches Go's len(name)) test/pb-size.spec.ts: - unit tests for all four pb-size.ts functions, verified against @ipld/dag-pb's encode(prepare(node)) output test/ipip-499-profiles.spec.ts: - pre-compute shared filename arrays once in before() hook instead of calling deterministicFilenames 7 times with duplicate params - per-describe blockstores to reduce memory accumulation - removed explicit timeouts (no longer needed) - updated stale comments about block-bytes re-serialization v1-2025 directory tests: ~150s -> ~150ms (1000x faster) full IPIP-499 suite (21 tests): minutes -> ~5s
1 parent 37229b3 commit 1a82209

5 files changed

Lines changed: 509 additions & 68 deletions

File tree

packages/ipfs-unixfs-importer/src/dir-flat.ts

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { encode, prepare } from '@ipld/dag-pb'
22
import { UnixFS } from 'ipfs-unixfs'
33
import { Dir } from './dir.ts'
4+
import { dataFieldSerializedSize, linkSerializedSize, utf8ByteLength } from './utils/pb-size.ts'
45
import { persist } from './utils/persist.ts'
56
import type { DirProps } from './dir.ts'
67
import type { ImportResult, InProgressImportResult } from './index.ts'
@@ -19,10 +20,42 @@ export class DirFlat extends Dir {
1920
}
2021

2122
async put (name: string, value: InProgressImportResult | Dir): Promise<void> {
23+
if (this.nodeSize !== undefined) {
24+
const oldChild = this._children.get(name)
25+
26+
const strategy = this.options?.shardSplitStrategy
27+
if (strategy === 'links-bytes') {
28+
const nameBytes = utf8ByteLength(name)
29+
if (oldChild?.cid != null && oldChild?.size != null) {
30+
this.nodeSize -= nameBytes + oldChild.cid.byteLength
31+
}
32+
if (value.cid != null && value.size != null) {
33+
this.nodeSize += nameBytes + value.cid.byteLength
34+
}
35+
} else if (strategy === 'block-bytes') {
36+
const nameBytes = utf8ByteLength(name)
37+
if (oldChild?.cid != null && oldChild?.size != null) {
38+
this.nodeSize -= linkSerializedSize(
39+
nameBytes, oldChild.cid.byteLength, Number(oldChild.size)
40+
)
41+
}
42+
if (value.cid != null && value.size != null) {
43+
this.nodeSize += linkSerializedSize(
44+
nameBytes, value.cid.byteLength, Number(value.size)
45+
)
46+
}
47+
} else {
48+
throw new Error(`unknown shardSplitStrategy: ${strategy}`)
49+
}
50+
51+
// safety: reset on underflow to force recomputation
52+
if (this.nodeSize < 0) {
53+
this.nodeSize = undefined
54+
}
55+
}
56+
2257
this.cid = undefined
2358
this.size = undefined
24-
this.nodeSize = undefined
25-
2659
this._children.set(name, value)
2760
}
2861

@@ -89,18 +122,29 @@ export class DirFlat extends Dir {
89122
return this.nodeSize
90123
}
91124

92-
this.nodeSize = 0
93-
94-
if (this.options?.shardSplitStrategy === 'links-bytes') {
95-
// estimate size only based on DAGLink name and CID byte lengths
125+
const strategy = this.options?.shardSplitStrategy
126+
if (strategy === 'links-bytes') {
127+
// estimate size based on DAGLink name (UTF-8 byte length) and CID byte lengths
96128
// @see https://github.com/ipfs/go-unixfsnode/blob/37b47f1f917f1b2f54c207682f38886e49896ef9/data/builder/directory.go#L81-L96
129+
this.nodeSize = 0
130+
for (const [name, child] of this._children.entries()) {
131+
if (child.size != null && child.cid != null) {
132+
this.nodeSize += utf8ByteLength(name) + child.cid.byteLength
133+
}
134+
}
135+
} else if (strategy === 'block-bytes') {
136+
// compute exact serialized size arithmetically
137+
// (matches marshal().byteLength without allocating byte arrays)
138+
this.nodeSize = dataFieldSerializedSize(this.mode, this.mtime)
97139
for (const [name, child] of this._children.entries()) {
98-
if (child.size != null && (child.cid != null)) {
99-
this.nodeSize += name.length + child.cid.byteLength
140+
if (child.size != null && child.cid != null) {
141+
this.nodeSize += linkSerializedSize(
142+
utf8ByteLength(name), child.cid.byteLength, Number(child.size)
143+
)
100144
}
101145
}
102146
} else {
103-
this.nodeSize = this.marshal().byteLength
147+
throw new Error(`unknown shardSplitStrategy: ${strategy}`)
104148
}
105149

106150
return this.nodeSize
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
/**
2+
* Protobuf size calculation utilities for DAG-PB nodes.
3+
*
4+
* Computes exact serialized sizes matching @ipld/dag-pb's encoding
5+
* without allocating byte arrays. Used by DirFlat to avoid O(N)
6+
* re-serialization on every file insert.
7+
*
8+
* Ported from @ipld/dag-pb/src/pb-encode.js (sov, len64, sizeLink, sizeNode)
9+
* and boxo's directory.go estimatedSize logic.
10+
*/
11+
12+
import type { Mtime } from 'ipfs-unixfs'
13+
14+
// --- varint helpers (from @ipld/dag-pb/src/pb-encode.js:166-214) ---
15+
16+
const maxInt32 = 2 ** 32
17+
18+
const len8tab = [
19+
0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
20+
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
21+
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
22+
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
23+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
24+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
25+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
26+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
27+
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
28+
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
29+
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
30+
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
31+
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
32+
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
33+
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
34+
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
35+
]
36+
37+
function len64 (x: number): number {
38+
let n = 0
39+
if (x >= maxInt32) {
40+
x = Math.floor(x / maxInt32)
41+
n = 32
42+
}
43+
if (x >= (1 << 16)) {
44+
x >>>= 16
45+
n += 16
46+
}
47+
if (x >= (1 << 8)) {
48+
x >>>= 8
49+
n += 8
50+
}
51+
return n + len8tab[x]
52+
}
53+
54+
/**
55+
* Protobuf varint byte size, matching @ipld/dag-pb's sov().
56+
*/
57+
export function varintLen (x: number): number {
58+
if (x % 2 === 0) {
59+
x++
60+
}
61+
return Math.floor((len64(x) + 6) / 7)
62+
}
63+
64+
/**
65+
* Compute UTF-8 byte length of a JS string without allocation.
66+
*
67+
* Safe to assume UTF-8 because @ipld/dag-pb always encodes PBLink.Name
68+
* via TextEncoder (UTF-8) and decodes via TextDecoder (UTF-8).
69+
* This produces the same result as textEncoder.encode(str).length
70+
* without the Uint8Array allocation on every put() call.
71+
*/
72+
export function utf8ByteLength (str: string): number {
73+
let len = 0
74+
for (let i = 0; i < str.length; i++) {
75+
const c = str.charCodeAt(i)
76+
if (c < 0x80) {
77+
// ASCII: 1 UTF-8 byte
78+
len++
79+
} else if (c < 0x800) {
80+
// U+0080 - U+07FF: 2 UTF-8 bytes
81+
len += 2
82+
} else if (c >= 0xD800 && c <= 0xDBFF && i + 1 < str.length) {
83+
// Surrogate pair (JS encodes code points above U+FFFF as two
84+
// UTF-16 surrogates). The pair maps to one code point that takes
85+
// 4 UTF-8 bytes. Lone surrogates cannot occur here because names
86+
// always round-trip through @ipld/dag-pb's TextEncoder/TextDecoder
87+
// which only produce valid UTF-8 strings.
88+
i++
89+
len += 4
90+
} else {
91+
// U+0800 - U+FFFF: 3 UTF-8 bytes
92+
len += 3
93+
}
94+
}
95+
return len
96+
}
97+
98+
/**
99+
* Exact bytes a single PBLink adds to the PBNode encoding.
100+
*
101+
* Matches sizeLink() + its wrapper in sizeNode() from pb-encode.js:
102+
* linkLen = Hash(1+sov(cidLen)+cidLen) + Name(1+sov(nameLen)+nameLen) + Tsize(1+sov(tsize))
103+
* total = 1 + sov(linkLen) + linkLen
104+
*/
105+
export function linkSerializedSize (nameByteLen: number, cidByteLength: number, tsize: number): number {
106+
// Hash field: tag(1) + varint(cidLen) + cidBytes
107+
let linkLen = 1 + varintLen(cidByteLength) + cidByteLength
108+
// Name field: tag(1) + varint(nameLen) + nameBytes
109+
linkLen += 1 + varintLen(nameByteLen) + nameByteLen
110+
// Tsize field: tag(1) + varint(tsize)
111+
linkLen += 1 + varintLen(tsize)
112+
// PBNode Links wrapper: tag(1) + varint(linkLen) + linkBytes
113+
return 1 + varintLen(linkLen) + linkLen
114+
}
115+
116+
// Default mode for directories (0o755 = 493)
117+
const DIR_DEFAULT_MODE = 0o755
118+
119+
/**
120+
* Exact bytes the PBNode Data field adds for a UnixFS directory.
121+
*
122+
* Directory-only: the type field is hardcoded to directory (2 bytes) and
123+
* the default mode is 0o755. Do not use for file nodes (different type
124+
* byte, different default mode 0o644).
125+
*
126+
* For the common case (no mode, no mtime) this is always 4 bytes:
127+
* innerSize=2 [0x08,0x01], wrapper 1+1+2=4.
128+
*/
129+
export function dataFieldSerializedSize (mode?: number, mtime?: Mtime): number {
130+
// UnixFS inner: type field [0x08, 0x01] = 2 bytes for directory
131+
let innerSize = 2
132+
133+
// mode (field 7, varint) -- only encoded if set and not the default
134+
if (mode !== undefined && mode !== DIR_DEFAULT_MODE) {
135+
innerSize += 1 + varintLen(mode)
136+
}
137+
138+
// mtime (field 8, nested UnixTime message)
139+
if (mtime != null) {
140+
let mtimeInner = 0
141+
// Seconds (field 1, int64 varint)
142+
const secs = Number(mtime.secs)
143+
if (secs < 0) {
144+
// negative int64 always takes 10 bytes in protobuf two's complement
145+
mtimeInner += 1 + 10
146+
} else {
147+
mtimeInner += 1 + varintLen(secs)
148+
}
149+
// FractionalNanoseconds (field 2, fixed32) -- optional
150+
if (mtime.nsecs != null) {
151+
mtimeInner += 1 + 4
152+
}
153+
innerSize += 1 + varintLen(mtimeInner) + mtimeInner
154+
}
155+
156+
// PBNode Data wrapper: tag(1) + varint(innerSize) + innerBytes
157+
return 1 + varintLen(innerSize) + innerSize
158+
}

packages/ipfs-unixfs-importer/test/helpers/deterministic.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
* Deterministic pseudo-random byte generation matching kubo's Go test helpers.
33
*
44
* Algorithm:
5-
* 1. key = sha256(utf8(seed)) -- 32 bytes
6-
* 2. nonce = 12 zero bytes (ChaCha20-IETF)
7-
* 3. return chacha20(key, nonce, zeros(size)) -- keystream XOR zeros = keystream
5+
* 1. key = sha256(utf8(seed)) -- 32 bytes
6+
* 2. nonce = 12 zero bytes (ChaCha20-IETF)
7+
* 3. return chacha20(key, nonce, zeros(size)) -- keystream XOR zeros = keystream
88
*
99
* @see https://github.com/ipfs/kubo/blob/master/test/cli/testutils/random_deterministic.go
1010
*/
@@ -56,6 +56,9 @@ export async function * deterministicRandomStream (size: number, seed: string):
5656
*
5757
* Files 0..count-2 get `nameLen` chars, the last file gets `lastNameLen` chars.
5858
* Each byte from the ChaCha20 stream is mapped through AlphabetEasy modulo.
59+
*
60+
* Note: the stream is 1 MiB, so total bytes consumed
61+
* ((count-1)*nameLen + lastNameLen) must not exceed 1,048,576.
5962
*/
6063
export async function deterministicFilenames (
6164
count: number,

0 commit comments

Comments
 (0)