Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/ipfs-unixfs-importer/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@
"uint8arrays": "^5.1.0"
},
"devDependencies": {
"@noble/ciphers": "^1.2.1",
Comment thread
achingbrain marked this conversation as resolved.
Outdated
"aegir": "^47.0.16",
"it-last": "^3.0.9",
"wherearewe": "^2.0.1"
Expand Down
62 changes: 53 additions & 9 deletions packages/ipfs-unixfs-importer/src/dir-flat.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { encode, prepare } from '@ipld/dag-pb'
import { UnixFS } from 'ipfs-unixfs'
import { Dir } from './dir.ts'
import { dataFieldSerializedSize, linkSerializedSize, utf8ByteLength } from './utils/pb-size.ts'
import { persist } from './utils/persist.ts'
import type { DirProps } from './dir.ts'
import type { ImportResult, InProgressImportResult } from './index.ts'
Expand All @@ -19,10 +20,42 @@ export class DirFlat extends Dir {
}

async put (name: string, value: InProgressImportResult | Dir): Promise<void> {
if (this.nodeSize !== undefined) {
Comment thread
achingbrain marked this conversation as resolved.
const oldChild = this._children.get(name)

const strategy = this.options?.shardSplitStrategy
if (strategy === 'links-bytes') {
const nameBytes = utf8ByteLength(name)
if (oldChild?.cid != null && oldChild?.size != null) {
this.nodeSize -= nameBytes + oldChild.cid.byteLength
}
if (value.cid != null && value.size != null) {
this.nodeSize += nameBytes + value.cid.byteLength
}
} else if (strategy === 'block-bytes') {
const nameBytes = utf8ByteLength(name)
if (oldChild?.cid != null && oldChild?.size != null) {
this.nodeSize -= linkSerializedSize(
nameBytes, oldChild.cid.byteLength, Number(oldChild.size)
)
}
if (value.cid != null && value.size != null) {
this.nodeSize += linkSerializedSize(
nameBytes, value.cid.byteLength, Number(value.size)
)
}
} else {
throw new Error(`unknown shardSplitStrategy: ${strategy}`)
}

// safety: reset on underflow to force recomputation
if (this.nodeSize < 0) {
this.nodeSize = undefined
}
}

this.cid = undefined
this.size = undefined
this.nodeSize = undefined

this._children.set(name, value)
}

Expand Down Expand Up @@ -89,18 +122,29 @@ export class DirFlat extends Dir {
return this.nodeSize
}

this.nodeSize = 0

if (this.options?.shardSplitStrategy === 'links-bytes') {
// estimate size only based on DAGLink name and CID byte lengths
const strategy = this.options?.shardSplitStrategy
if (strategy === 'links-bytes') {
// estimate size based on DAGLink name (UTF-8 byte length) and CID byte lengths
// @see https://github.com/ipfs/go-unixfsnode/blob/37b47f1f917f1b2f54c207682f38886e49896ef9/data/builder/directory.go#L81-L96
this.nodeSize = 0
for (const [name, child] of this._children.entries()) {
if (child.size != null && child.cid != null) {
this.nodeSize += utf8ByteLength(name) + child.cid.byteLength
}
}
} else if (strategy === 'block-bytes') {
// compute exact serialized size arithmetically
// (matches marshal().byteLength without allocating byte arrays)
this.nodeSize = dataFieldSerializedSize(this.mode, this.mtime)
for (const [name, child] of this._children.entries()) {
if (child.size != null && (child.cid != null)) {
this.nodeSize += name.length + child.cid.byteLength
if (child.size != null && child.cid != null) {
this.nodeSize += linkSerializedSize(
utf8ByteLength(name), child.cid.byteLength, Number(child.size)
)
}
}
} else {
this.nodeSize = this.marshal().byteLength
throw new Error(`unknown shardSplitStrategy: ${strategy}`)
}

return this.nodeSize
Expand Down
158 changes: 158 additions & 0 deletions packages/ipfs-unixfs-importer/src/utils/pb-size.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/**
* Protobuf size calculation utilities for DAG-PB nodes.
*
* Computes exact serialized sizes matching @ipld/dag-pb's encoding
* without allocating byte arrays. Used by DirFlat to avoid O(N)
* re-serialization on every file insert.
*
* Ported from @ipld/dag-pb/src/pb-encode.js (sov, len64, sizeLink, sizeNode)
* and boxo's directory.go estimatedSize logic.
*/

import type { Mtime } from 'ipfs-unixfs'

// --- varint helpers (from @ipld/dag-pb/src/pb-encode.js:166-214) ---

const maxInt32 = 2 ** 32

const len8tab = [
0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
]

function len64 (x: number): number {
let n = 0
if (x >= maxInt32) {
x = Math.floor(x / maxInt32)
n = 32
}
if (x >= (1 << 16)) {
x >>>= 16
n += 16
}
if (x >= (1 << 8)) {
x >>>= 8
n += 8
}
return n + len8tab[x]
}

/**
* Protobuf varint byte size, matching @ipld/dag-pb's sov().
*/
export function varintLen (x: number): number {
if (x % 2 === 0) {
x++
}
return Math.floor((len64(x) + 6) / 7)
}

/**
* Compute UTF-8 byte length of a JS string without allocation.
*
* Safe to assume UTF-8 because @ipld/dag-pb always encodes PBLink.Name
* via TextEncoder (UTF-8) and decodes via TextDecoder (UTF-8).
* This produces the same result as textEncoder.encode(str).length
* without the Uint8Array allocation on every put() call.
*/
export function utf8ByteLength (str: string): number {
let len = 0
for (let i = 0; i < str.length; i++) {
const c = str.charCodeAt(i)
if (c < 0x80) {
// ASCII: 1 UTF-8 byte
len++
} else if (c < 0x800) {
// U+0080 - U+07FF: 2 UTF-8 bytes
len += 2
} else if (c >= 0xD800 && c <= 0xDBFF && i + 1 < str.length) {
// Surrogate pair (JS encodes code points above U+FFFF as two
// UTF-16 surrogates). The pair maps to one code point that takes
// 4 UTF-8 bytes. Lone surrogates cannot occur here because names
// always round-trip through @ipld/dag-pb's TextEncoder/TextDecoder
// which only produce valid UTF-8 strings.
i++
len += 4
} else {
// U+0800 - U+FFFF: 3 UTF-8 bytes
len += 3
}
}
return len
}

/**
* Exact bytes a single PBLink adds to the PBNode encoding.
*
* Matches sizeLink() + its wrapper in sizeNode() from pb-encode.js:
* linkLen = Hash(1+sov(cidLen)+cidLen) + Name(1+sov(nameLen)+nameLen) + Tsize(1+sov(tsize))
* total = 1 + sov(linkLen) + linkLen
*/
export function linkSerializedSize (nameByteLen: number, cidByteLength: number, tsize: number): number {
// Hash field: tag(1) + varint(cidLen) + cidBytes
let linkLen = 1 + varintLen(cidByteLength) + cidByteLength
// Name field: tag(1) + varint(nameLen) + nameBytes
linkLen += 1 + varintLen(nameByteLen) + nameByteLen
// Tsize field: tag(1) + varint(tsize)
linkLen += 1 + varintLen(tsize)
// PBNode Links wrapper: tag(1) + varint(linkLen) + linkBytes
return 1 + varintLen(linkLen) + linkLen
}

// Default mode for directories (0o755 = 493)
const DIR_DEFAULT_MODE = 0o755

/**
* Exact bytes the PBNode Data field adds for a UnixFS directory.
*
* Directory-only: the type field is hardcoded to directory (2 bytes) and
* the default mode is 0o755. Do not use for file nodes (different type
* byte, different default mode 0o644).
*
* For the common case (no mode, no mtime) this is always 4 bytes:
* innerSize=2 [0x08,0x01], wrapper 1+1+2=4.
*/
export function dataFieldSerializedSize (mode?: number, mtime?: Mtime): number {
// UnixFS inner: type field [0x08, 0x01] = 2 bytes for directory
let innerSize = 2

// mode (field 7, varint) -- only encoded if set and not the default
if (mode !== undefined && mode !== DIR_DEFAULT_MODE) {
innerSize += 1 + varintLen(mode)
}

// mtime (field 8, nested UnixTime message)
if (mtime != null) {
let mtimeInner = 0
// Seconds (field 1, int64 varint)
const secs = Number(mtime.secs)
if (secs < 0) {
// negative int64 always takes 10 bytes in protobuf two's complement
mtimeInner += 1 + 10
} else {
mtimeInner += 1 + varintLen(secs)
}
// FractionalNanoseconds (field 2, fixed32) -- optional
if (mtime.nsecs != null) {
mtimeInner += 1 + 4
}
innerSize += 1 + varintLen(mtimeInner) + mtimeInner
}

// PBNode Data wrapper: tag(1) + varint(innerSize) + innerBytes
return 1 + varintLen(innerSize) + innerSize
}
86 changes: 86 additions & 0 deletions packages/ipfs-unixfs-importer/test/helpers/deterministic.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/**
* Deterministic pseudo-random byte generation matching kubo's Go test helpers.
*
* Algorithm:
* 1. key = sha256(utf8(seed)) -- 32 bytes
* 2. nonce = 12 zero bytes (ChaCha20-IETF)
* 3. return chacha20(key, nonce, zeros(size)) -- keystream XOR zeros = keystream
*
* @see https://github.com/ipfs/kubo/blob/master/test/cli/testutils/random_deterministic.go
*/

import { chacha20 } from '@noble/ciphers/chacha'
Comment thread
achingbrain marked this conversation as resolved.
Outdated
import { sha256 } from 'multiformats/hashes/sha2'

/**
* 39-char alphabet matching Go's testutils.AlphabetEasy exactly.
*/
const ALPHABET_EASY = 'abcdefghijklmnopqrstuvwxyz01234567890-_'

const CHACHA20_BLOCK_LEN = 64

/**
* Produce `size` deterministic pseudo-random bytes seeded by `seed`.
* Matches Go's DeterministicRandomReaderBytes.
*/
export async function deterministicRandomBytes (size: number, seed: string): Promise<Uint8Array> {
const hash = await sha256.digest(new TextEncoder().encode(seed))
const key = hash.digest // 32 bytes
const nonce = new Uint8Array(12) // 12 zero bytes
return chacha20(key, nonce, new Uint8Array(size))
}

/**
* Produce `size` deterministic pseudo-random bytes as an async iterable,
* yielding in 1 MiB chunks to avoid allocating huge buffers at once.
* Maintains ChaCha20 block counter across chunks to match Go's streaming reader.
*/
export async function * deterministicRandomStream (size: number, seed: string): AsyncGenerator<Uint8Array> {
const CHUNK = 1_048_576 // 1 MiB
const hash = await sha256.digest(new TextEncoder().encode(seed))
const key = hash.digest
const nonce = new Uint8Array(12)

let remaining = size
let counter = 0
while (remaining > 0) {
const n = Math.min(remaining, CHUNK)
yield chacha20(key, nonce, new Uint8Array(n), undefined, counter)
counter += Math.ceil(n / CHACHA20_BLOCK_LEN)
remaining -= n
}
}

/**
* Generate deterministic filenames matching kubo's createDeterministicFiles.
*
* Files 0..count-2 get `nameLen` chars, the last file gets `lastNameLen` chars.
* Each byte from the ChaCha20 stream is mapped through AlphabetEasy modulo.
*
* Note: the stream is 1 MiB, so total bytes consumed
* ((count-1)*nameLen + lastNameLen) must not exceed 1,048,576.
*/
export async function deterministicFilenames (
count: number,
nameLen: number,
lastNameLen: number,
seed: string
): Promise<string[]> {
// Match Go: DeterministicRandomReader("1MiB", seed) - always 1 MiB stream
const stream = await deterministicRandomBytes(1_048_576, seed)

const names: string[] = []
let offset = 0

for (let i = 0; i < count; i++) {
const currentLen = (i === count - 1) ? lastNameLen : nameLen
let name = ''
for (let j = 0; j < currentLen; j++) {
name += ALPHABET_EASY[stream[offset + j] % ALPHABET_EASY.length]
}
names.push(name)
offset += currentLen
}

return names
}
Loading
Loading