Skip to content

Commit 7efcec9

Browse files
committed
rf: Abstract out UTF8-enforcing stream handling
1 parent da2c6e6 commit 7efcec9

File tree

9 files changed

+225
-30
lines changed

9 files changed

+225
-30
lines changed

deno.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
"@std/io": "jsr:@std/[email protected]",
4242
"@std/log": "jsr:@std/[email protected]",
4343
"@std/path": "jsr:@std/[email protected]",
44+
"@std/streams": "jsr:@std/[email protected]",
4445
"@std/yaml": "jsr:@std/yaml@^1.0.4"
4546
},
4647
"tasks": {

src/files/deno.test.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ import { readAll, readerFromStreamReader } from '@std/io'
33
import { basename, dirname, fromFileUrl, join } from '@std/path'
44
import { EOL } from '@std/fs'
55
import type { FileTree } from '../types/filetree.ts'
6-
import { BIDSFileDeno, readFileTree, UnicodeDecodeError } from './deno.ts'
6+
import { BIDSFileDeno, readFileTree } from './deno.ts'
7+
import { UnicodeDecodeError } from './streams.ts'
78
import { requestReadPermission } from '../setup/requestPermissions.ts'
89
import { FileIgnoreRules } from './ignore.ts'
910

src/files/deno.ts

Lines changed: 8 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,9 @@ import { type BIDSFile, FileTree } from '../types/filetree.ts'
77
import { requestReadPermission } from '../setup/requestPermissions.ts'
88
import { FileIgnoreRules, readBidsIgnore } from './ignore.ts'
99
import { logger } from '../utils/logger.ts'
10+
import { createUTF8Stream } from './streams.ts'
1011
export { type BIDSFile, FileTree }
1112

12-
/**
13-
* Thrown when a text file is decoded as UTF-8 but contains UTF-16 characters
14-
*/
15-
export class UnicodeDecodeError extends Error {
16-
constructor(message: string) {
17-
super(message)
18-
this.name = 'UnicodeDecode'
19-
}
20-
}
21-
2213
/**
2314
* Deno implementation of BIDSFile
2415
*/
@@ -67,27 +58,17 @@ export class BIDSFileDeno implements BIDSFile {
6758
* Read the entire file and decode as utf-8 text
6859
*/
6960
async text(): Promise<string> {
70-
const streamReader = this.stream
71-
.pipeThrough(new TextDecoderStream('utf-8'))
72-
.getReader()
73-
let data = ''
61+
const reader = this.stream.pipeThrough(createUTF8Stream()).getReader()
62+
const chunks: string[] = []
7463
try {
75-
// Read once to check for unicode issues
76-
const { done, value } = await streamReader.read()
77-
// Check for UTF-16 BOM
78-
if (value && value.startsWith('\uFFFD')) {
79-
throw new UnicodeDecodeError('This file appears to be UTF-16')
80-
}
81-
if (done) return data
82-
data += value
83-
// Continue reading the rest of the file if no unicode issues were found
8464
while (true) {
85-
const { done, value } = await streamReader.read()
86-
if (done) return data
87-
data += value
65+
const { done, value } = await reader.read()
66+
if (done) break
67+
chunks.push(value)
8868
}
69+
return chunks.join('')
8970
} finally {
90-
streamReader.releaseLock()
71+
reader.releaseLock()
9172
}
9273
}
9374

src/files/filetree.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,11 @@ import { FileIgnoreRules } from './ignore.ts'
55

66
const nullFile = {
77
size: 0,
8-
stream: new ReadableStream(),
8+
stream: new ReadableStream({
9+
start(controller) {
10+
controller.close()
11+
}
12+
}),
913
text: () => Promise.resolve(''),
1014
readBytes: async (size: number, offset?: number) => new Uint8Array(),
1115
parent: new FileTree('', '/'),

src/files/json.test.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import { type assert, assertObjectMatch } from '@std/assert'
2-
import type { BIDSFileDeno, UnicodeDecodeError } from './deno.ts'
32
import type { BIDSFile } from '../types/filetree.ts'
43
import type { FileIgnoreRules } from './ignore.ts'
54

src/files/streams.test.ts

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import { assert, assertEquals } from '@std/assert'
2+
import { createUTF8Stream, UnicodeDecodeError } from './streams.ts'
3+
import { streamFromUint8Array, streamFromString } from '../tests/utils.ts'
4+
5+
Deno.test('createUTF8Stream', async (t) => {
6+
await t.step('should return a TransformStream with UTF8StreamTransformer', () => {
7+
const stream = createUTF8Stream()
8+
assertEquals(stream instanceof TransformStream, true)
9+
})
10+
11+
await t.step('should correctly transform UTF-8 input', async () => {
12+
const rawstream = streamFromString('Hello, world!')
13+
const reader = rawstream.pipeThrough(createUTF8Stream()).getReader()
14+
const { value } = await reader.read()
15+
assertEquals(value, 'Hello, world!')
16+
17+
await reader.cancel()
18+
})
19+
20+
await t.step('should throw UnicodeDecodeError for UTF-16 input', async () => {
21+
const rawStream = streamFromUint8Array(new Uint8Array([0xFF, 0xFE, 0x00, 0x00]))
22+
23+
let reader
24+
try {
25+
// The exception can't be localized to either of the following lines
26+
// but is raised before the second returns
27+
reader = rawStream.pipeThrough(createUTF8Stream()).getReader()
28+
const { value } = await reader.read()
29+
assert(false, 'Expected UnicodeDecodeError, got ' + value)
30+
} catch (e: any) {
31+
assertEquals(e instanceof UnicodeDecodeError, true)
32+
assertEquals(e?.message, 'This file appears to be UTF-16')
33+
} finally {
34+
if (reader) await reader.cancel
35+
}
36+
})
37+
})

src/files/streams.ts

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/**
2+
* Thrown when a text file is decoded as UTF-8 but contains UTF-16 characters
3+
*/
4+
export class UnicodeDecodeError extends Error {
5+
constructor(message: string) {
6+
super(message)
7+
this.name = 'UnicodeDecode'
8+
}
9+
}
10+
11+
/**
12+
* A transformer that ensures the input stream is valid UTF-8 and throws
13+
* a UnicodeDecodeError if UTF-16 BOM is detected
14+
*/
15+
export class UTF8StreamTransformer implements Transformer<Uint8Array, string> {
16+
private decoder: TextDecoder
17+
private firstChunk: boolean
18+
19+
constructor() {
20+
this.decoder = new TextDecoder('utf-8')
21+
this.firstChunk = true
22+
}
23+
24+
transform(chunk: Uint8Array, controller: TransformStreamDefaultController<string>) {
25+
// Check first chunk for UTF-16 BOM
26+
if (this.firstChunk) {
27+
const decoded = this.decoder.decode(chunk, { stream: true })
28+
if (decoded.startsWith('\uFFFD')) {
29+
throw new UnicodeDecodeError('This file appears to be UTF-16')
30+
}
31+
this.firstChunk = false
32+
controller.enqueue(decoded)
33+
} else {
34+
controller.enqueue(this.decoder.decode(chunk, { stream: true }))
35+
}
36+
}
37+
38+
flush(controller: TransformStreamDefaultController<string>) {
39+
const final = this.decoder.decode()
40+
if (final) {
41+
controller.enqueue(final)
42+
}
43+
}
44+
}
45+
46+
/**
47+
* Creates a TransformStream that validates and decodes UTF-8 text
48+
*/
49+
export function createUTF8Stream() {
50+
return new TransformStream(new UTF8StreamTransformer())
51+
}

src/files/tsv.test.ts

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import { assert, assertEquals, assertObjectMatch } from '@std/assert'
2+
import { pathToFile } from './filetree.ts'
3+
import { loadTSV } from './tsv.ts'
4+
import { streamFromString } from '../tests/utils.ts'
5+
import { ColumnsMap } from '../types/columns.ts'
6+
7+
Deno.test('TSV loading', async (t) => {
8+
await t.step('Empty file produces empty map', async () => {
9+
const file = pathToFile('/empty.tsv')
10+
file.stream = streamFromString('')
11+
12+
const map = await loadTSV(file)
13+
// map.size looks for a column called map, so work around it
14+
assertEquals(Object.keys(map).length, 0)
15+
})
16+
17+
await t.step('Single row file produces header-only map', async () => {
18+
const file = pathToFile('/single_row.tsv')
19+
file.stream = streamFromString('a\tb\tc\n')
20+
21+
const map = await loadTSV(file)
22+
assertEquals(map.a, [])
23+
assertEquals(map.b, [])
24+
assertEquals(map.c, [])
25+
})
26+
27+
await t.step('Single column file produces single column map', async () => {
28+
const file = pathToFile('/single_column.tsv')
29+
file.stream = streamFromString('a\n1\n2\n3\n')
30+
31+
const map = await loadTSV(file)
32+
assertEquals(map.a, ['1', '2', '3'])
33+
})
34+
35+
await t.step('Missing final newline is ignored', async () => {
36+
const file = pathToFile('/missing_newline.tsv')
37+
file.stream = streamFromString('a\n1\n2\n3')
38+
39+
const map = await loadTSV(file)
40+
assertEquals(map.a, ['1', '2', '3'])
41+
})
42+
43+
await t.step('Empty row throws issue', async () => {
44+
const file = pathToFile('/empty_row.tsv')
45+
file.stream = streamFromString('a\tb\tc\n1\t2\t3\n\n4\t5\t6\n')
46+
47+
try {
48+
await loadTSV(file)
49+
} catch (e: any) {
50+
assertObjectMatch(e, { key: 'TSV_EMPTY_LINE', line: 3 })
51+
}
52+
})
53+
54+
await t.step('Mismatched row length throws issue', async () => {
55+
const file = pathToFile('/mismatched_row.tsv')
56+
file.stream = streamFromString('a\tb\tc\n1\t2\t3\n4\t5\n')
57+
58+
try {
59+
await loadTSV(file)
60+
} catch (e: any) {
61+
assertObjectMatch(e, { key: 'TSV_EQUAL_ROWS', line: 3 })
62+
}
63+
})
64+
65+
await t.step('maxRows limits the number of rows read', async () => {
66+
const file = pathToFile('/long.tsv')
67+
// Use 1500 to avoid overlap with default initial capacity
68+
const text = 'a\tb\tc\n' + '1\t2\t3\n'.repeat(1500)
69+
file.stream = streamFromString(text)
70+
71+
let map = await loadTSV(file, 0)
72+
assertEquals(map.a, [])
73+
assertEquals(map.b, [])
74+
assertEquals(map.c, [])
75+
76+
// Clear memoization cache. We currently do not key on maxRows.
77+
loadTSV.cache.clear()
78+
file.stream = streamFromString(text)
79+
map = await loadTSV(file, 1)
80+
assertEquals(map.a, ['1'])
81+
assertEquals(map.b, ['2'])
82+
assertEquals(map.c, ['3'])
83+
84+
loadTSV.cache.clear()
85+
file.stream = streamFromString(text)
86+
map = await loadTSV(file, 2)
87+
assertEquals(map.a, ['1', '1'])
88+
assertEquals(map.b, ['2', '2'])
89+
assertEquals(map.c, ['3', '3'])
90+
91+
loadTSV.cache.clear()
92+
file.stream = streamFromString(text)
93+
map = await loadTSV(file, -1)
94+
assertEquals(map.a, Array(1500).fill('1'))
95+
assertEquals(map.b, Array(1500).fill('2'))
96+
assertEquals(map.c, Array(1500).fill('3'))
97+
98+
loadTSV.cache.clear()
99+
// Check that maxRows does not truncate shorter files
100+
file.stream = streamFromString('a\tb\tc\n1\t2\t3\n4\t5\t6\n7\t8\t9\n')
101+
map = await loadTSV(file, 4)
102+
assertEquals(map.a, ['1', '4', '7'])
103+
assertEquals(map.b, ['2', '5', '8'])
104+
assertEquals(map.c, ['3', '6', '9'])
105+
})
106+
107+
// Tests will have populated the memoization cache
108+
await loadTSV.cache.clear()
109+
})

src/tests/utils.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
export function streamFromUint8Array(arr: Uint8Array): ReadableStream<Uint8Array> {
2+
return new ReadableStream({
3+
start(controller) {
4+
controller.enqueue(arr)
5+
controller.close()
6+
},
7+
})
8+
}
9+
10+
export function streamFromString(str: string): ReadableStream<Uint8Array> {
11+
return streamFromUint8Array(new TextEncoder().encode(str))
12+
}

0 commit comments

Comments
 (0)