Skip to content

Commit 0320631

Browse files
committed
throw utf8 encoding error for tsv files, add non utf-8 test data and test.
1 parent a115575 commit 0320631

File tree

5 files changed

+79
-7
lines changed

5 files changed

+79
-7
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
<!--
2+
A new scriv changelog fragment.
3+
4+
Uncomment the section that is right (remove the HTML comment wrapper).
5+
For top level release notes, leave all the headers commented out.
6+
-->
7+
8+
<!--
9+
### Added
10+
11+
- A bullet item for the Added category.
12+
13+
-->
14+
<!--
15+
### Changed
16+
17+
- A bullet item for the Changed category.
18+
19+
-->
20+
### Fixed
21+
22+
- Throw utf-8 encoding error for tsv files similar to json files.
23+
24+
<!--
25+
### Deprecated
26+
27+
- A bullet item for the Deprecated category.
28+
29+
-->
30+
<!--
31+
### Removed
32+
33+
- A bullet item for the Removed category.
34+
35+
-->
36+
<!--
37+
### Security
38+
39+
- A bullet item for the Security category.
40+
41+
-->
42+
<!--
43+
### Infrastructure
44+
45+
- A bullet item for the Infrastructure category.
46+
47+
-->

src/files/streams.ts

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,23 @@ export class UTF8StreamTransformer implements Transformer<Uint8Array, string> {
1616
private decoder: TextDecoder
1717
private firstChunk: boolean
1818

19-
constructor() {
20-
this.decoder = new TextDecoder('utf-8')
19+
constructor(fatal: boolean = false) {
20+
this.decoder = new TextDecoder('utf-8', {fatal})
2121
this.firstChunk = true
2222
}
2323

2424
transform(chunk: Uint8Array, controller: TransformStreamDefaultController<string>) {
2525
// Check first chunk for UTF-16 BOM
2626
if (this.firstChunk) {
27-
const decoded = this.decoder.decode(chunk, { stream: true })
27+
let decoded
28+
try {
29+
decoded = this.decoder.decode(chunk, { stream: true })
30+
} catch {
31+
throw { code: 'INVALID_FILE_ENCODING' }
32+
}
33+
if (decoded === undefined) {
34+
throw { code: 'FILE_DECODING_ERROR' }
35+
}
2836
if (decoded.startsWith('\uFFFD')) {
2937
throw new UnicodeDecodeError('This file appears to be UTF-16')
3038
}
@@ -46,6 +54,6 @@ export class UTF8StreamTransformer implements Transformer<Uint8Array, string> {
4654
/**
4755
* Creates a TransformStream that validates and decodes UTF-8 text
4856
*/
49-
export function createUTF8Stream() {
50-
return new TransformStream(new UTF8StreamTransformer())
57+
export function createUTF8Stream(fatal: boolean = false) {
58+
return new TransformStream(new UTF8StreamTransformer(fatal))
5159
}

src/files/tsv.test.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import {
66
assertStrictEquals,
77
} from '@std/assert'
88
import { pathToFile } from './filetree.ts'
9+
import { BIDSFileDeno } from './deno.ts'
910
import { loadTSV, loadTSVGZ } from './tsv.ts'
1011
import { streamFromString } from '../tests/utils.ts'
1112
import { ColumnsMap } from '../types/columns.ts'
@@ -182,6 +183,18 @@ Deno.test('TSV loading', async (t) => {
182183
}
183184
})
184185

186+
await t.step('Raises issue on non utf-8', async () => {
187+
const file = new BIDSFileDeno('', './tests/data/iso8859.tsv')
188+
189+
try {
190+
await loadTSV(file)
191+
assert(false, 'Expected error')
192+
} catch (e: any) {
193+
assertObjectMatch(e, { code: 'INVALID_FILE_ENCODING' })
194+
}
195+
})
196+
197+
185198
// Tests will have populated the memoization cache
186199
loadTSV.cache.clear()
187200
})

src/files/tsv.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import type { BIDSFile } from '../types/filetree.ts'
88
import { filememoizeAsync } from '../utils/memoize.ts'
99
import { createUTF8Stream } from './streams.ts'
1010
import { openStream } from './access.ts'
11+
import { BIDSFileDeno } from './deno.ts'
1112

1213
async function loadColumns(
1314
reader: ReadableStreamDefaultReader<string>,
@@ -58,7 +59,7 @@ export async function loadTSVGZ(
5859
): Promise<ColumnsMap> {
5960
const reader = openStream(file)
6061
.pipeThrough(new DecompressionStream('gzip'))
61-
.pipeThrough(createUTF8Stream())
62+
.pipeThrough(createUTF8Stream(true))
6263
.pipeThrough(new TextLineStream())
6364
.getReader()
6465

@@ -77,7 +78,7 @@ export async function loadTSVGZ(
7778

7879
async function _loadTSV(file: BIDSFile, maxRows: number = -1): Promise<ColumnsMap> {
7980
const reader = openStream(file)
80-
.pipeThrough(createUTF8Stream())
81+
.pipeThrough(createUTF8Stream(true))
8182
.pipeThrough(new TextLineStream())
8283
.getReader()
8384

tests/data/iso8859.tsv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Name Age City
2+
José 30 São Paulo
3+
Müller 25 München

0 commit comments

Comments
 (0)