Skip to content

Commit a01f420

Browse files
authored
Merge pull request bids-standard#264 from bids-standard/enh/error_on_non_utf8_tsv
throw utf8 encoding error for tsv files, add non utf-8 test data and test.
2 parents d798ffc + f92bd22 commit a01f420

File tree

5 files changed

+81
-7
lines changed

5 files changed

+81
-7
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
<!--
2+
A new scriv changelog fragment.
3+
4+
Uncomment the section that is right (remove the HTML comment wrapper).
5+
For top level release notes, leave all the headers commented out.
6+
-->
7+
8+
<!--
9+
### Added
10+
11+
- A bullet item for the Added category.
12+
13+
-->
14+
<!--
15+
### Changed
16+
17+
- A bullet item for the Changed category.
18+
19+
-->
20+
### Fixed
21+
22+
- Throw utf-8 encoding error for tsv files similar to json files.
23+
24+
<!--
25+
### Deprecated
26+
27+
- A bullet item for the Deprecated category.
28+
29+
-->
30+
<!--
31+
### Removed
32+
33+
- A bullet item for the Removed category.
34+
35+
-->
36+
<!--
37+
### Security
38+
39+
- A bullet item for the Security category.
40+
41+
-->
42+
<!--
43+
### Infrastructure
44+
45+
- A bullet item for the Infrastructure category.
46+
47+
-->

src/files/streams.ts

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@ export class UnicodeDecodeError extends Error {
88
}
99
}
1010

11+
const _decode = TextDecoder.prototype.decode
12+
13+
TextDecoder.prototype.decode = function(input, options) {
14+
try {
15+
return _decode.call(this, input, options)
16+
} catch (error) {
17+
throw { code: 'INVALID_FILE_ENCODING', message: error}
18+
}
19+
}
20+
1121
/**
1222
* A transformer that ensures the input stream is valid UTF-8 and throws
1323
* a UnicodeDecodeError if UTF-16 BOM is detected
@@ -16,15 +26,15 @@ export class UTF8StreamTransformer implements Transformer<Uint8Array, string> {
1626
private decoder: TextDecoder
1727
private firstChunk: boolean
1828

19-
constructor() {
20-
this.decoder = new TextDecoder('utf-8')
29+
constructor(options = {fatal: false}) {
30+
this.decoder = new TextDecoder('utf-8', options)
2131
this.firstChunk = true
2232
}
2333

2434
transform(chunk: Uint8Array, controller: TransformStreamDefaultController<string>) {
2535
// Check first chunk for UTF-16 BOM
2636
if (this.firstChunk) {
27-
const decoded = this.decoder.decode(chunk, { stream: true })
37+
let decoded = this.decoder.decode(chunk, { stream: true })
2838
if (decoded.startsWith('\uFFFD')) {
2939
throw new UnicodeDecodeError('This file appears to be UTF-16')
3040
}
@@ -46,6 +56,6 @@ export class UTF8StreamTransformer implements Transformer<Uint8Array, string> {
4656
/**
4757
* Creates a TransformStream that validates and decodes UTF-8 text
4858
*/
49-
export function createUTF8Stream() {
50-
return new TransformStream(new UTF8StreamTransformer())
59+
export function createUTF8Stream(options = {fatal: false}) {
60+
return new TransformStream(new UTF8StreamTransformer(options))
5161
}

src/files/tsv.test.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import {
66
assertStrictEquals,
77
} from '@std/assert'
88
import { pathToFile } from './filetree.ts'
9+
import { BIDSFileDeno } from './deno.ts'
910
import { loadTSV, loadTSVGZ } from './tsv.ts'
1011
import { streamFromString } from '../tests/utils.ts'
1112
import { ColumnsMap } from '../types/columns.ts'
@@ -182,6 +183,18 @@ Deno.test('TSV loading', async (t) => {
182183
}
183184
})
184185

186+
await t.step('Raises issue on non utf-8', async () => {
187+
const file = new BIDSFileDeno('', './tests/data/iso8859.tsv')
188+
189+
try {
190+
await loadTSV(file)
191+
assert(false, 'Expected error')
192+
} catch (e: any) {
193+
assertObjectMatch(e, { code: 'INVALID_FILE_ENCODING' })
194+
}
195+
})
196+
197+
185198
// Tests will have populated the memoization cache
186199
loadTSV.cache.clear()
187200
})

src/files/tsv.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import type { BIDSFile } from '../types/filetree.ts'
88
import { filememoizeAsync } from '../utils/memoize.ts'
99
import { createUTF8Stream } from './streams.ts'
1010
import { openStream } from './access.ts'
11+
import { BIDSFileDeno } from './deno.ts'
1112

1213
async function loadColumns(
1314
reader: ReadableStreamDefaultReader<string>,
@@ -58,7 +59,7 @@ export async function loadTSVGZ(
5859
): Promise<ColumnsMap> {
5960
const reader = openStream(file)
6061
.pipeThrough(new DecompressionStream('gzip'))
61-
.pipeThrough(createUTF8Stream())
62+
.pipeThrough(createUTF8Stream({ fatal: true }))
6263
.pipeThrough(new TextLineStream())
6364
.getReader()
6465

@@ -77,7 +78,7 @@ export async function loadTSVGZ(
7778

7879
async function _loadTSV(file: BIDSFile, maxRows: number = -1): Promise<ColumnsMap> {
7980
const reader = openStream(file)
80-
.pipeThrough(createUTF8Stream())
81+
.pipeThrough(createUTF8Stream({ fatal: true }))
8182
.pipeThrough(new TextLineStream())
8283
.getReader()
8384

tests/data/iso8859.tsv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Name Age City
2+
José 30 São Paulo
3+
Müller 25 München

0 commit comments

Comments
 (0)