Skip to content

Commit b649c0c

Browse files
committed
rf: Read TSV files as streams
1 parent 1e6d650 commit b649c0c

File tree

4 files changed

+115
-33
lines changed

4 files changed

+115
-33
lines changed

src/files/tsv.test.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import { assert, assertEquals, assertObjectMatch } from '@std/assert'
2+
import { pathToFile } from './filetree.ts'
3+
import { loadTSV } from './tsv.ts'
4+
import { streamFromString } from '../tests/utils.ts'
5+
import { ColumnsMap } from '../types/columns.ts'
6+
7+
Deno.test('TSV loading', async (t) => {
8+
await t.step('Empty file produces empty map', async () => {
9+
const file = pathToFile('/empty.tsv')
10+
file.stream = streamFromString('')
11+
12+
const map = await loadTSV(file)
13+
// map.size looks for a column called map, so work around it
14+
assertEquals(Object.keys(map).length, 0)
15+
})
16+
17+
await t.step('Single row file produces header-only map', async () => {
18+
const file = pathToFile('/single_row.tsv')
19+
file.stream = streamFromString('a\tb\tc\n')
20+
21+
const map = await loadTSV(file)
22+
assertEquals(map.a, [])
23+
assertEquals(map.b, [])
24+
assertEquals(map.c, [])
25+
})
26+
27+
await t.step('Single column file produces single column map', async () => {
28+
const file = pathToFile('/single_column.tsv')
29+
file.stream = streamFromString('a\n1\n2\n3\n')
30+
31+
const map = await loadTSV(file)
32+
assertEquals(map.a, ['1', '2', '3'])
33+
})
34+
35+
await t.step('Missing final newline is ignored', async () => {
36+
const file = pathToFile('/missing_newline.tsv')
37+
file.stream = streamFromString('a\n1\n2\n3')
38+
39+
const map = await loadTSV(file)
40+
assertEquals(map.a, ['1', '2', '3'])
41+
})
42+
43+
await t.step('Empty row throws issue', async () => {
44+
const file = pathToFile('/empty_row.tsv')
45+
file.stream = streamFromString('a\tb\tc\n1\t2\t3\n\n4\t5\t6\n')
46+
47+
try {
48+
await loadTSV(file)
49+
} catch (e: any) {
50+
assertObjectMatch(e, { key: 'TSV_EMPTY_LINE', line: 3 })
51+
}
52+
})
53+
54+
await t.step('Mismatched row length throws issue', async () => {
55+
const file = pathToFile('/mismatched_row.tsv')
56+
file.stream = streamFromString('a\tb\tc\n1\t2\t3\n4\t5\n')
57+
58+
try {
59+
await loadTSV(file)
60+
} catch (e: any) {
61+
assertObjectMatch(e, { key: 'TSV_EQUAL_ROWS', line: 3 })
62+
}
63+
})
64+
65+
// Tests will have populated the memoization cache
66+
await loadTSV.cache.clear()
67+
})

src/files/tsv.ts

Lines changed: 43 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,44 +2,58 @@
22
* TSV
33
* Module for parsing TSV
44
*/
5+
import { TextLineStream } from '@std/streams'
56
import { ColumnsMap } from '../types/columns.ts'
67
import type { BIDSFile } from '../types/filetree.ts'
78
import { filememoizeAsync } from '../utils/memoize.ts'
8-
import type { WithCache } from '../utils/memoize.ts'
9-
10-
const normalizeEOL = (str: string): string => str.replace(/\r\n/g, '\n').replace(/\r/g, '\n')
11-
// Typescript resolved `row && !/^\s*$/.test(row)` as `string | boolean`
12-
const isContentfulRow = (row: string): boolean => !!(row && !/^\s*$/.test(row))
9+
import { createUTF8Stream } from './streams.ts'
1310

1411
async function _loadTSV(file: BIDSFile): Promise<ColumnsMap> {
15-
return await file.text().then(parseTSV)
16-
}
12+
const reader = file.stream
13+
.pipeThrough(createUTF8Stream())
14+
.pipeThrough(new TextLineStream())
15+
.getReader()
1716

18-
export const loadTSV = filememoizeAsync(_loadTSV)
17+
try {
18+
const headerRow = await reader.read()
19+
const headers = (headerRow.done || !headerRow.value) ? [] : headerRow.value.split('\t')
1920

20-
function parseTSV(contents: string) {
21-
const columns = new ColumnsMap()
22-
const rows: string[][] = normalizeEOL(contents)
23-
.split('\n')
24-
.filter(isContentfulRow)
25-
.map((str) => str.split('\t'))
26-
const headers = rows.length ? rows[0] : []
21+
// Initialize columns in array for construction efficiency
22+
const initialCapacity = 1000
23+
const columns: string[][] = headers.map(() => new Array<string>(initialCapacity))
2724

28-
if (rows.some((row) => row.length !== headers.length)) {
29-
throw { key: 'TSV_EQUAL_ROWS' }
30-
}
25+
let rowIndex = 0 // Keep in scope after loop
26+
for (; ; rowIndex++) {
27+
const { done, value } = await reader.read()
28+
if (done) break
3129

32-
headers.map((x) => {
33-
columns[x] = []
34-
})
35-
if (headers.length !== Object.keys(columns).length) {
36-
throw { key: 'TSV_COLUMN_HEADER_DUPLICATE', evidence: headers.join(', ') }
37-
}
38-
for (let i = 1; i < rows.length; i++) {
39-
for (let j = 0; j < headers.length; j++) {
40-
const col = columns[headers[j]] as string[]
41-
col.push(rows[i][j])
30+
// Expect a newline at the end of the file, but otherwise error on empty lines
31+
if (!value) {
32+
const nextRow = await reader.read()
33+
if (nextRow.done) break
34+
throw { key: 'TSV_EMPTY_LINE', line: rowIndex + 2 }
35+
}
36+
37+
const values = value.split('\t')
38+
if (values.length !== headers.length) {
39+
throw { key: 'TSV_EQUAL_ROWS', line: rowIndex + 2 }
40+
}
41+
columns.forEach((column, columnIndex) => {
42+
// Double array size if we exceed the current capacity
43+
if (rowIndex >= column.length) {
44+
column.length = column.length * 2
45+
}
46+
column[rowIndex] = values[columnIndex]
47+
})
4248
}
49+
50+
// Construct map, truncating columns to number of rows read
51+
return new ColumnsMap(
52+
headers.map((header, index) => [header, columns[index].slice(0, rowIndex)]),
53+
)
54+
} finally {
55+
reader.releaseLock()
4356
}
44-
return columns
4557
}
58+
59+
export const loadTSV = filememoizeAsync(_loadTSV)

src/tests/regression.test.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { assert } from '@std/assert'
22
import { pathsToTree } from '../files/filetree.ts'
33
import { validate } from '../validators/bids.ts'
44
import type { BIDSFile } from '../types/filetree.ts'
5+
import { streamFromString } from './utils.ts'
56

67
Deno.test('Regression tests', async (t) => {
78
await t.step('Verify ignored files in scans.tsv do not trigger error', async () => {
@@ -17,7 +18,7 @@ Deno.test('Regression tests', async (t) => {
1718
// Without ignore, NOT_INCLUDED is triggered for CT, but the scans file is happy
1819
let ds = pathsToTree(paths)
1920
let scans_tsv = ds.get('sub-01/sub-01_scans.tsv') as BIDSFile
20-
scans_tsv.text = () => Promise.resolve(scans_content)
21+
scans_tsv.stream = streamFromString(scans_content)
2122
let result = await validate(ds, {
2223
datasetPath: '/dataset',
2324
debug: 'ERROR',
@@ -30,7 +31,7 @@ Deno.test('Regression tests', async (t) => {
3031
// With ignore, NOT_INCLUDED is not triggered for CT, and the scans file is still happy
3132
ds = pathsToTree(paths, ignore)
3233
scans_tsv = ds.get('sub-01/sub-01_scans.tsv') as BIDSFile
33-
scans_tsv.text = () => Promise.resolve(scans_content)
34+
scans_tsv.stream = streamFromString(scans_content)
3435
result = await validate(ds, {
3536
datasetPath: '/dataset',
3637
debug: 'ERROR',

src/types/columns.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
// Allow ColumnsMap to be accessed as an object too
22
export class ColumnsMap extends Map<string, string[]> {
33
[key: string]: Map<string, string[]>[keyof Map<string, string[]>] | string[]
4-
constructor() {
4+
constructor(iterable?: Iterable<readonly [string, string[]]>) {
55
super()
6-
const columns = new Map<string, string[]>() as ColumnsMap
6+
const columns = new Map<string, string[]>(iterable) as ColumnsMap
77
return new Proxy<ColumnsMap>(columns, columnMapAccessorProxy)
88
}
99
}

0 commit comments

Comments
 (0)