|
2 | 2 | * TSV |
3 | 3 | * Module for parsing TSV |
4 | 4 | */ |
| 5 | +import { TextLineStream } from '@std/streams' |
5 | 6 | import { ColumnsMap } from '../types/columns.ts' |
6 | 7 | import type { BIDSFile } from '../types/filetree.ts' |
7 | 8 | import { filememoizeAsync } from '../utils/memoize.ts' |
8 | | -import type { WithCache } from '../utils/memoize.ts' |
9 | | - |
10 | | -const normalizeEOL = (str: string): string => str.replace(/\r\n/g, '\n').replace(/\r/g, '\n') |
11 | | -// Typescript resolved `row && !/^\s*$/.test(row)` as `string | boolean` |
12 | | -const isContentfulRow = (row: string): boolean => !!(row && !/^\s*$/.test(row)) |
| 9 | +import { createUTF8Stream } from './streams.ts' |
13 | 10 |
|
14 | 11 | async function _loadTSV(file: BIDSFile): Promise<ColumnsMap> { |
15 | | - return await file.text().then(parseTSV) |
16 | | -} |
| 12 | + const reader = file.stream |
| 13 | + .pipeThrough(createUTF8Stream()) |
| 14 | + .pipeThrough(new TextLineStream()) |
| 15 | + .getReader() |
17 | 16 |
|
18 | | -export const loadTSV = filememoizeAsync(_loadTSV) |
| 17 | + try { |
| 18 | + const headerRow = await reader.read() |
| 19 | + const headers = (headerRow.done || !headerRow.value) ? [] : headerRow.value.split('\t') |
19 | 20 |
|
20 | | -function parseTSV(contents: string) { |
21 | | - const columns = new ColumnsMap() |
22 | | - const rows: string[][] = normalizeEOL(contents) |
23 | | - .split('\n') |
24 | | - .filter(isContentfulRow) |
25 | | - .map((str) => str.split('\t')) |
26 | | - const headers = rows.length ? rows[0] : [] |
| 21 | + // Initialize columns in array for construction efficiency |
| 22 | + const initialCapacity = 1000 |
| 23 | + const columns: string[][] = headers.map(() => new Array<string>(initialCapacity)) |
27 | 24 |
|
28 | | - if (rows.some((row) => row.length !== headers.length)) { |
29 | | - throw { key: 'TSV_EQUAL_ROWS' } |
30 | | - } |
| 25 | + let rowIndex = 0 // Keep in scope after loop |
| 26 | + for (; ; rowIndex++) { |
| 27 | + const { done, value } = await reader.read() |
| 28 | + if (done) break |
31 | 29 |
|
32 | | - headers.map((x) => { |
33 | | - columns[x] = [] |
34 | | - }) |
35 | | - if (headers.length !== Object.keys(columns).length) { |
36 | | - throw { key: 'TSV_COLUMN_HEADER_DUPLICATE', evidence: headers.join(', ') } |
37 | | - } |
38 | | - for (let i = 1; i < rows.length; i++) { |
39 | | - for (let j = 0; j < headers.length; j++) { |
40 | | - const col = columns[headers[j]] as string[] |
41 | | - col.push(rows[i][j]) |
| 30 | + // Expect a newline at the end of the file, but otherwise error on empty lines |
| 31 | + if (!value) { |
| 32 | + const nextRow = await reader.read() |
| 33 | + if (nextRow.done) break |
| 34 | + throw { key: 'TSV_EMPTY_LINE', line: rowIndex + 2 } |
| 35 | + } |
| 36 | + |
| 37 | + const values = value.split('\t') |
| 38 | + if (values.length !== headers.length) { |
| 39 | + throw { key: 'TSV_EQUAL_ROWS', line: rowIndex + 2 } |
| 40 | + } |
| 41 | + columns.forEach((column, columnIndex) => { |
| 42 | + // Double array size if we exceed the current capacity |
| 43 | + if (rowIndex >= column.length) { |
| 44 | + column.length = column.length * 2 |
| 45 | + } |
| 46 | + column[rowIndex] = values[columnIndex] |
| 47 | + }) |
42 | 48 | } |
| 49 | + |
| 50 | + // Construct map, truncating columns to number of rows read |
| 51 | + return new ColumnsMap( |
| 52 | + headers.map((header, index) => [header, columns[index].slice(0, rowIndex)]), |
| 53 | + ) |
| 54 | + } finally { |
| 55 | + reader.releaseLock() |
43 | 56 | } |
44 | | - return columns |
45 | 57 | } |
| 58 | + |
| 59 | +export const loadTSV = filememoizeAsync(_loadTSV) |
0 commit comments