Skip to content

Commit 96c2b52

Browse files
authored
Merge pull request #240 from effigies/feat/tsv-gz
feat: Enable type checking on tsv.gz files
2 parents c1bec83 + eec3496 commit 96c2b52

File tree

6 files changed

+279
-58
lines changed

6 files changed

+279
-58
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
<!--
2+
A new scriv changelog fragment.
3+
4+
Uncomment the section that is right (remove the HTML comment wrapper).
5+
For top level release notes, leave all the headers commented out.
6+
-->
7+
8+
### Added
9+
10+
- Load `.tsv.gz` column contents for validation.
11+
12+
<!--
13+
### Changed
14+
15+
- A bullet item for the Changed category.
16+
17+
-->
18+
<!--
19+
### Fixed
20+
21+
- A bullet item for the Fixed category.
22+
23+
-->
24+
<!--
25+
### Deprecated
26+
27+
- A bullet item for the Deprecated category.
28+
29+
-->
30+
<!--
31+
### Removed
32+
33+
- A bullet item for the Removed category.
34+
35+
-->
36+
<!--
37+
### Security
38+
39+
- A bullet item for the Security category.
40+
41+
-->
42+
<!--
43+
### Infrastructure
44+
45+
- A bullet item for the Infrastructure category.
46+
47+
-->

src/files/tsv.test.ts

Lines changed: 111 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import {
66
assertStrictEquals,
77
} from '@std/assert'
88
import { pathToFile } from './filetree.ts'
9-
import { loadTSV } from './tsv.ts'
9+
import { loadTSV, loadTSVGZ } from './tsv.ts'
1010
import { streamFromString } from '../tests/utils.ts'
1111
import { ColumnsMap } from '../types/columns.ts'
1212

@@ -64,7 +64,7 @@ Deno.test('TSV loading', async (t) => {
6464
try {
6565
await loadTSV(file)
6666
} catch (e: any) {
67-
assertObjectMatch(e, { code: 'TSV_EQUAL_ROWS', location: '/mismatched_row.tsv', line: 3 })
67+
assertObjectMatch(e, { code: 'TSV_EQUAL_ROWS', line: 3 })
6868
}
6969
})
7070

@@ -178,3 +178,112 @@ Deno.test('TSV loading', async (t) => {
178178
// Tests will have populated the memoization cache
179179
loadTSV.cache.clear()
180180
})
181+
182+
Deno.test('TSVGZ loading', async (t) => {
183+
await t.step('No header and empty file produces empty map', async () => {
184+
const file = pathToFile('/empty.tsv.gz')
185+
file.stream = streamFromString('').pipeThrough(new CompressionStream('gzip'))
186+
187+
const map = await loadTSVGZ(file, [])
188+
// map.size looks for a column called map, so work around it
189+
assertEquals(Object.keys(map).length, 0)
190+
})
191+
192+
await t.step('Empty file produces header-only map', async () => {
193+
const file = pathToFile('/empty.tsv.gz')
194+
file.stream = streamFromString('').pipeThrough(new CompressionStream('gzip'))
195+
196+
const map = await loadTSVGZ(file, ['a', 'b', 'c'])
197+
assertEquals(map.a, [])
198+
assertEquals(map.b, [])
199+
assertEquals(map.c, [])
200+
})
201+
202+
await t.step('Single column file produces single column maps', async () => {
203+
const file = pathToFile('/single_column.tsv')
204+
file.stream = streamFromString('1\n2\n3\n').pipeThrough(new CompressionStream('gzip'))
205+
206+
const map = await loadTSVGZ(file, ['a'])
207+
assertEquals(map.a, ['1', '2', '3'])
208+
})
209+
210+
await t.step('Mismatched header length throws issue', async () => {
211+
const file = pathToFile('/single_column.tsv.gz')
212+
file.stream = streamFromString('1\n2\n3\n').pipeThrough(new CompressionStream('gzip'))
213+
214+
try {
215+
await loadTSVGZ(file, ['a', 'b'])
216+
} catch (e: any) {
217+
assertObjectMatch(e, { code: 'TSV_EQUAL_ROWS', line: 1 })
218+
}
219+
})
220+
221+
await t.step('Missing final newline is ignored', async () => {
222+
const file = pathToFile('/missing_newline.tsv.gz')
223+
file.stream = streamFromString('1\n2\n3').pipeThrough(new CompressionStream('gzip'))
224+
225+
const map = await loadTSVGZ(file, ['a'])
226+
assertEquals(map.a, ['1', '2', '3'])
227+
})
228+
229+
await t.step('Empty row throws issue', async () => {
230+
const file = pathToFile('/empty_row.tsv.gz')
231+
file.stream = streamFromString('1\t2\t3\n\n4\t5\t6\n').pipeThrough(new CompressionStream('gzip'))
232+
233+
try {
234+
await loadTSVGZ(file, ['a', 'b', 'c'])
235+
} catch (e: any) {
236+
assertObjectMatch(e, { code: 'TSV_EMPTY_LINE', line: 2 })
237+
}
238+
})
239+
240+
await t.step('Mislabeled TSV throws issue', async () => {
241+
const file = pathToFile('/mismatched_row.tsv.gz')
242+
file.stream = streamFromString('a\tb\tc\n1\t2\t3\n4\t5\n')
243+
244+
try {
245+
await loadTSVGZ(file, ['a', 'b', 'c'])
246+
} catch (e: any) {
247+
assertObjectMatch(e, { code: 'INVALID_GZIP' })
248+
}
249+
})
250+
251+
await t.step('maxRows limits the number of rows read', async () => {
252+
const file = pathToFile('/long.tsv.gz')
253+
// Use 1500 to avoid overlap with default initial capacity
254+
const headers = ['a', 'b', 'c']
255+
const text = '1\t2\t3\n'.repeat(1500)
256+
file.stream = streamFromString(text).pipeThrough(new CompressionStream('gzip'))
257+
258+
let map = await loadTSVGZ(file, headers, 0)
259+
assertEquals(map.a, [])
260+
assertEquals(map.b, [])
261+
assertEquals(map.c, [])
262+
263+
file.stream = streamFromString(text).pipeThrough(new CompressionStream('gzip'))
264+
map = await loadTSVGZ(file, headers, 1)
265+
assertEquals(map.a, ['1'])
266+
assertEquals(map.b, ['2'])
267+
assertEquals(map.c, ['3'])
268+
269+
file.stream = streamFromString(text).pipeThrough(new CompressionStream('gzip'))
270+
map = await loadTSVGZ(file, headers, 2)
271+
assertEquals(map.a, ['1', '1'])
272+
assertEquals(map.b, ['2', '2'])
273+
assertEquals(map.c, ['3', '3'])
274+
275+
file.stream = streamFromString(text).pipeThrough(new CompressionStream('gzip'))
276+
map = await loadTSVGZ(file, headers, -1)
277+
assertEquals(map.a, Array(1500).fill('1'))
278+
assertEquals(map.b, Array(1500).fill('2'))
279+
assertEquals(map.c, Array(1500).fill('3'))
280+
281+
// Check that maxRows does not truncate shorter files
282+
file.stream = streamFromString('1\t2\t3\n4\t5\t6\n7\t8\t9\n').pipeThrough(new CompressionStream('gzip'))
283+
map = await loadTSVGZ(file, headers, 4)
284+
assertEquals(map.a, ['1', '4', '7'])
285+
assertEquals(map.b, ['2', '5', '8'])
286+
assertEquals(map.c, ['3', '6', '9'])
287+
})
288+
289+
})

src/files/tsv.ts

Lines changed: 67 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,72 @@ import type { BIDSFile } from '../types/filetree.ts'
88
import { filememoizeAsync } from '../utils/memoize.ts'
99
import { createUTF8Stream } from './streams.ts'
1010

11+
async function loadColumns(
12+
reader: ReadableStreamDefaultReader<string>,
13+
headers: string[],
14+
maxRows: number,
15+
startRow: number = 0,
16+
): Promise<ColumnsMap> {
17+
// Initialize columns in array for construction efficiency
18+
const initialCapacity = maxRows >= 0 ? maxRows : 1000
19+
const columns: string[][] = headers.map(() => new Array<string>(initialCapacity))
20+
21+
maxRows = maxRows >= 0 ? maxRows : Infinity
22+
let rowIndex = 0 // Keep in scope after loop
23+
for (; rowIndex < maxRows; rowIndex++) {
24+
const { done, value } = await reader.read()
25+
if (done) break
26+
27+
// Expect a newline at the end of the file, but otherwise error on empty lines
28+
if (!value) {
29+
const nextRow = await reader.read()
30+
if (nextRow.done) break
31+
throw { code: 'TSV_EMPTY_LINE', line: rowIndex + startRow + 1 }
32+
}
33+
34+
const values = value.split('\t')
35+
if (values.length !== headers.length) {
36+
throw { code: 'TSV_EQUAL_ROWS', line: rowIndex + startRow + 1 }
37+
}
38+
columns.forEach((column, columnIndex) => {
39+
// Double array size if we exceed the current capacity
40+
if (rowIndex >= column.length) {
41+
column.length = column.length * 2
42+
}
43+
column[rowIndex] = values[columnIndex]
44+
})
45+
}
46+
47+
// Construct map, truncating columns to number of rows read
48+
return new ColumnsMap(
49+
headers.map((header, index) => [header, columns[index].slice(0, rowIndex)]),
50+
)
51+
}
52+
53+
export async function loadTSVGZ(
54+
file: BIDSFile,
55+
headers: string[],
56+
maxRows: number = -1,
57+
): Promise<ColumnsMap> {
58+
const reader = file.stream
59+
.pipeThrough(new DecompressionStream('gzip'))
60+
.pipeThrough(createUTF8Stream())
61+
.pipeThrough(new TextLineStream())
62+
.getReader()
63+
64+
try {
65+
return await loadColumns(reader, headers, maxRows)
66+
} catch (e: any) {
67+
// Cancel the reader if we interrupted the read
68+
// Cancelling for I/O errors will just re-trigger the error
69+
if (e.code) {
70+
await reader.cancel()
71+
throw e
72+
}
73+
throw { code: 'INVALID_GZIP', location: file.path }
74+
}
75+
}
76+
1177
async function _loadTSV(file: BIDSFile, maxRows: number = -1): Promise<ColumnsMap> {
1278
const reader = file.stream
1379
.pipeThrough(createUTF8Stream())
@@ -26,40 +92,7 @@ async function _loadTSV(file: BIDSFile, maxRows: number = -1): Promise<ColumnsMa
2692
}
2793
}
2894

29-
// Initialize columns in array for construction efficiency
30-
const initialCapacity = maxRows >= 0 ? maxRows : 1000
31-
const columns: string[][] = headers.map(() => new Array<string>(initialCapacity))
32-
33-
maxRows = maxRows >= 0 ? maxRows : Infinity
34-
let rowIndex = 0 // Keep in scope after loop
35-
for (; rowIndex < maxRows; rowIndex++) {
36-
const { done, value } = await reader.read()
37-
if (done) break
38-
39-
// Expect a newline at the end of the file, but otherwise error on empty lines
40-
if (!value) {
41-
const nextRow = await reader.read()
42-
if (nextRow.done) break
43-
throw { code: 'TSV_EMPTY_LINE', location: file.path, line: rowIndex + 2 }
44-
}
45-
46-
const values = value.split('\t')
47-
if (values.length !== headers.length) {
48-
throw { code: 'TSV_EQUAL_ROWS', location: file.path, line: rowIndex + 2 }
49-
}
50-
columns.forEach((column, columnIndex) => {
51-
// Double array size if we exceed the current capacity
52-
if (rowIndex >= column.length) {
53-
column.length = column.length * 2
54-
}
55-
column[rowIndex] = values[columnIndex]
56-
})
57-
}
58-
59-
// Construct map, truncating columns to number of rows read
60-
return new ColumnsMap(
61-
headers.map((header, index) => [header, columns[index].slice(0, rowIndex)]),
62-
)
95+
return await loadColumns(reader, headers, maxRows, 1)
6396
} finally {
6497
await reader.cancel()
6598
}

src/issues/list.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,10 @@ export const bidsIssues: IssueDefinitionRecord = {
133133
reason:
134134
'A column required in a TSV file has been redefined in a sidecar file. This redefinition is being ignored.',
135135
},
136+
INVALID_GZIP: {
137+
severity: 'error',
138+
reason: 'The gzip file could not be decompressed. It may be corrupt or misnamed.',
139+
},
136140
MULTIPLE_INHERITABLE_FILES: {
137141
severity: 'error',
138142
reason: 'Multiple files in a directory were found to be valid candidates for inheritance.',

src/schema/context.ts

Lines changed: 46 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import { readEntities } from './entities.ts'
1818
import { DatasetIssues } from '../issues/datasetIssues.ts'
1919
import { walkBack } from '../files/inheritance.ts'
2020
import { parseGzip } from '../files/gzip.ts'
21-
import { loadTSV } from '../files/tsv.ts'
21+
import { loadTSV, loadTSVGZ } from '../files/tsv.ts'
2222
import { parseTIFF } from '../files/tiff.ts'
2323
import { loadJSON } from '../files/json.ts'
2424
import { loadHeader } from '../files/nifti.ts'
@@ -254,21 +254,40 @@ export class BIDSContext implements Context {
254254
}
255255

256256
async loadColumns(): Promise<void> {
257-
if (this.extension !== '.tsv') {
258-
return
257+
if (this.extension == '.tsv') {
258+
this.columns = await loadTSV(this.file, this.dataset.options?.maxRows)
259+
.catch((error) => {
260+
if (error.code) {
261+
this.dataset.issues.add({ ...error, location: this.file.path })
262+
}
263+
logger.warn(
264+
`tsv file could not be opened by loadColumns '${this.file.path}'`,
265+
)
266+
logger.debug(error)
267+
return new Map<string, string[]>() as ColumnsMap
268+
}) as Record<string, string[]>
269+
} else if (this.extension == '.tsv.gz') {
270+
const headers = this.sidecar.Columns as string[];
271+
if (!headers || this.size === 0) {
272+
// Missing Columns will be caught by sidecar rules
273+
// Note that these rules currently select for suffix, and will need to be generalized
274+
// or duplicated for new .tsv.gz files
275+
// `this.size === 0` will show as `EMPTY_FILE`, so do not add INVALID_GZIP
276+
return
277+
}
278+
this.columns = await loadTSVGZ(this.file, headers, this.dataset.options?.maxRows)
279+
.catch((error) => {
280+
if (error.code) {
281+
this.dataset.issues.add({ ...error, location: this.file.path })
282+
}
283+
logger.warn(
284+
`tsv.gz file could not be opened by loadColumns '${this.file.path}'`,
285+
)
286+
logger.debug(error)
287+
return new Map<string, string[]>() as ColumnsMap
288+
}) as Record<string, string[]>
259289
}
260290

261-
this.columns = await loadTSV(this.file, this.dataset.options?.maxRows)
262-
.catch((error) => {
263-
if (error.code) {
264-
this.dataset.issues.add({ ...error, location: this.file.path })
265-
}
266-
logger.warn(
267-
`tsv file could not be opened by loadColumns '${this.file.path}'`,
268-
)
269-
logger.debug(error)
270-
return new Map<string, string[]>() as ColumnsMap
271-
}) as Record<string, string[]>
272291
return
273292
}
274293

@@ -340,15 +359,24 @@ export class BIDSContext implements Context {
340359
}
341360

342361
async asyncLoads() {
343-
await Promise.allSettled([
344-
this.loadSubjects(),
362+
// loaders that may be depended on by other loaders
363+
const initial = [
345364
this.loadSidecar(),
346-
this.loadColumns(),
347365
this.loadAssociations(),
366+
]
367+
// loaders that do not depend on other loaders
368+
const independent = [
369+
this.loadSubjects(),
348370
this.loadNiftiHeader(),
349371
this.loadJSON(),
350372
this.loadGzip(),
351373
this.loadTIFF(),
352-
])
374+
]
375+
376+
// Loaders with dependencies
377+
await Promise.allSettled(initial)
378+
await this.loadColumns()
379+
380+
await Promise.allSettled(independent)
353381
}
354382
}

0 commit comments

Comments
 (0)