Skip to content

Commit 5cfbb02

Browse files
committed
Add more tests
1 parent 6bd7344 commit 5cfbb02

16 files changed

Lines changed: 1193 additions & 124 deletions

src/externalSort.ts

Lines changed: 39 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import fs from 'fs'
22
import path from 'path'
33
import { Readable, Writable } from 'stream'
4+
import { pipeline } from 'stream/promises'
45

56
const EOF = Symbol('EOF')
67

@@ -15,6 +16,8 @@ class FileParser {
1516
private bytesRead = 0
1617
private file: string
1718
private delimiter: string
19+
private fh: fs.promises.FileHandle | undefined
20+
private eof = false
1821

1922
constructor(file: string, delimiter: string) {
2023
this.file = file
@@ -38,45 +41,49 @@ class FileParser {
3841
return this.checkBuffer()
3942
}
4043

41-
const fh = await fs.promises.open(this.file, 'r')
44+
if (this.eof) {
45+
return EOF
46+
}
47+
48+
if (!this.fh) {
49+
this.fh = await fs.promises.open(this.file, 'r')
50+
}
51+
4252
const cBuffer = Buffer.alloc(512)
4353
let readed: { bytesRead: number }
4454

45-
try {
46-
while (
47-
(readed = await fh.read(cBuffer, 0, 512, this.bytesRead)).bytesRead > 0
48-
) {
49-
this.bbuffer = Buffer.concat([
50-
this.bbuffer,
51-
cBuffer.subarray(0, readed.bytesRead),
52-
])
53-
this.bytesRead += readed.bytesRead
54-
const dIndex = this.bbuffer.indexOf(this.delimiter)
55-
if (dIndex === -1) {
56-
continue
57-
}
58-
this.buffer = this.bbuffer.subarray(0, dIndex + 1).toString('utf8')
59-
this.bbuffer = this.bbuffer.subarray(dIndex + 1)
60-
return this.checkBuffer()
61-
}
62-
63-
if (this.bbuffer.length > 0 && this.bbuffer.includes(this.delimiter)) {
64-
this.buffer = this.bbuffer.toString('utf8')
65-
this.bbuffer = Buffer.from('')
66-
return this.checkBuffer()
55+
while (
56+
(readed = await this.fh.read(cBuffer, 0, 512, this.bytesRead)).bytesRead >
57+
0
58+
) {
59+
this.bbuffer = Buffer.concat([
60+
this.bbuffer,
61+
cBuffer.subarray(0, readed.bytesRead),
62+
])
63+
this.bytesRead += readed.bytesRead
64+
const dIndex = this.bbuffer.indexOf(this.delimiter)
65+
if (dIndex === -1) {
66+
continue
6767
}
68+
this.buffer = this.bbuffer.subarray(0, dIndex + 1).toString('utf8')
69+
this.bbuffer = this.bbuffer.subarray(dIndex + 1)
70+
return this.checkBuffer()
71+
}
6872

69-
return EOF
70-
} finally {
71-
await fh.close()
73+
if (this.bbuffer.length > 0 && this.bbuffer.includes(this.delimiter)) {
74+
this.buffer = this.bbuffer.toString('utf8')
75+
this.bbuffer = Buffer.from('')
76+
return this.checkBuffer()
7277
}
78+
79+
this.eof = true
80+
await this.fh.close()
81+
return EOF
7382
}
7483
}
7584

7685
function swap(harr: HeapItem[], a: number, b: number) {
77-
const temp = harr[a]
78-
harr[a] = harr[b]!
79-
harr[b] = temp
86+
;[harr[a], harr[b]] = [harr[b], harr[a]]
8087
}
8188

8289
function compare(a: string | typeof EOF, b: string | typeof EOF): number {
@@ -86,13 +93,7 @@ function compare(a: string | typeof EOF, b: string | typeof EOF): number {
8693
if (b === EOF) {
8794
return -1
8895
}
89-
if (a < b) {
90-
return -1
91-
}
92-
if (a === b) {
93-
return 0
94-
}
95-
return 1
96+
return a < b ? -1 : a > b ? 1 : 0
9697
}
9798

9899
function heapify(harr: HeapItem[], i: number, heapSize: number) {
@@ -189,19 +190,8 @@ async function mergeSortedFiles(
189190
const flen = filesPath.length
190191

191192
if (flen === 1) {
192-
await new Promise<void>((resolve, reject) => {
193-
const rs = fs.createReadStream(filesPath[0], 'utf8')
194-
rs.on('open', () => {
195-
rs.pipe(output)
196-
})
197-
rs.on('error', err => {
198-
output.end()
199-
reject(err)
200-
})
201-
rs.on('end', () => {
202-
resolve()
203-
})
204-
})
193+
const rs = fs.createReadStream(filesPath[0], 'utf8')
194+
await pipeline(rs, output)
205195
return
206196
}
207197

src/makeIx.ts

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { spawn } from 'child_process'
22
import fs from 'fs'
3-
import { Readable } from 'stream'
3+
import { PassThrough, Readable } from 'stream'
44
import { pipeline } from 'stream/promises'
55

66
import { sync as commandExistsSync } from 'command-exists'
@@ -48,10 +48,14 @@ async function makeIxWithExternalSort(
4848
async function makeIxWithJsSort(fileStream: Readable, outIxFilename: string) {
4949
const out = fs.createWriteStream(outIxFilename)
5050

51-
// Transform input
52-
const transformedInput = fileStream
53-
.pipe(split2())
54-
.pipe(new TrixInputTransform())
51+
// Transform input using pipeline for proper error handling
52+
const transformedInput = new PassThrough()
53+
const inputDone = pipeline(
54+
fileStream,
55+
split2(),
56+
new TrixInputTransform(),
57+
transformedInput,
58+
)
5559

5660
// Sort lines using external merge sort
5761
const sortedOutput = split2()
@@ -60,7 +64,7 @@ async function makeIxWithJsSort(fileStream: Readable, outIxFilename: string) {
6064
// Transform sorted output and write to file
6165
const writeDone = pipeline(sortedOutput, new TrixOutputTransform(), out)
6266

63-
await Promise.all([sortDone, writeDone])
67+
await Promise.all([inputDone, sortDone, writeDone])
6468
}
6569

6670
export async function makeIxStream(

src/sortLines.ts

Lines changed: 0 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -24,60 +24,3 @@ export async function sortLinesExternal(
2424
}
2525
}
2626
}
27-
28-
/**
29-
* Sort lines in memory. Only use for small inputs.
30-
*/
31-
export async function sortLinesInMemory(
32-
input: Readable,
33-
output: Writable,
34-
): Promise<void> {
35-
const lines: string[] = []
36-
37-
await new Promise<void>((resolve, reject) => {
38-
let buffer = ''
39-
40-
input.on('data', function onData(chunk: Buffer | string) {
41-
buffer += chunk.toString()
42-
const parts = buffer.split('\n')
43-
buffer = parts.pop()!
44-
for (const line of parts) {
45-
if (line) {
46-
lines.push(line)
47-
}
48-
}
49-
})
50-
51-
input.on('end', function onEnd() {
52-
if (buffer) {
53-
lines.push(buffer)
54-
}
55-
resolve()
56-
})
57-
58-
input.on('error', reject)
59-
})
60-
61-
lines.sort()
62-
63-
await new Promise<void>((resolve, reject) => {
64-
let i = 0
65-
66-
function writeNext() {
67-
let ok = true
68-
while (i < lines.length && ok) {
69-
ok = output.write(lines[i] + '\n')
70-
i++
71-
}
72-
if (i < lines.length) {
73-
output.once('drain', writeNext)
74-
} else {
75-
output.end()
76-
resolve()
77-
}
78-
}
79-
80-
output.on('error', reject)
81-
writeNext()
82-
})
83-
}

test/TrixInputTransform.test.ts

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import { Readable } from 'stream'
2+
import { pipeline } from 'stream/promises'
3+
4+
import split2 from 'split2'
5+
import { describe, expect, test } from 'vitest'
6+
7+
import { TrixInputTransform } from '../src/TrixInputTransform.ts'
8+
9+
async function transformInput(lines: string[]): Promise<string> {
10+
const input = Readable.from(lines.map(l => l + '\n'))
11+
const chunks: string[] = []
12+
13+
await pipeline(input, split2(), new TrixInputTransform(), async function* (
14+
source,
15+
) {
16+
for await (const chunk of source) {
17+
chunks.push(chunk.toString())
18+
}
19+
})
20+
21+
return chunks.join('')
22+
}
23+
24+
describe('TrixInputTransform', () => {
25+
test('transforms single word line', async () => {
26+
const result = await transformInput(['id123 hello'])
27+
expect(result).toBe('hello id123\n')
28+
})
29+
30+
test('transforms multiple words', async () => {
31+
const result = await transformInput(['id123 hello world test'])
32+
expect(result).toBe('hello id123\nworld id123\ntest id123\n')
33+
})
34+
35+
test('lowercases words', async () => {
36+
const result = await transformInput(['id123 HELLO World TeSt'])
37+
expect(result).toBe('hello id123\nworld id123\ntest id123\n')
38+
})
39+
40+
test('handles line with only id (no words)', async () => {
41+
const result = await transformInput(['id123'])
42+
expect(result).toBe('')
43+
})
44+
45+
test('handles multiple lines', async () => {
46+
const result = await transformInput(['id1 apple', 'id2 banana'])
47+
expect(result).toBe('apple id1\nbanana id2\n')
48+
})
49+
50+
test('handles multiple whitespace between words', async () => {
51+
const result = await transformInput(['id123 hello world'])
52+
expect(result).toBe('hello id123\nworld id123\n')
53+
})
54+
55+
test('handles empty input', async () => {
56+
const result = await transformInput([])
57+
expect(result).toBe('')
58+
})
59+
})
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import { Readable } from 'stream'
2+
import { pipeline } from 'stream/promises'
3+
4+
import split2 from 'split2'
5+
import { describe, expect, test } from 'vitest'
6+
7+
import { TrixInputTransform } from '../src/TrixInputTransform.ts'
8+
9+
async function transformInput(lines: string[]): Promise<string> {
10+
const input = Readable.from(lines.map(l => l + '\n'))
11+
const chunks: string[] = []
12+
13+
await pipeline(input, split2(), new TrixInputTransform(), async function* (
14+
source,
15+
) {
16+
for await (const chunk of source) {
17+
chunks.push(chunk.toString())
18+
}
19+
})
20+
21+
return chunks.join('')
22+
}
23+
24+
describe('TrixInputTransform edge cases', () => {
25+
test('filters out empty strings from split', async () => {
26+
// Multiple spaces create empty strings when split
27+
const result = await transformInput(['id1 word1 word2'])
28+
// Should only have word1 and word2, not empty entries
29+
const lines = result.trim().split('\n')
30+
expect(lines.every(l => l.length > 0)).toBe(true)
31+
expect(lines).toHaveLength(2)
32+
})
33+
34+
test('handles tab characters as whitespace', async () => {
35+
const result = await transformInput(['id1\tword1\tword2'])
36+
expect(result).toContain('word1')
37+
expect(result).toContain('word2')
38+
})
39+
40+
test('handles mixed whitespace', async () => {
41+
const result = await transformInput(['id1 \t word1 \t word2'])
42+
const lines = result.trim().split('\n').filter(l => l.length > 0)
43+
expect(lines).toHaveLength(2)
44+
})
45+
46+
test('preserves numbers in words', async () => {
47+
const result = await transformInput(['id1 abc123 456def'])
48+
expect(result).toContain('abc123')
49+
expect(result).toContain('456def')
50+
})
51+
52+
test('handles hyphenated words', async () => {
53+
const result = await transformInput(['id1 well-known self-aware'])
54+
expect(result).toContain('well-known')
55+
expect(result).toContain('self-aware')
56+
})
57+
58+
test('handles words with underscores', async () => {
59+
const result = await transformInput(['id1 snake_case SCREAMING_SNAKE'])
60+
expect(result).toContain('snake_case')
61+
expect(result).toContain('screaming_snake')
62+
})
63+
64+
test('handles unicode words', async () => {
65+
const result = await transformInput(['id1 café naïve résumé'])
66+
expect(result).toContain('café')
67+
expect(result).toContain('naïve')
68+
expect(result).toContain('résumé')
69+
})
70+
71+
test('handles CJK characters', async () => {
72+
const result = await transformInput(['id1 日本語 中文'])
73+
expect(result).toContain('日本語')
74+
expect(result).toContain('中文')
75+
})
76+
77+
test('handles emoji', async () => {
78+
const result = await transformInput(['id1 hello🎉 world🌍'])
79+
expect(result).toContain('hello🎉')
80+
expect(result).toContain('world🌍')
81+
})
82+
83+
test('lowercases unicode letters', async () => {
84+
const result = await transformInput(['id1 MÜNCHEN ZÜRICH'])
85+
expect(result).toContain('münchen')
86+
expect(result).toContain('zürich')
87+
})
88+
})

0 commit comments

Comments
 (0)