Skip to content

Commit 4a720b2

Browse files
committed
K1
1 parent 6c7eaf3 commit 4a720b2

6 files changed

Lines changed: 219 additions & 109 deletions

File tree

benchmark/string-comparison.bench.ts

Lines changed: 154 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,25 @@
11
import { bench, describe } from 'vitest'
22

3+
const garbageFields =
4+
'DUMMY1=value1;DUMMY2=value2;DUMMY3=value3;DUMMY4=value4;DUMMY5=value5;' +
5+
'DUMMY6=value6;DUMMY7=value7;DUMMY8=value8;DUMMY9=value9;DUMMY10=value10;' +
6+
'DUMMY11=value11;DUMMY12=value12;DUMMY13=value13;DUMMY14=value14;DUMMY15=value15;' +
7+
'DUMMY16=value16;DUMMY17=value17;DUMMY18=value18;DUMMY19=value19;DUMMY20=value20;' +
8+
'DUMMY21=value21;DUMMY22=value22;DUMMY23=value23;DUMMY24=value24;DUMMY25=value25;' +
9+
'DUMMY26=value26;DUMMY27=value27;DUMMY28=value28;DUMMY29=value29;DUMMY30=value30;' +
10+
'DUMMY31=value31;DUMMY32=value32;DUMMY33=value33;DUMMY34=value34;DUMMY35=value35;' +
11+
'DUMMY36=value36;DUMMY37=value37;DUMMY38=value38;DUMMY39=value39;DUMMY40=value40'
12+
313
const sampleInfoFields = [
4-
'AC=1;AF=0.5;AN=2;DP=100;END=12345;MQ=60;NS=2',
5-
'SVTYPE=TRA;CHR2=chr2;END=54321;CT=3to5',
6-
'IMPRECISE;SVTYPE=DEL;END=10000;SVLEN=-500;CIPOS=-10,10',
7-
'END=5000;SVTYPE=INS;SVLEN=300;HOMLEN=10;HOMSEQ=ACGT',
8-
'DP=50;VDB=0.5;RPB=1.0;MQB=0.9;BQB=0.8;MQSB=0.85;SGB=-0.693147;MQ0F=0;END=8000',
9-
'SVTYPE=DUP;END=20000;SVLEN=1000;IMPRECISE;CIEND=-50,50',
10-
'AC=2;AF=1.0;AN=2;DP=200;END=15000;MQ=70;NS=3;DB',
11-
'INDEL;IDV=10;IMF=0.5;DP=100;VDB=0.8;RPB=0.9;END=7000',
14+
'AC=1;AF=0.5;AN=2;DP=100;END=12345;MQ=60;NS=2;' + garbageFields,
15+
'SVTYPE=TRA;CHR2=chr2;END=54321;CT=3to5;' + garbageFields,
16+
'IMPRECISE;SVTYPE=DEL;END=10000;SVLEN=-500;CIPOS=-10,10;' + garbageFields,
17+
'END=5000;SVTYPE=INS;SVLEN=300;HOMLEN=10;HOMSEQ=ACGT;' + garbageFields,
18+
'DP=50;VDB=0.5;RPB=1.0;MQB=0.9;BQB=0.8;MQSB=0.85;SGB=-0.693147;MQ0F=0;END=8000;' +
19+
garbageFields,
20+
'SVTYPE=DUP;END=20000;SVLEN=1000;IMPRECISE;CIEND=-50,50;' + garbageFields,
21+
'AC=2;AF=1.0;AN=2;DP=200;END=15000;MQ=70;NS=3;DB;' + garbageFields,
22+
'INDEL;IDV=10;IMF=0.5;DP=100;VDB=0.8;RPB=0.9;END=7000;' + garbageFields,
1223
]
1324

1425
describe('String comparison methods', () => {
@@ -125,10 +136,64 @@ describe('String comparison methods', () => {
125136
}
126137
}
127138
})
139+
140+
bench('using includes() then indexOf()', () => {
141+
for (const info of sampleInfoFields) {
142+
let endCoordinate = 0
143+
if (info.includes(';END=') || info.startsWith('END=')) {
144+
let pos = info.indexOf(';END=')
145+
if (pos === -1 && info.startsWith('END=')) {
146+
const valueEnd = info.indexOf(';')
147+
endCoordinate = Number.parseInt(
148+
info.slice(4, valueEnd === -1 ? info.length : valueEnd),
149+
10,
150+
)
151+
} else if (pos !== -1) {
152+
pos += 1
153+
let valueEnd = info.indexOf(';', pos + 4)
154+
if (valueEnd === -1) {
155+
valueEnd = info.length
156+
}
157+
endCoordinate = Number.parseInt(info.slice(pos + 4, valueEnd), 10)
158+
}
159+
}
160+
if (endCoordinate) {
161+
// do something
162+
}
163+
}
164+
})
165+
166+
bench('using regex (cached)', () => {
167+
const endRegex = /(?:^|;)END=([^;]+)/
168+
for (const info of sampleInfoFields) {
169+
let endCoordinate = 0
170+
const match = endRegex.exec(info)
171+
if (match) {
172+
endCoordinate = Number.parseInt(match[1], 10)
173+
}
174+
if (endCoordinate) {
175+
// do something
176+
}
177+
}
178+
})
179+
180+
bench('using regex (created each iteration)', () => {
181+
for (const info of sampleInfoFields) {
182+
let endCoordinate = 0
183+
const endRegex = /(?:^|;)END=([^;]+)/
184+
const match = endRegex.exec(info)
185+
if (match) {
186+
endCoordinate = Number.parseInt(match[1], 10)
187+
}
188+
if (endCoordinate) {
189+
// do something
190+
}
191+
}
192+
})
128193
})
129194

130195
describe('Combined scenario (realistic _getVcfEnd)', () => {
131-
bench('original approach with includes + slice', () => {
196+
bench('includes for TRA + manual loop with slice', () => {
132197
for (const info of sampleInfoFields) {
133198
const startCoordinate = 1000
134199
const refSeq = 'ACGT'
@@ -157,7 +222,7 @@ describe('String comparison methods', () => {
157222
}
158223
})
159224

160-
bench('current approach with includes + character comparison', () => {
225+
bench('includes for TRA + manual loop with char-by-char', () => {
161226
for (const info of sampleInfoFields) {
162227
const startCoordinate = 1000
163228
const refSeq = 'ACGT'
@@ -192,7 +257,7 @@ describe('String comparison methods', () => {
192257
}
193258
})
194259

195-
bench('using includes + indexOf', () => {
260+
bench('includes for TRA + indexOf for END', () => {
196261
for (const info of sampleInfoFields) {
197262
const startCoordinate = 1000
198263
const refSeq = 'ACGT'
@@ -223,5 +288,83 @@ describe('String comparison methods', () => {
223288
}
224289
}
225290
})
291+
292+
bench('includes for TRA + includes guard + indexOf for END', () => {
293+
for (const info of sampleInfoFields) {
294+
const startCoordinate = 1000
295+
const refSeq = 'ACGT'
296+
let endCoordinate = startCoordinate + refSeq.length
297+
298+
const isTRA = info.includes('SVTYPE=TRA')
299+
if (info[0] !== '.' && !isTRA) {
300+
if (info.includes(';END=') || info.startsWith('END=')) {
301+
let pos = info.indexOf(';END=')
302+
if (pos === -1 && info.startsWith('END=')) {
303+
const valueEnd = info.indexOf(';')
304+
endCoordinate = Number.parseInt(
305+
info.slice(4, valueEnd === -1 ? info.length : valueEnd),
306+
10,
307+
)
308+
} else if (pos !== -1) {
309+
pos += 1
310+
let valueEnd = info.indexOf(';', pos + 4)
311+
if (valueEnd === -1) {
312+
valueEnd = info.length
313+
}
314+
endCoordinate = Number.parseInt(info.slice(pos + 4, valueEnd), 10)
315+
}
316+
}
317+
} else if (isTRA) {
318+
endCoordinate = startCoordinate + 1
319+
}
320+
if (endCoordinate) {
321+
// do something
322+
}
323+
}
324+
})
325+
326+
bench('includes for TRA + regex for END (cached)', () => {
327+
const endRegex = /(?:^|;)END=([^;]+)/
328+
for (const info of sampleInfoFields) {
329+
const startCoordinate = 1000
330+
const refSeq = 'ACGT'
331+
let endCoordinate = startCoordinate + refSeq.length
332+
333+
const isTRA = info.includes('SVTYPE=TRA')
334+
if (info[0] !== '.' && !isTRA) {
335+
const match = endRegex.exec(info)
336+
if (match) {
337+
endCoordinate = Number.parseInt(match[1], 10)
338+
}
339+
} else if (isTRA) {
340+
endCoordinate = startCoordinate + 1
341+
}
342+
if (endCoordinate) {
343+
// do something
344+
}
345+
}
346+
})
347+
348+
bench('includes for TRA + regex for END (created each iteration)', () => {
349+
for (const info of sampleInfoFields) {
350+
const startCoordinate = 1000
351+
const refSeq = 'ACGT'
352+
let endCoordinate = startCoordinate + refSeq.length
353+
354+
const isTRA = info.includes('SVTYPE=TRA')
355+
if (info[0] !== '.' && !isTRA) {
356+
const endRegex = /(?:^|;)END=([^;]+)/
357+
const match = endRegex.exec(info)
358+
if (match) {
359+
endCoordinate = Number.parseInt(match[1], 10)
360+
}
361+
} else if (isTRA) {
362+
endCoordinate = startCoordinate + 1
363+
}
364+
if (endCoordinate) {
365+
// do something
366+
}
367+
}
368+
})
226369
})
227370
})

eslint.config.mjs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ export default defineConfig(
4141
curly: 'error',
4242

4343
semi: ['error', 'never'],
44+
'unicorn/number-literal-case': 'off',
4445
'unicorn/text-encoding-identifier-case': 'off',
4546
'unicorn/no-new-array': 'off',
4647
'unicorn/prefer-module': 'off',

src/csi.ts

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -87,31 +87,6 @@ export default class CSI extends IndexFile {
8787
}
8888
}
8989

90-
_parseNameBytes(namesBytes: Uint8Array) {
91-
let currRefId = 0
92-
let currNameStart = 0
93-
const refIdToName = []
94-
const refNameToId: Record<string, number> = {}
95-
const decoder = new TextDecoder('utf8')
96-
for (let i = 0; i < namesBytes.length; i += 1) {
97-
if (!namesBytes[i]) {
98-
if (currNameStart < i) {
99-
const refName = this.renameRefSeq(
100-
decoder.decode(namesBytes.subarray(currNameStart, i)),
101-
)
102-
refIdToName[currRefId] = refName
103-
refNameToId[refName] = currRefId
104-
}
105-
currNameStart = i + 1
106-
currRefId += 1
107-
}
108-
}
109-
return {
110-
refNameToId,
111-
refIdToName,
112-
}
113-
}
114-
11590
async _parse(opts: Options = {}) {
11691
const bytes = await unzip(await this.filehandle.readFile(opts))
11792
const dataView = new DataView(bytes.buffer)

src/indexFile.ts

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,4 +76,29 @@ export default abstract class IndexFile {
7676
const idx = await this.parse(opts)
7777
return !!idx.indices[seqId]?.binIndex
7878
}
79+
80+
_parseNameBytes(namesBytes: Uint8Array) {
81+
let currRefId = 0
82+
let currNameStart = 0
83+
const refIdToName: string[] = []
84+
const refNameToId: Record<string, number> = {}
85+
const decoder = new TextDecoder('utf8')
86+
for (let i = 0; i < namesBytes.length; i += 1) {
87+
if (!namesBytes[i]) {
88+
if (currNameStart < i) {
89+
const refName = this.renameRefSeq(
90+
decoder.decode(namesBytes.subarray(currNameStart, i)),
91+
)
92+
refIdToName[currRefId] = refName
93+
refNameToId[refName] = currRefId
94+
}
95+
currNameStart = i + 1
96+
currRefId += 1
97+
}
98+
}
99+
return {
100+
refNameToId,
101+
refIdToName,
102+
}
103+
}
79104
}

0 commit comments

Comments
 (0)