Skip to content

Commit 4fe3580

Browse files
committed
feat: add stats on row duplication
1 parent 7ee7b9b commit 4fe3580

File tree

5 files changed

+148
-16
lines changed

5 files changed

+148
-16
lines changed

README.md

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -327,8 +327,8 @@ You can resolve the conflict by keeping the first or last row of the duplicates:
327327
```Typescript
328328
import { diff } from 'tabular-data-differ';
329329
const stats = await diff({
330-
oldSource: './tests/a.csv',
331-
newSource: './tests/b.csv',
330+
oldSource: './tests/a2.csv',
331+
newSource: './tests/b2.csv',
332332
keys: ['id'],
333333
duplicateKeyHandling: 'keepFirstRow', // or 'keepLastRow'
334334
}).to('console');
@@ -339,17 +339,34 @@ Or, if you need more control in the row selection, then you can provide your own
339339
```Typescript
340340
import { diff } from 'tabular-data-differ';
341341
const stats = await diff({
342-
oldSource: './tests/a.csv',
343-
newSource: './tests/b.csv',
342+
oldSource: './tests/a2.csv',
343+
newSource: './tests/b2.csv',
344344
keys: ['id'],
345345
duplicateKeyHandling: (rows) => rows[0], // same as 'keepFirstRow'
346346
duplicateRowBufferSize: 2000,
347-
}).to('console');
347+
}).to('null');
348348
console.log(stats);
349349
```
350350
351351
Note that you can specify the size of the buffer if you know that it cannot exceed this quantity, otherwise you can enable the **duplicateRowBufferOverflow** option,
352-
which will remove the first entries when it exceeds the allocated capacity, to avoid a failure.
352+
which will remove the first entries when it exceeds the allocated capacity, to avoid any failure.
353+
354+
Finally, you can inspect the source stats to check the duplication metrics:
355+
```Typescript
356+
import { diff } from 'tabular-data-differ';
357+
const ctx = await diff({
358+
oldSource: './tests/a2.csv',
359+
newSource: './tests/b2.csv',
360+
keys: ['id'],
361+
duplicateKeyHandling: 'keepFirstRow', // or 'keepLastRow'
362+
}).start();
363+
const stats = await ctx.to('null');
364+
console.log(stats);
365+
console.log(ctx.oldStats);
366+
console.log(ctx.newStats);
367+
368+
```
369+
353370
354371
### Order 2 CSV files and diff them on the console
355372
@@ -632,6 +649,14 @@ Returns the current column names.
632649
633650
Returns the currents stats.
634651
652+
#### oldSourceStats
653+
654+
Returns the stats accumulated while parsing the old source.
655+
656+
#### newSourceStats
657+
658+
Returns the stats accumulated while parsing the new source.
659+
635660
#### to
636661
637662
Initiates the comparison between the old and new sources and sends the diffs to the specified output.

package-lock.json

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "tabular-data-differ",
3-
"version": "1.1.3",
3+
"version": "1.1.4",
44
"description": "A very efficient library for diffing two sorted streams of tabular data, such as CSV files.",
55
"keywords": [
66
"table",

src/differ.test.ts

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,8 @@ describe('differ', () => {
168168
keys: ['ID'],
169169
})).rejects.toThrowError(new UniqueKeyViolationError(`Expected rows to be unique by "ID" in old source but received:
170170
previous=3,dave,44
171-
current=3,dave bis,444`));
171+
current=3,dave bis,444
172+
Note that you can resolve this conflict automatically using the duplicateKeyHandling option.`));
172173
});
173174
test('should detect primary key violation in new source', async () => {
174175
await expect(() => diffStrings({
@@ -189,7 +190,8 @@ describe('differ', () => {
189190
keys: ['ID'],
190191
})).rejects.toThrowError(new UniqueKeyViolationError(`Expected rows to be unique by "ID" in new source but received:
191192
previous=3,dave,44
192-
current=3,dave bis,444`));
193+
current=3,dave bis,444
194+
Note that you can resolve this conflict automatically using the duplicateKeyHandling option.`));
193195
});
194196
test('should detect duplicate keys and return the first row', async () => {
195197
const writer = await diffStrings({
@@ -1849,7 +1851,8 @@ describe('differ', () => {
18491851
keys: ['id'],
18501852
duplicateKeyHandling: 'keepFirstRow',
18511853
});
1852-
await differ.to({
1854+
const ctx = await differ.start();
1855+
await ctx.to({
18531856
destination: {
18541857
format: 'custom',
18551858
writer: output,
@@ -1878,6 +1881,28 @@ describe('differ', () => {
18781881
modified: 1,
18791882
same: 5
18801883
});
1884+
expect(ctx.oldSourceStats).toEqual({
1885+
parsedRows: 12,
1886+
duplicateParsedRows: 3,
1887+
uniqueRows: 9,
1888+
uniqueRowsWithDuplicates: 3,
1889+
duplicationPercent: 25,
1890+
uniqueRowDuplicationPercent: 33.3333,
1891+
maxDuplicatesPerUniqueKey: 1,
1892+
minDuplicatesPerUniqueKey: 1,
1893+
averageDuplicatesPerUniqueKey: 1
1894+
});
1895+
expect(ctx.newSourceStats).toEqual({
1896+
parsedRows: 10,
1897+
duplicateParsedRows: 3,
1898+
uniqueRows: 7,
1899+
uniqueRowsWithDuplicates: 2,
1900+
duplicationPercent: 30,
1901+
uniqueRowDuplicationPercent: 28.5714,
1902+
maxDuplicatesPerUniqueKey: 2,
1903+
minDuplicatesPerUniqueKey: 1,
1904+
averageDuplicatesPerUniqueKey: 1.5
1905+
});
18811906
});
18821907
test('should work with http streams (CSV)', async () => {
18831908
const currentDir = process.cwd().replaceAll('\\', '/');

src/differ.ts

Lines changed: 86 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import {
3030
IterableFormatReaderOptions,
3131
IterableFormatReader,
3232
BufferedFormatReader,
33+
roundDecimals,
3334
} from "./formats";
3435

3536
export class UnorderedStreamsError extends Error {
@@ -171,6 +172,57 @@ export type DuplicateKeyHandler = (rows: Row[]) => Row;
171172

172173
export type DuplicateKeyHandling = 'fail' |'keepFirstRow' | 'keepLastRow' | DuplicateKeyHandler;
173174

175+
export class SourceStats {
176+
parsedRows = 0;
177+
duplicateParsedRows = 0;
178+
uniqueRows = 0;
179+
uniqueRowsWithDuplicates = 0;
180+
181+
duplicationPercent = 0;
182+
uniqueRowDuplicationPercent = 0;
183+
184+
maxDuplicatesPerUniqueKey = 0;
185+
minDuplicatesPerUniqueKey = 0;
186+
averageDuplicatesPerUniqueKey = 0;
187+
188+
incParsedRows() {
189+
this.parsedRows += 1;
190+
}
191+
192+
incDuplicateParsedRows() {
193+
this.duplicateParsedRows += 1;
194+
}
195+
196+
incUniqueRows() {
197+
this.uniqueRows += 1;
198+
}
199+
200+
incUniqueRowsWithDuplicates() {
201+
this.uniqueRowsWithDuplicates += 1;
202+
}
203+
204+
incDuplicates(value: number) {
205+
this.maxDuplicatesPerUniqueKey = Math.max(this.maxDuplicatesPerUniqueKey, value);
206+
if (this.minDuplicatesPerUniqueKey === 0) {
207+
this.minDuplicatesPerUniqueKey = value;
208+
} else {
209+
this.minDuplicatesPerUniqueKey = Math.min(this.minDuplicatesPerUniqueKey, value);
210+
}
211+
}
212+
213+
calcStats() {
214+
if (this.uniqueRowsWithDuplicates) {
215+
this.averageDuplicatesPerUniqueKey = roundDecimals(this.duplicateParsedRows / this.uniqueRowsWithDuplicates, 4);
216+
}
217+
if (this.parsedRows) {
218+
this.duplicationPercent = roundDecimals((this.duplicateParsedRows / this.parsedRows) * 100, 4);
219+
}
220+
if ( this.uniqueRows) {
221+
this.uniqueRowDuplicationPercent = roundDecimals((this.uniqueRowsWithDuplicates / this.uniqueRows) * 100, 4);
222+
}
223+
}
224+
}
225+
174226
/**
175227
* Options for configuring the Differ object that will traverse two input streams in parallel in order to compare their rows
176228
* and produce a change set.
@@ -378,6 +430,8 @@ export class DifferContext {
378430
private normalizeNewRow: RowNormalizer = row => row;
379431
private duplicateKeyHandling: DuplicateKeyHandling;
380432
private duplicateRowBufferSize: number;
433+
private _oldSourceStats = new SourceStats();
434+
private _newSourceStats = new SourceStats();
381435

382436
constructor(private options: DifferOptions) {
383437
this.oldSource = new BufferedFormatReader(createSource(options.oldSource));
@@ -394,6 +448,8 @@ export class DifferContext {
394448
async [OpenSymbol](): Promise<void> {
395449
if (!this._isOpen) {
396450
this._isOpen = true;
451+
this._oldSourceStats = new SourceStats();
452+
this._newSourceStats = new SourceStats();
397453
await this.oldSource.open();
398454
await this.newSource.open();
399455
await this.extractHeaders();
@@ -436,6 +492,22 @@ export class DifferContext {
436492
get stats(): DiffStats {
437493
return this._stats;
438494
}
495+
496+
/**
497+
* gets the stats accumulated while parsing the old source
498+
* @returns the source stats
499+
*/
500+
get oldSourceStats(): SourceStats {
501+
return this._oldSourceStats;
502+
}
503+
504+
/**
505+
* gets the stats accumulated while parsing the new source
506+
* @returns the source stats
507+
*/
508+
get newSourceStats(): SourceStats {
509+
return this._newSourceStats;
510+
}
439511

440512
/**
441513
* Iterates over the changes and sends them to the submitted output.
@@ -542,6 +614,8 @@ export class DifferContext {
542614
previousPair = pair;
543615
}
544616
} finally {
617+
this.oldSourceStats.calcStats();
618+
this.newSourceStats.calcStats();
545619
this.close();
546620
}
547621
}
@@ -609,11 +683,13 @@ export class DifferContext {
609683
return result;
610684
}
611685

612-
async getNextRow(source: BufferedFormatReader): Promise<Row | undefined> {
686+
async getNextRow(source: BufferedFormatReader, stats: SourceStats): Promise<Row | undefined> {
613687
const row = await source.readRow();
614688
if (!row) {
615689
return row;
616690
}
691+
stats.incParsedRows();
692+
stats.incUniqueRows();
617693
if (this.duplicateKeyHandling === 'fail') {
618694
// Note that it will be further processed in ensureRowsAreInAscendingOrder
619695
return row;
@@ -626,9 +702,14 @@ export class DifferContext {
626702
if (isDuplicate) {
627703
const duplicateRows: Row[] = [];
628704
duplicateRows.push(row);
705+
stats.incUniqueRowsWithDuplicates();
706+
let duplicateCount = 0;
629707
while(isDuplicate) {
630708
const duplicateRow = await source.readRow();
631709
if (duplicateRow) {
710+
duplicateCount += 1;
711+
stats.incParsedRows();
712+
stats.incDuplicateParsedRows();
632713
if (this.duplicateKeyHandling !== 'keepFirstRow') {
633714
// we don't need to accumulate duplicate rows when we just have to return the first row!
634715
duplicateRows.push(duplicateRow);
@@ -649,6 +730,7 @@ export class DifferContext {
649730
const nextRow = await source.peekRow();
650731
isDuplicate = !!nextRow && this.comparer(this.keys, nextRow, row) === 0;
651732
}
733+
stats.incDuplicates(duplicateCount);
652734
if (this.duplicateKeyHandling === 'keepFirstRow') {
653735
return duplicateRows[0];
654736
}
@@ -661,11 +743,11 @@ export class DifferContext {
661743
}
662744

663745
private getNextOldRow(): Promise<Row | undefined> {
664-
return this.getNextRow(this.oldSource);
746+
return this.getNextRow(this.oldSource, this._oldSourceStats);
665747
}
666748

667749
private getNextNewRow(): Promise<Row | undefined> {
668-
return this.getNextRow(this.newSource);
750+
return this.getNextRow(this.newSource, this._newSourceStats);
669751
}
670752

671753
private async getNextPair():Promise<RowPair> {
@@ -693,7 +775,7 @@ export class DifferContext {
693775
const oldDelta = this.comparer(this.keys, previous, current);
694776
if (oldDelta === 0) {
695777
const cols = this.keys.map(key => key.name);
696-
throw new UniqueKeyViolationError(`Expected rows to be unique by "${cols}" in ${source} source but received:\n previous=${previous}\n current=${current}`);
778+
throw new UniqueKeyViolationError(`Expected rows to be unique by "${cols}" in ${source} source but received:\n previous=${previous}\n current=${current}\nNote that you can resolve this conflict automatically using the duplicateKeyHandling option.`);
697779
}
698780
if (oldDelta > 0) {
699781
const colOrder = this.keys.map(key => `${key.name} ${key.sortDirection ?? 'ASC'}`);

0 commit comments

Comments
 (0)