feat: optimize buffer management for duplicate key handling

livetocode · livetocode · commit 7ee7b9be5e77 · 2023-03-29T10:56:30.000-04:00
diff --git a/README.md b/README.md
@@ -331,7 +331,6 @@ const stats = await diff({
     newSource: './tests/b.csv',
     keys: ['id'],
     duplicateKeyHandling: 'keepFirstRow', // or 'keepLastRow'
-    duplicateRowBufferSize: 2000,
 }).to('console');
 console.log(stats);
 ```
@@ -349,6 +348,8 @@ const stats = await diff({
 console.log(stats);
 ```
 
+Note that you can specify the size of the buffer if you know that it cannot exceed this quantity, otherwise you can enable the **duplicateRowBufferOverflow** option,
+which will remove the first entries when it exceeds the allocated capacity, to avoid a failure.
 
 ### Order 2 CSV files and diff them on the console
 
@@ -580,16 +581,17 @@ sortDirection| no     | ASC         | specifies if the column is sorted in ascen
 
 ### Differ options
 
-Name                  |Required|Default value|Description
-----------------------|--------|-------------|-----------
-oldSource             | yes    |             | either a string filename, a URL or a SourceOptions
-newSource             | yes    |             | either a string filename, a URL or a SourceOptions
-keys                  | yes    |             | the list of columns that form the primary key. This is required for comparing the rows. A key can be a string name or a {ColumnDefinition}
-includedColumns       | no     |             | the list of columns to keep from the input sources. If not specified, all columns are selected.
-excludedColumns       | no     |             | the list of columns to exclude from the input sources.
-rowComparer           | no     |             | specifies a custom row comparer.
-duplicateKeyHandling  |no      | fail        | specifies how to handle duplicate rows in a source. It will fail by default and throw a UniqueKeyViolationError exception. But you can ignore, keep the first or last row, or even provide your own function that will receive the duplicates and select the best candidate. 
-duplicateRowBufferSize|no      | 1000        | specifies the maximum size of the buffer used to accumulate duplicate rows.
+Name                      |Required|Default value|Description
+--------------------------|--------|-------------|-----------
+oldSource                 | yes    |             | either a string filename, a URL or a SourceOptions
+newSource                 | yes    |             | either a string filename, a URL or a SourceOptions
+keys                      | yes    |             | the list of columns that form the primary key. This is required for comparing the rows. A key can be a string name or a {ColumnDefinition}
+includedColumns           | no     |             | the list of columns to keep from the input sources. If not specified, all columns are selected.
+excludedColumns           | no     |             | the list of columns to exclude from the input sources.
+rowComparer               | no     |             | specifies a custom row comparer.
+duplicateKeyHandling      |no      | fail        | specifies how to handle duplicate rows in a source. It will fail by default and throw a UniqueKeyViolationError exception. But you can ignore, keep the first or last row, or even provide your own function that will receive the duplicates and select the best candidate. 
+duplicateRowBufferSize    |no      | 1000        | specifies the maximum size of the buffer used to accumulate duplicate rows.
+duplicateRowBufferOverflow|no      | false       | specifies if we can remove the first entries of the buffer to continue adding new duplicate entries when reaching maximum capacity, to avoir throwing an error and halting the process.
 
 ### diff function
 
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "tabular-data-differ",
-  "version": "1.1.2",
+  "version": "1.1.3",
   "description": "A very efficient library for diffing two sorted streams of tabular data, such as CSV files.",
   "keywords": [
     "table",
diff --git a/src/differ.test.ts b/src/differ.test.ts
@@ -328,6 +328,73 @@ describe('differ', () => {
                 [ '3', 'dave bis', '444' ] 
             ]);
         });    
+        test('should detect duplicate keys and call aggregate function, with buffer overflow', async () => {
+            const dups = [];
+            for (let i = 0; i < 100; i++) {
+                dups.push(`3,dave bis${i},444`);
+            }
+            let duplicateRows: Row[] = [];
+            const duplicateKeyHandler: DuplicateKeyHandler = (rows) => {
+                if (duplicateRows.length === 0) {
+                    duplicateRows = rows;
+                }
+                return rows[rows.length - 1];
+            };
+            const writer = await diffStrings({
+                oldLines: [
+                    'ID,NAME,AGE',
+                    '1,john,33',
+                    '2,rachel,22',
+                    '3,dave,44',
+                    ...dups,
+                    '4,noemie,11',
+                ],
+                newLines: [
+                    'ID,NAME,AGE',
+                    '1,john,33',
+                    '2,rachel,22',
+                    '3,dave,44',
+                ],
+                keys: ['ID'],
+                duplicateKeyHandling: duplicateKeyHandler,
+                duplicateRowBufferOverflow: true,
+                duplicateRowBufferSize: 10,
+                keepSameRows: true,
+            });
+            expect(writer.diffs).toEqual([
+                {
+                  delta: 0,
+                  status: 'same',
+                  oldRow: [ '1', 'john', '33' ],
+                  newRow: [ '1', 'john', '33' ]
+                },
+                {
+                  delta: 0,
+                  status: 'same',
+                  oldRow: [ '2', 'rachel', '22' ],
+                  newRow: [ '2', 'rachel', '22' ]
+                },
+                {
+                  delta: 0,
+                  status: 'modified',
+                  oldRow: [ '3', 'dave bis99', '444' ],
+                  newRow: [ '3', 'dave', '44' ]
+                },
+                { delta: -1, status: 'deleted', oldRow: [ '4', 'noemie', '11' ] }
+            ]);
+            expect(duplicateRows).toEqual([ 
+                [ '3', 'dave bis90', '444' ],
+                [ '3', 'dave bis91', '444' ],
+                [ '3', 'dave bis92', '444' ],
+                [ '3', 'dave bis93', '444' ],
+                [ '3', 'dave bis94', '444' ],
+                [ '3', 'dave bis95', '444' ],
+                [ '3', 'dave bis96', '444' ],
+                [ '3', 'dave bis97', '444' ],
+                [ '3', 'dave bis98', '444' ],
+                [ '3', 'dave bis99', '444' ]
+            ]);
+        });    
         test('should detect duplicate keys and throw an error when the buffer exceeds the limit', async () => {
             const dups = [];
             for (let i = 0; i < 10; i++) {
@@ -349,7 +416,7 @@ describe('differ', () => {
                     '3,dave,44',
                 ],
                 keys: ['ID'],
-                duplicateKeyHandling: 'keepLastRow',
+                duplicateKeyHandling: (rows) => rows[0],
                 duplicateRowBufferSize: 5,
                 keepSameRows: true,
             })).rejects.toThrowError('Too many duplicate rows');
diff --git a/src/differ.ts b/src/differ.ts
@@ -210,10 +210,21 @@ export interface DifferOptions {
     duplicateKeyHandling?: DuplicateKeyHandling;
     /**
      * specifies the maximum size of the buffer used to accumulate duplicate rows.
+     * Note that the buffer size matters only when you provide a custom function to the duplicateKeyHandling, since it will receive the accumulated duplicates
+     * as an input parameter.
      * @default 1000
      * @see duplicateKeyHandling
      */
     duplicateRowBufferSize?: number;
+    /**
+     * specifies if we can remove the first entries of the buffer to continue adding new duplicate entries when reaching maximum capacity,
+     * to avoir throwing an error and halting the process.
+     * Note that the buffer size matters only when you provide a custom function to the duplicateKeyHandling, since it will receive the accumulated duplicates
+     * as an input parameter.
+     * @default false
+     * @see duplicateRowBufferSize
+     */
+    duplicateRowBufferOverflow?: boolean;
 }
 
 /**
@@ -373,7 +384,7 @@ export class DifferContext {
         this.newSource = new BufferedFormatReader(createSource(options.newSource));
         this.comparer = options.rowComparer ?? defaultRowComparer;
         this.duplicateKeyHandling = options.duplicateKeyHandling ?? 'fail';
-        this.duplicateRowBufferSize = options.duplicateRowBufferSize ?? 1000;
+        this.duplicateRowBufferSize = Math.max(5, options.duplicateRowBufferSize ?? 1000);
     }
 
     /**
@@ -598,12 +609,63 @@ export class DifferContext {
         return result;
     }
 
+    async getNextRow(source: BufferedFormatReader): Promise<Row | undefined> {        
+        const row = await source.readRow();
+        if (!row) {
+            return row;
+        }
+        if (this.duplicateKeyHandling === 'fail') {
+            // Note that it will be further processed in ensureRowsAreInAscendingOrder
+            return row;
+        }
+        const nextRow = await source.peekRow();
+        if (!nextRow) {
+            return row;
+        }
+        let isDuplicate = this.comparer(this.keys, nextRow, row) === 0;
+        if (isDuplicate) {
+            const duplicateRows: Row[] = [];
+            duplicateRows.push(row);
+            while(isDuplicate) {
+                const duplicateRow = await source.readRow();
+                if (duplicateRow) {
+                    if (this.duplicateKeyHandling !== 'keepFirstRow') {
+                        // we don't need to accumulate duplicate rows when we just have to return the first row!
+                        duplicateRows.push(duplicateRow);
+                    }
+                    if (this.duplicateKeyHandling === 'keepLastRow') {
+                        // we don't need to accumulate the previous rows when we just have to return the last row!
+                        duplicateRows.shift();
+                    }
+                    if (duplicateRows.length > this.duplicateRowBufferSize) {
+                        if (this.options.duplicateRowBufferOverflow) {
+                            // remove the first entry when we can overflow
+                            duplicateRows.shift();
+                        } else {
+                            throw new Error('Too many duplicate rows');
+                        }
+                    }
+                }
+                const nextRow = await source.peekRow();                        
+                isDuplicate = !!nextRow && this.comparer(this.keys, nextRow, row) === 0;
+            }
+            if (this.duplicateKeyHandling === 'keepFirstRow') {
+                return duplicateRows[0];
+            }
+            if (this.duplicateKeyHandling === 'keepLastRow') {
+                return duplicateRows[duplicateRows.length-1];
+            }
+            return this.duplicateKeyHandling(duplicateRows);
+        }    
+        return row;
+    }
+
     private getNextOldRow(): Promise<Row | undefined> {        
-        return getNextRow(this.oldSource, this.duplicateKeyHandling, this.comparer, this.keys, this.duplicateRowBufferSize);
+        return this.getNextRow(this.oldSource);
     }
 
     private getNextNewRow(): Promise<Row | undefined> {
-        return getNextRow(this.newSource, this.duplicateKeyHandling, this.comparer, this.keys, this.duplicateRowBufferSize);
+        return this.getNextRow(this.newSource);
     }
 
     private async getNextPair():Promise<RowPair> {
@@ -674,48 +736,3 @@ export function sameArrays(a: string[], b: string[]) {
     }
     return true;
 }
-
-async function getNextRow(
-    source: BufferedFormatReader, 
-    duplicateKeyHandling: DuplicateKeyHandling, 
-    comparer: RowComparer, 
-    keys: Column[],
-    duplicateRowBufferSize: number,
-): Promise<Row | undefined> {        
-    const row = await source.readRow();
-    if (!row) {
-        return row;
-    }
-    if (duplicateKeyHandling === 'fail') {
-        // Note that it will be further processed in ensureRowsAreInAscendingOrder
-        return row;
-    }
-    const nextRow = await source.peekRow();
-    if (!nextRow) {
-        return row;
-    }
-    let isDuplicate = comparer(keys, nextRow, row) === 0;
-    if (isDuplicate) {
-        const duplicateRows: Row[] = [];
-        duplicateRows.push(row);
-        while(isDuplicate) {
-            const duplicateRow = await source.readRow();
-            if (duplicateRow) {
-                duplicateRows.push(duplicateRow);
-                if (duplicateRows.length > duplicateRowBufferSize) {
-                    throw new Error('Too many duplicate rows');
-                }
-            }
-            const nextRow = await source.peekRow();                        
-            isDuplicate = !!nextRow && comparer(keys, nextRow, row) === 0;
-        }
-        if (duplicateKeyHandling === 'keepFirstRow') {
-            return duplicateRows[0];
-        }
-        if (duplicateKeyHandling === 'keepLastRow') {
-            return duplicateRows[duplicateRows.length-1];
-        }
-        return duplicateKeyHandling(duplicateRows);
-    }    
-    return row;
-}

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "tabular-data-differ",`
`3`		`- "version": "1.1.2",`
	`3`	`+ "version": "1.1.3",`
`4`	`4`	`"description": "A very efficient library for diffing two sorted streams of tabular data, such as CSV files.",`
`5`	`5`	`"keywords": [`
`6`	`6`	`"table",`