test: bypass jsforce request buffering

cristiand391 · cristiand391 · commit 5fd3b4cfd935 · 2026-02-24T11:53:36.000-03:00
diff --git a/package.json b/package.json
@@ -135,6 +135,7 @@
     "csv-stringify": "^6.6.0",
     "form-data": "^4.0.5",
     "terminal-link": "^3.0.0",
+    "undici": "^7.22.0",
     "zod": "^4.3.6"
   },
   "devDependencies": {
diff --git a/src/bulkUtils.ts b/src/bulkUtils.ts
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-import { Transform, Readable } from 'node:stream';
+import { Transform, Readable, TransformCallback } from 'node:stream';
 import { createInterface } from 'node:readline';
 import { pipeline } from 'node:stream/promises';
 import * as fs from 'node:fs';
 import { EOL } from 'node:os';
-import { HttpApi } from '@jsforce/jsforce-node/lib/http-api.js';
+import { fetch } from 'undici';
 import { HttpResponse } from '@jsforce/jsforce-node';
 import {
   IngestJobV2Results,
@@ -75,28 +75,91 @@ export enum ColumnDelimiter {
 
 export type ColumnDelimiterKeys = keyof typeof ColumnDelimiter;
 
-async function bulkRequest(conn: Connection, url: string): Promise<{ body: string; headers: HttpResponse['headers'] }> {
-  const httpApi = new HttpApi(conn, {
-    responseType: 'text/plain', // this ensures jsforce doesn't try parsing the body
-  });
+/**
+ * Transform stream that skips the first line of CSV data (the header row).
+ * Used when processing subsequent bulk result pages to avoid duplicate headers.
+ */
+export class SkipFirstLineTransform extends Transform {
+  private firstLineSkipped = false;
+  private buffer = '';
 
-  let headers: HttpResponse['headers'] | undefined;
+  public constructor() {
+    super();
+  }
 
-  httpApi.on('response', (response: HttpResponse) => {
-    headers = response.headers;
-  });
+  public _transform(chunk: Buffer, _encoding: BufferEncoding, callback: TransformCallback): void {
+    if (this.firstLineSkipped) {
+      // After first line is skipped, pass through all subsequent data
+      callback(null, chunk);
+      return;
+    }
+
+    // Buffer incoming data until we find the first newline
+    this.buffer += chunk.toString('utf8');
+
+    const newlineIndex = this.buffer.indexOf('\n');
+
+    if (newlineIndex === -1) {
+      // No newline yet, keep buffering
+      callback();
+      return;
+    }
+
+    // Found the newline, skip everything up to and including it
+    const remainingData = this.buffer.slice(newlineIndex + 1);
+    this.firstLineSkipped = true;
+    this.buffer = ''; // Clear buffer to free memory
+
+    callback(null, Buffer.from(remainingData, 'utf8'));
+  }
 
-  const body = await httpApi.request<string>({
-    url: conn.normalizeUrl(url),
+  public _flush(callback: TransformCallback): void {
+    // If we reach the end without finding a newline, clear buffer and finish
+    this.buffer = '';
+    callback();
+  }
+}
+
+async function bulkRequest(
+  conn: Connection,
+  url: string
+): Promise<{ stream: Readable; headers: HttpResponse['headers'] }> {
+  // Bypass jsforce entirely and use undici fetch to avoid any buffering.
+  // jsforce's Transport.httpRequest() adds a 'complete' listener which triggers readAll() buffering.
+  // Using undici fetch directly gives us the raw response stream without any intermediate buffering.
+
+  const normalizedUrl = conn.normalizeUrl(url);
+
+  // Prepare request headers with authorization
+  const headers: { [name: string]: string } = {
+    'content-Type': 'text/csv',
+  };
+
+  if (conn.accessToken) {
+    headers.Authorization = `Bearer ${conn.accessToken}`;
+  }
+
+  const response = await fetch(normalizedUrl, {
     method: 'GET',
+    headers,
   });
 
-  if (!headers) throw new Error('failed to get HTTP headers for bulk query');
+  if (!response.ok) {
+    throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+  }
 
-  return {
-    body,
-    headers,
-  };
+  if (!response.body) {
+    throw new Error('No body was returned');
+  }
+  const stream = Readable.fromWeb(response.body);
+
+  // Extract headers in the format jsforce expects
+  const responseHeaders: HttpResponse['headers'] = {};
+  response.headers.forEach((value: string, key: string) => {
+    responseHeaders[key] = value;
+  });
+
+  return { stream, headers: responseHeaders };
 }
 
 export async function exportRecords(
@@ -124,6 +187,9 @@ export async function exportRecords(
 
   let recordsWritten = 0;
 
+  // refresh here because `bulkRequest` uses undici for fetching results.
+  await conn.refreshAuth();
+
   while (locator !== 'null') {
     // we can't parallelize this because we:
     // 1. need to get 1 batch to know the locator for the next one
@@ -151,7 +217,7 @@ export async function exportRecords(
 
       // eslint-disable-next-line no-await-in-loop
       await pipeline(
-        Readable.from(res.body),
+        res.stream,
         new csvParse({ columns: true, delimiter: ColumnDelimiter[outputInfo.columnDelimiter] }),
         new Transform({
           objectMode: true,
@@ -173,18 +239,15 @@ export async function exportRecords(
       await pipeline(
         locator
           ? [
-              // Skip the 1st row (CSV header) by finding the index of the first `LF`
-              // occurence and move the position 1 char ahead.
-              //
-              // CSVs using `CRLF` are still handled correctly because `CR` and `LF` are different chars in the string.
-              Readable.from(res.body.slice(res.body.indexOf('\n') + 1)),
+              res.stream,
+              new SkipFirstLineTransform(),
               fs.createWriteStream(outputInfo.filePath, {
                 // Open file for appending. The file is created if it does not exist.
                 // https://nodejs.org/api/fs.html#file-system-flags
                 flags: 'a', // append mode
               }),
             ]
-          : [Readable.from(res.body), fs.createWriteStream(outputInfo.filePath)]
+          : [res.stream, fs.createWriteStream(outputInfo.filePath)]
       );
     }
 
diff --git a/test/bulkUtils.test.ts b/test/bulkUtils.test.ts
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
+import { Readable } from 'node:stream';
+import { pipeline } from 'node:stream/promises';
+
 import { expect } from 'chai';
 
-import { detectDelimiter } from '../src/bulkUtils.js';
+import { detectDelimiter, SkipFirstLineTransform } from '../src/bulkUtils.js';
 
 describe('bulkUtils', () => {
   describe('csv', () => {
@@ -31,4 +34,104 @@ describe('bulkUtils', () => {
       expect(await detectDelimiter('./test/test-files/csv/tab.csv')).to.equal('TAB');
     });
   });
+
+  describe.only('SkipFirstLineTransform', () => {
+    async function streamToString(readable: Readable): Promise<string> {
+      const chunks: Buffer[] = [];
+      for await (const chunk of readable) {
+        chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk as string));
+      }
+      return Buffer.concat(chunks).toString('utf8');
+    }
+
+    it('skips first line with LF endings', async () => {
+      const input = 'Header1,Header2,Header3\nRow1Col1,Row1Col2,Row1Col3\nRow2Col1,Row2Col2,Row2Col3\n';
+      const expected = 'Row1Col1,Row1Col2,Row1Col3\nRow2Col1,Row2Col2,Row2Col3\n';
+
+      const result = await streamToString(Readable.from(input).pipe(new SkipFirstLineTransform()));
+
+      expect(result).to.equal(expected);
+    });
+
+    it('skips first line with CRLF endings', async () => {
+      const input = 'Header1,Header2,Header3\r\nRow1Col1,Row1Col2,Row1Col3\r\nRow2Col1,Row2Col2,Row2Col3\r\n';
+      const expected = 'Row1Col1,Row1Col2,Row1Col3\r\nRow2Col1,Row2Col2,Row2Col3\r\n';
+
+      const result = await streamToString(Readable.from(input).pipe(new SkipFirstLineTransform()));
+
+      expect(result).to.equal(expected);
+    });
+
+    it('handles header split across chunks', async () => {
+      // Simulate a stream where the header is split across multiple chunks
+      const chunk1 = 'Header1,Head';
+      const chunk2 = 'er2,Header3\nRow1Col1,Row1Col2,Row1Col3\n';
+      const expected = 'Row1Col1,Row1Col2,Row1Col3\n';
+
+      const input = Readable.from([chunk1, chunk2]);
+      const result = await streamToString(input.pipe(new SkipFirstLineTransform()));
+
+      expect(result).to.equal(expected);
+    });
+
+    it('handles empty stream after header', async () => {
+      const input = 'Header1,Header2,Header3\n';
+      const expected = '';
+
+      const result = await streamToString(Readable.from(input).pipe(new SkipFirstLineTransform()));
+
+      expect(result).to.equal(expected);
+    });
+
+    it('handles single-line input without newline', async () => {
+      // Edge case: header with no newline at all
+      const input = 'Header1,Header2,Header3';
+      const expected = '';
+
+      const result = await streamToString(Readable.from(input).pipe(new SkipFirstLineTransform()));
+
+      expect(result).to.equal(expected);
+    });
+
+    it('handles multi-byte UTF-8 characters in header', async () => {
+      const input = 'Header1,Hëàdér2,Header3\nRow1Col1,Row1Col2,Row1Col3\n';
+      const expected = 'Row1Col1,Row1Col2,Row1Col3\n';
+
+      const result = await streamToString(Readable.from(input).pipe(new SkipFirstLineTransform()));
+
+      expect(result).to.equal(expected);
+    });
+
+    it('handles very long header line', async () => {
+      // Create a header with many columns
+      const headerCols = Array.from({ length: 100 }, (_, i) => `Header${i}`).join(',');
+      const dataCols = Array.from({ length: 100 }, (_, i) => `Data${i}`).join(',');
+      const input = `${headerCols}\n${dataCols}\n`;
+      const expected = `${dataCols}\n`;
+
+      const result = await streamToString(Readable.from(input).pipe(new SkipFirstLineTransform()));
+
+      expect(result).to.equal(expected);
+    });
+
+    it('passes through data correctly in pipeline', async () => {
+      const input = 'Id,Name,Email\n1,John,john@example.com\n2,Jane,jane@example.com\n';
+      const expected = '1,John,john@example.com\n2,Jane,jane@example.com\n';
+
+      const chunks: string[] = [];
+      await pipeline(
+        Readable.from(input),
+        new SkipFirstLineTransform(),
+        async function* (source: AsyncIterable<Buffer>) {
+          for await (const chunk of source) {
+            const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk as string);
+            chunks.push(buffer.toString('utf8'));
+            yield chunk;
+          }
+        }
+      );
+
+      expect(chunks.join('')).to.equal(expected);
+    });
+  });
 });
diff --git a/yarn.lock b/yarn.lock
@@ -7553,6 +7553,11 @@ undici-types@~6.19.2:
   resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-6.19.6.tgz#e218c3df0987f4c0e0008ca00d6b6472d9b89b36"
   integrity sha512-e/vggGopEfTKSvj4ihnOLTsqhrKRN3LeO6qSN/GxohhuRv8qH9bNQ4B8W7e/vFL+0XTnmHPB4/kegunZGA4Org==
 
+undici@^7.22.0:
+  version "7.22.0"
+  resolved "https://registry.yarnpkg.com/undici/-/undici-7.22.0.tgz#7a82590a5908e504a47d85c60b0f89ca14240e60"
+  integrity sha512-RqslV2Us5BrllB+JeiZnK4peryVTndy9Dnqq62S3yYRRTj0tFQCwEniUy2167skdGOy3vqRzEvl1Dm4sV2ReDg==
+
 unicorn-magic@^0.3.0:
   version "0.3.0"
   resolved "https://registry.yarnpkg.com/unicorn-magic/-/unicorn-magic-0.3.0.tgz#4efd45c85a69e0dd576d25532fbfa22aa5c8a104"