Make results consistent, add utf-16be

ChALkeR · ChALkeR · commit d5625a800d37 · 2026-02-12T17:11:57.000+04:00
diff --git a/lib/index.ts b/lib/index.ts
@@ -1,7 +1,11 @@
+import { utf16fromStringLoose, utf16toStringLoose } from "@exodus/bytes/utf16.js";
+import { utf8fromStringLoose, utf8toStringLoose } from "@exodus/bytes/utf8.js";
+
 export type SupportedEncoding =
   | "utf-8"
   | "utf8"
   | "utf-16le"
+  | "utf-16be"
   | "us-ascii"
   | "ascii"
   | "latin1"
@@ -21,19 +25,6 @@ for (const [code, char] of Object.entries(WINDOWS_1252_EXTRA)) {
   WINDOWS_1252_REVERSE[char] = Number.parseInt(code, 10);
 }
 
-// ---------- Cached decoders/encoders ----------
-let _utf8Decoder: TextDecoder | undefined;
-let _utf8Encoder: TextEncoder | undefined;
-
-function utf8Decoder(): TextDecoder | undefined {
-  if (typeof globalThis.TextDecoder === "undefined") return undefined;
-  return (_utf8Decoder ??= new globalThis.TextDecoder("utf-8"));
-}
-function utf8Encoder(): TextEncoder | undefined {
-  if (typeof globalThis.TextEncoder === "undefined") return undefined;
-  return (_utf8Encoder ??= new globalThis.TextEncoder());
-}
-
 // Safe chunk size well under your measured ~105k cliff.
 // 32k keeps memory reasonable and is plenty fast.
 const CHUNK = 32 * 1024;
@@ -49,12 +40,17 @@ export function textDecode(
 ): string {
   switch (encoding.toLowerCase() as SupportedEncoding) {
     case "utf-8":
-    case "utf8": {
-      const dec = utf8Decoder();
-      return dec ? dec.decode(bytes) : decodeUTF8(bytes);
-    }
+    case "utf8":
+      return utf8toStringLoose(bytes);
     case "utf-16le":
-      return decodeUTF16LE(bytes);
+    case "utf-16be": {
+      let suffix = "";
+      if (bytes.length % 2 === 1) {
+        suffix = '\uFFFD';
+        bytes = bytes.subarray(0, -1);
+      }
+      return utf16toStringLoose(bytes, encoding === 'utf-16be' ? 'uint8-be' : 'uint8-le') + suffix;
+    }
     case "us-ascii":
     case "ascii":
       return decodeASCII(bytes);
@@ -74,12 +70,12 @@ export function textEncode(
 ): Uint8Array {
   switch (encoding.toLowerCase() as SupportedEncoding) {
     case "utf-8":
-    case "utf8": {
-      const enc = utf8Encoder();
-      return enc ? enc.encode(input) : encodeUTF8(input);
-    }
+    case "utf8":
+      return utf8fromStringLoose(input);
     case "utf-16le":
-      return encodeUTF16LE(input);
+      return utf16fromStringLoose(input, "uint8-le");
+    case "utf-16be":
+      return utf16fromStringLoose(input, "uint8-be");
     case "us-ascii":
     case "ascii":
       return encodeASCII(input);
@@ -95,64 +91,6 @@ export function textEncode(
 
 // --- Internal helpers ---
 
-function decodeUTF8(bytes: Uint8Array): string {
-  const parts: string[] = [];
-  let out = "";
-  let i = 0;
-  while (i < bytes.length) {
-    const b1 = bytes[i++];
-    if (b1 < 0x80) {
-      out += String.fromCharCode(b1);
-    } else if (b1 < 0xe0) {
-      const b2 = bytes[i++] & 0x3f;
-      out += String.fromCharCode(((b1 & 0x1f) << 6) | b2);
-    } else if (b1 < 0xf0) {
-      const b2 = bytes[i++] & 0x3f;
-      const b3 = bytes[i++] & 0x3f;
-      out += String.fromCharCode(((b1 & 0x0f) << 12) | (b2 << 6) | b3);
-    } else {
-      const b2 = bytes[i++] & 0x3f;
-      const b3 = bytes[i++] & 0x3f;
-      const b4 = bytes[i++] & 0x3f;
-      let cp = ((b1 & 0x07) << 18) | (b2 << 12) | (b3 << 6) | b4;
-      cp -= 0x10000;
-      out += String.fromCharCode(
-        0xd800 + ((cp >> 10) & 0x3ff),
-        0xdc00 + (cp & 0x3ff)
-      );
-    }
-
-    if (out.length >= CHUNK) {
-      parts.push(out);
-      out = "";
-    }
-  }
-
-  if (out) parts.push(out);
-  return parts.join("");
-}
-
-function decodeUTF16LE(bytes: Uint8Array): string {
-  // Use chunked fromCharCode on 16-bit code units.
-  // If odd length, ignore trailing byte (common behavior).
-  const len = bytes.length & ~1;
-  if (len === 0) return "";
-
-  const parts: string[] = [];
-  // Build a temporary code-unit array per chunk.
-  const maxUnits = CHUNK; // CHUNK code units per chunk
-
-  for (let i = 0; i < len; ) {
-    const unitsThis = Math.min(maxUnits, (len - i) >> 1);
-    const units = new Array<number>(unitsThis);
-    for (let j = 0; j < unitsThis; j++, i += 2) {
-      units[j] = bytes[i] | (bytes[i + 1] << 8);
-    }
-    parts.push(String.fromCharCode.apply(null, units as unknown as number[]));
-  }
-  return parts.join("");
-}
-
 function decodeASCII(bytes: Uint8Array): string {
   // 7-bit ASCII: mask high bit. (Kept to match your original semantics.)
   const parts: string[] = [];
@@ -201,53 +139,6 @@ function decodeWindows1252(bytes: Uint8Array): string {
   return parts.join("");
 }
 
-function encodeUTF8(str: string): Uint8Array {
-  const out: number[] = [];
-  for (let i = 0; i < str.length; i++) {
-    let cp = str.charCodeAt(i);
-
-    // surrogate pair
-    if (cp >= 0xd800 && cp <= 0xdbff && i + 1 < str.length) {
-      const lo = str.charCodeAt(i + 1);
-      if (lo >= 0xdc00 && lo <= 0xdfff) {
-        cp = 0x10000 + ((cp - 0xd800) << 10) + (lo - 0xdc00);
-        i++;
-      }
-    }
-
-    if (cp < 0x80) {
-      out.push(cp);
-    } else if (cp < 0x800) {
-      out.push(0xc0 | (cp >> 6), 0x80 | (cp & 0x3f));
-    } else if (cp < 0x10000) {
-      out.push(
-        0xe0 | (cp >> 12),
-        0x80 | ((cp >> 6) & 0x3f),
-        0x80 | (cp & 0x3f)
-      );
-    } else {
-      out.push(
-        0xf0 | (cp >> 18),
-        0x80 | ((cp >> 12) & 0x3f),
-        0x80 | ((cp >> 6) & 0x3f),
-        0x80 | (cp & 0x3f)
-      );
-    }
-  }
-  return new Uint8Array(out);
-}
-
-function encodeUTF16LE(str: string): Uint8Array {
-  const out = new Uint8Array(str.length * 2);
-  for (let i = 0; i < str.length; i++) {
-    const code = str.charCodeAt(i);
-    const o = i * 2;
-    out[o] = code & 0xff;
-    out[o + 1] = code >>> 8;
-  }
-  return out;
-}
-
 function encodeASCII(str: string): Uint8Array {
   // 7-bit ASCII: mask high bit
   const out = new Uint8Array(str.length);
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -66,5 +66,8 @@
     "type": "github",
     "url": "https://github.com/sponsors/Borewit"
   },
-  "license": "MIT"
+  "license": "MIT",
+  "dependencies": {
+    "@exodus/bytes": "^1.14.0"
+  }
 }
diff --git a/test/test.ts b/test/test.ts
@@ -1,10 +1,27 @@
 import { expect } from "chai";
+import { toHex } from "@exodus/bytes/hex.js";
 import { textEncode, textDecode, type SupportedEncoding } from '../lib/index.js';
 
+const nonUtf8 = [
+  { bytes: [0, 254, 255], charcodes: [0, 0xff_fd, 0xff_fd] },
+  { bytes: [0x80], charcodes: [0xff_fd] },
+  { bytes: [0xf0, 0x90, 0x80], charcodes: [0xff_fd] },
+  { bytes: [0xf0, 0x80, 0x80], charcodes: [0xff_fd, 0xff_fd, 0xff_fd] },
+]
+
+const orphans = [
+  { charcodes: [0x61, 0x62, 0xd8_00, 0x77, 0x78], replaced: [0x61, 0x62, 0xff_fd, 0x77, 0x78], utf8: '6162efbfbd7778' },
+  { charcodes: [0xd8_00], replaced: [0xff_fd], utf8: 'efbfbd' },
+  { charcodes: [0xd8_00, 0xd8_00], replaced: [0xff_fd, 0xff_fd], utf8: 'efbfbdefbfbd' },
+  { charcodes: [0x61, 0x62, 0xdf_ff, 0x77, 0x78], replaced: [0x61, 0x62, 0xff_fd, 0x77, 0x78], utf8: '6162efbfbd7778' },
+  { charcodes: [0xdf_ff, 0xd8_00], replaced: [0xff_fd, 0xff_fd], utf8: 'efbfbdefbfbd' },
+]
+
 describe("Text polyfill encode/decode", () => {
   const encodings: [SupportedEncoding, string][] = [
     ["utf-8", "Hello 🌍"],
     ["utf-16le", "Hello 🌍"],
+    ["utf-16be", "Hello 🌍"],
     ["ascii", "Hello!"],
     ["latin1", "Héllo ¢"],
     ["windows-1252", "Hello €—World"],
@@ -30,6 +47,24 @@ describe("Text polyfill encode/decode", () => {
       const str = "𝄞"; // U+1D11E
       expect(textDecode(textEncode(str, "utf-8"), "utf-8")).to.equal(str);
     });
+    it("should ignore (not remove) BOM", () => {
+      expect(textDecode(Uint8Array.of(0xef, 0xbb, 0xbf), "utf-8"), "utf-8").to.equal("\uFEFF");
+      expect(textDecode(Uint8Array.of(0xef, 0xbb, 0xbf, 0x42), "utf-8"), "utf-8").to.equal("\uFEFFB");
+    });
+    it("textDecode replacement", () => {
+      for (const { bytes, charcodes } of nonUtf8) {
+        const string = String.fromCharCode(...charcodes)
+        expect(textDecode(Uint8Array.from(bytes), "utf-8")).to.equal(string);
+        expect(textDecode(textEncode(string, "utf-8"), "utf-8")).to.equal(string);
+      }
+    });
+    it("textEncode replacement", () => {
+      for (const { charcodes, replaced, utf8 } of orphans) {
+        const bytes = textEncode(String.fromCharCode(...charcodes), "utf-8");
+        expect(toHex(bytes)).to.equal(utf8);
+        expect(textDecode(bytes, "utf-8")).to.equal(String.fromCharCode(...replaced));
+      }
+    });
   });
 
   describe("UTF-16LE", () => {
@@ -41,6 +76,67 @@ describe("Text polyfill encode/decode", () => {
       const str = "😀";
       expect(textDecode(textEncode(str, "utf-16le"), "utf-16le")).to.equal(str);
     });
+    it("should ignore (not remove) BOM", () => {
+      expect(textDecode(Uint8Array.of(0xff, 0xfe), "utf-16le"), "utf-16le").to.equal("\uFEFF");
+      expect(textDecode(Uint8Array.of(0xff, 0xfe, 0x42, 0), "utf-16le"), "utf-16le").to.equal("\uFEFFB");
+    });
+    it("textDecode replacement", () => {
+      for (const { charcodes, replaced } of orphans) {
+        const bytes = new Uint8Array(replaced.length * 2);
+        const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
+        for (let i = 0; i < charcodes.length; i++) view.setUint16(i * 2, charcodes[i], true);
+        const string = String.fromCharCode(...replaced);
+        expect(textDecode(bytes, "utf-16le")).to.equal(string);
+        expect(textDecode(textEncode(string, "utf-16le"), "utf-16le")).to.equal(string);
+      }
+    });
+    it("textEncode replacement", () => {
+      for (const { charcodes, replaced } of orphans) {
+        const bytes = textEncode(String.fromCharCode(...charcodes), "utf-16le");
+        const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
+        expect(view.byteLength).to.equal(replaced.length * 2);
+        for (let i = 0; i < replaced.length; i++) {
+          expect(view.getUint16(i * 2, true)).to.equal(replaced[i]);
+        }
+        expect(textDecode(bytes, "utf-16le")).to.equal(String.fromCharCode(...replaced));
+      }
+    });
+  });
+
+  describe("UTF-16BE", () => {
+    it("should handle BMP chars", () => {
+      const str = "ABC";
+      expect(textDecode(textEncode(str, "utf-16be"), "utf-16be")).to.equal(str);
+    });
+    it("should handle emoji", () => {
+      const str = "😀";
+      expect(textDecode(textEncode(str, "utf-16be"), "utf-16be")).to.equal(str);
+    });
+    it("should ignore (not remove) BOM", () => {
+      expect(textDecode(Uint8Array.of(0xfe, 0xff), "utf-16be"), "utf-16be").to.equal("\uFEFF");
+      expect(textDecode(Uint8Array.of(0xfe, 0xff, 0, 0x42), "utf-16be"), "utf-16be").to.equal("\uFEFFB");
+    });
+    it("textDecode replacement", () => {
+      for (const { charcodes, replaced } of orphans) {
+        const bytes = new Uint8Array(replaced.length * 2);
+        const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
+        for (let i = 0; i < charcodes.length; i++) view.setUint16(i * 2, charcodes[i], false);
+        const string = String.fromCharCode(...replaced);
+        expect(textDecode(bytes, "utf-16be")).to.equal(string);
+        expect(textDecode(textEncode(string, "utf-16be"), "utf-16be")).to.equal(string);
+      }
+    });
+    it("textEncode replacement", () => {
+      for (const { charcodes, replaced } of orphans) {
+        const bytes = textEncode(String.fromCharCode(...charcodes), "utf-16be");
+        const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
+        expect(view.byteLength).to.equal(replaced.length * 2);
+        for (let i = 0; i < replaced.length; i++) {
+          expect(view.getUint16(i * 2, false)).to.equal(replaced[i]);
+        }
+        expect(textDecode(bytes, "utf-16be")).to.equal(String.fromCharCode(...replaced));
+      }
+    });
   });
 
   describe("ASCII", () => {