Skip to content

Commit d5625a8

Browse files
committed
Make results consistent, add utf-16be
1 parent ec17227 commit d5625a8

File tree

4 files changed

+141
-131
lines changed

4 files changed

+141
-131
lines changed

lib/index.ts

Lines changed: 19 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1+
import { utf16fromStringLoose, utf16toStringLoose } from "@exodus/bytes/utf16.js";
2+
import { utf8fromStringLoose, utf8toStringLoose } from "@exodus/bytes/utf8.js";
3+
14
export type SupportedEncoding =
25
| "utf-8"
36
| "utf8"
47
| "utf-16le"
8+
| "utf-16be"
59
| "us-ascii"
610
| "ascii"
711
| "latin1"
@@ -21,19 +25,6 @@ for (const [code, char] of Object.entries(WINDOWS_1252_EXTRA)) {
2125
WINDOWS_1252_REVERSE[char] = Number.parseInt(code, 10);
2226
}
2327

24-
// ---------- Cached decoders/encoders ----------
25-
let _utf8Decoder: TextDecoder | undefined;
26-
let _utf8Encoder: TextEncoder | undefined;
27-
28-
function utf8Decoder(): TextDecoder | undefined {
29-
if (typeof globalThis.TextDecoder === "undefined") return undefined;
30-
return (_utf8Decoder ??= new globalThis.TextDecoder("utf-8"));
31-
}
32-
function utf8Encoder(): TextEncoder | undefined {
33-
if (typeof globalThis.TextEncoder === "undefined") return undefined;
34-
return (_utf8Encoder ??= new globalThis.TextEncoder());
35-
}
36-
3728
// Safe chunk size well under your measured ~105k cliff.
3829
// 32k keeps memory reasonable and is plenty fast.
3930
const CHUNK = 32 * 1024;
@@ -49,12 +40,17 @@ export function textDecode(
4940
): string {
5041
switch (encoding.toLowerCase() as SupportedEncoding) {
5142
case "utf-8":
52-
case "utf8": {
53-
const dec = utf8Decoder();
54-
return dec ? dec.decode(bytes) : decodeUTF8(bytes);
55-
}
43+
case "utf8":
44+
return utf8toStringLoose(bytes);
5645
case "utf-16le":
57-
return decodeUTF16LE(bytes);
46+
case "utf-16be": {
47+
let suffix = "";
48+
if (bytes.length % 2 === 1) {
49+
suffix = '\uFFFD';
50+
bytes = bytes.subarray(0, -1);
51+
}
52+
return utf16toStringLoose(bytes, encoding === 'utf-16be' ? 'uint8-be' : 'uint8-le') + suffix;
53+
}
5854
case "us-ascii":
5955
case "ascii":
6056
return decodeASCII(bytes);
@@ -74,12 +70,12 @@ export function textEncode(
7470
): Uint8Array {
7571
switch (encoding.toLowerCase() as SupportedEncoding) {
7672
case "utf-8":
77-
case "utf8": {
78-
const enc = utf8Encoder();
79-
return enc ? enc.encode(input) : encodeUTF8(input);
80-
}
73+
case "utf8":
74+
return utf8fromStringLoose(input);
8175
case "utf-16le":
82-
return encodeUTF16LE(input);
76+
return utf16fromStringLoose(input, "uint8-le");
77+
case "utf-16be":
78+
return utf16fromStringLoose(input, "uint8-be");
8379
case "us-ascii":
8480
case "ascii":
8581
return encodeASCII(input);
@@ -95,64 +91,6 @@ export function textEncode(
9591

9692
// --- Internal helpers ---
9793

98-
function decodeUTF8(bytes: Uint8Array): string {
99-
const parts: string[] = [];
100-
let out = "";
101-
let i = 0;
102-
while (i < bytes.length) {
103-
const b1 = bytes[i++];
104-
if (b1 < 0x80) {
105-
out += String.fromCharCode(b1);
106-
} else if (b1 < 0xe0) {
107-
const b2 = bytes[i++] & 0x3f;
108-
out += String.fromCharCode(((b1 & 0x1f) << 6) | b2);
109-
} else if (b1 < 0xf0) {
110-
const b2 = bytes[i++] & 0x3f;
111-
const b3 = bytes[i++] & 0x3f;
112-
out += String.fromCharCode(((b1 & 0x0f) << 12) | (b2 << 6) | b3);
113-
} else {
114-
const b2 = bytes[i++] & 0x3f;
115-
const b3 = bytes[i++] & 0x3f;
116-
const b4 = bytes[i++] & 0x3f;
117-
let cp = ((b1 & 0x07) << 18) | (b2 << 12) | (b3 << 6) | b4;
118-
cp -= 0x10000;
119-
out += String.fromCharCode(
120-
0xd800 + ((cp >> 10) & 0x3ff),
121-
0xdc00 + (cp & 0x3ff)
122-
);
123-
}
124-
125-
if (out.length >= CHUNK) {
126-
parts.push(out);
127-
out = "";
128-
}
129-
}
130-
131-
if (out) parts.push(out);
132-
return parts.join("");
133-
}
134-
135-
function decodeUTF16LE(bytes: Uint8Array): string {
136-
// Use chunked fromCharCode on 16-bit code units.
137-
// If odd length, ignore trailing byte (common behavior).
138-
const len = bytes.length & ~1;
139-
if (len === 0) return "";
140-
141-
const parts: string[] = [];
142-
// Build a temporary code-unit array per chunk.
143-
const maxUnits = CHUNK; // CHUNK code units per chunk
144-
145-
for (let i = 0; i < len; ) {
146-
const unitsThis = Math.min(maxUnits, (len - i) >> 1);
147-
const units = new Array<number>(unitsThis);
148-
for (let j = 0; j < unitsThis; j++, i += 2) {
149-
units[j] = bytes[i] | (bytes[i + 1] << 8);
150-
}
151-
parts.push(String.fromCharCode.apply(null, units as unknown as number[]));
152-
}
153-
return parts.join("");
154-
}
155-
15694
function decodeASCII(bytes: Uint8Array): string {
15795
// 7-bit ASCII: mask high bit. (Kept to match your original semantics.)
15896
const parts: string[] = [];
@@ -201,53 +139,6 @@ function decodeWindows1252(bytes: Uint8Array): string {
201139
return parts.join("");
202140
}
203141

204-
function encodeUTF8(str: string): Uint8Array {
205-
const out: number[] = [];
206-
for (let i = 0; i < str.length; i++) {
207-
let cp = str.charCodeAt(i);
208-
209-
// surrogate pair
210-
if (cp >= 0xd800 && cp <= 0xdbff && i + 1 < str.length) {
211-
const lo = str.charCodeAt(i + 1);
212-
if (lo >= 0xdc00 && lo <= 0xdfff) {
213-
cp = 0x10000 + ((cp - 0xd800) << 10) + (lo - 0xdc00);
214-
i++;
215-
}
216-
}
217-
218-
if (cp < 0x80) {
219-
out.push(cp);
220-
} else if (cp < 0x800) {
221-
out.push(0xc0 | (cp >> 6), 0x80 | (cp & 0x3f));
222-
} else if (cp < 0x10000) {
223-
out.push(
224-
0xe0 | (cp >> 12),
225-
0x80 | ((cp >> 6) & 0x3f),
226-
0x80 | (cp & 0x3f)
227-
);
228-
} else {
229-
out.push(
230-
0xf0 | (cp >> 18),
231-
0x80 | ((cp >> 12) & 0x3f),
232-
0x80 | ((cp >> 6) & 0x3f),
233-
0x80 | (cp & 0x3f)
234-
);
235-
}
236-
}
237-
return new Uint8Array(out);
238-
}
239-
240-
function encodeUTF16LE(str: string): Uint8Array {
241-
const out = new Uint8Array(str.length * 2);
242-
for (let i = 0; i < str.length; i++) {
243-
const code = str.charCodeAt(i);
244-
const o = i * 2;
245-
out[o] = code & 0xff;
246-
out[o + 1] = code >>> 8;
247-
}
248-
return out;
249-
}
250-
251142
function encodeASCII(str: string): Uint8Array {
252143
// 7-bit ASCII: mask high bit
253144
const out = new Uint8Array(str.length);

package-lock.json

Lines changed: 22 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,5 +66,8 @@
6666
"type": "github",
6767
"url": "https://github.com/sponsors/Borewit"
6868
},
69-
"license": "MIT"
69+
"license": "MIT",
70+
"dependencies": {
71+
"@exodus/bytes": "^1.14.0"
72+
}
7073
}

test/test.ts

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,27 @@
11
import { expect } from "chai";
2+
import { toHex } from "@exodus/bytes/hex.js";
23
import { textEncode, textDecode, type SupportedEncoding } from '../lib/index.js';
34

5+
const nonUtf8 = [
6+
{ bytes: [0, 254, 255], charcodes: [0, 0xff_fd, 0xff_fd] },
7+
{ bytes: [0x80], charcodes: [0xff_fd] },
8+
{ bytes: [0xf0, 0x90, 0x80], charcodes: [0xff_fd] },
9+
{ bytes: [0xf0, 0x80, 0x80], charcodes: [0xff_fd, 0xff_fd, 0xff_fd] },
10+
]
11+
12+
const orphans = [
13+
{ charcodes: [0x61, 0x62, 0xd8_00, 0x77, 0x78], replaced: [0x61, 0x62, 0xff_fd, 0x77, 0x78], utf8: '6162efbfbd7778' },
14+
{ charcodes: [0xd8_00], replaced: [0xff_fd], utf8: 'efbfbd' },
15+
{ charcodes: [0xd8_00, 0xd8_00], replaced: [0xff_fd, 0xff_fd], utf8: 'efbfbdefbfbd' },
16+
{ charcodes: [0x61, 0x62, 0xdf_ff, 0x77, 0x78], replaced: [0x61, 0x62, 0xff_fd, 0x77, 0x78], utf8: '6162efbfbd7778' },
17+
{ charcodes: [0xdf_ff, 0xd8_00], replaced: [0xff_fd, 0xff_fd], utf8: 'efbfbdefbfbd' },
18+
]
19+
420
describe("Text polyfill encode/decode", () => {
521
const encodings: [SupportedEncoding, string][] = [
622
["utf-8", "Hello 🌍"],
723
["utf-16le", "Hello 🌍"],
24+
["utf-16be", "Hello 🌍"],
825
["ascii", "Hello!"],
926
["latin1", "Héllo ¢"],
1027
["windows-1252", "Hello €—World"],
@@ -30,6 +47,24 @@ describe("Text polyfill encode/decode", () => {
3047
const str = "𝄞"; // U+1D11E
3148
expect(textDecode(textEncode(str, "utf-8"), "utf-8")).to.equal(str);
3249
});
50+
it("should ignore (not remove) BOM", () => {
51+
expect(textDecode(Uint8Array.of(0xef, 0xbb, 0xbf), "utf-8"), "utf-8").to.equal("\uFEFF");
52+
expect(textDecode(Uint8Array.of(0xef, 0xbb, 0xbf, 0x42), "utf-8"), "utf-8").to.equal("\uFEFFB");
53+
});
54+
it("textDecode replacement", () => {
55+
for (const { bytes, charcodes } of nonUtf8) {
56+
const string = String.fromCharCode(...charcodes)
57+
expect(textDecode(Uint8Array.from(bytes), "utf-8")).to.equal(string);
58+
expect(textDecode(textEncode(string, "utf-8"), "utf-8")).to.equal(string);
59+
}
60+
});
61+
it("textEncode replacement", () => {
62+
for (const { charcodes, replaced, utf8 } of orphans) {
63+
const bytes = textEncode(String.fromCharCode(...charcodes), "utf-8");
64+
expect(toHex(bytes)).to.equal(utf8);
65+
expect(textDecode(bytes, "utf-8")).to.equal(String.fromCharCode(...replaced));
66+
}
67+
});
3368
});
3469

3570
describe("UTF-16LE", () => {
@@ -41,6 +76,67 @@ describe("Text polyfill encode/decode", () => {
4176
const str = "😀";
4277
expect(textDecode(textEncode(str, "utf-16le"), "utf-16le")).to.equal(str);
4378
});
79+
it("should ignore (not remove) BOM", () => {
80+
expect(textDecode(Uint8Array.of(0xff, 0xfe), "utf-16le"), "utf-16le").to.equal("\uFEFF");
81+
expect(textDecode(Uint8Array.of(0xff, 0xfe, 0x42, 0), "utf-16le"), "utf-16le").to.equal("\uFEFFB");
82+
});
83+
it("textDecode replacement", () => {
84+
for (const { charcodes, replaced } of orphans) {
85+
const bytes = new Uint8Array(replaced.length * 2);
86+
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
87+
for (let i = 0; i < charcodes.length; i++) view.setUint16(i * 2, charcodes[i], true);
88+
const string = String.fromCharCode(...replaced);
89+
expect(textDecode(bytes, "utf-16le")).to.equal(string);
90+
expect(textDecode(textEncode(string, "utf-16le"), "utf-16le")).to.equal(string);
91+
}
92+
});
93+
it("textEncode replacement", () => {
94+
for (const { charcodes, replaced } of orphans) {
95+
const bytes = textEncode(String.fromCharCode(...charcodes), "utf-16le");
96+
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
97+
expect(view.byteLength).to.equal(replaced.length * 2);
98+
for (let i = 0; i < replaced.length; i++) {
99+
expect(view.getUint16(i * 2, true)).to.equal(replaced[i]);
100+
}
101+
expect(textDecode(bytes, "utf-16le")).to.equal(String.fromCharCode(...replaced));
102+
}
103+
});
104+
});
105+
106+
describe("UTF-16BE", () => {
107+
it("should handle BMP chars", () => {
108+
const str = "ABC";
109+
expect(textDecode(textEncode(str, "utf-16be"), "utf-16be")).to.equal(str);
110+
});
111+
it("should handle emoji", () => {
112+
const str = "😀";
113+
expect(textDecode(textEncode(str, "utf-16be"), "utf-16be")).to.equal(str);
114+
});
115+
it("should ignore (not remove) BOM", () => {
116+
expect(textDecode(Uint8Array.of(0xfe, 0xff), "utf-16be"), "utf-16be").to.equal("\uFEFF");
117+
expect(textDecode(Uint8Array.of(0xfe, 0xff, 0, 0x42), "utf-16be"), "utf-16be").to.equal("\uFEFFB");
118+
});
119+
it("textDecode replacement", () => {
120+
for (const { charcodes, replaced } of orphans) {
121+
const bytes = new Uint8Array(replaced.length * 2);
122+
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
123+
for (let i = 0; i < charcodes.length; i++) view.setUint16(i * 2, charcodes[i], false);
124+
const string = String.fromCharCode(...replaced);
125+
expect(textDecode(bytes, "utf-16be")).to.equal(string);
126+
expect(textDecode(textEncode(string, "utf-16be"), "utf-16be")).to.equal(string);
127+
}
128+
});
129+
it("textEncode replacement", () => {
130+
for (const { charcodes, replaced } of orphans) {
131+
const bytes = textEncode(String.fromCharCode(...charcodes), "utf-16be");
132+
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
133+
expect(view.byteLength).to.equal(replaced.length * 2);
134+
for (let i = 0; i < replaced.length; i++) {
135+
expect(view.getUint16(i * 2, false)).to.equal(replaced[i]);
136+
}
137+
expect(textDecode(bytes, "utf-16be")).to.equal(String.fromCharCode(...replaced));
138+
}
139+
});
44140
});
45141

46142
describe("ASCII", () => {

0 commit comments

Comments
 (0)