Skip to content

Commit 9aa082f

Browse files
committed
Implement UTF-16LE encoding, update tests, adjust codec interface
Three major reasons for reimplementing UTF-16 and not use native codec: 1. We want to remove StringDecoder & Buffer references due to #235. 2. StringDecoder is inconsistent with handling surrogates on Node v6-9 3. NPM module string_decoder gives strange results when processing chunks - it sometimes prepends '\u0000', likely due to a bug. Performance was and is a major concern here. Decoder shouldn't be affected because it uses backend methods directly. Encoder is affected due to introducing character-level loop. It's still very fast (~450Mb/s), so I'm not too worried. If needed, we can make it about 4x faster in Node.js by introducing a dedicated backend method. Browser speeds will be the same.
1 parent e567849 commit 9aa082f

File tree

8 files changed

+464
-100
lines changed

8 files changed

+464
-100
lines changed

backends/web.js

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
"use strict";
2-
// NOTE: This backend uses TextDecoder interface.
2+
// NOTE: This backend uses TextDecoder class.
3+
// NOTE: Web backend differs from Node in handling invalid surrogates when decoding to strings in rawCharsToResult() function.
4+
// Node passes them through unchanged, web backend (actually TextDecoder) replaces them with '�'. I haven't found a
5+
// performant way to unify these behaviors while keeping compatibility with Node <11 where there's no TextDecoder.
6+
// Not too worried as it seems like an edge case mostly concerning utf-16/utf-32/cesu8 codecs, but something to be aware of.
37

48
module.exports = {
59
// Encoder string input: use str directly, .length, .charCodeAt(i).
@@ -38,7 +42,12 @@ module.exports = {
3842
return new Uint16Array(new ArrayBuffer(numChars * Uint16Array.BYTES_PER_ELEMENT));
3943
},
4044
rawCharsToResult(rawChars, finalLen) {
41-
return new TextDecoder("utf-16").decode(rawChars.subarray(0, finalLen));
45+
rawChars = rawChars.subarray(0, finalLen);
46+
// NOTE: TextDecoder will convert all invalid surrogates to '�'-s.
47+
let res = new TextDecoder("utf-16", {ignoreBOM: true}).decode(rawChars);
48+
if (res.length !== finalLen)
49+
throw new Error("TextDecoder returned different length string on array " + rawChars);
50+
return res;
4251
},
4352

4453
// Optimizations

encodings/internal.js

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ module.exports = {
99
cesu8: { type: "_internal", bomAware: true},
1010
unicode11utf8: "utf8",
1111

12-
ucs2: { type: "_internal", bomAware: true},
13-
utf16le: "ucs2",
12+
// NOTE: utf-16le/ucs2 are in utf16.js.
1413

1514
binary: { type: "_internal" },
1615
base64: { type: "_internal" },

encodings/utf16.js

Lines changed: 212 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,150 @@
11
"use strict";
22

3-
// Note: UTF16-LE (or UCS2) codec is Node.js native. See encodings/internal.js
3+
// == UTF16-LE codec. ==========================================================
4+
// Note: We're not using Node.js native codec because StringDecoder implementation is buggy
5+
// (adds \0 in some chunks; doesn't flag non-even number of bytes). We do use raw encoding/decoding
6+
// routines for performance where possible, though.
7+
8+
exports.utf16le = class Utf16LECodec {
9+
createEncoder(options, iconv) {
10+
return new Utf16LEEncoder(iconv.backend);
11+
}
12+
createDecoder(options, iconv) {
13+
return new Utf16LEDecoder(iconv.backend, iconv.defaultCharUnicode);
14+
}
15+
get bomAware() { return true; }
16+
}
17+
18+
class Utf16LEEncoder {
19+
constructor(backend) {
20+
this.backend = backend;
21+
}
22+
23+
write(str) {
24+
const bytes = this.backend.allocBytes(str.length * 2);
25+
const chars = new Uint16Array(bytes.buffer, bytes.byteOffset, str.length);
26+
for (let i = 0; i < str.length; i++) {
27+
chars[i] = str.charCodeAt(i);
28+
}
29+
return this.backend.bytesToResult(bytes, bytes.length);
30+
}
31+
32+
end() {}
33+
}
34+
35+
class Utf16LEDecoder {
36+
constructor(backend, defaultChar) {
37+
this.backend = backend;
38+
this.defaultChar = defaultChar;
39+
this.leadByte = -1;
40+
this.leadSurrogate = undefined;
41+
}
42+
43+
write(buf) {
44+
// NOTE: This function is mostly the same as Utf16BEDecoder.write() with bytes swapped.
45+
// Please keep them in sync.
46+
// NOTE: The logic here is more complicated than barely necessary due to several limitations:
47+
// 1. Input data chunks can split 2-byte code units, making 'leadByte' necessary.
48+
// 2. Input data chunks can split valid surrogate pairs, making 'leadSurrogate' necessary.
49+
// 3. rawCharsToResult() of Web backend converts all lone surrogates to '�', so we need to make
50+
// sure we don't feed it parts of valid surrogate pairs.
51+
// 4. For performance reasons we want to use initial buffer as much as we can. This is not
52+
// possible if after our calculations the 2-byte memory alignment of a Uint16Array is lost,
53+
// in which case we have to do a copy.
54+
55+
if (buf.length == 0) {
56+
return '';
57+
}
58+
let offset = 0;
59+
let byteLen = buf.length;
60+
61+
// Process previous leadByte
62+
let prefix = '';
63+
if (this.leadByte !== -1) {
64+
offset++; byteLen--;
65+
prefix = String.fromCharCode(this.leadByte | (buf[0] << 8));
66+
}
67+
68+
// Set new leadByte if needed
69+
if (byteLen & 1) {
70+
this.leadByte = buf[buf.length-1];
71+
byteLen--;
72+
} else {
73+
this.leadByte = -1;
74+
}
75+
76+
// Process leadSurrogate
77+
if (prefix.length || byteLen) {
78+
// Add high surrogate from previous chunk.
79+
if (this.leadSurrogate) {
80+
if (prefix.length) {
81+
prefix = this.leadSurrogate + prefix;
82+
} else {
83+
// Make sure 'chars' don't start with a lone low surrogate; it will mess with rawCharsToResult.
84+
prefix = this.leadSurrogate + String.fromCharCode(buf[offset] | (buf[offset+1] << 8));
85+
offset += 2; byteLen -= 2;
86+
}
87+
this.leadSurrogate = undefined;
88+
}
89+
90+
// Slice off a new high surrogate at the end of the current chunk.
91+
if (byteLen) {
92+
const lastIdx = offset + byteLen - 2;
93+
const lastChar = buf[lastIdx] | (buf[lastIdx+1] << 8);
94+
if (0xD800 <= lastChar && lastChar < 0xDC00) {
95+
this.leadSurrogate = String.fromCharCode(lastChar);
96+
byteLen -= 2;
97+
}
98+
} else { // slice from prefix
99+
const lastChar = prefix.charCodeAt(prefix.length-1);
100+
if (0xD800 <= lastChar && lastChar < 0xDC00) {
101+
this.leadSurrogate = prefix[prefix.length-1];
102+
prefix = prefix.slice(0, -1);
103+
}
104+
}
105+
}
106+
107+
let chars;
108+
if ((buf.byteOffset + offset) & 1 === 0) {
109+
// If byteOffset is aligned, just use the ArrayBuffer from input buf.
110+
chars = new Uint16Array(buf.buffer, buf.byteOffset + offset, byteLen >> 1);
111+
} else {
112+
// If byteOffset is NOT aligned, create a new aligned buffer and copy the data.
113+
chars = this.backend.allocRawChars(byteLen >> 1);
114+
const srcByteView = new Uint8Array(buf.buffer, buf.byteOffset + offset, byteLen);
115+
const destByteView = new Uint8Array(chars.buffer, chars.byteOffset, byteLen);
116+
destByteView.set(srcByteView);
117+
}
118+
119+
return prefix + this.backend.rawCharsToResult(chars, chars.length);
120+
}
121+
122+
end() {
123+
if (this.leadSurrogate || this.leadByte !== -1) {
124+
const res = (this.leadSurrogate ? this.leadSurrogate : '') + (this.leadByte !== -1 ? this.defaultChar : '');
125+
this.leadSurrogate = undefined;
126+
this.leadByte = -1;
127+
return res;
128+
}
129+
}
130+
}
131+
exports.ucs2 = "utf16le"; // Alias
132+
4133

5134
// == UTF16-BE codec. ==========================================================
6135

7136
exports.utf16be = class Utf16BECodec {
8-
get encoder() { return Utf16BEEncoder; }
9-
get decoder() { return Utf16BEDecoder; }
137+
createEncoder(options, iconv) {
138+
return new Utf16BEEncoder(iconv.backend);
139+
}
140+
createDecoder(options, iconv) {
141+
return new Utf16BEDecoder(iconv.backend, iconv.defaultCharUnicode);
142+
}
10143
get bomAware() { return true; }
11144
}
12145

13146
class Utf16BEEncoder {
14-
constructor(opts, codec, backend) {
147+
constructor(backend) {
15148
this.backend = backend;
16149
}
17150

@@ -30,30 +163,86 @@ class Utf16BEEncoder {
30163
}
31164

32165
class Utf16BEDecoder {
33-
constructor(opts, codec, backend) {
166+
constructor(backend, defaultChar) {
34167
this.backend = backend;
35-
this.overflowByte = -1;
168+
this.defaultChar = defaultChar;
169+
this.leadByte = -1;
170+
this.leadSurrogate = undefined;
36171
}
37172

38173
write(buf) {
39-
const chars = this.backend.allocRawChars((buf.length+1) >> 1);
40-
let charsPos = 0, i = 0;
41-
42-
if (this.overflowByte !== -1 && i < buf.length) {
43-
chars[charsPos++] = (this.overflowByte << 8) + buf[i++];
174+
// NOTE: This function is mostly copy/paste from Utf16LEDecoder.write() with bytes swapped.
175+
// Please keep them in sync. Comments in that function apply here too.
176+
if (buf.length === 0) {
177+
return '';
44178
}
45-
46-
for (; i < buf.length-1; i += 2) {
47-
chars[charsPos++] = (buf[i] << 8) + buf[i+1];
179+
180+
let offset = 0;
181+
let byteLen = buf.length;
182+
183+
// Process previous leadByte
184+
let prefix = '';
185+
if (this.leadByte !== -1) {
186+
offset++; byteLen--;
187+
prefix = String.fromCharCode((this.leadByte << 8) | buf[0]);
188+
}
189+
190+
// Set new leadByte
191+
if (byteLen & 1) {
192+
this.leadByte = buf[buf.length-1];
193+
byteLen--;
194+
} else {
195+
this.leadByte = -1;
48196
}
49197

50-
this.overflowByte = (i == buf.length-1) ? buf[i] : -1;
198+
// Process leadSurrogate
199+
if (prefix.length || byteLen) {
200+
// Add high surrogate from previous chunk.
201+
if (this.leadSurrogate) {
202+
if (prefix.length) {
203+
prefix = this.leadSurrogate + prefix;
204+
} else {
205+
// Make sure 'chars' don't start with a lone low surrogate; it will mess with rawCharsToResult.
206+
prefix = this.leadSurrogate + String.fromCharCode((buf[offset] << 8) | buf[offset+1]);
207+
offset += 2; byteLen -= 2;
208+
}
209+
this.leadSurrogate = undefined;
210+
}
211+
212+
// Slice off a new high surrogate at the end of the current chunk.
213+
if (byteLen) {
214+
const lastIdx = offset + byteLen - 2;
215+
const lastChar = (buf[lastIdx] << 8) | buf[lastIdx+1];
216+
if (0xD800 <= lastChar && lastChar < 0xDC00) {
217+
this.leadSurrogate = String.fromCharCode(lastChar);
218+
byteLen -= 2;
219+
}
220+
} else { // slice from prefix
221+
const lastChar = prefix.charCodeAt(prefix.length-1);
222+
if (0xD800 <= lastChar && lastChar < 0xDC00) {
223+
this.leadSurrogate = prefix[prefix.length-1];
224+
prefix = prefix.slice(0, -1);
225+
}
226+
}
227+
}
228+
229+
// Convert the main chunk of bytes
230+
const chars = this.backend.allocRawChars(byteLen >> 1);
231+
const srcBytes = new DataView(buf.buffer, buf.byteOffset + offset, byteLen);
232+
for (let i = 0; i < chars.length; i++) {
233+
chars[i] = srcBytes.getUint16(i*2);
234+
}
51235

52-
return this.backend.rawCharsToResult(chars, charsPos);
236+
return prefix + this.backend.rawCharsToResult(chars, chars.length);
53237
}
54238

55239
end() {
56-
this.overflowByte = -1;
240+
if (this.leadSurrogate || this.leadByte !== -1) {
241+
const res = (this.leadSurrogate ? this.leadSurrogate : '') + (this.leadByte !== -1 ? this.defaultChar : '');
242+
this.leadSurrogate = undefined;
243+
this.leadByte = -1;
244+
return res;
245+
}
57246
}
58247
}
59248

@@ -67,39 +256,25 @@ class Utf16BEDecoder {
67256
// Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false).
68257

69258
exports.utf16 = class Utf16Codec {
70-
constructor(opts, iconv) {
71-
this.iconv = iconv;
72-
}
73-
get encoder() { return Utf16Encoder; }
74-
get decoder() { return Utf16Decoder; }
75-
}
76-
77-
class Utf16Encoder {
78-
constructor(options, codec) {
259+
createEncoder(options, iconv) {
79260
options = options || {};
80261
if (options.addBOM === undefined)
81262
options.addBOM = true;
82-
this.encoder = codec.iconv.getEncoder(options.use || 'utf-16le', options);
263+
return iconv.getEncoder('utf-16le', options);
83264
}
84-
85-
// Pass-through to this.encoder
86-
write(str) {
87-
return this.encoder.write(str);
88-
}
89-
90-
end() {
91-
return this.encoder.end();
265+
createDecoder(options, iconv) {
266+
return new Utf16Decoder(options, iconv);
92267
}
93268
}
94269

95270
class Utf16Decoder {
96-
constructor(options, codec) {
271+
constructor(options, iconv) {
97272
this.decoder = null;
98273
this.initialBufs = [];
99274
this.initialBufsLen = 0;
100275

101276
this.options = options || {};
102-
this.iconv = codec.iconv;
277+
this.iconv = iconv;
103278
}
104279

105280
write(buf) {

lib/index.js

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,11 @@ iconv._canonicalizeEncoding = function(encoding) {
105105
}
106106

107107
iconv.getEncoder = function getEncoder(encoding, options) {
108-
var codec = iconv.getCodec(encoding),
109-
encoder = new codec.encoder(options, codec, iconv.backend);
108+
const codec = iconv.getCodec(encoding);
109+
110+
let encoder = codec.createEncoder
111+
? codec.createEncoder(options, iconv)
112+
: new codec.encoder(options, codec, iconv.backend);
110113

111114
if (codec.bomAware && options && options.addBOM)
112115
encoder = new bomHandling.PrependBOM(encoder, options);
@@ -115,8 +118,11 @@ iconv.getEncoder = function getEncoder(encoding, options) {
115118
}
116119

117120
iconv.getDecoder = function getDecoder(encoding, options) {
118-
var codec = iconv.getCodec(encoding),
119-
decoder = new codec.decoder(options, codec, iconv.backend);
121+
const codec = iconv.getCodec(encoding);
122+
123+
let decoder = codec.createDecoder
124+
? codec.createDecoder(options, iconv)
125+
: new codec.decoder(options, codec, iconv.backend);
120126

121127
if (codec.bomAware && !(options && options.stripBOM === false))
122128
decoder = new bomHandling.StripBOM(decoder, options);

package.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@
3737
"iconv": "^2.3.5",
3838
"mocha": "^3.5.3",
3939
"request": "^2.88.2",
40-
"semver": "^6.3.0",
4140
"unorm": "^1.6.0"
4241
},
4342
"dependencies": {

0 commit comments

Comments
 (0)