pillarjs
diff --git a/‎backends/web.js‎
Lines changed: 11 additions & 2 deletions b/‎backends/web.js‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎encodings/internal.js‎
Lines changed: 1 addition & 2 deletions b/‎encodings/internal.js‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎encodings/utf16.js‎
Lines changed: 212 additions & 37 deletions b/‎encodings/utf16.js‎
Lines changed: 212 additions & 37 deletions
diff --git a/‎lib/index.js‎
Lines changed: 10 additions & 4 deletions b/‎lib/index.js‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎package.json‎
Lines changed: 0 additions & 1 deletion b/‎package.json‎
Lines changed: 0 additions & 1 deletion
@@ -1,5 +1,9 @@
 "use strict";
-// NOTE: This backend uses TextDecoder interface.
+// NOTE: This backend uses TextDecoder class.
+// NOTE: Web backend differs from Node in handling invalid surrogates when decoding to strings in rawCharsToResult() function.
+//   Node passes them through unchanged, web backend (actually TextDecoder) replaces them with '�'. I haven't found a 
+//   performant way to unify these behaviors while keeping compatibility with Node <11 where there's no TextDecoder.
+//   Not too worried as it seems like an edge case mostly concerning utf-16/utf-32/cesu8 codecs, but something to be aware of.
 
 module.exports = {
     // Encoder string input: use str directly, .length, .charCodeAt(i).
@@ -38,7 +42,12 @@ module.exports = {
         return new Uint16Array(new ArrayBuffer(numChars * Uint16Array.BYTES_PER_ELEMENT));
     },
     rawCharsToResult(rawChars, finalLen) {
-        return new TextDecoder("utf-16").decode(rawChars.subarray(0, finalLen));
+        rawChars = rawChars.subarray(0, finalLen);
+        // NOTE: TextDecoder will convert all invalid surrogates to '�'-s.
+        let res = new TextDecoder("utf-16", {ignoreBOM: true}).decode(rawChars);
+        if (res.length !== finalLen)
+            throw new Error("TextDecoder returned different length string on array " + rawChars);
+        return res;
     },
 
     // Optimizations
 
@@ -9,8 +9,7 @@ module.exports = {
     cesu8:  { type: "_internal", bomAware: true},
     unicode11utf8: "utf8",
 
-    ucs2:   { type: "_internal", bomAware: true},
-    utf16le: "ucs2",
+    // NOTE: utf-16le/ucs2 are in utf16.js.
 
     binary: { type: "_internal" },
     base64: { type: "_internal" },
 
@@ -1,17 +1,150 @@
 "use strict";
 
-// Note: UTF16-LE (or UCS2) codec is Node.js native. See encodings/internal.js
+// == UTF16-LE codec. ==========================================================
+// Note: We're not using Node.js native codec because StringDecoder implementation is buggy
+// (adds \0 in some chunks; doesn't flag non-even number of bytes). We do use raw encoding/decoding
+// routines for performance where possible, though.
+
+exports.utf16le = class Utf16LECodec {
+    createEncoder(options, iconv) {
+        return new Utf16LEEncoder(iconv.backend);
+    }
+    createDecoder(options, iconv) {
+        return new Utf16LEDecoder(iconv.backend, iconv.defaultCharUnicode);
+    }
+    get bomAware() { return true; }
+}
+
+class Utf16LEEncoder {
+    constructor(backend) {
+        this.backend = backend;
+    }
+
+    write(str) {
+        const bytes = this.backend.allocBytes(str.length * 2);
+        const chars = new Uint16Array(bytes.buffer, bytes.byteOffset, str.length);
+        for (let i = 0; i < str.length; i++) {
+            chars[i] = str.charCodeAt(i);
+        }
+        return this.backend.bytesToResult(bytes, bytes.length);
+    }
+
+    end() {}
+}
+
+class Utf16LEDecoder {
+    constructor(backend, defaultChar) {
+        this.backend = backend;
+        this.defaultChar = defaultChar;
+        this.leadByte = -1;
+        this.leadSurrogate = undefined;
+    }
+
+    write(buf) {
+        // NOTE: This function is mostly the same as Utf16BEDecoder.write() with bytes swapped.
+        //   Please keep them in sync.
+        // NOTE: The logic here is more complicated than barely necessary due to several limitations:
+        //  1. Input data chunks can split 2-byte code units, making 'leadByte' necessary.
+        //  2. Input data chunks can split valid surrogate pairs, making 'leadSurrogate' necessary.
+        //  3. rawCharsToResult() of Web backend converts all lone surrogates to '�', so we need to make
+        //     sure we don't feed it parts of valid surrogate pairs.
+        //  4. For performance reasons we want to use initial buffer as much as we can. This is not
+        //     possible if after our calculations the 2-byte memory alignment of a Uint16Array is lost, 
+        //     in which case we have to do a copy.
+
+        if (buf.length == 0) {
+            return '';
+        }
+        let offset = 0;
+        let byteLen = buf.length;
+
+        // Process previous leadByte
+        let prefix = '';
+        if (this.leadByte !== -1) {
+            offset++; byteLen--;
+            prefix = String.fromCharCode(this.leadByte | (buf[0] << 8));
+        }
+
+        // Set new leadByte if needed
+        if (byteLen & 1) {
+            this.leadByte = buf[buf.length-1];
+            byteLen--;
+        } else {
+            this.leadByte = -1;
+        }
+
+        // Process leadSurrogate
+        if (prefix.length || byteLen) {
+            // Add high surrogate from previous chunk.
+            if (this.leadSurrogate) {
+                if (prefix.length) {
+                    prefix = this.leadSurrogate + prefix;
+                } else {
+                    // Make sure 'chars' don't start with a lone low surrogate; it will mess with rawCharsToResult.
+                    prefix = this.leadSurrogate + String.fromCharCode(buf[offset] | (buf[offset+1] << 8));
+                    offset += 2; byteLen -= 2;
+                }
+                this.leadSurrogate = undefined;
+            }
+
+            // Slice off a new high surrogate at the end of the current chunk.
+            if (byteLen) {
+                const lastIdx = offset + byteLen - 2;
+                const lastChar = buf[lastIdx] | (buf[lastIdx+1] << 8);
+                if (0xD800 <= lastChar && lastChar < 0xDC00) {
+                    this.leadSurrogate = String.fromCharCode(lastChar);
+                    byteLen -= 2;
+                }
+            } else { // slice from prefix
+                const lastChar = prefix.charCodeAt(prefix.length-1);
+                if (0xD800 <= lastChar && lastChar < 0xDC00) {
+                    this.leadSurrogate = prefix[prefix.length-1];
+                    prefix = prefix.slice(0, -1);
+                }
+            }
+        }
+
+        let chars;
+        if ((buf.byteOffset + offset) & 1 === 0) {
+            // If byteOffset is aligned, just use the ArrayBuffer from input buf.
+            chars = new Uint16Array(buf.buffer, buf.byteOffset + offset, byteLen >> 1);
+        } else {
+            // If byteOffset is NOT aligned, create a new aligned buffer and copy the data.
+            chars = this.backend.allocRawChars(byteLen >> 1);
+            const srcByteView = new Uint8Array(buf.buffer, buf.byteOffset + offset, byteLen);
+            const destByteView = new Uint8Array(chars.buffer, chars.byteOffset, byteLen);
+            destByteView.set(srcByteView);
+        }
+
+        return prefix + this.backend.rawCharsToResult(chars, chars.length);
+    }
+
+    end() {
+        if (this.leadSurrogate || this.leadByte !== -1) {
+            const res = (this.leadSurrogate ? this.leadSurrogate : '') + (this.leadByte !== -1 ? this.defaultChar : '');
+            this.leadSurrogate = undefined;
+            this.leadByte = -1;
+            return res;
+        }
+    }
+}
+exports.ucs2 = "utf16le";  // Alias
+
 
 // == UTF16-BE codec. ==========================================================
 
 exports.utf16be = class Utf16BECodec {
-    get encoder() { return Utf16BEEncoder; }
-    get decoder() { return Utf16BEDecoder; }
+    createEncoder(options, iconv) {
+        return new Utf16BEEncoder(iconv.backend);
+    }
+    createDecoder(options, iconv) {
+        return new Utf16BEDecoder(iconv.backend, iconv.defaultCharUnicode);
+    }
     get bomAware() { return true; }
 }
 
 class Utf16BEEncoder {
-    constructor(opts, codec, backend) {
+    constructor(backend) {
         this.backend = backend;
     }
 
@@ -30,30 +163,86 @@ class Utf16BEEncoder {
 }
 
 class Utf16BEDecoder {
-    constructor(opts, codec, backend) {
+    constructor(backend, defaultChar) {
         this.backend = backend;
-        this.overflowByte = -1;
+        this.defaultChar = defaultChar;
+        this.leadByte = -1;
+        this.leadSurrogate = undefined;
     }
 
     write(buf) {
-        const chars = this.backend.allocRawChars((buf.length+1) >> 1);
-        let charsPos = 0, i = 0;
-    
-        if (this.overflowByte !== -1 && i < buf.length) {
-            chars[charsPos++] = (this.overflowByte << 8) + buf[i++];
+        // NOTE: This function is mostly copy/paste from Utf16LEDecoder.write() with bytes swapped.
+        // Please keep them in sync. Comments in that function apply here too.
+        if (buf.length === 0) {
+            return '';
         }
-    
-        for (; i < buf.length-1; i += 2) {
-            chars[charsPos++] = (buf[i] << 8) + buf[i+1];
+
+        let offset = 0;
+        let byteLen = buf.length;
+
+        // Process previous leadByte
+        let prefix = '';
+        if (this.leadByte !== -1) {
+            offset++; byteLen--;
+            prefix = String.fromCharCode((this.leadByte << 8) | buf[0]);
+        }
+
+        // Set new leadByte
+        if (byteLen & 1) {
+            this.leadByte = buf[buf.length-1];
+            byteLen--;
+        } else {
+            this.leadByte = -1;
         }
 
-        this.overflowByte = (i == buf.length-1) ? buf[i] : -1;
+        // Process leadSurrogate
+        if (prefix.length || byteLen) {
+            // Add high surrogate from previous chunk.
+            if (this.leadSurrogate) {
+                if (prefix.length) {
+                    prefix = this.leadSurrogate + prefix;
+                } else {
+                    // Make sure 'chars' don't start with a lone low surrogate; it will mess with rawCharsToResult.
+                    prefix = this.leadSurrogate + String.fromCharCode((buf[offset] << 8) | buf[offset+1]);
+                    offset += 2; byteLen -= 2;
+                }
+                this.leadSurrogate = undefined;
+            }
+
+            // Slice off a new high surrogate at the end of the current chunk.
+            if (byteLen) {
+                const lastIdx = offset + byteLen - 2;
+                const lastChar = (buf[lastIdx] << 8) | buf[lastIdx+1];
+                if (0xD800 <= lastChar && lastChar < 0xDC00) {
+                    this.leadSurrogate = String.fromCharCode(lastChar);
+                    byteLen -= 2;
+                }
+            } else { // slice from prefix
+                const lastChar = prefix.charCodeAt(prefix.length-1);
+                if (0xD800 <= lastChar && lastChar < 0xDC00) {
+                    this.leadSurrogate = prefix[prefix.length-1];
+                    prefix = prefix.slice(0, -1);
+                }
+            }
+        }
+
+        // Convert the main chunk of bytes
+        const chars = this.backend.allocRawChars(byteLen >> 1);
+        const srcBytes = new DataView(buf.buffer, buf.byteOffset + offset, byteLen);
+        for (let i = 0; i < chars.length; i++) {
+            chars[i] = srcBytes.getUint16(i*2);
+        }
 
-        return this.backend.rawCharsToResult(chars, charsPos);
+        return prefix + this.backend.rawCharsToResult(chars, chars.length);
     }
 
     end() {
-        this.overflowByte = -1;
+        if (this.leadSurrogate || this.leadByte !== -1) {
+            const res = (this.leadSurrogate ? this.leadSurrogate : '') + (this.leadByte !== -1 ? this.defaultChar : '');
+            this.leadSurrogate = undefined;
+            this.leadByte = -1;
+            return res;
+        }
     }
 }
 
@@ -67,39 +256,25 @@ class Utf16BEDecoder {
 // Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false).
 
 exports.utf16 = class Utf16Codec {
-    constructor(opts, iconv) {
-        this.iconv = iconv;
-    }
-    get encoder() { return Utf16Encoder; }
-    get decoder() { return Utf16Decoder; }
-}
-
-class Utf16Encoder {
-    constructor(options, codec) {
+    createEncoder(options, iconv) {
         options = options || {};
         if (options.addBOM === undefined)
             options.addBOM = true;
-        this.encoder = codec.iconv.getEncoder(options.use || 'utf-16le', options);
+        return iconv.getEncoder('utf-16le', options);
     }
-
-    // Pass-through to this.encoder
-    write(str) {
-        return this.encoder.write(str);
-    }
-    
-    end() {
-        return this.encoder.end();
+    createDecoder(options, iconv) {
+        return new Utf16Decoder(options, iconv);
     }
 }
 
 class Utf16Decoder {
-    constructor(options, codec) {
+    constructor(options, iconv) {
         this.decoder = null;
         this.initialBufs = [];
         this.initialBufsLen = 0;
 
         this.options = options || {};
-        this.iconv = codec.iconv;
+        this.iconv = iconv;
     }
 
     write(buf) {
 
@@ -105,8 +105,11 @@ iconv._canonicalizeEncoding = function(encoding) {
 }
 
 iconv.getEncoder = function getEncoder(encoding, options) {
-    var codec = iconv.getCodec(encoding),
-        encoder = new codec.encoder(options, codec, iconv.backend);
+    const codec = iconv.getCodec(encoding);
+
+    let encoder = codec.createEncoder
+        ? codec.createEncoder(options, iconv)
+        : new codec.encoder(options, codec, iconv.backend);
 
     if (codec.bomAware && options && options.addBOM)
         encoder = new bomHandling.PrependBOM(encoder, options);
@@ -115,8 +118,11 @@ iconv.getEncoder = function getEncoder(encoding, options) {
 }
 
 iconv.getDecoder = function getDecoder(encoding, options) {
-    var codec = iconv.getCodec(encoding),
-        decoder = new codec.decoder(options, codec, iconv.backend);
+    const codec = iconv.getCodec(encoding);
+
+    let decoder = codec.createDecoder
+        ? codec.createDecoder(options, iconv)
+        : new codec.decoder(options, codec, iconv.backend);
 
     if (codec.bomAware && !(options && options.stripBOM === false))
         decoder = new bomHandling.StripBOM(decoder, options);
 
@@ -37,7 +37,6 @@
         "iconv": "^2.3.5",
         "mocha": "^3.5.3",
         "request": "^2.88.2",
-        "semver": "^6.3.0",
         "unorm": "^1.6.0"
     },
     "dependencies": {