pillarjs · Meigyoku-Thmn · Jul 26, 2020 · Jul 29, 2020 · Jul 29, 2020 · Aug 1, 2020
diff --git a/README.md b/README.md
@@ -31,6 +31,30 @@ buf = iconv.encode("Sample input string", "win1251");
 
 // Check if encoding is supported
 iconv.encodingExists("us-ascii");
+
+// Calculate the actual length in bytes.
+len = iconv.byteLength("Hello, world! 😀", "utf16be");
+
+// Get a decoder and decode two different buffers into a single string, the decoder keeps state between buffers
+var utf8Decoder = iconv.getDecoder("utf8");
+var bytes1 = Buffer.from([0x20, 0x23, 0xe2]); // space, # and part of ☣
+var bytes2 = Buffer.from([0x98, 0xa3]); // the rest of ☣
+var str = utf8Decoder.write(bytes1);
+// You can check if the decoder has state currently
+var hasState = utf8Decoder.hasState; // true;
+str += utf8Decoder.write(bytes2);
+var hasState = utf8Decoder.hasState; // false;
+
+// The same for encoder, you rarely need to care about the encoder's state, except for some special encoders and surrogate pair
+var utf8Encoder = iconv.getEncoder("utf8");
+var bytes = utf8Encoder.write("Hi \uD83D");
+var hasState = utf8Encoder.hasState; // true
+bytes = bytes.concat([utf8Encoder.write("\uDE00")]);
+hasState = utf8Encoder.hasState; // false
+
+// Use the "end" method to get the remaining data in encoder/decoder's state and clear the state
+var bytes = encoder.end();
+var str = decoder.end();
 ```
 
 ### Streaming API
@@ -112,9 +136,9 @@ This library supports UTF-32LE, UTF-32BE and UTF-32 encodings. Like the UTF-16 e
 
 ## Other notes
 
-When decoding, be sure to supply a Buffer to decode() method, otherwise [bad things usually happen](https://github.com/ashtuchkin/iconv-lite/wiki/Use-Buffers-when-decoding).  
-Untranslatable characters are set to � or ?. No transliteration is currently supported.  
-Node versions 0.10.31 and 0.11.13 are buggy, don't use them (see #65, #77).
+-   When decoding, be sure to supply a Buffer to decode() method, otherwise [bad things usually happen](https://github.com/ashtuchkin/iconv-lite/wiki/Use-Buffers-when-decoding).
+-   Untranslatable characters are set to � or ?. No transliteration is currently supported.
+-   Node versions 0.10.31 and 0.11.13 are buggy, don't use them (see #65, #77).
 
 ## Testing
 

diff --git a/encodings/dbcs-codec.js b/encodings/dbcs-codec.js
diff --git a/encodings/internal.js b/encodings/internal.js
@@ -56,6 +56,13 @@ function InternalDecoder(options, codec) {
     this.decoder = new StringDecoder(codec.enc);
 }
 
+Object.defineProperty(InternalDecoder.prototype, "hasState", {
+    get: function () {
+        // TODO: hopefully this will not be changed in newer version of NodeJS
+        return this.decoder["lastNeed"] !== 0;
+    },
+});
+
 InternalDecoder.prototype.write = function (buf) {
     if (!Buffer.isBuffer(buf)) {
         buf = Buffer.from(buf);
@@ -75,6 +82,16 @@ function InternalEncoder(options, codec) {
     this.enc = codec.enc;
 }
 
+Object.defineProperty(InternalEncoder.prototype, "hasState", {
+    get: function () {
+        return false;
+    },
+});
+
+InternalEncoder.prototype.byteLength = function (str) {
+    return Buffer.byteLength(str, this.enc);
+};
+
 InternalEncoder.prototype.write = function (str) {
     return Buffer.from(str, this.enc);
 };
@@ -88,6 +105,26 @@ function InternalEncoderBase64() {
     this.prevStr = "";
 }
 
+Object.defineProperty(InternalEncoderBase64.prototype, "hasState", {
+    get: function () {
+        return this.prevStr.length > 0;
+    },
+});
+
+InternalEncoderBase64.prototype.byteLength = function (str) {
+    var byteLength = 0;
+    var completeQuads = str.length - (str.length % 4);
+    var prevStr = str.slice(completeQuads);
+    str = str.slice(0, completeQuads);
+    var nonPaddedLength = str.search(/=*$/);
+    if (nonPaddedLength === -1) nonPaddedLength = str.length;
+    byteLength += Math.floor((nonPaddedLength * 3) / 4);
+    nonPaddedLength = prevStr.search(/=*$/);
+    if (nonPaddedLength === -1) nonPaddedLength = str.length;
+    byteLength += Math.floor((nonPaddedLength * 3) / 4);
+    return byteLength;
+};
+
 InternalEncoderBase64.prototype.write = function (str) {
     str = this.prevStr + str;
     var completeQuads = str.length - (str.length % 4);
@@ -106,6 +143,23 @@ InternalEncoderBase64.prototype.end = function () {
 
 function InternalEncoderCesu8() {}
 
+Object.defineProperty(InternalEncoderCesu8.prototype, "hasState", {
+    get: function () {
+        return false;
+    },
+});
+
+InternalEncoderCesu8.prototype.byteLength = function (str) {
+    let byteLength = 0;
+    for (let i = 0; i < str.length; i++) {
+        const charCode = str.charCodeAt(i);
+        if (charCode < 0x80) byteLength += 1;
+        else if (charCode < 0x800) byteLength += 2;
+        else byteLength += 3;
+    }
+    return byteLength;
+};
+
 InternalEncoderCesu8.prototype.write = function (str) {
     const buf = Buffer.alloc(str.length * 3);
     let bufIdx = 0;
@@ -140,6 +194,12 @@ function InternalDecoderCesu8(options, codec) {
     this.defaultCharUnicode = codec.defaultCharUnicode;
 }
 
+Object.defineProperty(InternalDecoderCesu8.prototype, "hasState", {
+    get: function () {
+        return this.contBytes > 0;
+    },
+});
+
 InternalDecoderCesu8.prototype.write = function (buf) {
     let acc = this.acc,
         contBytes = this.contBytes,

diff --git a/encodings/sbcs-codec.js b/encodings/sbcs-codec.js
@@ -58,6 +58,14 @@ class SBCSEncoder {
         this.encodeBuf = codec.encodeBuf;
     }
 
+    byteLength(str) {
+        return str.length;
+    }
+
+    get hasState() {
+        return false;
+    }
+
     write(str) {
         const bytes = this.backend.allocBytes(str.length);
 
@@ -77,6 +85,10 @@ class SBCSDecoder {
         this.backend = backend;
     }
 
+    get hasState() {
+        return false;
+    }
+
     write(buf) {
         // Strings are immutable in JS -> we use ucs2 buffer to speed up computations.
         const decodeBuf = this.decodeBuf;

diff --git a/encodings/utf16.js b/encodings/utf16.js
@@ -22,6 +22,14 @@ class Utf16LEEncoder {
         this.backend = backend;
     }
 
+    byteLength(str) {
+        return str.length * 2;
+    }
+
+    get hasState() {
+        return false;
+    }
+
     write(str) {
         const bytes = this.backend.allocBytes(str.length * 2);
         const chars = new Uint16Array(bytes.buffer, bytes.byteOffset, str.length);
@@ -42,6 +50,10 @@ class Utf16LEDecoder {
         this.leadSurrogate = undefined;
     }
 
+    get hasState() {
+        return this.leadSurrogate || this.leadByte !== -1;
+    }
+
     write(buf) {
         // NOTE: This function is mostly the same as Utf16BEDecoder.write() with bytes swapped.
         //   Please keep them in sync.
@@ -158,6 +170,14 @@ class Utf16BEEncoder {
         this.backend = backend;
     }
 
+    byteLength(str) {
+        return str.length * 2;
+    }
+
+    get hasState() {
+        return false;
+    }
+
     write(str) {
         const bytes = this.backend.allocBytes(str.length * 2);
         let bytesPos = 0;
@@ -180,6 +200,10 @@ class Utf16BEDecoder {
         this.leadSurrogate = undefined;
     }
 
+    get hasState() {
+        return this.leadSurrogate || this.leadByte !== -1;
+    }
+
     write(buf) {
         // NOTE: This function is mostly copy/paste from Utf16LEDecoder.write() with bytes swapped.
         // Please keep them in sync. Comments in that function apply here too.
@@ -292,6 +316,10 @@ class Utf16Decoder {
         this.iconv = iconv;
     }
 
+    get hasState() {
+        return this.initialBufsLen !== 0 || (this.decoder != null && this.decoder.hasState);
+    }
+
     write(buf) {
         if (!this.decoder) {
             // Codec is not chosen yet. Accumulate initial bytes.

diff --git a/encodings/utf32.js b/encodings/utf32.js
@@ -29,6 +29,12 @@ function Utf32Encoder(options, codec) {
     this.highSurrogate = 0;
 }
 
+Object.defineProperty(Utf32Encoder.prototype, "hasState", {
+    get: function () {
+        return !!this.highSurrogate;
+    },
+});
+
 Utf32Encoder.prototype.write = function (str) {
     var src = Buffer.from(str, "ucs2");
     var dst = Buffer.alloc(src.length * 2);
@@ -76,6 +82,40 @@ Utf32Encoder.prototype.write = function (str) {
     return dst;
 };
 
+Utf32Encoder.prototype.byteLength = function (str) {
+    var byteLength = 0;
+    var currentHighSurrogate = 0;
+
+    for (var i = 0; i < str.length; i++) {
+        var code = str.charCodeAt(i);
+        var isHighSurrogate = (0xd800 <= code && code < 0xdc00); // prettier-ignore
+        var isLowSurrogate = (0xdc00 <= code && code < 0xe000); // prettier-ignore
+
+        if (currentHighSurrogate) {
+            if (isHighSurrogate || !isLowSurrogate) {
+                byteLength += 4;
+            } else {
+                byteLength += 4;
+                currentHighSurrogate = 0;
+                continue;
+            }
+        }
+
+        if (isHighSurrogate) {
+            currentHighSurrogate = code;
+        } else {
+            byteLength += 4;
+            currentHighSurrogate = 0;
+        }
+    }
+
+    if (currentHighSurrogate) {
+        byteLength += 4;
+    }
+
+    return byteLength;
+};
+
 Utf32Encoder.prototype.end = function () {
     // Treat any leftover high surrogate as a semi-valid independent character.
     if (!this.highSurrogate) {
@@ -100,6 +140,12 @@ function Utf32Decoder(options, codec) {
     this.overflow = [];
 }
 
+Object.defineProperty(Utf32Decoder.prototype, "hasState", {
+    get: function () {
+        return this.overflow.length > 0;
+    },
+});
+
 Utf32Decoder.prototype.write = function (src) {
     if (src.length === 0) return "";
 
@@ -212,6 +258,16 @@ function Utf32AutoEncoder(options, codec) {
     this.encoder = codec.iconv.getEncoder(options.defaultEncoding || "utf-32le", options);
 }
 
+Object.defineProperty(Utf32AutoEncoder.prototype, "hasState", {
+    get: function () {
+        return this.encoder.hasState;
+    },
+});
+
+Utf32AutoEncoder.prototype.byteLength = function (str) {
+    return this.encoder.byteLength(str);
+};
+
 Utf32AutoEncoder.prototype.write = function (str) {
     return this.encoder.write(str);
 };
@@ -230,6 +286,12 @@ function Utf32AutoDecoder(options, codec) {
     this.iconv = codec.iconv;
 }
 
+Object.defineProperty(Utf32AutoDecoder.prototype, "hasState", {
+    get: function () {
+        return this.initialBufsLen !== 0 || (this.decoder != null && this.decoder.hasState);
+    },
+});
+
 Utf32AutoDecoder.prototype.write = function (buf) {
     if (!this.decoder) {
         // Codec is not chosen yet. Accumulate initial bytes.