Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,30 @@ buf = iconv.encode("Sample input string", "win1251");

// Check if encoding is supported
iconv.encodingExists("us-ascii");

// Calculate the actual length in bytes.
len = iconv.byteLength("Hello, world! 😀", "utf16be");

// Get a decoder and decode two different buffers into a single string, the decoder keeps state between buffers
var utf8Decoder = iconv.getDecoder("utf8");
var bytes1 = Buffer.from([0x20, 0x23, 0xe2]); // space, # and part of ☣
var bytes2 = Buffer.from([0x98, 0xa3]); // the rest of ☣
var str = utf8Decoder.write(bytes1);
// You can check if the decoder has state currently
var hasState = utf8Decoder.hasState; // true;
str += utf8Decoder.write(bytes2);
var hasState = utf8Decoder.hasState; // false;

// The same for encoder, you rarely need to care about the encoder's state, except for some special encoders and surrogate pair
var utf8Encoder = iconv.getEncoder("utf8");
var bytes = utf8Encoder.write("Hi \uD83D");
var hasState = utf8Encoder.hasState; // true
bytes = bytes.concat([utf8Encoder.write("\uDE00")]);
hasState = utf8Encoder.hasState; // false

// Use the "end" method to get the remaining data in encoder/decoder's state and clear the state
var bytes = encoder.end();
var str = decoder.end();
```

### Streaming API
Expand Down Expand Up @@ -112,9 +136,9 @@ This library supports UTF-32LE, UTF-32BE and UTF-32 encodings. Like the UTF-16 e

## Other notes

When decoding, be sure to supply a Buffer to decode() method, otherwise [bad things usually happen](https://github.com/ashtuchkin/iconv-lite/wiki/Use-Buffers-when-decoding).
Untranslatable characters are set to � or ?. No transliteration is currently supported.
Node versions 0.10.31 and 0.11.13 are buggy, don't use them (see #65, #77).
- When decoding, be sure to supply a Buffer to decode() method, otherwise [bad things usually happen](https://github.com/ashtuchkin/iconv-lite/wiki/Use-Buffers-when-decoding).
- Untranslatable characters are set to � or ?. No transliteration is currently supported.
- Node versions 0.10.31 and 0.11.13 are buggy, don't use them (see #65, #77).

## Testing

Expand Down
1,139 changes: 644 additions & 495 deletions encodings/dbcs-codec.js

Large diffs are not rendered by default.

60 changes: 60 additions & 0 deletions encodings/internal.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ function InternalDecoder(options, codec) {
this.decoder = new StringDecoder(codec.enc);
}

Object.defineProperty(InternalDecoder.prototype, "hasState", {
get: function () {
// TODO: hopefully this will not be changed in newer version of NodeJS
return this.decoder["lastNeed"] !== 0;
},
});

InternalDecoder.prototype.write = function (buf) {
if (!Buffer.isBuffer(buf)) {
buf = Buffer.from(buf);
Expand All @@ -75,6 +82,16 @@ function InternalEncoder(options, codec) {
this.enc = codec.enc;
}

Object.defineProperty(InternalEncoder.prototype, "hasState", {
get: function () {
return false;
},
});

InternalEncoder.prototype.byteLength = function (str) {
return Buffer.byteLength(str, this.enc);
};

InternalEncoder.prototype.write = function (str) {
return Buffer.from(str, this.enc);
};
Expand All @@ -88,6 +105,26 @@ function InternalEncoderBase64() {
this.prevStr = "";
}

Object.defineProperty(InternalEncoderBase64.prototype, "hasState", {
get: function () {
return this.prevStr.length > 0;
},
});

InternalEncoderBase64.prototype.byteLength = function (str) {
var byteLength = 0;
var completeQuads = str.length - (str.length % 4);
var prevStr = str.slice(completeQuads);
str = str.slice(0, completeQuads);
var nonPaddedLength = str.search(/=*$/);
if (nonPaddedLength === -1) nonPaddedLength = str.length;
byteLength += Math.floor((nonPaddedLength * 3) / 4);
nonPaddedLength = prevStr.search(/=*$/);
if (nonPaddedLength === -1) nonPaddedLength = str.length;
byteLength += Math.floor((nonPaddedLength * 3) / 4);
return byteLength;
};

InternalEncoderBase64.prototype.write = function (str) {
str = this.prevStr + str;
var completeQuads = str.length - (str.length % 4);
Expand All @@ -106,6 +143,23 @@ InternalEncoderBase64.prototype.end = function () {

function InternalEncoderCesu8() {}

Object.defineProperty(InternalEncoderCesu8.prototype, "hasState", {
get: function () {
return false;
},
});

InternalEncoderCesu8.prototype.byteLength = function (str) {
let byteLength = 0;
for (let i = 0; i < str.length; i++) {
const charCode = str.charCodeAt(i);
if (charCode < 0x80) byteLength += 1;
else if (charCode < 0x800) byteLength += 2;
else byteLength += 3;
}
return byteLength;
};

InternalEncoderCesu8.prototype.write = function (str) {
const buf = Buffer.alloc(str.length * 3);
let bufIdx = 0;
Expand Down Expand Up @@ -140,6 +194,12 @@ function InternalDecoderCesu8(options, codec) {
this.defaultCharUnicode = codec.defaultCharUnicode;
}

Object.defineProperty(InternalDecoderCesu8.prototype, "hasState", {
get: function () {
return this.contBytes > 0;
},
});

InternalDecoderCesu8.prototype.write = function (buf) {
let acc = this.acc,
contBytes = this.contBytes,
Expand Down
12 changes: 12 additions & 0 deletions encodings/sbcs-codec.js
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ class SBCSEncoder {
this.encodeBuf = codec.encodeBuf;
}

byteLength(str) {
return str.length;
}

get hasState() {
return false;
}

write(str) {
const bytes = this.backend.allocBytes(str.length);

Expand All @@ -77,6 +85,10 @@ class SBCSDecoder {
this.backend = backend;
}

get hasState() {
return false;
}

write(buf) {
// Strings are immutable in JS -> we use ucs2 buffer to speed up computations.
const decodeBuf = this.decodeBuf;
Expand Down
28 changes: 28 additions & 0 deletions encodings/utf16.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ class Utf16LEEncoder {
this.backend = backend;
}

byteLength(str) {
return str.length * 2;
}

get hasState() {
return false;
}

write(str) {
const bytes = this.backend.allocBytes(str.length * 2);
const chars = new Uint16Array(bytes.buffer, bytes.byteOffset, str.length);
Expand All @@ -42,6 +50,10 @@ class Utf16LEDecoder {
this.leadSurrogate = undefined;
}

get hasState() {
return this.leadSurrogate || this.leadByte !== -1;
}

write(buf) {
// NOTE: This function is mostly the same as Utf16BEDecoder.write() with bytes swapped.
// Please keep them in sync.
Expand Down Expand Up @@ -158,6 +170,14 @@ class Utf16BEEncoder {
this.backend = backend;
}

byteLength(str) {
return str.length * 2;
}

get hasState() {
return false;
}

write(str) {
const bytes = this.backend.allocBytes(str.length * 2);
let bytesPos = 0;
Expand All @@ -180,6 +200,10 @@ class Utf16BEDecoder {
this.leadSurrogate = undefined;
}

get hasState() {
return this.leadSurrogate || this.leadByte !== -1;
}

write(buf) {
// NOTE: This function is mostly copy/paste from Utf16LEDecoder.write() with bytes swapped.
// Please keep them in sync. Comments in that function apply here too.
Expand Down Expand Up @@ -292,6 +316,10 @@ class Utf16Decoder {
this.iconv = iconv;
}

get hasState() {
return this.initialBufsLen !== 0 || (this.decoder != null && this.decoder.hasState);
}

write(buf) {
if (!this.decoder) {
// Codec is not chosen yet. Accumulate initial bytes.
Expand Down
62 changes: 62 additions & 0 deletions encodings/utf32.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ function Utf32Encoder(options, codec) {
this.highSurrogate = 0;
}

Object.defineProperty(Utf32Encoder.prototype, "hasState", {
get: function () {
return !!this.highSurrogate;
},
});

Utf32Encoder.prototype.write = function (str) {
var src = Buffer.from(str, "ucs2");
var dst = Buffer.alloc(src.length * 2);
Expand Down Expand Up @@ -76,6 +82,40 @@ Utf32Encoder.prototype.write = function (str) {
return dst;
};

Utf32Encoder.prototype.byteLength = function (str) {
var byteLength = 0;
var currentHighSurrogate = 0;

for (var i = 0; i < str.length; i++) {
var code = str.charCodeAt(i);
var isHighSurrogate = (0xd800 <= code && code < 0xdc00); // prettier-ignore
var isLowSurrogate = (0xdc00 <= code && code < 0xe000); // prettier-ignore

if (currentHighSurrogate) {
if (isHighSurrogate || !isLowSurrogate) {
byteLength += 4;
} else {
byteLength += 4;
currentHighSurrogate = 0;
continue;
}
}

if (isHighSurrogate) {
currentHighSurrogate = code;
} else {
byteLength += 4;
currentHighSurrogate = 0;
}
}

if (currentHighSurrogate) {
byteLength += 4;
}

return byteLength;
};

Utf32Encoder.prototype.end = function () {
// Treat any leftover high surrogate as a semi-valid independent character.
if (!this.highSurrogate) {
Expand All @@ -100,6 +140,12 @@ function Utf32Decoder(options, codec) {
this.overflow = [];
}

Object.defineProperty(Utf32Decoder.prototype, "hasState", {
get: function () {
return this.overflow.length > 0;
},
});

Utf32Decoder.prototype.write = function (src) {
if (src.length === 0) return "";

Expand Down Expand Up @@ -212,6 +258,16 @@ function Utf32AutoEncoder(options, codec) {
this.encoder = codec.iconv.getEncoder(options.defaultEncoding || "utf-32le", options);
}

Object.defineProperty(Utf32AutoEncoder.prototype, "hasState", {
get: function () {
return this.encoder.hasState;
},
});

Utf32AutoEncoder.prototype.byteLength = function (str) {
return this.encoder.byteLength(str);
};

Utf32AutoEncoder.prototype.write = function (str) {
return this.encoder.write(str);
};
Expand All @@ -230,6 +286,12 @@ function Utf32AutoDecoder(options, codec) {
this.iconv = codec.iconv;
}

Object.defineProperty(Utf32AutoDecoder.prototype, "hasState", {
get: function () {
return this.initialBufsLen !== 0 || (this.decoder != null && this.decoder.hasState);
},
});

Utf32AutoDecoder.prototype.write = function (buf) {
if (!this.decoder) {
// Codec is not chosen yet. Accumulate initial bytes.
Expand Down
Loading