Skip to content

Commit ed88711

Browse files
committed
Fix Big5-HKSCS encoding to prefer non-HKSCS codes in case of multiple options (fixes #264)
1 parent 9627ecf commit ed88711

3 files changed

Lines changed: 47 additions & 3 deletions

File tree

encodings/dbcs-data.js

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,19 @@ module.exports = {
167167
'big5hkscs': {
168168
type: '_dbcs',
169169
table: function() { return require('./tables/cp950.json').concat(require('./tables/big5-added.json')) },
170-
encodeSkipVals: [0xa2cc],
170+
encodeSkipVals: [
171+
// Although Encoding Standard says we should avoid encoding to HKSCS area (See Step 1 of
172+
// https://encoding.spec.whatwg.org/#index-big5-pointer), we still do it to increase compatibility with ICU.
173+
// But if a single unicode point can be encoded both as HKSCS and regular Big5, we prefer the latter.
174+
0x8e69, 0x8e6f, 0x8e7e, 0x8eab, 0x8eb4, 0x8ecd, 0x8ed0, 0x8f57, 0x8f69, 0x8f6e, 0x8fcb, 0x8ffe,
175+
0x906d, 0x907a, 0x90c4, 0x90dc, 0x90f1, 0x91bf, 0x92af, 0x92b0, 0x92b1, 0x92b2, 0x92d1, 0x9447, 0x94ca,
176+
0x95d9, 0x96fc, 0x9975, 0x9b76, 0x9b78, 0x9b7b, 0x9bc6, 0x9bde, 0x9bec, 0x9bf6, 0x9c42, 0x9c53, 0x9c62,
177+
0x9c68, 0x9c6b, 0x9c77, 0x9cbc, 0x9cbd, 0x9cd0, 0x9d57, 0x9d5a, 0x9dc4, 0x9def, 0x9dfb, 0x9ea9, 0x9eef,
178+
0x9efd, 0x9f60, 0x9fcb, 0xa077, 0xa0dc, 0xa0df, 0x8fcc, 0x92c8, 0x9644, 0x96ed,
179+
180+
// Step 2 of https://encoding.spec.whatwg.org/#index-big5-pointer: Use last pointer for U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345
181+
0xa2a4, 0xa2a5, 0xa2a7, 0xa2a6, 0xa2cc, 0xa2ce,
182+
],
171183
},
172184

173185
'cnbig5': 'big5hkscs',

generation/gen-dbcs.js

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ async.parallel({
2727
}
2828

2929
// Calculate difference between big5 and cp950, and write it to a file.
30-
// See http://encoding.spec.whatwg.org/#big5-encoder
30+
// See http://encoding.spec.whatwg.org/#big5
3131
var big5add = {}
3232
for (var i = 0x8100; i < 0x10000; i++) { // Lead byte is 0x81 .. 0xFE
3333
var trail = i & 0xFF;
@@ -41,7 +41,35 @@ async.parallel({
4141
big5add[i] = big5Char;
4242
}
4343

44-
// Add char sequences that are not in the index file (as given in http://encoding.spec.whatwg.org/#big5-encoder)
44+
// Calculate HKSCS codes that are duplicates of big5 codes and need to be skipped when encoding.
45+
console.log("Duplicate HKSCS codes that need to be skipped when encoded (see encodeSkipVals in big5hkscs): ")
46+
var big5codes = {};
47+
for (var i = 0xA100; i < 0x10000; i++) {
48+
var uCharCode = (big5add[i] !== undefined) ? big5add[i] : data.cp950[i];
49+
if (uCharCode !== undefined) {
50+
big5codes[uCharCode] = true;
51+
}
52+
}
53+
for (var i = 0x8100; i < 0xA100; i++) {
54+
var uCharCode = (big5add[i] !== undefined) ? big5add[i] : data.cp950[i];
55+
if (uCharCode !== undefined && big5codes[uCharCode]) {
56+
console.log("0x"+i.toString(16));
57+
}
58+
}
59+
60+
if (big5Char !== undefined) {
61+
if (lead < 0xA1) {
62+
if (d[big5Char] !== undefined) {
63+
console.log("duplicate in first: "+ pointer + " char " + big5Char);
64+
}
65+
d[big5Char] = i;
66+
} else if (d[big5Char] !== undefined) {
67+
console.log("dup 0x"+d[big5Char].toString(16) + " -> " + i.toString(16))
68+
}
69+
70+
}
71+
72+
// Add char sequences that are not in the index file (as given in http://encoding.spec.whatwg.org/#big5-decoder)
4573
function toIdx(pointer) { var trail = pointer % 157; var lead = Math.floor(pointer / 157) + 0x81; return (lead << 8) + (trail + (trail < 0x3F ? 0x40 : 0x62))}
4674
big5add[toIdx(1133)] = [0x00CA, 0x0304];
4775
big5add[toIdx(1135)] = [0x00CA, 0x030C];

test/big5-test.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,8 @@ describe("Big5 tests", function() {
5454
it("Big5 correctly encodes 十", function() {
5555
assert.strictEqual(iconv.encode("十", "big5").toString('hex'), "a451");
5656
});
57+
58+
it("Big5 correctly encodes 起 (issue #264)", function() {
59+
assert.strictEqual(iconv.encode("起", "big5").toString('hex'), "b05f");
60+
});
5761
});

0 commit comments

Comments
 (0)