Skip to content

Commit 47e4c18

Browse files
committed
[Change] Re-haul to get pass as many test cases as possible in both Linux & Windows, except:
* In Linux, UTF-16BE without BOM still fails. * In Windows, UTF-32 has found no way to pass.
1 parent 77ea0e8 commit 47e4c18

File tree

3 files changed

+21
-18
lines changed

3 files changed

+21
-18
lines changed

include/bux/UnicodeCvt.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,15 +154,15 @@ std::basic_string<T> BOM(std::basic_string_view<T> sv)
154154
if constexpr (sizeof(T) > 1)
155155
return T(0xFEFF) + std::basic_string<T>(sv);
156156
else
157-
return std::basic_string<T>{(const T*)"\xef\xbb\xbf"}.append(sv);
157+
return std::basic_string<T>{(const T*)u8"\uFEFF"}.append(sv);
158158
}
159159
template<typename T>
160160
std::basic_string<T> BOM(const T *p)
161161
{
162162
if constexpr (sizeof(T) > 1)
163163
return T(0xFEFF) + std::basic_string<T>(p);
164164
else
165-
return std::basic_string<T>{(const T*)"\xef\xbb\xbf"} += p;
165+
return std::basic_string<T>{(const T*)u8"\uFEFF"} += p;
166166
}
167167

168168
} // namespace bux

src/UnicodeCvt.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ constinit const char *const CHSETS_KSC[] = {"CP949", "EUC-KR", "JOHAB", 0};
5454
constinit const char *const CHSETS_BIG5[] = {"CP950", "EUC-TW", "BIG5-HKSCS", "BIG5HKSCS", "BIG-5", "BIG5", 0};
5555
constinit const char *const CHSETS_UTF8[] = {"UTF-8", "UTF8", 0};
5656
constinit const char *const CHSETS_UTF7[] = {"UTF-7", "UTF7", 0};
57-
constinit const char *const CHSETS_UTF16LE[] = {"UCS-2LE", "UTF-16LE", "USC2LE", "UTF16LE", 0};
58-
constinit const char *const CHSETS_UTF16BE[] = {"UCS-2BE", "UTF-16BE", "USC2BE", "UTF16BE", 0};
59-
constinit const char *const CHSETS_UTF32LE[] = {"UCS-4LE", "UTF-32LE", "USC4LE", "UTF32LE", 0};
60-
constinit const char *const CHSETS_UTF32BE[] = {"UCS-4BE", "UTF-32BE", "USC4BE", "UTF32BE", 0};
57+
constinit const char *const CHSETS_UTF16LE[] = {"UTF-16LE", "UTF16LE", "UCS-2LE", "USC2LE", 0};
58+
constinit const char *const CHSETS_UTF16BE[] = {"UTF-16BE", "UTF16BE", "UCS-2BE", "USC2BE", 0};
59+
constinit const char *const CHSETS_UTF32LE[] = {"UTF-32LE", "UTF32LE", "UCS-4LE", "USC4LE", 0};
60+
constinit const char *const CHSETS_UTF32BE[] = {"UTF-32BE", "UTF32BE", "UCS-4BE", "USC4BE", 0};
6161
#endif
6262

6363
//
@@ -341,7 +341,7 @@ void C_UnicodeIn::init()
341341
m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
342342
return;
343343
default:
344-
if (m_Src.size() >= 3 && 0 == memcmp(m_Src.buffer(), "\xef\xbb\xbf", 3))
344+
if (m_Src.size() >= 3 && 0 == memcmp(m_Src.buffer(), u8"\uFEFF", 3))
345345
// UTF-8 with BOM
346346
{
347347
m_Src.pop(3);
@@ -357,13 +357,13 @@ void C_UnicodeIn::init()
357357
#ifdef _WIN32
358358
const auto size = m_Src.size();
359359
int mask = IS_TEXT_UNICODE_UNICODE_MASK;
360-
if (IsTextUnicode(m_Src.buffer(), int(size), &mask))
360+
if (IsTextUnicode(m_Src.buffer(), int(size), &mask) || mask)
361361
{
362362
m_ReadMethod = &C_UnicodeIn::readUTF16;
363363
return;
364364
}
365365
mask = IS_TEXT_UNICODE_REVERSE_MASK;
366-
if (IsTextUnicode(m_Src.buffer(), int(size), &mask))
366+
if (IsTextUnicode(m_Src.buffer(), int(size), &mask) || mask)
367367
{
368368
m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
369369
return;
@@ -429,17 +429,17 @@ bool C_UnicodeIn::guessCodePage()
429429
{
430430
static constinit const T_Encoding MBCS_CODEPAGES[] ={
431431
#ifdef _WIN32
432-
CP_ACP, CP_UTF8,
432+
CP_UTF8, CP_ACP,
433433
932, 936, 949, 950, 951, // from https://en.wikipedia.org/wiki/Windows_code_page#East_Asian_multi-byte_code_pages
434434
CP_UTF7
435435
#elif defined(__unix__)
436436
CHSETS_UTF32LE, CHSETS_UTF32BE, CHSETS_UTF8, CHSETS_SJIS, CHSETS_GB, CHSETS_KSC, CHSETS_BIG5, CHSETS_UTF7, CHSETS_UTF16LE, CHSETS_UTF16BE
437437
#endif
438438
};
439-
for (size_t i = 0; i < std::size(MBCS_CODEPAGES); ++i)
439+
for (auto i: MBCS_CODEPAGES)
440440
{
441441
m_ErrCode = UIE_EOF; // reset error code
442-
setCodePage(MBCS_CODEPAGES[i]);
442+
setCodePage(i);
443443
ingestMBCS();
444444
if (m_ErrCode != UIE_NO_UNICODE_TRANSLATION)
445445
{

test/test_unicodecvt.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Test cases are organized according to ZOMBIES rules
33
http://blog.wingman-sw.com/tdd-guided-by-zombies
44
*/
5-
#include <bux/LexBase.h> // bux::asciiLiteral()
5+
//#include <bux/LexBase.h> // bux::asciiLiteral()
66
#include <bux/UnicodeCvt.h> // bux::to_utf8(), bux::BOM()
77
#include <catch2/catch_test_macros.hpp>
88

@@ -11,7 +11,7 @@ TEST_CASE("Empty string to BOM", "[Z]")
1111
CHECK(bux::to_utf8(bux::BOM(L"")).empty());
1212
CHECK(bux::to_utf8(bux::BOM(U"")).empty());
1313
CHECK(bux::to_utf8(bux::BOM(u"")).empty());
14-
CHECK(bux::BOM(u8"") == u8"\xef\xbb\xbf");
14+
CHECK(bux::BOM(u8"") == u8"\uFEFF");
1515
}
1616

1717
TEST_CASE("String to utf-8 vs stringview to utf-8", "[S]")
@@ -29,17 +29,20 @@ TEST_CASE("String to utf-8 vs stringview to utf-8", "[S]")
2929
ch = std::byteswap(ch);
3030
CHECK(bux::to_utf8(u32str) == (const char*)u8"一律轉成 utf-8");
3131

32-
static constinit const char *const CHSETS_UTF16LE[] = {"UCS-2LE", "UTF-16LE", "USC2LE", "UTF16LE", 0};
3332
char16_t u16str[] = u"一律轉成 utf-8";
33+
#ifdef __unix__
34+
static constinit const char *const CHSETS_UTF16LE[] = {"UTF-16LE", "UTF16LE", "UCS-2LE", "USC2LE", 0};
3435
CHECK(bux::to_utf8(u16str, 0, CHSETS_UTF16LE) == (const char*)u8"一律轉成 utf-8");
3536

3637
static constinit const char *const CHSETS_UTF16[] = {"UCS-2", "UTF-16", "USC2", "UTF16", 0};
3738
CHECK(bux::to_utf8(u16str, 0, CHSETS_UTF16) == (const char*)u8"一律轉成 utf-8");
38-
39-
CHECK(bux::to_utf8(u16str) == (const char*)u8"一律轉成 utf-8"); // %%%%% fail
39+
#endif
40+
CHECK(bux::to_utf8(u16str) == (const char*)u8"一律轉成 utf-8");
4041
for (auto &ch: u16str)
4142
ch = std::byteswap(ch);
42-
CHECK(bux::to_utf8(u16str) == (const char*)u8"一律轉成 utf-8"); // %%%%% fail
43+
static constinit const char *const CHSETS_UTF16BE[] = {"UTF-16BE", "UTF16BE", "UCS-2BE", "USC2BE", 0};
44+
CHECK(bux::to_utf8(u16str, 0, CHSETS_UTF16BE) == (const char*)u8"一律轉成 utf-8");
45+
CHECK(bux::to_utf8(u16str) == (const char*)u8"一律轉成 utf-8");
4346

4447
CHECK(bux::to_utf8(u8"一律轉成 utf-8", 0, bux::ENCODING_UTF8) == (const char*)u8"一律轉成 utf-8");
4548
CHECK(bux::to_utf8(bux::BOM(u8"一律轉成 utf-8")) == (const char*)u8"一律轉成 utf-8");

0 commit comments

Comments
 (0)