[Change] Re-haul to get pass as many test cases as possible in both Linux & Windows, except:

buck-yeh · buck-yeh · commit 47e4c18cfca9 · 2024-12-07T11:31:11.000+08:00
* In Linux, UTF-16BE without BOM still fails.
* In Windows, UTF-32 has found no way to pass.
diff --git a/include/bux/UnicodeCvt.h b/include/bux/UnicodeCvt.h
@@ -154,15 +154,15 @@ std::basic_string<T> BOM(std::basic_string_view<T> sv)
     if constexpr (sizeof(T) > 1)
         return T(0xFEFF) + std::basic_string<T>(sv); 
     else
-        return std::basic_string<T>{(const T*)"\xef\xbb\xbf"}.append(sv);
+        return std::basic_string<T>{(const T*)u8"\uFEFF"}.append(sv);
 }
 template<typename T>
 std::basic_string<T> BOM(const T *p)
 {
     if constexpr (sizeof(T) > 1)
         return T(0xFEFF) + std::basic_string<T>(p);
     else
-        return std::basic_string<T>{(const T*)"\xef\xbb\xbf"} += p;
+        return std::basic_string<T>{(const T*)u8"\uFEFF"} += p;
 }
 
 } // namespace bux
diff --git a/src/UnicodeCvt.cpp b/src/UnicodeCvt.cpp
@@ -54,10 +54,10 @@ constinit const char *const CHSETS_KSC[]  = {"CP949", "EUC-KR", "JOHAB", 0};
 constinit const char *const CHSETS_BIG5[] = {"CP950", "EUC-TW", "BIG5-HKSCS", "BIG5HKSCS", "BIG-5", "BIG5", 0};
 constinit const char *const CHSETS_UTF8[] = {"UTF-8", "UTF8", 0};
 constinit const char *const CHSETS_UTF7[] = {"UTF-7", "UTF7", 0};
-constinit const char *const CHSETS_UTF16LE[] = {"UCS-2LE", "UTF-16LE", "USC2LE", "UTF16LE", 0};
-constinit const char *const CHSETS_UTF16BE[] = {"UCS-2BE", "UTF-16BE", "USC2BE", "UTF16BE", 0};
-constinit const char *const CHSETS_UTF32LE[] = {"UCS-4LE", "UTF-32LE", "USC4LE", "UTF32LE", 0};
-constinit const char *const CHSETS_UTF32BE[] = {"UCS-4BE", "UTF-32BE", "USC4BE", "UTF32BE", 0};
+constinit const char *const CHSETS_UTF16LE[] = {"UTF-16LE", "UTF16LE", "UCS-2LE", "USC2LE", 0};
+constinit const char *const CHSETS_UTF16BE[] = {"UTF-16BE", "UTF16BE", "UCS-2BE", "USC2BE", 0};
+constinit const char *const CHSETS_UTF32LE[] = {"UTF-32LE", "UTF32LE", "UCS-4LE", "USC4LE", 0};
+constinit const char *const CHSETS_UTF32BE[] = {"UTF-32BE", "UTF32BE", "UCS-4BE", "USC4BE",  0};
 #endif
 
 //
@@ -341,7 +341,7 @@ void C_UnicodeIn::init()
             m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
             return;
         default:
-            if (m_Src.size() >= 3 && 0 == memcmp(m_Src.buffer(), "\xef\xbb\xbf", 3))
+            if (m_Src.size() >= 3 && 0 == memcmp(m_Src.buffer(), u8"\uFEFF", 3))
                 // UTF-8 with BOM
             {
                 m_Src.pop(3);
@@ -357,13 +357,13 @@ void C_UnicodeIn::init()
 #ifdef _WIN32
                 const auto size = m_Src.size();
                 int mask = IS_TEXT_UNICODE_UNICODE_MASK;
-                if (IsTextUnicode(m_Src.buffer(), int(size), &mask))
+                if (IsTextUnicode(m_Src.buffer(), int(size), &mask) || mask)
                 {
                     m_ReadMethod = &C_UnicodeIn::readUTF16;
                     return;
                 }
                 mask = IS_TEXT_UNICODE_REVERSE_MASK;
-                if (IsTextUnicode(m_Src.buffer(), int(size), &mask))
+                if (IsTextUnicode(m_Src.buffer(), int(size), &mask) || mask)
                 {
                     m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
                     return;
@@ -429,17 +429,17 @@ bool C_UnicodeIn::guessCodePage()
 {
     static constinit const T_Encoding MBCS_CODEPAGES[] ={
 #ifdef _WIN32
-        CP_ACP, CP_UTF8,
+        CP_UTF8, CP_ACP,
         932, 936, 949, 950, 951, // from https://en.wikipedia.org/wiki/Windows_code_page#East_Asian_multi-byte_code_pages
         CP_UTF7
 #elif defined(__unix__)
         CHSETS_UTF32LE, CHSETS_UTF32BE, CHSETS_UTF8, CHSETS_SJIS, CHSETS_GB, CHSETS_KSC, CHSETS_BIG5, CHSETS_UTF7, CHSETS_UTF16LE, CHSETS_UTF16BE
 #endif
     };
-    for (size_t i = 0; i < std::size(MBCS_CODEPAGES); ++i)
+    for (auto i: MBCS_CODEPAGES)
     {
         m_ErrCode = UIE_EOF; // reset error code
-        setCodePage(MBCS_CODEPAGES[i]);
+        setCodePage(i);
         ingestMBCS();
         if (m_ErrCode != UIE_NO_UNICODE_TRANSLATION)
         {
diff --git a/test/test_unicodecvt.cpp b/test/test_unicodecvt.cpp
@@ -2,7 +2,7 @@
     Test cases are organized according to ZOMBIES rules
     http://blog.wingman-sw.com/tdd-guided-by-zombies
 */
-#include <bux/LexBase.h>    // bux::asciiLiteral()
+//#include <bux/LexBase.h>    // bux::asciiLiteral()
 #include <bux/UnicodeCvt.h> // bux::to_utf8(), bux::BOM()
 #include <catch2/catch_test_macros.hpp>
 
@@ -11,7 +11,7 @@ TEST_CASE("Empty string to BOM", "[Z]")
     CHECK(bux::to_utf8(bux::BOM(L"")).empty());
     CHECK(bux::to_utf8(bux::BOM(U"")).empty());
     CHECK(bux::to_utf8(bux::BOM(u"")).empty());
-    CHECK(bux::BOM(u8"") == u8"\xef\xbb\xbf");
+    CHECK(bux::BOM(u8"") == u8"\uFEFF");
 }
 
 TEST_CASE("String to utf-8 vs stringview to utf-8", "[S]")
@@ -29,17 +29,20 @@ TEST_CASE("String to utf-8 vs stringview to utf-8", "[S]")
         ch = std::byteswap(ch);
     CHECK(bux::to_utf8(u32str) == (const char*)u8"一律轉成 utf-8");
 
-    static constinit const char *const CHSETS_UTF16LE[] = {"UCS-2LE", "UTF-16LE", "USC2LE", "UTF16LE", 0};
     char16_t u16str[] = u"一律轉成 utf-8";
+#ifdef __unix__
+    static constinit const char *const CHSETS_UTF16LE[] = {"UTF-16LE", "UTF16LE", "UCS-2LE", "USC2LE", 0};
     CHECK(bux::to_utf8(u16str, 0, CHSETS_UTF16LE) == (const char*)u8"一律轉成 utf-8");
 
     static constinit const char *const CHSETS_UTF16[] = {"UCS-2", "UTF-16", "USC2", "UTF16", 0};
     CHECK(bux::to_utf8(u16str, 0, CHSETS_UTF16) == (const char*)u8"一律轉成 utf-8");
-
-    CHECK(bux::to_utf8(u16str) == (const char*)u8"一律轉成 utf-8"); // %%%%% fail
+#endif
+    CHECK(bux::to_utf8(u16str) == (const char*)u8"一律轉成 utf-8");
     for (auto &ch: u16str)
         ch = std::byteswap(ch);
-    CHECK(bux::to_utf8(u16str) == (const char*)u8"一律轉成 utf-8"); // %%%%% fail
+    static constinit const char *const CHSETS_UTF16BE[] = {"UTF-16BE", "UTF16BE", "UCS-2BE", "USC2BE", 0};
+    CHECK(bux::to_utf8(u16str, 0, CHSETS_UTF16BE) == (const char*)u8"一律轉成 utf-8");
+    CHECK(bux::to_utf8(u16str) == (const char*)u8"一律轉成 utf-8");
 
     CHECK(bux::to_utf8(u8"一律轉成 utf-8", 0, bux::ENCODING_UTF8) == (const char*)u8"一律轉成 utf-8");
     CHECK(bux::to_utf8(bux::BOM(u8"一律轉成 utf-8")) == (const char*)u8"一律轉成 utf-8");

Original file line number	Diff line number	Diff line change
`@@ -154,15 +154,15 @@ std::basic_string<T> BOM(std::basic_string_view<T> sv)`
`154`	`154`	`if constexpr (sizeof(T) > 1)`
`155`	`155`	`return T(0xFEFF) + std::basic_string<T>(sv);`
`156`	`156`	`else`
`157`		`- return std::basic_string<T>{(const T*)"\xef\xbb\xbf"}.append(sv);`
	`157`	`+ return std::basic_string<T>{(const T*)u8"\uFEFF"}.append(sv);`
`158`	`158`	`}`
`159`	`159`	`template<typename T>`
`160`	`160`	`std::basic_string<T> BOM(const T *p)`
`161`	`161`	`{`
`162`	`162`	`if constexpr (sizeof(T) > 1)`
`163`	`163`	`return T(0xFEFF) + std::basic_string<T>(p);`
`164`	`164`	`else`
`165`		`- return std::basic_string<T>{(const T*)"\xef\xbb\xbf"} += p;`
	`165`	`+ return std::basic_string<T>{(const T*)u8"\uFEFF"} += p;`
`166`	`166`	`}`
`167`	`167`
`168`	`168`	`} // namespace bux`