unicode transform

malytomas · malytomas · commit 45a546fc986a · 2024-11-19T07:31:10.000+01:00
diff --git a/sources/asset-processor/texts.cpp b/sources/asset-processor/texts.cpp
@@ -1,6 +1,7 @@
 #include "processor.h"
 
 #include <cage-core/texts.h>
+#include <cage-core/unicode.h>
 
 namespace
 {
@@ -52,6 +53,7 @@ namespace
 
 		while (f->readLine(l))
 		{
+			l = unicodeTransformString(l, { UnicodeTransformEnum::CanonicalComposition });
 			l = trim(l);
 			if (l.empty())
 			{
diff --git a/sources/include/cage-core/unicode.h b/sources/include/cage-core/unicode.h
@@ -1,11 +1,57 @@
 #ifndef guard_unicode_h_fg45jhftg45zj4sdas
 #define guard_unicode_h_fg45jhftg45zj4sdas
 
-#include <cage-core/utf.h>
+#include <cage-core/core.h>
 
 namespace cage
 {
-	// todo
+	CAGE_CORE_API bool utfValid(PointerRange<const char> buffer);
+	CAGE_CORE_API bool utfValid(const String &str);
+	CAGE_CORE_API bool utfValid(const char *str);
+
+	// returns number of utf32 characters the string would have after conversion
+	CAGE_CORE_API uint32 utf32Length(PointerRange<const char> buffer);
+	CAGE_CORE_API uint32 utf32Length(const String &str);
+	CAGE_CORE_API uint32 utf32Length(const char *str);
+
+	// returns number of bytes the string would have after converting to utf8
+	CAGE_CORE_API uint32 utf8Length(PointerRange<const uint32> buffer);
+
+	CAGE_CORE_API Holder<PointerRange<uint32>> utf8to32(PointerRange<const char> buffer);
+	CAGE_CORE_API Holder<PointerRange<uint32>> utf8to32(const String &str);
+	CAGE_CORE_API Holder<PointerRange<uint32>> utf8to32(const char *str);
+	CAGE_CORE_API void utf8to32(PointerRange<uint32> &outBuffer, PointerRange<const char> inBuffer);
+	CAGE_CORE_API void utf8to32(PointerRange<uint32> &outBuffer, const String &inStr);
+	CAGE_CORE_API void utf8to32(PointerRange<uint32> &outBuffer, const char *inStr);
+
+	CAGE_CORE_API Holder<PointerRange<char>> utf32to8(PointerRange<const uint32> buffer);
+	CAGE_CORE_API void utf32to8(PointerRange<char> &outBuffer, PointerRange<const uint32> inBuffer);
+	CAGE_CORE_API String utf32to8string(PointerRange<const uint32> str);
+
+	enum class UnicodeTransformEnum : uint32
+	{
+		None = 0,
+		Validate, // replaces invalid sequences
+		CanonicalComposition, // NFC (canonical preserves exact meaning of the text, but removes distinct encodings)
+		CanonicalDecomposition, // NFD
+		CompatibilityComposition, // NFKC (compatibility removes small visual distinctions, possibly changing meaning)
+		CompatibilityDecomposition, // NFKD
+		Lowercase,
+		Uppercase,
+		Titlecase,
+		Casefold, // used for caseless string matching
+		Unaccent,
+		FuzzyMatching, // casefold, unaccent, removes duplicates
+	};
+
+	struct CAGE_CORE_API UnicodeTransformConfig
+	{
+		UnicodeTransformEnum transform = UnicodeTransformEnum::Validate;
+		const char *locale = nullptr; // optional, eg. en_US
+	};
+
+	CAGE_CORE_API Holder<PointerRange<char>> unicodeTransform(PointerRange<const char> buffer, const UnicodeTransformConfig &cfg);
+	CAGE_CORE_API String unicodeTransformString(PointerRange<const char> buffer, const UnicodeTransformConfig &cfg);
 }
 
 #endif // guard_unicode_h_fg45jhftg45zj4sdas
diff --git a/sources/include/cage-core/utf.h b/sources/include/cage-core/utf.h
diff --git a/sources/libcore/unicode.cpp b/sources/libcore/unicode.cpp
diff --git a/sources/libcore/unicode/unicode.cpp b/sources/libcore/unicode/unicode.cpp
@@ -0,0 +1,94 @@
+#include <algorithm>
+
+#include "uni_algo/case.h"
+#include "uni_algo/conv.h"
+#include "uni_algo/norm.h"
+
+#include <cage-core/pointerRangeHolder.h>
+#include <cage-core/unicode.h>
+
+namespace cage
+{
+	namespace
+	{
+		std::string_view view(PointerRange<const char> buffer)
+		{
+			return std::string_view(buffer.begin(), buffer.end());
+		}
+	}
+
+	Holder<PointerRange<char>> unicodeTransform(PointerRange<const char> buffer, const UnicodeTransformConfig &cfg)
+	{
+		const std::string_view v = view(buffer);
+		const una::locale l = una::locale(cfg.locale ? cfg.locale : "");
+		switch (cfg.transform)
+		{
+			case UnicodeTransformEnum::None:
+				return PointerRangeHolder<char>(buffer);
+			case UnicodeTransformEnum::Validate:
+			{
+				const auto t = una::utf32to8u(una::utf8to32u(v));
+				return PointerRangeHolder<char>(t.begin(), t.end());
+			}
+			case UnicodeTransformEnum::CanonicalComposition:
+			{
+				const auto t = una::norm::to_nfc_utf8(v);
+				return PointerRangeHolder<char>(t.begin(), t.end());
+			}
+			case UnicodeTransformEnum::CanonicalDecomposition:
+			{
+				const auto t = una::norm::to_nfd_utf8(v);
+				return PointerRangeHolder<char>(t.begin(), t.end());
+			}
+			case UnicodeTransformEnum::CompatibilityComposition:
+			{
+				const auto t = una::norm::to_nfkc_utf8(v);
+				return PointerRangeHolder<char>(t.begin(), t.end());
+			}
+			case UnicodeTransformEnum::CompatibilityDecomposition:
+			{
+				const auto t = una::norm::to_nfkd_utf8(v);
+				return PointerRangeHolder<char>(t.begin(), t.end());
+			}
+			case UnicodeTransformEnum::Lowercase:
+			{
+				const auto t = una::cases::to_lowercase_utf8(v, l);
+				return PointerRangeHolder<char>(t.begin(), t.end());
+			}
+			case UnicodeTransformEnum::Uppercase:
+			{
+				const auto t = una::cases::to_uppercase_utf8(v, l);
+				return PointerRangeHolder<char>(t.begin(), t.end());
+			}
+			case UnicodeTransformEnum::Titlecase:
+			{
+				const auto t = una::cases::to_titlecase_utf8(v, l);
+				return PointerRangeHolder<char>(t.begin(), t.end());
+			}
+			case UnicodeTransformEnum::Casefold:
+			{
+				const auto t = una::cases::to_casefold_utf8(v);
+				return PointerRangeHolder<char>(t.begin(), t.end());
+			}
+			case UnicodeTransformEnum::Unaccent:
+			{
+				const auto t = una::norm::to_unaccent_utf8(v);
+				return PointerRangeHolder<char>(t.begin(), t.end());
+			}
+			case UnicodeTransformEnum::FuzzyMatching:
+			{
+				auto t = una::norm::to_unaccent_utf8(una::cases::to_casefold_utf8(v));
+				t.erase(std::unique(t.begin(), t.end()), t.end());
+				return PointerRangeHolder<char>(t.begin(), t.end());
+			}
+			default:
+				CAGE_THROW_CRITICAL(Exception, "invalid UnicodeTransformEnum value");
+		}
+	}
+
+	String unicodeTransformString(PointerRange<const char> buffer, const UnicodeTransformConfig &cfg)
+	{
+		const auto t = unicodeTransform(buffer, cfg);
+		return String(PointerRange<const char>((const char *)t.data(), (const char *)t.data() + t.size()));
+	}
+}
diff --git a/sources/libcore/unicode/utf.cpp b/sources/libcore/unicode/utf.cpp
@@ -4,7 +4,7 @@
 #include "uni_algo/conv.h"
 
 #include <cage-core/pointerRangeHolder.h>
-#include <cage-core/utf.h>
+#include <cage-core/unicode.h>
 
 namespace cage
 {
diff --git a/sources/libengine/graphics/font.cpp b/sources/libengine/graphics/font.cpp
@@ -1,7 +1,7 @@
 #include <cstring>
 #include <vector>
 
-#include <cage-core/utf.h>
+#include <cage-core/unicode.h>
 #include <cage-engine/assetStructs.h>
 #include <cage-engine/font.h>
 #include <cage-engine/opengl.h>
diff --git a/sources/libengine/gui/widgets/input.cpp b/sources/libengine/gui/widgets/input.cpp
@@ -2,7 +2,7 @@
 
 #include <cage-core/debug.h>
 #include <cage-core/string.h>
-#include <cage-core/utf.h>
+#include <cage-core/unicode.h>
 
 namespace cage
 {
diff --git a/sources/libengine/gui/widgets/textArea.cpp b/sources/libengine/gui/widgets/textArea.cpp
@@ -1,7 +1,7 @@
 #include "../private.h"
 
 #include <cage-core/memoryBuffer.h>
-#include <cage-core/utf.h>
+#include <cage-core/unicode.h>
 
 namespace cage
 {
diff --git a/sources/test-core/unicode.cpp b/sources/test-core/unicode.cpp
@@ -7,17 +7,59 @@ namespace
 	void testUtf32()
 	{
 		const String a = "hello there Straße";
+		CAGE_TEST(utfValid(a));
 		const auto b = utf8to32(a);
 		const String c = utf32to8string(b);
 		CAGE_TEST(a == c);
 		CAGE_TEST(utf32Length(a) == b->size());
 		CAGE_TEST(utf8Length(b) == c.size());
 	}
+
+	void testValidation()
+	{
+		CAGE_TEST(utfValid("hello there"));
+		CAGE_TEST(!utfValid("Te\xC2st")); // "\xC2" is truncated sequence in UTF-8
+		CAGE_TEST(unicodeTransformString("Te\xC2st", { UnicodeTransformEnum::Validate }) == "Te\xEF\xBF\xBDst"); // "\xEF\xBF\xBD" is the replacement character U+FFFD in UTF-8
+	}
+
+	void testCaseConversions()
+	{
+		CAGE_TEST(unicodeTransformString("heLLo There", { UnicodeTransformEnum::Lowercase }) == "hello there");
+		CAGE_TEST(unicodeTransformString("heLLo There", { UnicodeTransformEnum::Uppercase }) == "HELLO THERE");
+		CAGE_TEST(unicodeTransformString("heLLo There", { UnicodeTransformEnum::Titlecase }) == "Hello There");
+		CAGE_TEST(unicodeTransformString("heLLo There", { UnicodeTransformEnum::Casefold }) == "hello there");
+
+		CAGE_TEST(unicodeTransformString("Příliš Žluťoučký kůň úpĚl ďÁbelské ódy", { UnicodeTransformEnum::Lowercase }) == "příliš žluťoučký kůň úpěl ďábelské ódy");
+		CAGE_TEST(unicodeTransformString("Příliš Žluťoučký kůň úpĚl ďÁbelské ódy", { UnicodeTransformEnum::Uppercase }) == "PŘÍLIŠ ŽLUŤOUČKÝ KŮŇ ÚPĚL ĎÁBELSKÉ ÓDY");
+		CAGE_TEST(unicodeTransformString("Příliš Žluťoučký kůň úpĚl ďÁbelské ódy", { UnicodeTransformEnum::Titlecase }) == "Příliš Žluťoučký Kůň Úpěl Ďábelské Ódy");
+		CAGE_TEST(unicodeTransformString("Příliš Žluťoučký kůň úpĚl ďÁbelské ódy", { UnicodeTransformEnum::Casefold }) == "příliš žluťoučký kůň úpěl ďábelské ódy");
+		CAGE_TEST(unicodeTransformString("Příliš Žluťoučký kůň úpĚl ďÁbelské ódy", { .transform = UnicodeTransformEnum::Lowercase, .locale = "cs_cz" }) == "příliš žluťoučký kůň úpěl ďábelské ódy");
+
+		CAGE_TEST(unicodeTransformString("ijslAnd", { UnicodeTransformEnum::Titlecase }) == "Ijsland");
+		CAGE_TEST(unicodeTransformString("ijslAnd", { .transform = UnicodeTransformEnum::Lowercase, .locale = "nl" }) == "ijsland");
+		CAGE_TEST(unicodeTransformString("ijslAnd", { .transform = UnicodeTransformEnum::Uppercase, .locale = "nl" }) == "IJSLAND");
+		CAGE_TEST(unicodeTransformString("ijslAnd", { .transform = UnicodeTransformEnum::Titlecase, .locale = "nl" }) == "IJsland");
+	}
+
+	void testNormalizations()
+	{
+		CAGE_TEST(unicodeTransformString("heLLo There", { UnicodeTransformEnum::CanonicalComposition }) == "heLLo There");
+		CAGE_TEST(unicodeTransformString("Ŵ W\u0302", { UnicodeTransformEnum::CanonicalComposition }) == "Ŵ Ŵ");
+		CAGE_TEST(unicodeTransformString("Ŵ W\u0302", { UnicodeTransformEnum::CanonicalDecomposition }) == "W\u0302 W\u0302");
+	}
+
+	void testFuzzyMatching()
+	{
+		CAGE_TEST(unicodeTransformString("Příliš Žluťoučký kůň úpĚl ďÁbelské ódy. Hello there.", { UnicodeTransformEnum::FuzzyMatching }) == "prilis zlutoucky kun upel dabelske ody. helo there.");
+	}
 }
 
 void testUnicode()
 {
 	CAGE_TESTCASE("unicode");
 	testUtf32();
-	// todo
+	testValidation();
+	testCaseConversions();
+	testNormalizations();
+	testFuzzyMatching();
 }

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`#include "processor.h"`
`2`	`2`
`3`	`3`	`#include <cage-core/texts.h>`
	`4`	`+#include <cage-core/unicode.h>`
`4`	`5`
`5`	`6`	`namespace`
`6`	`7`	`{`
`@@ -52,6 +53,7 @@ namespace`
`52`	`53`
`53`	`54`	`while (f->readLine(l))`
`54`	`55`	`{`
	`56`	`+ l = unicodeTransformString(l, { UnicodeTransformEnum::CanonicalComposition });`
`55`	`57`	`l = trim(l);`
`56`	`58`	`if (l.empty())`
`57`	`59`	`{`
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`#include "uni_algo/conv.h"`
`5`	`5`
`6`	`6`	`#include <cage-core/pointerRangeHolder.h>`
`7`		`-#include <cage-core/utf.h>`
	`7`	`+#include <cage-core/unicode.h>`
`8`	`8`
`9`	`9`	`namespace cage`
`10`	`10`	`{`
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`
`3`	`3`	`#include <cage-core/debug.h>`
`4`	`4`	`#include <cage-core/string.h>`
`5`		`-#include <cage-core/utf.h>`
	`5`	`+#include <cage-core/unicode.h>`
`6`	`6`
`7`	`7`	`namespace cage`
`8`	`8`	`{`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`#include "../private.h"`
`2`	`2`
`3`	`3`	`#include <cage-core/memoryBuffer.h>`
`4`		`-#include <cage-core/utf.h>`
	`4`	`+#include <cage-core/unicode.h>`
`5`	`5`
`6`	`6`	`namespace cage`
`7`	`7`	`{`