Skip to content

Commit 45a546f

Browse files
committed
unicode transform
1 parent 11bf3f7 commit 45a546f

File tree

10 files changed

+191
-47
lines changed

10 files changed

+191
-47
lines changed

sources/asset-processor/texts.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "processor.h"
22

33
#include <cage-core/texts.h>
4+
#include <cage-core/unicode.h>
45

56
namespace
67
{
@@ -52,6 +53,7 @@ namespace
5253

5354
while (f->readLine(l))
5455
{
56+
l = unicodeTransformString(l, { UnicodeTransformEnum::CanonicalComposition });
5557
l = trim(l);
5658
if (l.empty())
5759
{
Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,57 @@
11
#ifndef guard_unicode_h_fg45jhftg45zj4sdas
22
#define guard_unicode_h_fg45jhftg45zj4sdas
33

4-
#include <cage-core/utf.h>
4+
#include <cage-core/core.h>
55

66
namespace cage
77
{
8-
// todo
8+
CAGE_CORE_API bool utfValid(PointerRange<const char> buffer);
9+
CAGE_CORE_API bool utfValid(const String &str);
10+
CAGE_CORE_API bool utfValid(const char *str);
11+
12+
// returns number of utf32 characters the string would have after conversion
13+
CAGE_CORE_API uint32 utf32Length(PointerRange<const char> buffer);
14+
CAGE_CORE_API uint32 utf32Length(const String &str);
15+
CAGE_CORE_API uint32 utf32Length(const char *str);
16+
17+
// returns number of bytes the string would have after converting to utf8
18+
CAGE_CORE_API uint32 utf8Length(PointerRange<const uint32> buffer);
19+
20+
CAGE_CORE_API Holder<PointerRange<uint32>> utf8to32(PointerRange<const char> buffer);
21+
CAGE_CORE_API Holder<PointerRange<uint32>> utf8to32(const String &str);
22+
CAGE_CORE_API Holder<PointerRange<uint32>> utf8to32(const char *str);
23+
CAGE_CORE_API void utf8to32(PointerRange<uint32> &outBuffer, PointerRange<const char> inBuffer);
24+
CAGE_CORE_API void utf8to32(PointerRange<uint32> &outBuffer, const String &inStr);
25+
CAGE_CORE_API void utf8to32(PointerRange<uint32> &outBuffer, const char *inStr);
26+
27+
CAGE_CORE_API Holder<PointerRange<char>> utf32to8(PointerRange<const uint32> buffer);
28+
CAGE_CORE_API void utf32to8(PointerRange<char> &outBuffer, PointerRange<const uint32> inBuffer);
29+
CAGE_CORE_API String utf32to8string(PointerRange<const uint32> str);
30+
31+
enum class UnicodeTransformEnum : uint32
32+
{
33+
None = 0,
34+
Validate, // replaces invalid sequences
35+
CanonicalComposition, // NFC (canonical preserves exact meaning of the text, but removes distinct encodings)
36+
CanonicalDecomposition, // NFD
37+
CompatibilityComposition, // NFKC (compatibility removes small visual distinctions, possibly changing meaning)
38+
CompatibilityDecomposition, // NFKD
39+
Lowercase,
40+
Uppercase,
41+
Titlecase,
42+
Casefold, // used for caseless string matching
43+
Unaccent,
44+
FuzzyMatching, // casefold, unaccent, removes duplicates
45+
};
46+
47+
struct CAGE_CORE_API UnicodeTransformConfig
48+
{
49+
UnicodeTransformEnum transform = UnicodeTransformEnum::Validate;
50+
const char *locale = nullptr; // optional, eg. en_US
51+
};
52+
53+
CAGE_CORE_API Holder<PointerRange<char>> unicodeTransform(PointerRange<const char> buffer, const UnicodeTransformConfig &cfg);
54+
CAGE_CORE_API String unicodeTransformString(PointerRange<const char> buffer, const UnicodeTransformConfig &cfg);
955
}
1056

1157
#endif // guard_unicode_h_fg45jhftg45zj4sdas

sources/include/cage-core/utf.h

Lines changed: 0 additions & 32 deletions
This file was deleted.

sources/libcore/unicode.cpp

Lines changed: 0 additions & 8 deletions
This file was deleted.
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#include <algorithm>
2+
3+
#include "uni_algo/case.h"
4+
#include "uni_algo/conv.h"
5+
#include "uni_algo/norm.h"
6+
7+
#include <cage-core/pointerRangeHolder.h>
8+
#include <cage-core/unicode.h>
9+
10+
namespace cage
11+
{
12+
namespace
13+
{
14+
std::string_view view(PointerRange<const char> buffer)
15+
{
16+
return std::string_view(buffer.begin(), buffer.end());
17+
}
18+
}
19+
20+
Holder<PointerRange<char>> unicodeTransform(PointerRange<const char> buffer, const UnicodeTransformConfig &cfg)
21+
{
22+
const std::string_view v = view(buffer);
23+
const una::locale l = una::locale(cfg.locale ? cfg.locale : "");
24+
switch (cfg.transform)
25+
{
26+
case UnicodeTransformEnum::None:
27+
return PointerRangeHolder<char>(buffer);
28+
case UnicodeTransformEnum::Validate:
29+
{
30+
const auto t = una::utf32to8u(una::utf8to32u(v));
31+
return PointerRangeHolder<char>(t.begin(), t.end());
32+
}
33+
case UnicodeTransformEnum::CanonicalComposition:
34+
{
35+
const auto t = una::norm::to_nfc_utf8(v);
36+
return PointerRangeHolder<char>(t.begin(), t.end());
37+
}
38+
case UnicodeTransformEnum::CanonicalDecomposition:
39+
{
40+
const auto t = una::norm::to_nfd_utf8(v);
41+
return PointerRangeHolder<char>(t.begin(), t.end());
42+
}
43+
case UnicodeTransformEnum::CompatibilityComposition:
44+
{
45+
const auto t = una::norm::to_nfkc_utf8(v);
46+
return PointerRangeHolder<char>(t.begin(), t.end());
47+
}
48+
case UnicodeTransformEnum::CompatibilityDecomposition:
49+
{
50+
const auto t = una::norm::to_nfkd_utf8(v);
51+
return PointerRangeHolder<char>(t.begin(), t.end());
52+
}
53+
case UnicodeTransformEnum::Lowercase:
54+
{
55+
const auto t = una::cases::to_lowercase_utf8(v, l);
56+
return PointerRangeHolder<char>(t.begin(), t.end());
57+
}
58+
case UnicodeTransformEnum::Uppercase:
59+
{
60+
const auto t = una::cases::to_uppercase_utf8(v, l);
61+
return PointerRangeHolder<char>(t.begin(), t.end());
62+
}
63+
case UnicodeTransformEnum::Titlecase:
64+
{
65+
const auto t = una::cases::to_titlecase_utf8(v, l);
66+
return PointerRangeHolder<char>(t.begin(), t.end());
67+
}
68+
case UnicodeTransformEnum::Casefold:
69+
{
70+
const auto t = una::cases::to_casefold_utf8(v);
71+
return PointerRangeHolder<char>(t.begin(), t.end());
72+
}
73+
case UnicodeTransformEnum::Unaccent:
74+
{
75+
const auto t = una::norm::to_unaccent_utf8(v);
76+
return PointerRangeHolder<char>(t.begin(), t.end());
77+
}
78+
case UnicodeTransformEnum::FuzzyMatching:
79+
{
80+
auto t = una::norm::to_unaccent_utf8(una::cases::to_casefold_utf8(v));
81+
t.erase(std::unique(t.begin(), t.end()), t.end());
82+
return PointerRangeHolder<char>(t.begin(), t.end());
83+
}
84+
default:
85+
CAGE_THROW_CRITICAL(Exception, "invalid UnicodeTransformEnum value");
86+
}
87+
}
88+
89+
String unicodeTransformString(PointerRange<const char> buffer, const UnicodeTransformConfig &cfg)
90+
{
91+
const auto t = unicodeTransform(buffer, cfg);
92+
return String(PointerRange<const char>((const char *)t.data(), (const char *)t.data() + t.size()));
93+
}
94+
}

sources/libcore/utf.cpp renamed to sources/libcore/unicode/utf.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#include "uni_algo/conv.h"
55

66
#include <cage-core/pointerRangeHolder.h>
7-
#include <cage-core/utf.h>
7+
#include <cage-core/unicode.h>
88

99
namespace cage
1010
{

sources/libengine/graphics/font.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#include <cstring>
22
#include <vector>
33

4-
#include <cage-core/utf.h>
4+
#include <cage-core/unicode.h>
55
#include <cage-engine/assetStructs.h>
66
#include <cage-engine/font.h>
77
#include <cage-engine/opengl.h>

sources/libengine/gui/widgets/input.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
#include <cage-core/debug.h>
44
#include <cage-core/string.h>
5-
#include <cage-core/utf.h>
5+
#include <cage-core/unicode.h>
66

77
namespace cage
88
{

sources/libengine/gui/widgets/textArea.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#include "../private.h"
22

33
#include <cage-core/memoryBuffer.h>
4-
#include <cage-core/utf.h>
4+
#include <cage-core/unicode.h>
55

66
namespace cage
77
{

sources/test-core/unicode.cpp

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,59 @@ namespace
77
void testUtf32()
88
{
99
const String a = "hello there Straße";
10+
CAGE_TEST(utfValid(a));
1011
const auto b = utf8to32(a);
1112
const String c = utf32to8string(b);
1213
CAGE_TEST(a == c);
1314
CAGE_TEST(utf32Length(a) == b->size());
1415
CAGE_TEST(utf8Length(b) == c.size());
1516
}
17+
18+
void testValidation()
19+
{
20+
CAGE_TEST(utfValid("hello there"));
21+
CAGE_TEST(!utfValid("Te\xC2st")); // "\xC2" is truncated sequence in UTF-8
22+
CAGE_TEST(unicodeTransformString("Te\xC2st", { UnicodeTransformEnum::Validate }) == "Te\xEF\xBF\xBDst"); // "\xEF\xBF\xBD" is the replacement character U+FFFD in UTF-8
23+
}
24+
25+
void testCaseConversions()
26+
{
27+
CAGE_TEST(unicodeTransformString("heLLo There", { UnicodeTransformEnum::Lowercase }) == "hello there");
28+
CAGE_TEST(unicodeTransformString("heLLo There", { UnicodeTransformEnum::Uppercase }) == "HELLO THERE");
29+
CAGE_TEST(unicodeTransformString("heLLo There", { UnicodeTransformEnum::Titlecase }) == "Hello There");
30+
CAGE_TEST(unicodeTransformString("heLLo There", { UnicodeTransformEnum::Casefold }) == "hello there");
31+
32+
CAGE_TEST(unicodeTransformString("Příliš Žluťoučký kůň úpĚl ďÁbelské ódy", { UnicodeTransformEnum::Lowercase }) == "příliš žluťoučký kůň úpěl ďábelské ódy");
33+
CAGE_TEST(unicodeTransformString("Příliš Žluťoučký kůň úpĚl ďÁbelské ódy", { UnicodeTransformEnum::Uppercase }) == "PŘÍLIŠ ŽLUŤOUČKÝ KŮŇ ÚPĚL ĎÁBELSKÉ ÓDY");
34+
CAGE_TEST(unicodeTransformString("Příliš Žluťoučký kůň úpĚl ďÁbelské ódy", { UnicodeTransformEnum::Titlecase }) == "Příliš Žluťoučký Kůň Úpěl Ďábelské Ódy");
35+
CAGE_TEST(unicodeTransformString("Příliš Žluťoučký kůň úpĚl ďÁbelské ódy", { UnicodeTransformEnum::Casefold }) == "příliš žluťoučký kůň úpěl ďábelské ódy");
36+
CAGE_TEST(unicodeTransformString("Příliš Žluťoučký kůň úpĚl ďÁbelské ódy", { .transform = UnicodeTransformEnum::Lowercase, .locale = "cs_cz" }) == "příliš žluťoučký kůň úpěl ďábelské ódy");
37+
38+
CAGE_TEST(unicodeTransformString("ijslAnd", { UnicodeTransformEnum::Titlecase }) == "Ijsland");
39+
CAGE_TEST(unicodeTransformString("ijslAnd", { .transform = UnicodeTransformEnum::Lowercase, .locale = "nl" }) == "ijsland");
40+
CAGE_TEST(unicodeTransformString("ijslAnd", { .transform = UnicodeTransformEnum::Uppercase, .locale = "nl" }) == "IJSLAND");
41+
CAGE_TEST(unicodeTransformString("ijslAnd", { .transform = UnicodeTransformEnum::Titlecase, .locale = "nl" }) == "IJsland");
42+
}
43+
44+
void testNormalizations()
45+
{
46+
CAGE_TEST(unicodeTransformString("heLLo There", { UnicodeTransformEnum::CanonicalComposition }) == "heLLo There");
47+
CAGE_TEST(unicodeTransformString("Ŵ W\u0302", { UnicodeTransformEnum::CanonicalComposition }) == "Ŵ Ŵ");
48+
CAGE_TEST(unicodeTransformString("Ŵ W\u0302", { UnicodeTransformEnum::CanonicalDecomposition }) == "W\u0302 W\u0302");
49+
}
50+
51+
void testFuzzyMatching()
52+
{
53+
CAGE_TEST(unicodeTransformString("Příliš Žluťoučký kůň úpĚl ďÁbelské ódy. Hello there.", { UnicodeTransformEnum::FuzzyMatching }) == "prilis zlutoucky kun upel dabelske ody. helo there.");
54+
}
1655
}
1756

1857
void testUnicode()
1958
{
2059
CAGE_TESTCASE("unicode");
2160
testUtf32();
22-
// todo
61+
testValidation();
62+
testCaseConversions();
63+
testNormalizations();
64+
testFuzzyMatching();
2365
}

0 commit comments

Comments
 (0)