Skip to content

Commit 18f4411

Browse files
Merge pull request #107 from contour-terminal/unicode-v16
Update supported Unicode to version 16.0.0
2 parents 42494f9 + 48e78ba commit 18f4411

9 files changed

+65
-22
lines changed

.gitignore

+3-3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,6 @@ src/libunicode/ucd.h
1313
src/libunicode/ucd_enums.h
1414
src/libunicode/ucd_fmt.h
1515
src/libunicode/ucd_ostream.h
16-
/src/libunicode/codepoint_properties_data.cpp
17-
/src/libunicode/codepoint_properties_data.h
18-
/src/libunicode/codepoint_properties_names.cpp
16+
src/libunicode/codepoint_properties_data.cpp
17+
src/libunicode/codepoint_properties_data.h
18+
src/libunicode/codepoint_properties_names.cpp

.vimspector.json

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"$schema": "https://puremourning.github.io/vimspector/schema/vimspector.schema.json#",
3+
"configurations": {
4+
"ModelTest": {
5+
"adapter": "vscode-cpptools",
6+
"configuration": {
7+
"request": "launch",
8+
"program": "${workspaceRoot}/build/linux-clang-debug/src/libunicode/unicode_test",
9+
"args": [
10+
],
11+
"cwd": "${workspaceRoot}",
12+
"externalConsole": true,
13+
"stopAtEntry": false,
14+
"MIMode": "gdb"
15+
},
16+
"breakpoints": {
17+
"exception": {
18+
"caught": "Y",
19+
"uncaught": "Y"
20+
}
21+
}
22+
}
23+
}
24+
}

CMakeLists.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,11 @@ if(LIBUNICODE_TESTING)
6161
endif()
6262

6363
# ----------------------------------------------------------------------------
64-
set(LIBUNICODE_UCD_VERSION "15.0.0" CACHE STRING "libunicode: Unicode version")
64+
set(LIBUNICODE_UCD_VERSION "16.0.0" CACHE STRING "libunicode: Unicode version")
6565
set(LIBUNICODE_UCD_BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/_ucd" CACHE PATH "Path to directory for downloaded files & extracted directories.")
6666

6767
set(LIBUNICODE_UCD_ZIP_DOWNLOAD_URL "https://www.unicode.org/Public/${LIBUNICODE_UCD_VERSION}/ucd/UCD.zip")
68-
set(LIBUNICODE_UCD_MD5 "8c66407dd8ce2d84278868a69ea83280")
68+
set(LIBUNICODE_UCD_MD5 "bdd823cbd37c376633d6737a12281233")
6969
set(LIBUNICODE_UCD_ZIP_FILE "${LIBUNICODE_UCD_BASE_DIR}/ucd-${LIBUNICODE_UCD_VERSION}.zip")
7070
set(LIBUNICODE_UCD_DIR "${LIBUNICODE_UCD_BASE_DIR}/ucd-${LIBUNICODE_UCD_VERSION}" CACHE PATH "Path to UCD directory.")
7171

src/libunicode/codepoint_properties_loader.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ namespace
4646
pair { "13.0"sv, Age::V13_0 },
4747
pair { "14.0"sv, Age::V14_0 },
4848
pair { "15.0"sv, Age::V15_0 },
49+
pair { "16.0"sv, Age::V16_0 },
4950
pair { "1.1"sv, Age::V1_1 },
5051
pair { "2.0"sv, Age::V2_0 },
5152
pair { "2.1"sv, Age::V2_1 },

src/libunicode/run_segmenter_test.cpp

+2-4
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717

1818
#include <catch2/catch_test_macros.hpp>
1919

20-
#include <array>
21-
#include <format>
2220
#include <ostream>
2321
#include <sstream>
2422
#include <string>
@@ -144,9 +142,9 @@ TEST_CASE("run_segmenter.JapaneseHindiEmojiMix", "[run_segmenter]")
144142
{ U"🌱🌲", Script::Han, PresentationStyle::Emoji } });
145143
}
146144

147-
TEST_CASE("run_segmenter.CombiningCirlce", "[run_segmenter]")
145+
TEST_CASE("run_segmenter.CombiningCircle", "[run_segmenter]")
148146
{
149-
test_run_segmentation(__LINE__, { { U"◌́◌̀◌̈◌̂◌̄◌̊", Script::Common, PresentationStyle::Text } });
147+
test_run_segmentation(__LINE__, { { U"◌́◌̀◌̈◌̂◌̄◌̊", Script::Latin, PresentationStyle::Text } });
150148
}
151149

152150
TEST_CASE("run_segmenter.Arab_Hangul", "[run_segmenter]")

src/libunicode/script_segmenter.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ optional<script_segmenter::result> script_segmenter::consume()
6060
return res;
6161
}
6262

63-
bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet)
63+
bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet) noexcept
6464
{
6565
if (nextSet.empty() || currentSet.empty())
6666
return false;
@@ -122,7 +122,7 @@ bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet
122122
return true;
123123
}
124124

125-
script_segmenter::ScriptSet script_segmenter::getScriptsFor(char32_t codepoint)
125+
script_segmenter::ScriptSet script_segmenter::getScriptsFor(char32_t codepoint) noexcept
126126
{
127127
ScriptSet scriptSet;
128128

src/libunicode/script_segmenter.h

+2-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
#include <optional>
2020
#include <string_view>
21-
#include <vector>
2221

2322
namespace unicode
2423
{
@@ -81,13 +80,13 @@ class script_segmenter
8180
}
8281

8382
/// Returnes all scripts that this @p _codepoint is associated with.
84-
ScriptSet getScriptsFor(char32_t codepoint);
83+
ScriptSet getScriptsFor(char32_t codepoint) noexcept;
8584

8685
/// Intersects @p _nextSet into @p _currentSet.
8786
///
8887
/// @retval true Intersection succeed, meaning that no boundary was found.
8988
/// @retval false The resulting intersection is empty, meaning, a script boundary was found.
90-
bool mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet);
89+
bool mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet) noexcept;
9190

9291
/// Returns the resolved script.
9392
///

src/libunicode/script_segmenter_test.cpp

+20-7
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,9 @@
1515

1616
#include <catch2/catch_test_macros.hpp>
1717

18-
#include <string>
1918
#include <string_view>
2019

2120
using namespace std::string_view_literals;
22-
using namespace std::string_view_literals;
23-
using std::optional;
24-
using unicode::script_segmenter;
2521

2622
TEST_CASE("script_segmenter.private_use_area", "[script_segmenter]")
2723
{
@@ -35,15 +31,32 @@ TEST_CASE("script_segmenter.private_use_area", "[script_segmenter]")
3531
CHECK(res1.script == unicode::Script::Unknown);
3632
}
3733

34+
TEST_CASE("script_segmenter.common_to_specific", "[script_segmenter]")
35+
{
36+
// '1' is script property Common, 'a' is script property Latin, so the whole string is Latin.
37+
38+
auto constexpr str = U"1a"sv;
39+
auto seg = unicode::script_segmenter { str.data(), str.size() };
40+
41+
std::optional<unicode::script_segmenter::result> const r1 = seg.consume();
42+
REQUIRE(r1.has_value());
43+
auto const res1 = r1.value();
44+
CHECK(res1.size == str.size());
45+
CHECK(res1.script == unicode::Script::Latin);
46+
47+
auto const r2 = seg.consume();
48+
REQUIRE_FALSE(r2.has_value());
49+
}
50+
3851
TEST_CASE("script_segmenter.greek_kanji_greek", "[script_segmenter]")
3952
{
4053
char32_t const* str = U"λ 合気道 λ;";
41-
auto seg = script_segmenter { str };
54+
auto seg = unicode::script_segmenter { str };
4255

4356
// greek text
44-
optional<script_segmenter::result> const r1 = seg.consume();
57+
std::optional<unicode::script_segmenter::result> const r1 = seg.consume();
4558
REQUIRE(r1.has_value());
46-
script_segmenter::result const res1 = r1.value();
59+
unicode::script_segmenter::result const res1 = r1.value();
4760
CHECK(res1.size == 2);
4861
CHECK(res1.script == unicode::Script::Greek);
4962

src/libunicode/ucd_private.h

+9-1
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,20 @@ constexpr std::optional<T> search(std::array<Prop<T>, N> const& ranges, char32_t
6363

6464
while (a < b)
6565
{
66-
auto const i = static_cast<size_t>((b + a) / 2);
66+
auto const i = a + static_cast<size_t>((b - a) / 2);
6767
auto const& I = ranges[i];
6868
if (I.interval.to < codepoint)
69+
{
70+
if (i == b)
71+
return std::nullopt;
6972
a = i + 1;
73+
}
7074
else if (I.interval.from > codepoint)
75+
{
76+
if (i == 0)
77+
return std::nullopt;
7178
b = i - 1;
79+
}
7280
else
7381
return I.property;
7482
}

0 commit comments

Comments
 (0)