Merge pull request #107 from contour-terminal/unicode-v16

christianparpart · web-flow · commit 18f44117fe62 · 2024-09-30T15:35:42.000+02:00
Update supported Unicode to version 16.0.0
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,6 @@ src/libunicode/ucd.h
 src/libunicode/ucd_enums.h
 src/libunicode/ucd_fmt.h
 src/libunicode/ucd_ostream.h
-/src/libunicode/codepoint_properties_data.cpp
-/src/libunicode/codepoint_properties_data.h
-/src/libunicode/codepoint_properties_names.cpp
+src/libunicode/codepoint_properties_data.cpp
+src/libunicode/codepoint_properties_data.h
+src/libunicode/codepoint_properties_names.cpp
diff --git a/.vimspector.json b/.vimspector.json
@@ -0,0 +1,24 @@
+{
+    "$schema": "https://puremourning.github.io/vimspector/schema/vimspector.schema.json#",
+    "configurations": {
+        "ModelTest": {
+            "adapter": "vscode-cpptools",
+            "configuration": {
+                "request": "launch",
+                "program": "${workspaceRoot}/build/linux-clang-debug/src/libunicode/unicode_test",
+                "args": [
+                ],
+                "cwd": "${workspaceRoot}",
+                "externalConsole": true,
+                "stopAtEntry": false,
+                "MIMode": "gdb"
+            },
+            "breakpoints": {
+                "exception": {
+                    "caught": "Y",
+                    "uncaught": "Y"
+                }
+            }
+        }
+    }
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -61,11 +61,11 @@ if(LIBUNICODE_TESTING)
 endif()
 
 # ----------------------------------------------------------------------------
-set(LIBUNICODE_UCD_VERSION "15.0.0" CACHE STRING "libunicode: Unicode version")
+set(LIBUNICODE_UCD_VERSION "16.0.0" CACHE STRING "libunicode: Unicode version")
 set(LIBUNICODE_UCD_BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/_ucd" CACHE PATH "Path to directory for downloaded files & extracted directories.")
 
 set(LIBUNICODE_UCD_ZIP_DOWNLOAD_URL "https://www.unicode.org/Public/${LIBUNICODE_UCD_VERSION}/ucd/UCD.zip")
-set(LIBUNICODE_UCD_MD5 "8c66407dd8ce2d84278868a69ea83280")
+set(LIBUNICODE_UCD_MD5 "bdd823cbd37c376633d6737a12281233")
 set(LIBUNICODE_UCD_ZIP_FILE "${LIBUNICODE_UCD_BASE_DIR}/ucd-${LIBUNICODE_UCD_VERSION}.zip")
 set(LIBUNICODE_UCD_DIR "${LIBUNICODE_UCD_BASE_DIR}/ucd-${LIBUNICODE_UCD_VERSION}" CACHE PATH "Path to UCD directory.")
 
diff --git a/src/libunicode/codepoint_properties_loader.cpp b/src/libunicode/codepoint_properties_loader.cpp
@@ -46,6 +46,7 @@ namespace
             pair { "13.0"sv, Age::V13_0 },
             pair { "14.0"sv, Age::V14_0 },
             pair { "15.0"sv, Age::V15_0 },
+            pair { "16.0"sv, Age::V16_0 },
             pair { "1.1"sv, Age::V1_1 },
             pair { "2.0"sv, Age::V2_0 },
             pair { "2.1"sv, Age::V2_1 },
diff --git a/src/libunicode/run_segmenter_test.cpp b/src/libunicode/run_segmenter_test.cpp
@@ -17,8 +17,6 @@
 
 #include <catch2/catch_test_macros.hpp>
 
-#include <array>
-#include <format>
 #include <ostream>
 #include <sstream>
 #include <string>
@@ -144,9 +142,9 @@ TEST_CASE("run_segmenter.JapaneseHindiEmojiMix", "[run_segmenter]")
                             { U"🌱🌲", Script::Han, PresentationStyle::Emoji } });
 }
 
-TEST_CASE("run_segmenter.CombiningCirlce", "[run_segmenter]")
+TEST_CASE("run_segmenter.CombiningCircle", "[run_segmenter]")
 {
-    test_run_segmentation(__LINE__, { { U"◌́◌̀◌̈◌̂◌̄◌̊", Script::Common, PresentationStyle::Text } });
+    test_run_segmentation(__LINE__, { { U"◌́◌̀◌̈◌̂◌̄◌̊", Script::Latin, PresentationStyle::Text } });
 }
 
 TEST_CASE("run_segmenter.Arab_Hangul", "[run_segmenter]")
diff --git a/src/libunicode/script_segmenter.cpp b/src/libunicode/script_segmenter.cpp
@@ -60,7 +60,7 @@ optional<script_segmenter::result> script_segmenter::consume()
     return res;
 }
 
-bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet)
+bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet) noexcept
 {
     if (nextSet.empty() || currentSet.empty())
         return false;
@@ -122,7 +122,7 @@ bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet
     return true;
 }
 
-script_segmenter::ScriptSet script_segmenter::getScriptsFor(char32_t codepoint)
+script_segmenter::ScriptSet script_segmenter::getScriptsFor(char32_t codepoint) noexcept
 {
     ScriptSet scriptSet;
 
diff --git a/src/libunicode/script_segmenter.h b/src/libunicode/script_segmenter.h
@@ -18,7 +18,6 @@
 
 #include <optional>
 #include <string_view>
-#include <vector>
 
 namespace unicode
 {
@@ -81,13 +80,13 @@ class script_segmenter
     }
 
     /// Returnes all scripts that this @p _codepoint is associated with.
-    ScriptSet getScriptsFor(char32_t codepoint);
+    ScriptSet getScriptsFor(char32_t codepoint) noexcept;
 
     /// Intersects @p _nextSet into @p _currentSet.
     ///
     /// @retval true Intersection succeed, meaning that no boundary was found.
     /// @retval false The resulting intersection is empty, meaning, a script boundary was found.
-    bool mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet);
+    bool mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet) noexcept;
 
     /// Returns the resolved script.
     ///
diff --git a/src/libunicode/script_segmenter_test.cpp b/src/libunicode/script_segmenter_test.cpp
@@ -15,13 +15,9 @@
 
 #include <catch2/catch_test_macros.hpp>
 
-#include <string>
 #include <string_view>
 
 using namespace std::string_view_literals;
-using namespace std::string_view_literals;
-using std::optional;
-using unicode::script_segmenter;
 
 TEST_CASE("script_segmenter.private_use_area", "[script_segmenter]")
 {
@@ -35,15 +31,32 @@ TEST_CASE("script_segmenter.private_use_area", "[script_segmenter]")
     CHECK(res1.script == unicode::Script::Unknown);
 }
 
+TEST_CASE("script_segmenter.common_to_specific", "[script_segmenter]")
+{
+    // '1' is script property Common, 'a' is script property Latin, so the whole string is Latin.
+
+    auto constexpr str = U"1a"sv;
+    auto seg = unicode::script_segmenter { str.data(), str.size() };
+
+    std::optional<unicode::script_segmenter::result> const r1 = seg.consume();
+    REQUIRE(r1.has_value());
+    auto const res1 = r1.value();
+    CHECK(res1.size == str.size());
+    CHECK(res1.script == unicode::Script::Latin);
+
+    auto const r2 = seg.consume();
+    REQUIRE_FALSE(r2.has_value());
+}
+
 TEST_CASE("script_segmenter.greek_kanji_greek", "[script_segmenter]")
 {
     char32_t const* str = U"λ 合気道 λ;";
-    auto seg = script_segmenter { str };
+    auto seg = unicode::script_segmenter { str };
 
     // greek text
-    optional<script_segmenter::result> const r1 = seg.consume();
+    std::optional<unicode::script_segmenter::result> const r1 = seg.consume();
     REQUIRE(r1.has_value());
-    script_segmenter::result const res1 = r1.value();
+    unicode::script_segmenter::result const res1 = r1.value();
     CHECK(res1.size == 2);
     CHECK(res1.script == unicode::Script::Greek);
 
diff --git a/src/libunicode/ucd_private.h b/src/libunicode/ucd_private.h
@@ -63,12 +63,20 @@ constexpr std::optional<T> search(std::array<Prop<T>, N> const& ranges, char32_t
 
     while (a < b)
     {
-        auto const i = static_cast<size_t>((b + a) / 2);
+        auto const i = a + static_cast<size_t>((b - a) / 2);
         auto const& I = ranges[i];
         if (I.interval.to < codepoint)
+        {
+            if (i == b)
+                return std::nullopt;
             a = i + 1;
+        }
         else if (I.interval.from > codepoint)
+        {
+            if (i == 0)
+                return std::nullopt;
             b = i - 1;
+        }
         else
             return I.property;
     }

Original file line number	Diff line number	Diff line change
`@@ -17,8 +17,6 @@`
`17`	`17`
`18`	`18`	`#include <catch2/catch_test_macros.hpp>`
`19`	`19`
`20`		`-#include <array>`
`21`		`-#include <format>`
`22`	`20`	`#include <ostream>`
`23`	`21`	`#include <sstream>`
`24`	`22`	`#include <string>`
`@@ -144,9 +142,9 @@ TEST_CASE("run_segmenter.JapaneseHindiEmojiMix", "[run_segmenter]")`
`144`	`142`	`{ U"🌱🌲", Script::Han, PresentationStyle::Emoji } });`
`145`	`143`	`}`
`146`	`144`
`147`		`-TEST_CASE("run_segmenter.CombiningCirlce", "[run_segmenter]")`
	`145`	`+TEST_CASE("run_segmenter.CombiningCircle", "[run_segmenter]")`
`148`	`146`	`{`
`149`		`- test_run_segmentation(__LINE__, { { U"◌́◌̀◌̈◌̂◌̄◌̊", Script::Common, PresentationStyle::Text } });`
	`147`	`+ test_run_segmentation(__LINE__, { { U"◌́◌̀◌̈◌̂◌̄◌̊", Script::Latin, PresentationStyle::Text } });`
`150`	`148`	`}`
`151`	`149`
`152`	`150`	`TEST_CASE("run_segmenter.Arab_Hangul", "[run_segmenter]")`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ optional<script_segmenter::result> script_segmenter::consume()`
`60`	`60`	`return res;`
`61`	`61`	`}`
`62`	`62`
`63`		`-bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet)`
	`63`	`+bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet) noexcept`
`64`	`64`	`{`
`65`	`65`	`if (nextSet.empty() \|\| currentSet.empty())`
`66`	`66`	`return false;`
`@@ -122,7 +122,7 @@ bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet`
`122`	`122`	`return true;`
`123`	`123`	`}`
`124`	`124`
`125`		`-script_segmenter::ScriptSet script_segmenter::getScriptsFor(char32_t codepoint)`
	`125`	`+script_segmenter::ScriptSet script_segmenter::getScriptsFor(char32_t codepoint) noexcept`
`126`	`126`	`{`
`127`	`127`	`ScriptSet scriptSet;`
`128`	`128`