Add support for ISO-15924 abbreviation script code to locale_data

Flamefire · Flamefire · commit 08cd6f744a34 · 2024-12-01T18:31:34.000+01:00
The value is (currently) ignored but this allows to parse locale names
returned by ICU.
diff --git a/include/boost/locale/util/locale_data.hpp b/include/boost/locale/util/locale_data.hpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
-// Copyright (c) 2023 Alexander Grund
+// Copyright (c) 2023-2024 Alexander Grund
 //
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
@@ -21,6 +21,7 @@ namespace boost { namespace locale { namespace util {
     /// Holder and parser for locale names/identifiers
     class BOOST_LOCALE_DECL locale_data {
         std::string language_;
+        std::string script_;
         std::string country_;
         std::string encoding_;
         std::string variant_;
@@ -36,6 +37,8 @@ namespace boost { namespace locale { namespace util {
 
         /// Return language (usually 2 lowercase letters, i.e. ISO-639 or 'C')
         const std::string& language() const { return language_; }
+        /// Return the ISO-15924 abbreviation script code if present
+        const std::string& script() const { return script_; }
         /// Return country (usually 2 uppercase letters, i.e. ISO-3166)
         const std::string& country() const { return country_; }
         /// Return encoding/codeset, e.g. ISO8859-1 or UTF-8
@@ -48,12 +51,13 @@ namespace boost { namespace locale { namespace util {
         /// Return iff the encoding is UTF-8
         bool is_utf8() const { return utf8_; }
 
-        /// Parse a locale identifier of the form `[language[_territory][.codeset][@modifier]]`
+        /// Parse a locale identifier of the form `[language[_script][_territory][.codeset][@modifier]]`
         ///
         /// Allows a dash as the delimiter: `[language-territory]`
         /// Return true if the identifier is valid:
         ///   - `language` is given and consists of ASCII letters
-        ///   - `territory`, if given, consists of ASCII letters
+        ///   - `script` is only considered if it consists of exactly 4 ASCII letters
+        ///   - `territory`, if given, consists of ASCII letters (usually ISO-3166)
         ///   - Any field started by a delimiter (`_`, `-`, `.`, `@`) is not empty
         /// Otherwise parsing is aborted. Valid values already parsed stay set, other are defaulted.
         bool parse(const std::string& locale_name);
@@ -65,6 +69,7 @@ namespace boost { namespace locale { namespace util {
     private:
         void reset();
         bool parse_from_lang(const std::string& input);
+        bool parse_from_script(const std::string& input);
         bool parse_from_country(const std::string& input);
         bool parse_from_encoding(const std::string& input);
         bool parse_from_variant(const std::string& input);
diff --git a/src/boost/locale/util/locale_data.cpp b/src/boost/locale/util/locale_data.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
-// Copyright (c) 2022-2023 Alexander Grund
+// Copyright (c) 2022-2024 Alexander Grund
 //
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
@@ -14,6 +14,26 @@
 #include <string>
 
 namespace boost { namespace locale { namespace util {
+    /// Convert uppercase ASCII to lower case, return true if converted
+    static constexpr bool make_lower(char& c)
+    {
+        if(is_upper_ascii(c)) {
+            c += 'a' - 'A';
+            return true;
+        } else
+            return false;
+    }
+
+    /// Convert lowercase ASCII to upper case, return true if converted
+    static constexpr bool make_upper(char& c)
+    {
+        if(is_lower_ascii(c)) {
+            c += 'A' - 'a';
+            return true;
+        } else
+            return false;
+    }
+
     locale_data::locale_data()
     {
         reset();
@@ -28,6 +48,7 @@ namespace boost { namespace locale { namespace util {
     void locale_data::reset()
     {
         language_ = "C";
+        script_.clear();
         country_.clear();
         encoding_ = "US-ASCII";
         variant_.clear();
@@ -37,6 +58,8 @@ namespace boost { namespace locale { namespace util {
     std::string locale_data::to_string() const
     {
         std::string result = language_;
+        if(!script_.empty())
+            (result += '_') += script_;
         if(!country_.empty())
             (result += '_') += country_;
         if(!encoding_.empty() && !util::are_encodings_equal(encoding_, "US-ASCII"))
@@ -60,14 +83,39 @@ namespace boost { namespace locale { namespace util {
             return false;
         // lowercase ASCII
         for(char& c : tmp) {
-            if(is_upper_ascii(c))
-                c += 'a' - 'A';
-            else if(!is_lower_ascii(c))
+            if(!is_lower_ascii(c) && !make_lower(c))
                 return false;
         }
         if(tmp != "c" && tmp != "posix") // Keep default
             language_ = tmp;
 
+        if(end >= input.size())
+            return true;
+        else if(input[end] == '-' || input[end] == '_')
+            return parse_from_script(input.substr(end + 1));
+        else if(input[end] == '.')
+            return parse_from_encoding(input.substr(end + 1));
+        else {
+            BOOST_ASSERT_MSG(input[end] == '@', "Unexpected delimiter");
+            return parse_from_variant(input.substr(end + 1));
+        }
+    }
+
+    bool locale_data::parse_from_script(const std::string& input)
+    {
+        const auto end = input.find_first_of("-_@.");
+        std::string tmp = input.substr(0, end);
+        // Script is exactly 4 ASCII characters, otherwise it is not present
+        if(tmp.length() != 4)
+            return parse_from_country(input);
+
+        for(char& c : tmp) {
+            if(!is_lower_ascii(c) && !make_lower(c))
+                return parse_from_country(input);
+        }
+        make_upper(tmp[0]); // Capitalize first letter only
+        script_ = tmp;
+
         if(end >= input.size())
             return true;
         else if(input[end] == '-' || input[end] == '_')
@@ -91,10 +139,9 @@ namespace boost { namespace locale { namespace util {
             return false;
 
         // Make uppercase
-        for(char& c : tmp) {
-            if(util::is_lower_ascii(c))
-                c += 'A' - 'a';
-        }
+        for(char& c : tmp)
+            make_upper(c);
+
         // If it's ALL uppercase ASCII, assume ISO 3166 country id
         if(std::find_if_not(tmp.begin(), tmp.end(), util::is_upper_ascii) != tmp.end()) {
             // else handle special cases:
@@ -142,20 +189,16 @@ namespace boost { namespace locale { namespace util {
             return false;
         variant_ = input;
         // No assumptions, just make it lowercase
-        for(char& c : variant_) {
-            if(util::is_upper_ascii(c))
-                c += 'a' - 'A';
-        }
+        for(char& c : variant_)
+            make_lower(c);
         return true;
     }
 
     locale_data& locale_data::encoding(std::string new_encoding, const bool uppercase)
     {
         if(uppercase) {
-            for(char& c : new_encoding) {
-                if(util::is_lower_ascii(c))
-                    c += 'A' - 'a';
-            }
+            for(char& c : new_encoding)
+                make_upper(c);
         }
         encoding_ = std::move(new_encoding);
         utf8_ = util::normalize_encoding(encoding_) == "utf8";
diff --git a/test/test_util.cpp b/test/test_util.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2022-2023 Alexander Grund
+// Copyright (c) 2022-2024 Alexander Grund
 //
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
@@ -112,6 +112,34 @@ void test_get_system_locale()
     TEST_EQ(get_system_locale(true), "barlang.bar");
 }
 
+#ifndef BOOST_LOCALE_WITH_ICU
+void verify_against_icu(){};
+#else
+#    include <unicode/locid.h>
+void verify_against_icu()
+{
+    int32_t count;
+    auto* cur_locale = icu::Locale::getAvailableLocales(count);
+    boost::locale::util::locale_data data;
+    for(int i = 0; i < count; i++, cur_locale++) {
+        const std::string loc_name = cur_locale->getName();
+        if(loc_name == "en_US_POSIX")
+            continue; // Parsed as "C", tested elsewhere
+        for(const bool add_utf8 : {false, true}) {
+            // Also test with added encoding to verify input is fully parsed
+            const std::string curName = add_utf8 ? loc_name + ".UTF-8" : loc_name;
+            TEST_CONTEXT(curName);
+            TEST(data.parse(curName));
+            TEST_EQ(data.language(), cur_locale->getLanguage());
+            TEST_EQ(data.country(), cur_locale->getCountry());
+            TEST_EQ(data.encoding(), add_utf8 ? "UTF-8" : "US-ASCII");
+            TEST_EQ(data.variant(), cur_locale->getVariant());
+            TEST_EQ(data.to_string(), curName);
+        }
+    }
+}
+#endif
+
 void test_locale_data()
 {
     boost::locale::util::locale_data data;
@@ -131,13 +159,15 @@ void test_locale_data()
 
     TEST(data.parse("C"));
     TEST_EQ(data.language(), "C");
+    TEST_EQ(data.script(), "");
     TEST_EQ(data.country(), "");
     TEST_EQ(data.encoding(), "US-ASCII");
     TEST(!data.is_utf8());
     TEST_EQ(data.variant(), "");
 
     TEST(data.parse("ku_TR.UTF-8@sorani"));
     TEST_EQ(data.language(), "ku");
+    TEST_EQ(data.script(), "");
     TEST_EQ(data.country(), "TR");
     TEST_EQ(data.encoding(), "UTF-8");
     TEST(data.is_utf8());
@@ -200,6 +230,17 @@ void test_locale_data()
     TEST(data.is_utf8());
     TEST_EQ(data.variant(), "");
 
+    // Script used, optionally with dashes instead of underscores
+    for(const std::string name : {"pa_Arab_PK.UTF-8", "pa-Arab_PK.UTF-8", "pa_Arab-PK.UTF-8"}) {
+        TEST(data.parse("pa_Arab_PK.UTF-8"));
+        TEST_EQ(data.language(), "pa");
+        TEST_EQ(data.script(), "Arab");
+        TEST_EQ(data.country(), "PK");
+        TEST_EQ(data.encoding(), "UTF-8");
+        TEST(data.is_utf8());
+        TEST_EQ(data.variant(), "");
+    }
+
     // to_string yields the input (if format is correct already)
     for(const std::string name : {"C",
                                   "en_US.UTF-8",
@@ -211,8 +252,18 @@ void test_locale_data()
                                   "th_TH.TIS620",
                                   "zh_TW.UTF-8@radical",
                                   "en_001",
-                                  "en_150.UTF-8"})
+                                  "en_150.UTF-8",
+                                  // Different variation with parts missing
+                                  "pa_Arab_PK.UTF-8",
+                                  "pa_Arab_PK@euro",
+                                  "pa_Arab.UTF-8",
+                                  "pa_Arab@euro",
+                                  "pa.UTF-8",
+                                  "pa@euro",
+                                  "pa_PK.UTF-8",
+                                  "pa_PK@euro"})
     {
+        TEST_CONTEXT(name);
         TEST(data.parse(name));
         TEST_EQ(data.to_string(), name);
     }
@@ -224,16 +275,18 @@ void test_locale_data()
 
     // Unify casing:
     // - language: lowercase
+    // - script: Capitalized
     // - region: uppercase
     // - encoding: uppercase
     // - variant: lowercase
-    TEST(data.parse("EN_us.utf-8@EUro"));
+    TEST(data.parse("EN_sCrI_us.utf-8@EUro"));
     TEST_EQ(data.language(), "en");
+    TEST_EQ(data.script(), "Scri");
     TEST_EQ(data.country(), "US");
     TEST_EQ(data.encoding(), "UTF-8");
     TEST(data.is_utf8());
     TEST_EQ(data.variant(), "euro");
-    TEST_EQ(data.to_string(), "en_US.UTF-8@euro");
+    TEST_EQ(data.to_string(), "en_Scri_US.UTF-8@euro");
     TEST(data.parse("lAnGUagE_cOunTRy.eNCo-d123inG@Va-r1_Ant"));
     TEST_EQ(data.to_string(), "language_COUNTRY.ENCO-D123ING@va-r1_ant");
 
@@ -313,6 +366,8 @@ void test_locale_data()
     // Construct from string
     TEST_EQ(boost::locale::util::locale_data("en_US.UTF-8").to_string(), "en_US.UTF-8");
     TEST_THROWS(boost::locale::util::locale_data invalid("en_UÖ.UTF-8"), std::invalid_argument);
+
+    verify_against_icu();
 }
 
 #include "../src/boost/locale/util/numeric.hpp"