Skip to content

Commit 08cd6f7

Browse files
committed
Add support for ISO-15924 abbreviation script code to locale_data
The value is (currently) ignored but this allows to parse locale names returned by ICU.
1 parent 5159cb6 commit 08cd6f7

File tree

3 files changed

+126
-23
lines changed

3 files changed

+126
-23
lines changed

include/boost/locale/util/locale_data.hpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//
22
// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3-
// Copyright (c) 2023 Alexander Grund
3+
// Copyright (c) 2023-2024 Alexander Grund
44
//
55
// Distributed under the Boost Software License, Version 1.0.
66
// https://www.boost.org/LICENSE_1_0.txt
@@ -21,6 +21,7 @@ namespace boost { namespace locale { namespace util {
2121
/// Holder and parser for locale names/identifiers
2222
class BOOST_LOCALE_DECL locale_data {
2323
std::string language_;
24+
std::string script_;
2425
std::string country_;
2526
std::string encoding_;
2627
std::string variant_;
@@ -36,6 +37,8 @@ namespace boost { namespace locale { namespace util {
3637

3738
/// Return language (usually 2 lowercase letters, i.e. ISO-639 or 'C')
3839
const std::string& language() const { return language_; }
40+
/// Return the ISO-15924 abbreviation script code if present
41+
const std::string& script() const { return script_; }
3942
/// Return country (usually 2 uppercase letters, i.e. ISO-3166)
4043
const std::string& country() const { return country_; }
4144
/// Return encoding/codeset, e.g. ISO8859-1 or UTF-8
@@ -48,12 +51,13 @@ namespace boost { namespace locale { namespace util {
4851
/// Return iff the encoding is UTF-8
4952
bool is_utf8() const { return utf8_; }
5053

51-
/// Parse a locale identifier of the form `[language[_territory][.codeset][@modifier]]`
54+
/// Parse a locale identifier of the form `[language[_script][_territory][.codeset][@modifier]]`
5255
///
5356
/// Allows a dash as the delimiter: `[language-territory]`
5457
/// Return true if the identifier is valid:
5558
/// - `language` is given and consists of ASCII letters
56-
/// - `territory`, if given, consists of ASCII letters
59+
/// - `script` is only considered if it consists of exactly 4 ASCII letters
60+
/// - `territory`, if given, consists of ASCII letters (usually ISO-3166)
5761
/// - Any field started by a delimiter (`_`, `-`, `.`, `@`) is not empty
5862
/// Otherwise parsing is aborted. Valid values already parsed stay set, other are defaulted.
5963
bool parse(const std::string& locale_name);
@@ -65,6 +69,7 @@ namespace boost { namespace locale { namespace util {
6569
private:
6670
void reset();
6771
bool parse_from_lang(const std::string& input);
72+
bool parse_from_script(const std::string& input);
6873
bool parse_from_country(const std::string& input);
6974
bool parse_from_encoding(const std::string& input);
7075
bool parse_from_variant(const std::string& input);

src/boost/locale/util/locale_data.cpp

Lines changed: 59 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//
22
// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3-
// Copyright (c) 2022-2023 Alexander Grund
3+
// Copyright (c) 2022-2024 Alexander Grund
44
//
55
// Distributed under the Boost Software License, Version 1.0.
66
// https://www.boost.org/LICENSE_1_0.txt
@@ -14,6 +14,26 @@
1414
#include <string>
1515

1616
namespace boost { namespace locale { namespace util {
17+
/// Convert uppercase ASCII to lower case, return true if converted
18+
static constexpr bool make_lower(char& c)
19+
{
20+
if(is_upper_ascii(c)) {
21+
c += 'a' - 'A';
22+
return true;
23+
} else
24+
return false;
25+
}
26+
27+
/// Convert lowercase ASCII to upper case, return true if converted
28+
static constexpr bool make_upper(char& c)
29+
{
30+
if(is_lower_ascii(c)) {
31+
c += 'A' - 'a';
32+
return true;
33+
} else
34+
return false;
35+
}
36+
1737
locale_data::locale_data()
1838
{
1939
reset();
@@ -28,6 +48,7 @@ namespace boost { namespace locale { namespace util {
2848
void locale_data::reset()
2949
{
3050
language_ = "C";
51+
script_.clear();
3152
country_.clear();
3253
encoding_ = "US-ASCII";
3354
variant_.clear();
@@ -37,6 +58,8 @@ namespace boost { namespace locale { namespace util {
3758
std::string locale_data::to_string() const
3859
{
3960
std::string result = language_;
61+
if(!script_.empty())
62+
(result += '_') += script_;
4063
if(!country_.empty())
4164
(result += '_') += country_;
4265
if(!encoding_.empty() && !util::are_encodings_equal(encoding_, "US-ASCII"))
@@ -60,14 +83,39 @@ namespace boost { namespace locale { namespace util {
6083
return false;
6184
// lowercase ASCII
6285
for(char& c : tmp) {
63-
if(is_upper_ascii(c))
64-
c += 'a' - 'A';
65-
else if(!is_lower_ascii(c))
86+
if(!is_lower_ascii(c) && !make_lower(c))
6687
return false;
6788
}
6889
if(tmp != "c" && tmp != "posix") // Keep default
6990
language_ = tmp;
7091

92+
if(end >= input.size())
93+
return true;
94+
else if(input[end] == '-' || input[end] == '_')
95+
return parse_from_script(input.substr(end + 1));
96+
else if(input[end] == '.')
97+
return parse_from_encoding(input.substr(end + 1));
98+
else {
99+
BOOST_ASSERT_MSG(input[end] == '@', "Unexpected delimiter");
100+
return parse_from_variant(input.substr(end + 1));
101+
}
102+
}
103+
104+
bool locale_data::parse_from_script(const std::string& input)
105+
{
106+
const auto end = input.find_first_of("-_@.");
107+
std::string tmp = input.substr(0, end);
108+
// Script is exactly 4 ASCII characters, otherwise it is not present
109+
if(tmp.length() != 4)
110+
return parse_from_country(input);
111+
112+
for(char& c : tmp) {
113+
if(!is_lower_ascii(c) && !make_lower(c))
114+
return parse_from_country(input);
115+
}
116+
make_upper(tmp[0]); // Capitalize first letter only
117+
script_ = tmp;
118+
71119
if(end >= input.size())
72120
return true;
73121
else if(input[end] == '-' || input[end] == '_')
@@ -91,10 +139,9 @@ namespace boost { namespace locale { namespace util {
91139
return false;
92140

93141
// Make uppercase
94-
for(char& c : tmp) {
95-
if(util::is_lower_ascii(c))
96-
c += 'A' - 'a';
97-
}
142+
for(char& c : tmp)
143+
make_upper(c);
144+
98145
// If it's ALL uppercase ASCII, assume ISO 3166 country id
99146
if(std::find_if_not(tmp.begin(), tmp.end(), util::is_upper_ascii) != tmp.end()) {
100147
// else handle special cases:
@@ -142,20 +189,16 @@ namespace boost { namespace locale { namespace util {
142189
return false;
143190
variant_ = input;
144191
// No assumptions, just make it lowercase
145-
for(char& c : variant_) {
146-
if(util::is_upper_ascii(c))
147-
c += 'a' - 'A';
148-
}
192+
for(char& c : variant_)
193+
make_lower(c);
149194
return true;
150195
}
151196

152197
locale_data& locale_data::encoding(std::string new_encoding, const bool uppercase)
153198
{
154199
if(uppercase) {
155-
for(char& c : new_encoding) {
156-
if(util::is_lower_ascii(c))
157-
c += 'A' - 'a';
158-
}
200+
for(char& c : new_encoding)
201+
make_upper(c);
159202
}
160203
encoding_ = std::move(new_encoding);
161204
utf8_ = util::normalize_encoding(encoding_) == "utf8";

test/test_util.cpp

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//
2-
// Copyright (c) 2022-2023 Alexander Grund
2+
// Copyright (c) 2022-2024 Alexander Grund
33
//
44
// Distributed under the Boost Software License, Version 1.0.
55
// https://www.boost.org/LICENSE_1_0.txt
@@ -112,6 +112,34 @@ void test_get_system_locale()
112112
TEST_EQ(get_system_locale(true), "barlang.bar");
113113
}
114114

115+
#ifndef BOOST_LOCALE_WITH_ICU
116+
void verify_against_icu(){};
117+
#else
118+
# include <unicode/locid.h>
119+
void verify_against_icu()
120+
{
121+
int32_t count;
122+
auto* cur_locale = icu::Locale::getAvailableLocales(count);
123+
boost::locale::util::locale_data data;
124+
for(int i = 0; i < count; i++, cur_locale++) {
125+
const std::string loc_name = cur_locale->getName();
126+
if(loc_name == "en_US_POSIX")
127+
continue; // Parsed as "C", tested elsewhere
128+
for(const bool add_utf8 : {false, true}) {
129+
// Also test with added encoding to verify input is fully parsed
130+
const std::string curName = add_utf8 ? loc_name + ".UTF-8" : loc_name;
131+
TEST_CONTEXT(curName);
132+
TEST(data.parse(curName));
133+
TEST_EQ(data.language(), cur_locale->getLanguage());
134+
TEST_EQ(data.country(), cur_locale->getCountry());
135+
TEST_EQ(data.encoding(), add_utf8 ? "UTF-8" : "US-ASCII");
136+
TEST_EQ(data.variant(), cur_locale->getVariant());
137+
TEST_EQ(data.to_string(), curName);
138+
}
139+
}
140+
}
141+
#endif
142+
115143
void test_locale_data()
116144
{
117145
boost::locale::util::locale_data data;
@@ -131,13 +159,15 @@ void test_locale_data()
131159

132160
TEST(data.parse("C"));
133161
TEST_EQ(data.language(), "C");
162+
TEST_EQ(data.script(), "");
134163
TEST_EQ(data.country(), "");
135164
TEST_EQ(data.encoding(), "US-ASCII");
136165
TEST(!data.is_utf8());
137166
TEST_EQ(data.variant(), "");
138167

139168
TEST(data.parse("ku_TR.UTF-8@sorani"));
140169
TEST_EQ(data.language(), "ku");
170+
TEST_EQ(data.script(), "");
141171
TEST_EQ(data.country(), "TR");
142172
TEST_EQ(data.encoding(), "UTF-8");
143173
TEST(data.is_utf8());
@@ -200,6 +230,17 @@ void test_locale_data()
200230
TEST(data.is_utf8());
201231
TEST_EQ(data.variant(), "");
202232

233+
// Script used, optionally with dashes instead of underscores
234+
for(const std::string name : {"pa_Arab_PK.UTF-8", "pa-Arab_PK.UTF-8", "pa_Arab-PK.UTF-8"}) {
235+
TEST(data.parse("pa_Arab_PK.UTF-8"));
236+
TEST_EQ(data.language(), "pa");
237+
TEST_EQ(data.script(), "Arab");
238+
TEST_EQ(data.country(), "PK");
239+
TEST_EQ(data.encoding(), "UTF-8");
240+
TEST(data.is_utf8());
241+
TEST_EQ(data.variant(), "");
242+
}
243+
203244
// to_string yields the input (if format is correct already)
204245
for(const std::string name : {"C",
205246
"en_US.UTF-8",
@@ -211,8 +252,18 @@ void test_locale_data()
211252
"th_TH.TIS620",
212253
"zh_TW.UTF-8@radical",
213254
"en_001",
214-
"en_150.UTF-8"})
255+
"en_150.UTF-8",
256+
// Different variation with parts missing
257+
"pa_Arab_PK.UTF-8",
258+
"pa_Arab_PK@euro",
259+
"pa_Arab.UTF-8",
260+
"pa_Arab@euro",
261+
"pa.UTF-8",
262+
"pa@euro",
263+
"pa_PK.UTF-8",
264+
"pa_PK@euro"})
215265
{
266+
TEST_CONTEXT(name);
216267
TEST(data.parse(name));
217268
TEST_EQ(data.to_string(), name);
218269
}
@@ -224,16 +275,18 @@ void test_locale_data()
224275

225276
// Unify casing:
226277
// - language: lowercase
278+
// - script: Capitalized
227279
// - region: uppercase
228280
// - encoding: uppercase
229281
// - variant: lowercase
230-
TEST(data.parse("EN_us.utf-8@EUro"));
282+
TEST(data.parse("EN_sCrI_us.utf-8@EUro"));
231283
TEST_EQ(data.language(), "en");
284+
TEST_EQ(data.script(), "Scri");
232285
TEST_EQ(data.country(), "US");
233286
TEST_EQ(data.encoding(), "UTF-8");
234287
TEST(data.is_utf8());
235288
TEST_EQ(data.variant(), "euro");
236-
TEST_EQ(data.to_string(), "en_US.UTF-8@euro");
289+
TEST_EQ(data.to_string(), "en_Scri_US.UTF-8@euro");
237290
TEST(data.parse("lAnGUagE_cOunTRy.eNCo-d123inG@Va-r1_Ant"));
238291
TEST_EQ(data.to_string(), "language_COUNTRY.ENCO-D123ING@va-r1_ant");
239292

@@ -313,6 +366,8 @@ void test_locale_data()
313366
// Construct from string
314367
TEST_EQ(boost::locale::util::locale_data("en_US.UTF-8").to_string(), "en_US.UTF-8");
315368
TEST_THROWS(boost::locale::util::locale_data invalid("en_UÖ.UTF-8"), std::invalid_argument);
369+
370+
verify_against_icu();
316371
}
317372

318373
#include "../src/boost/locale/util/numeric.hpp"

0 commit comments

Comments
 (0)