Skip to content

Commit 0a24434

Browse files
authored
Merge pull request #10382 from dgud/dgud/stdlib/unicode-17/OTP-19853
Unicode 17 and some more improvements
2 parents 891eab8 + 2ea3f5e commit 0a24434

18 files changed

+34627
-17978
lines changed

lib/stdlib/test/unicode_util_SUITE.erl

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
nfd/1, nfc/1, nfkd/1, nfkc/1,
3030
whitespace/1,
3131
get/1,
32-
lookup/1,
32+
lookup/1, category/1, is_id_func/1,
3333
count/1]).
3434

3535
-export([debug/0, id/1, bin_split/1, uc_loaded_size/0,
@@ -47,6 +47,8 @@ all() ->
4747
cp, gc,
4848
nfd, nfc, nfkd, nfkc,
4949
whitespace,
50+
category,
51+
is_id_func,
5052
get,
5153
lookup,
5254
count
@@ -91,7 +93,8 @@ casefold(_) ->
9193
[[$s,$s]] = unicode_util:casefold([$ẞ]),
9294
ok.
9395

94-
whitespace(_) ->
96+
whitespace(_Config) ->
97+
%% Pattern whitespace
9598
WS = unicode_util:whitespace(),
9699
WS = lists:filter(fun unicode_util:is_whitespace/1, WS),
97100
false = unicode_util:is_whitespace($A),
@@ -368,6 +371,28 @@ check_category(Id, [{Next,_}|_] = Rest, Es) ->
368371
check_category(_Id, [], Es) ->
369372
Es.
370373

374+
category(_Config) ->
375+
Check = fun(Id) ->
376+
LC = maps:get(category, unicode_util:lookup(Id)),
377+
LC == unicode_util:category(Id)
378+
end,
379+
[] = [Id || Id <- lists:seq(1, 200000), not Check(Id)],
380+
{'EXIT', _} = catch unicode_util:category(-1),
381+
{'EXIT', _} = catch unicode_util:category(5000000),
382+
{'EXIT', _} = catch unicode_util:category(foobar),
383+
ok.
384+
385+
is_id_func(_Config) ->
386+
%% Basic tests more tests in unicode tests
387+
false = unicode_util:is_other_id_start($a),
388+
true = unicode_util:is_other_id_start(6277),
389+
390+
false = unicode_util:is_other_id_continue($a),
391+
true = unicode_util:is_other_id_continue(183),
392+
393+
false = unicode_util:is_letter_not_pattern_syntax(11823),
394+
true = unicode_util:is_letter_not_pattern_syntax($a),
395+
ok.
371396

372397
count(Config) ->
373398
Parent = self(),

lib/stdlib/test/unicode_util_SUITE_data/GraphemeBreakTest.txt

Lines changed: 706 additions & 1031 deletions
Large diffs are not rendered by default.

lib/stdlib/test/unicode_util_SUITE_data/LineBreakTest.txt

Lines changed: 19329 additions & 16661 deletions
Large diffs are not rendered by default.

lib/stdlib/test/unicode_util_SUITE_data/NormalizationTest.txt

Lines changed: 72 additions & 3 deletions
Large diffs are not rendered by default.
12.7 KB
Binary file not shown.

lib/stdlib/uc_spec/CaseFolding.txt

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
# CaseFolding-16.0.0.txt
2-
# Date: 2024-04-30, 21:48:11 GMT
3-
# © 2024 Unicode®, Inc.
1+
# CaseFolding-17.0.0.txt
2+
# Date: 2025-07-30, 23:54:36 GMT
3+
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
66
#
@@ -18,15 +18,15 @@
1818
# The data supports both implementations that require simple case foldings
1919
# (where string lengths don't change), and implementations that allow full case folding
2020
# (where string lengths may grow). Note that where they can be supported, the
21-
# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match.
21+
# full case foldings are superior: for example, they allow "FUSS" and "Fuß" to match.
2222
#
2323
# All code points not listed in this file map to themselves.
2424
#
2525
# NOTE: case folding does not preserve normalization formats!
2626
#
2727
# For information on case folding, including how to have case folding
28-
# preserve normalization formats, see Section 3.13 Default Case Algorithms in
29-
# The Unicode Standard.
28+
# preserve normalization formats, see the
29+
# "Conformance" / "Default Case Algorithms" section of the core specification.
3030
#
3131
# ================================================================================
3232
# Format
@@ -1243,7 +1243,10 @@ A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY
12431243
A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY
12441244
A7CB; C; 0264; # LATIN CAPITAL LETTER RAMS HORN
12451245
A7CC; C; A7CD; # LATIN CAPITAL LETTER S WITH DIAGONAL STROKE
1246+
A7CE; C; A7CF; # LATIN CAPITAL LETTER PHARYNGEAL VOICED FRICATIVE
12461247
A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G
1248+
A7D2; C; A7D3; # LATIN CAPITAL LETTER DOUBLE THORN
1249+
A7D4; C; A7D5; # LATIN CAPITAL LETTER DOUBLE WYNN
12471250
A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S
12481251
A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S
12491252
A7DA; C; A7DB; # LATIN CAPITAL LETTER LAMBDA
@@ -1616,6 +1619,31 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z
16161619
16E5D; C; 16E7D; # MEDEFAIDRIN CAPITAL LETTER O
16171620
16E5E; C; 16E7E; # MEDEFAIDRIN CAPITAL LETTER AI
16181621
16E5F; C; 16E7F; # MEDEFAIDRIN CAPITAL LETTER Y
1622+
16EA0; C; 16EBB; # BERIA ERFE CAPITAL LETTER ARKAB
1623+
16EA1; C; 16EBC; # BERIA ERFE CAPITAL LETTER BASIGNA
1624+
16EA2; C; 16EBD; # BERIA ERFE CAPITAL LETTER DARBAI
1625+
16EA3; C; 16EBE; # BERIA ERFE CAPITAL LETTER EH
1626+
16EA4; C; 16EBF; # BERIA ERFE CAPITAL LETTER FITKO
1627+
16EA5; C; 16EC0; # BERIA ERFE CAPITAL LETTER GOWAY
1628+
16EA6; C; 16EC1; # BERIA ERFE CAPITAL LETTER HIRDEABO
1629+
16EA7; C; 16EC2; # BERIA ERFE CAPITAL LETTER I
1630+
16EA8; C; 16EC3; # BERIA ERFE CAPITAL LETTER DJAI
1631+
16EA9; C; 16EC4; # BERIA ERFE CAPITAL LETTER KOBO
1632+
16EAA; C; 16EC5; # BERIA ERFE CAPITAL LETTER LAKKO
1633+
16EAB; C; 16EC6; # BERIA ERFE CAPITAL LETTER MERI
1634+
16EAC; C; 16EC7; # BERIA ERFE CAPITAL LETTER NINI
1635+
16EAD; C; 16EC8; # BERIA ERFE CAPITAL LETTER GNA
1636+
16EAE; C; 16EC9; # BERIA ERFE CAPITAL LETTER NGAY
1637+
16EAF; C; 16ECA; # BERIA ERFE CAPITAL LETTER OI
1638+
16EB0; C; 16ECB; # BERIA ERFE CAPITAL LETTER PI
1639+
16EB1; C; 16ECC; # BERIA ERFE CAPITAL LETTER ERIGO
1640+
16EB2; C; 16ECD; # BERIA ERFE CAPITAL LETTER ERIGO TAMURA
1641+
16EB3; C; 16ECE; # BERIA ERFE CAPITAL LETTER SERI
1642+
16EB4; C; 16ECF; # BERIA ERFE CAPITAL LETTER SHEP
1643+
16EB5; C; 16ED0; # BERIA ERFE CAPITAL LETTER TATASOUE
1644+
16EB6; C; 16ED1; # BERIA ERFE CAPITAL LETTER UI
1645+
16EB7; C; 16ED2; # BERIA ERFE CAPITAL LETTER WASSE
1646+
16EB8; C; 16ED3; # BERIA ERFE CAPITAL LETTER AY
16191647
1E900; C; 1E922; # ADLAM CAPITAL LETTER ALIF
16201648
1E901; C; 1E923; # ADLAM CAPITAL LETTER DAALI
16211649
1E902; C; 1E924; # ADLAM CAPITAL LETTER LAAM

lib/stdlib/uc_spec/CompositionExclusions.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
# CompositionExclusions-16.0.0.txt
2-
# Date: 2024-02-02
3-
# © 2024 Unicode®, Inc.
1+
# CompositionExclusions-17.0.0.txt
2+
# Date: 2025-08-01
3+
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
66
#

0 commit comments

Comments
 (0)