Skip to content

Commit ca7e19b

Browse files
committed
stdlib: Add classifying functions
When making scanners there might be a need to classify codepoints if the specification follows the Unicode standard. The following: is_whitespace/1, is_ID_start/1, is_ID_continue/1, category/1 is (at least) needed to scan and parse for example markdown. This commit is backward incompatible if the *undocumented* unicode_util:[is_]whitespace(..) functions have been used. unicode_util:whitespace() have been changed to unicode_util:pattern_whitespace() string module uses it and documents that is uses the `pattern_whitespace` property. So the name of the function have been changed, this is because `pattern_whitespace` is not a subset of `whitespace` characters, as was discovered during testing. is_whitespace(Char) have been changed to use the Unicode `whitespace` property. See Unicode Standard Annex #31 and #44.
1 parent 0a24434 commit ca7e19b

File tree

7 files changed

+333
-21
lines changed

7 files changed

+333
-21
lines changed

lib/stdlib/src/erl_stdlib_errors.erl

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,15 @@ format_unicode_error(characters_to_nfkc_list, [_]) ->
428428
format_unicode_error(characters_to_nfkd_binary, [_]) ->
429429
[bad_char_data];
430430
format_unicode_error(characters_to_nfkd_list, [_]) ->
431-
[bad_char_data].
431+
[bad_char_data];
432+
format_unicode_error(category, [_]) ->
433+
[bad_char];
434+
format_unicode_error(is_whitespace, [_]) ->
435+
[bad_char];
436+
format_unicode_error(is_id_start, [_]) ->
437+
[bad_char];
438+
format_unicode_error(is_id_continue, [_]) ->
439+
[bad_char].
432440

433441
unicode_char_data(Chars) ->
434442
try unicode:characters_to_binary(Chars) of
@@ -1121,6 +1129,8 @@ expand_error(bad_boolean) ->
11211129
<<"not a boolean value">>;
11221130
expand_error(bad_binary_list) ->
11231131
<<"not a flat list of binaries">>;
1132+
expand_error(bad_char) ->
1133+
<<"not a valid character">>;
11241134
expand_error(bad_char_data) ->
11251135
<<"not valid character data (an iodata term)">>;
11261136
expand_error(bad_binary_pattern) ->

lib/stdlib/src/string.erl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,7 @@ pad(CD, Length, both, Char) when is_integer(Length) ->
477477
-spec trim(String) -> unicode:chardata() when
478478
String :: unicode:chardata().
479479
trim(Str) ->
480-
trim(Str, both, unicode_util:whitespace()).
480+
trim(Str, both, unicode_util:pattern_whitespace()).
481481

482482
-doc """
483483
Equivalent to [`trim(String, Dir, Whitespace})`](`trim/3`) where
@@ -490,7 +490,7 @@ as Pattern_White_Space in
490490
String :: unicode:chardata(),
491491
Dir :: direction() | 'both'.
492492
trim(Str, Dir) ->
493-
trim(Str, Dir, unicode_util:whitespace()).
493+
trim(Str, Dir, unicode_util:pattern_whitespace()).
494494

495495
-doc """
496496
Returns a string, where leading or trailing, or both, `Characters` have been

lib/stdlib/src/unicode.erl

Lines changed: 154 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
%%
2222
-module(unicode).
2323
-moduledoc """
24-
Functions for converting Unicode characters.
24+
Functions for converting and classifying Unicode characters.
2525

2626
This module contains functions for converting between different character
2727
representations. It converts between ISO Latin-1 characters and Unicode
@@ -71,9 +71,12 @@ normalization can be found in the
7171
characters_to_nfkc_list/1, characters_to_nfkc_binary/1
7272
]).
7373

74+
-export([is_whitespace/1, is_id_start/1, is_id_continue/1, category/1]).
75+
7476
-export_type([chardata/0, charlist/0, encoding/0, external_chardata/0,
7577
external_charlist/0, latin1_char/0, latin1_chardata/0,
76-
latin1_charlist/0, latin1_binary/0, unicode_binary/0]).
78+
latin1_charlist/0, latin1_binary/0, unicode_binary/0,
79+
category/0]).
7780

7881
-type encoding() :: 'latin1' | 'unicode' | 'utf8'
7982
| 'utf16' | {'utf16', endian()}
@@ -108,6 +111,15 @@ than UTF-8 (that is, UTF-16 or UTF-32).
108111
latin1_binary() |
109112
latin1_charlist(),
110113
latin1_binary() | nil()).
114+
-doc "Character category".
115+
-type category() ::
116+
{letter, uppercase | lowercase | titlecase | modifier | other} |
117+
{mark, non_spacing | spacing_combining | enclosing} |
118+
{number, decimal | letter | other} |
119+
{separator, space | line | paragraph} |
120+
{other, control | format | surrogate | private | not_assigned} |
121+
{punctuation, connector | dash | open | close | initial | final | other} |
122+
{symbol, math | currency | modifier | other}.
111123

112124
%% We must inline these functions so that the stacktrace points to
113125
%% the correct function.
@@ -122,6 +134,8 @@ than UTF-8 (that is, UTF-16 or UTF-32).
122134

123135
-export([bin_is_7bit/1, characters_to_binary/2, characters_to_list/2]).
124136

137+
-define(IS_CP(CP), is_integer(CP, 0, 16#10FFFF)).
138+
125139
-doc false.
126140
-spec bin_is_7bit(Binary) -> boolean() when
127141
Binary :: binary().
@@ -681,13 +695,149 @@ characters_to_nfkc_binary(CD, N, Row, Acc) when N > 0 ->
681695
characters_to_nfkc_binary(CD, _, Row, Acc) ->
682696
characters_to_nfkc_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)).
683697

698+
-doc """
699+
Returns true if `Char` is a whitespace.
700+
701+
Whitespace is defined in
702+
[Unicode Standard Annex #44](http://unicode.org/reports/tr44/).
703+
704+
```erlang
705+
1> unicode:is_whitespace($\s).
706+
true
707+
2> unicode:is_whitespace($😊).
708+
false
709+
```
710+
""".
711+
-doc(#{since => ~"@OTP-19858@"}).
712+
-spec is_whitespace(char()) -> boolean().
713+
is_whitespace(X) %% ASCII (and low number) Optimizations
714+
when X =:= 9; X =:= 10; X =:= 11; X =:= 12; X =:= 13; X =:= 32;
715+
X =:= 133; X =:= 160 ->
716+
true;
717+
is_whitespace(Char) when is_integer(Char, 0, 5000) -> %% Arbitrary limit without whitespace
718+
false;
719+
is_whitespace(Char) when ?IS_CP(Char) ->
720+
unicode_util:is_whitespace(Char);
721+
is_whitespace(Term) ->
722+
badarg_with_info([Term]).
723+
724+
725+
-doc """
726+
Returns true if `Char` is an identifier start.
727+
728+
Identifier start is defined by the ID_Start property in
729+
[Unicode Standard Annex #31](https://unicode.org/reports/tr31/#D1).
730+
731+
```erlang
732+
1> unicode:is_id_start($a).
733+
true
734+
2> unicode:is_id_start($_).
735+
false
736+
3> unicode:is_id_start($-).
737+
false
738+
```
739+
""".
740+
-doc(#{since => ~"@OTP-19858@"}).
741+
-spec is_id_start(char()) -> boolean().
742+
is_id_start(X) %% ASCII optimizations
743+
when X =:= 65; X =:= 66; X =:= 67; X =:= 68; X =:= 69; X =:= 70; X =:= 71;
744+
X =:= 72; X =:= 73; X =:= 74; X =:= 75; X =:= 76; X =:= 77; X =:= 78;
745+
X =:= 79; X =:= 80; X =:= 81; X =:= 82; X =:= 83; X =:= 84; X =:= 85;
746+
X =:= 86; X =:= 87; X =:= 88; X =:= 89; X =:= 90; X =:= 97; X =:= 98;
747+
X =:= 99; X =:= 100; X =:= 101; X =:= 102; X =:= 103; X =:= 104; X =:= 105;
748+
X =:= 106; X =:= 107; X =:= 108; X =:= 109; X =:= 110; X =:= 111; X =:= 112;
749+
X =:= 113; X =:= 114; X =:= 115; X =:= 116; X =:= 117; X =:= 118; X =:= 119;
750+
X =:= 120; X =:= 121; X =:= 122 ->
751+
true;
752+
is_id_start(Char) when is_integer(Char, 0, 127) ->
753+
false;
754+
is_id_start(Char) when ?IS_CP(Char) ->
755+
case unicode_util:category(Char) of
756+
{number,letter} -> true;
757+
{letter,modifier} -> unicode_util:is_letter_not_pattern_syntax(Char);
758+
{letter,_} -> true;
759+
{_,_} -> unicode_util:is_other_id_start(Char)
760+
end;
761+
is_id_start(Term) ->
762+
badarg_with_info([Term]).
763+
764+
765+
-doc """
766+
Returns true if `Char` is an identifier continuation.
767+
768+
Identifier continuation is defined by the ID_Continue property in
769+
[Unicode Standard Annex #31](https://unicode.org/reports/tr31/#D1).
770+
771+
```erlang
772+
1> unicode:is_id_continue($a).
773+
true
774+
2> unicode:is_id_continue($_).
775+
true
776+
3> unicode:is_id_continue($-).
777+
false
778+
```
779+
""".
780+
-doc(#{since => ~"@OTP-19858@"}).
781+
-spec is_id_continue(char()) -> boolean().
782+
is_id_continue(X)
783+
when X =:= 48; X =:= 49; X =:= 50; X =:= 51; X =:= 52; X =:= 53; X =:= 54;
784+
X =:= 55; X =:= 56; X =:= 57; X =:= 65; X =:= 66; X =:= 67; X =:= 68;
785+
X =:= 69; X =:= 70; X =:= 71; X =:= 72; X =:= 73; X =:= 74; X =:= 75;
786+
X =:= 76; X =:= 77; X =:= 78; X =:= 79; X =:= 80; X =:= 81; X =:= 82;
787+
X =:= 83; X =:= 84; X =:= 85; X =:= 86; X =:= 87; X =:= 88; X =:= 89;
788+
X =:= 90; X =:= 95; X =:= 97; X =:= 98; X =:= 99; X =:= 100; X =:= 101;
789+
X =:= 102; X =:= 103; X =:= 104; X =:= 105; X =:= 106; X =:= 107;
790+
X =:= 108; X =:= 109; X =:= 110; X =:= 111; X =:= 112; X =:= 113;
791+
X =:= 114; X =:= 115; X =:= 116; X =:= 117; X =:= 118; X =:= 119;
792+
X =:= 120; X =:= 121; X =:= 122 ->
793+
true;
794+
is_id_continue(Char) when is_integer(Char, 0, 127) ->
795+
false;
796+
is_id_continue(Char) when ?IS_CP(Char) ->
797+
case unicode_util:category(Char) of
798+
{punctuation, connector} -> true;
799+
{mark,non_spacing} -> true;
800+
{mark,spacing_combining} -> true;
801+
{number,other} -> unicode_util:is_other_id_continue(Char);
802+
{number,_} -> true;
803+
{letter,modifier} -> unicode_util:is_letter_not_pattern_syntax(Char);
804+
{letter,_} -> true;
805+
{_,_} -> unicode_util:is_other_id_start(Char) orelse
806+
unicode_util:is_other_id_continue(Char)
807+
end;
808+
is_id_continue(Term) ->
809+
badarg_with_info([Term]).
810+
811+
-doc """
812+
Returns the `Char` category.
813+
814+
```erlang
815+
1> unicode:category($a).
816+
{letter,lowercase}
817+
2> unicode:category($Ä).
818+
{letter,uppercase}
819+
3> unicode:category($😊).
820+
{symbol,other}
821+
4> unicode:category($€).
822+
{symbol,currency}
823+
5> unicode:category($[).
824+
{punctuation,open}
825+
```
826+
""".
827+
-doc(#{since => ~"@OTP-19858@"}).
828+
-spec category(char()) -> category().
829+
category(Char) when ?IS_CP(Char) ->
830+
unicode_util:category(Char);
831+
category(Term) ->
832+
badarg_with_info([Term]).
833+
834+
%% internals
835+
684836
acc_to_binary(Acc) ->
685837
list_to_binary(lists:reverse(Acc)).
686838
prepend_row_to_acc(Row, Acc) ->
687839
[characters_to_binary(lists:reverse(Row))|Acc].
688840

689-
%% internals
690-
691841
-doc false.
692842
characters_to_list_int(ML, Encoding) ->
693843
try

lib/stdlib/test/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,9 @@ release_tests_spec: make_emakefile
197197
$(ERL_FILES) $(COVERFILE) $(EXTRA_FILES) "$(RELSYSDIR)"
198198
chmod -R u+w "$(RELSYSDIR)"
199199
@tar cf - *_SUITE_data property_test | (cd "$(RELSYSDIR)"; tar xf -)
200+
$(INSTALL_DIR) "$(RELSYSDIR)/unicode_SUITE_data"
201+
$(INSTALL_DATA) ../uc_spec/PropList.txt ../uc_spec/DerivedCoreProperties.txt \
202+
"$(RELSYSDIR)/unicode_SUITE_data"
200203
$(INSTALL_DIR) "$(RELSYSDIR)/stdlib_SUITE_data"
201204
$(INSTALL_DATA) $(ERL_TOP)/make/otp_version_tickets "$(RELSYSDIR)/stdlib_SUITE_data"
202205

0 commit comments

Comments
 (0)