diff --git a/lib/stdlib/src/erl_stdlib_errors.erl b/lib/stdlib/src/erl_stdlib_errors.erl index 49db693d57e2..31503da2149e 100644 --- a/lib/stdlib/src/erl_stdlib_errors.erl +++ b/lib/stdlib/src/erl_stdlib_errors.erl @@ -428,7 +428,15 @@ format_unicode_error(characters_to_nfkc_list, [_]) -> format_unicode_error(characters_to_nfkd_binary, [_]) -> [bad_char_data]; format_unicode_error(characters_to_nfkd_list, [_]) -> - [bad_char_data]. + [bad_char_data]; +format_unicode_error(category, [_]) -> + [bad_char]; +format_unicode_error(is_whitespace, [_]) -> + [bad_char]; +format_unicode_error(is_id_start, [_]) -> + [bad_char]; +format_unicode_error(is_id_continue, [_]) -> + [bad_char]. unicode_char_data(Chars) -> try unicode:characters_to_binary(Chars) of @@ -1121,6 +1129,8 @@ expand_error(bad_boolean) -> <<"not a boolean value">>; expand_error(bad_binary_list) -> <<"not a flat list of binaries">>; +expand_error(bad_char) -> + <<"not a valid character">>; expand_error(bad_char_data) -> <<"not valid character data (an iodata term)">>; expand_error(bad_binary_pattern) -> diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl index bfcec6e6ca31..ad61855ba092 100644 --- a/lib/stdlib/src/string.erl +++ b/lib/stdlib/src/string.erl @@ -477,7 +477,7 @@ pad(CD, Length, both, Char) when is_integer(Length) -> -spec trim(String) -> unicode:chardata() when String :: unicode:chardata(). trim(Str) -> - trim(Str, both, unicode_util:whitespace()). + trim(Str, both, unicode_util:pattern_whitespace()). -doc """ Equivalent to [`trim(String, Dir, Whitespace})`](`trim/3`) where @@ -490,7 +490,7 @@ as Pattern_White_Space in String :: unicode:chardata(), Dir :: direction() | 'both'. trim(Str, Dir) -> - trim(Str, Dir, unicode_util:whitespace()). + trim(Str, Dir, unicode_util:pattern_whitespace()). -doc """ Returns a string, where leading or trailing, or both, `Characters` have been diff --git a/lib/stdlib/src/unicode.erl b/lib/stdlib/src/unicode.erl index 376444d930d1..7186770a6054 100644 --- a/lib/stdlib/src/unicode.erl +++ b/lib/stdlib/src/unicode.erl @@ -21,7 +21,7 @@ %% -module(unicode). -moduledoc """ -Functions for converting Unicode characters. +Functions for converting and classifying Unicode characters. This module contains functions for converting between different character representations. It converts between ISO Latin-1 characters and Unicode @@ -71,9 +71,12 @@ normalization can be found in the characters_to_nfkc_list/1, characters_to_nfkc_binary/1 ]). +-export([is_whitespace/1, is_id_start/1, is_id_continue/1, category/1]). + -export_type([chardata/0, charlist/0, encoding/0, external_chardata/0, external_charlist/0, latin1_char/0, latin1_chardata/0, - latin1_charlist/0, latin1_binary/0, unicode_binary/0]). + latin1_charlist/0, latin1_binary/0, unicode_binary/0, + category/0]). -type encoding() :: 'latin1' | 'unicode' | 'utf8' | 'utf16' | {'utf16', endian()} @@ -108,6 +111,15 @@ than UTF-8 (that is, UTF-16 or UTF-32). latin1_binary() | latin1_charlist(), latin1_binary() | nil()). +-doc "Character category". +-type category() :: + {letter, uppercase | lowercase | titlecase | modifier | other} | + {mark, non_spacing | spacing_combining | enclosing} | + {number, decimal | letter | other} | + {separator, space | line | paragraph} | + {other, control | format | surrogate | private | not_assigned} | + {punctuation, connector | dash | open | close | initial | final | other} | + {symbol, math | currency | modifier | other}. %% We must inline these functions so that the stacktrace points to %% the correct function. @@ -122,6 +134,8 @@ than UTF-8 (that is, UTF-16 or UTF-32). -export([bin_is_7bit/1, characters_to_binary/2, characters_to_list/2]). +-define(IS_CP(CP), is_integer(CP, 0, 16#10FFFF)). + -doc false. -spec bin_is_7bit(Binary) -> boolean() when Binary :: binary(). @@ -681,13 +695,149 @@ characters_to_nfkc_binary(CD, N, Row, Acc) when N > 0 -> characters_to_nfkc_binary(CD, _, Row, Acc) -> characters_to_nfkc_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)). +-doc """ +Returns true if `Char` is a whitespace. + +Whitespace is defined in +[Unicode Standard Annex #44](http://unicode.org/reports/tr44/). + +```erlang +1> unicode:is_whitespace($\s). +true +2> unicode:is_whitespace($😊). +false +``` +""". +-doc(#{since => ~"@OTP-19858@"}). +-spec is_whitespace(char()) -> boolean(). +is_whitespace(X) %% ASCII (and low number) Optimizations + when X =:= 9; X =:= 10; X =:= 11; X =:= 12; X =:= 13; X =:= 32; + X =:= 133; X =:= 160 -> + true; +is_whitespace(Char) when is_integer(Char, 0, 5000) -> %% Arbitrary limit without whitespace + false; +is_whitespace(Char) when ?IS_CP(Char) -> + unicode_util:is_whitespace(Char); +is_whitespace(Term) -> + badarg_with_info([Term]). + + +-doc """ +Returns true if `Char` is an identifier start. + +Identifier start is defined by the ID_Start property in +[Unicode Standard Annex #31](https://unicode.org/reports/tr31/#D1). + +```erlang +1> unicode:is_id_start($a). +true +2> unicode:is_id_start($_). +false +3> unicode:is_id_start($-). +false +``` +""". +-doc(#{since => ~"@OTP-19858@"}). +-spec is_id_start(char()) -> boolean(). +is_id_start(X) %% ASCII optimizations + when X =:= 65; X =:= 66; X =:= 67; X =:= 68; X =:= 69; X =:= 70; X =:= 71; + X =:= 72; X =:= 73; X =:= 74; X =:= 75; X =:= 76; X =:= 77; X =:= 78; + X =:= 79; X =:= 80; X =:= 81; X =:= 82; X =:= 83; X =:= 84; X =:= 85; + X =:= 86; X =:= 87; X =:= 88; X =:= 89; X =:= 90; X =:= 97; X =:= 98; + X =:= 99; X =:= 100; X =:= 101; X =:= 102; X =:= 103; X =:= 104; X =:= 105; + X =:= 106; X =:= 107; X =:= 108; X =:= 109; X =:= 110; X =:= 111; X =:= 112; + X =:= 113; X =:= 114; X =:= 115; X =:= 116; X =:= 117; X =:= 118; X =:= 119; + X =:= 120; X =:= 121; X =:= 122 -> + true; +is_id_start(Char) when is_integer(Char, 0, 127) -> + false; +is_id_start(Char) when ?IS_CP(Char) -> + case unicode_util:category(Char) of + {number,letter} -> true; + {letter,modifier} -> unicode_util:is_letter_not_pattern_syntax(Char); + {letter,_} -> true; + {_,_} -> unicode_util:is_other_id_start(Char) + end; +is_id_start(Term) -> + badarg_with_info([Term]). + + +-doc """ +Returns true if `Char` is an identifier continuation. + +Identifier continuation is defined by the ID_Continue property in +[Unicode Standard Annex #31](https://unicode.org/reports/tr31/#D1). + +```erlang +1> unicode:is_id_continue($a). +true +2> unicode:is_id_continue($_). +true +3> unicode:is_id_continue($-). +false +``` +""". +-doc(#{since => ~"@OTP-19858@"}). +-spec is_id_continue(char()) -> boolean(). +is_id_continue(X) + when X =:= 48; X =:= 49; X =:= 50; X =:= 51; X =:= 52; X =:= 53; X =:= 54; + X =:= 55; X =:= 56; X =:= 57; X =:= 65; X =:= 66; X =:= 67; X =:= 68; + X =:= 69; X =:= 70; X =:= 71; X =:= 72; X =:= 73; X =:= 74; X =:= 75; + X =:= 76; X =:= 77; X =:= 78; X =:= 79; X =:= 80; X =:= 81; X =:= 82; + X =:= 83; X =:= 84; X =:= 85; X =:= 86; X =:= 87; X =:= 88; X =:= 89; + X =:= 90; X =:= 95; X =:= 97; X =:= 98; X =:= 99; X =:= 100; X =:= 101; + X =:= 102; X =:= 103; X =:= 104; X =:= 105; X =:= 106; X =:= 107; + X =:= 108; X =:= 109; X =:= 110; X =:= 111; X =:= 112; X =:= 113; + X =:= 114; X =:= 115; X =:= 116; X =:= 117; X =:= 118; X =:= 119; + X =:= 120; X =:= 121; X =:= 122 -> + true; +is_id_continue(Char) when is_integer(Char, 0, 127) -> + false; +is_id_continue(Char) when ?IS_CP(Char) -> + case unicode_util:category(Char) of + {punctuation, connector} -> true; + {mark,non_spacing} -> true; + {mark,spacing_combining} -> true; + {number,other} -> unicode_util:is_other_id_continue(Char); + {number,_} -> true; + {letter,modifier} -> unicode_util:is_letter_not_pattern_syntax(Char); + {letter,_} -> true; + {_,_} -> unicode_util:is_other_id_start(Char) orelse + unicode_util:is_other_id_continue(Char) + end; +is_id_continue(Term) -> + badarg_with_info([Term]). + +-doc """ +Returns the `Char` category. + +```erlang +1> unicode:category($a). +{letter,lowercase} +2> unicode:category($Ä). +{letter,uppercase} +3> unicode:category($😊). +{symbol,other} +4> unicode:category($€). +{symbol,currency} +5> unicode:category($[). +{punctuation,open} +``` +""". +-doc(#{since => ~"@OTP-19858@"}). +-spec category(char()) -> category(). +category(Char) when ?IS_CP(Char) -> + unicode_util:category(Char); +category(Term) -> + badarg_with_info([Term]). + +%% internals + acc_to_binary(Acc) -> list_to_binary(lists:reverse(Acc)). prepend_row_to_acc(Row, Acc) -> [characters_to_binary(lists:reverse(Row))|Acc]. -%% internals - -doc false. characters_to_list_int(ML, Encoding) -> try diff --git a/lib/stdlib/test/Makefile b/lib/stdlib/test/Makefile index 1ce324b46c87..7df2b43eac48 100644 --- a/lib/stdlib/test/Makefile +++ b/lib/stdlib/test/Makefile @@ -197,6 +197,9 @@ release_tests_spec: make_emakefile $(ERL_FILES) $(COVERFILE) $(EXTRA_FILES) "$(RELSYSDIR)" chmod -R u+w "$(RELSYSDIR)" @tar cf - *_SUITE_data property_test | (cd "$(RELSYSDIR)"; tar xf -) + $(INSTALL_DIR) "$(RELSYSDIR)/unicode_SUITE_data" + $(INSTALL_DATA) ../uc_spec/PropList.txt ../uc_spec/DerivedCoreProperties.txt \ + "$(RELSYSDIR)/unicode_SUITE_data" $(INSTALL_DIR) "$(RELSYSDIR)/stdlib_SUITE_data" $(INSTALL_DATA) $(ERL_TOP)/make/otp_version_tickets "$(RELSYSDIR)/stdlib_SUITE_data" diff --git a/lib/stdlib/test/unicode_SUITE.erl b/lib/stdlib/test/unicode_SUITE.erl index 62306214a037..886b0738546f 100644 --- a/lib/stdlib/test/unicode_SUITE.erl +++ b/lib/stdlib/test/unicode_SUITE.erl @@ -39,14 +39,17 @@ normalize/1, huge_illegal_code_points/1, bin_is_7bit/1, - error_info/1 + error_info/1, + is_whitespace/1, + category/1, + is_id/1 ]). suite() -> [{ct_hooks,[ts_install_cth]}, {timetrap,{minutes,20}}]. -all() -> +all() -> [utf8_illegal_sequences_bif, utf16_illegal_sequences_bif, random_lists, roundtrips, latin1, exceptions, @@ -55,6 +58,7 @@ all() -> {group,binaries_errors}, huge_illegal_code_points, bin_is_7bit, + {group, classify}, error_info]. groups() -> @@ -63,7 +67,12 @@ groups() -> ex_binaries_errors_utf16_little, ex_binaries_errors_utf16_big, ex_binaries_errors_utf32_little, - ex_binaries_errors_utf32_big]}]. + ex_binaries_errors_utf32_big]}, + {classify, [parallel], + [is_whitespace, + category, + is_id]} + ]. binaries_errors_limit(Config) when is_list(Config) -> setlimit(10), @@ -1458,6 +1467,15 @@ error_info(_Config) -> {characters_to_nfkd_list, [abc]}, {characters_to_nfkd_list, [<<1:11>>]}, + {category, [-1]}, + {category, [foobar]}, + + {is_whitespace, [-1]}, + {is_whitespace, [foobar]}, + + {is_id_start, [-1]}, + {is_id_continue, [foobar]}, + %% Not BIFs (they don't throw badarg when they fail). {bom_to_encoding, 1}, %Not BIF. {encoding_to_bom, 1}, %Not BIF. @@ -1469,13 +1487,139 @@ error_info(_Config) -> ], error_info_lib:test_error_info(unicode, L). + +-define(MAX_CHAR, 16#10FFFF). +category(_Config) -> + Check = fun(Id) -> + LC = maps:get(category, unicode_util:lookup(Id)), + LC == unicode:category(Id) + end, + [] = [Id || Id <- lists:seq(1, ?MAX_CHAR), not Check(Id)], + {'EXIT', _} = catch unicode:category(-1), + {'EXIT', _} = catch unicode:category(5000000), + {'EXIT', _} = catch unicode:category(foobar), + ok. + +is_whitespace(Config) -> + Props = parse_properties(filename:join(proplists:get_value(data_dir, Config), "PropList.txt")), + WhiteSpaces = maps:get(white_space, Props), + Set = make_set(WhiteSpaces), + Test = fun(Char) -> + case {unicode:is_whitespace(Char), sets:is_element(Char, Set)} of + {X,X} -> false; + _ -> true + end + end, + [] = [{Char, integer_to_list(Char, 16), unicode_util:lookup(Char)} + || Char <- lists:seq(1, ?MAX_CHAR), Test(Char)], + ok. + +is_id(Config) -> + Props = parse_properties(filename:join(proplists:get_value(data_dir, Config), + "DerivedCoreProperties.txt")), + [] = id_start(Props), + [] = id_cont(Props), + ok. + +id_start(Props) -> + ID_Start = maps:get(id_start, Props), + Set = make_set(ID_Start), + + TestStart = fun(Char) -> + case {unicode:is_id_start(Char), sets:is_element(Char, Set)} of + {X,X} -> false; + _ -> true + end + end, + + [{Char, integer_to_list(Char, 16), unicode_util:lookup(Char)} + || Char <- lists:seq(1, ?MAX_CHAR), TestStart(Char)]. + +id_cont(Props) -> + ID_Cont = maps:get(id_continue, Props), + Set = make_set(ID_Cont), + TestCont = fun(Char) -> + case {unicode:is_id_continue(Char), sets:is_element(Char, Set)} of + {X,X} -> false; + _ -> true + end + end, + [{Char, integer_to_list(Char, 16), unicode_util:lookup(Char)} + || Char <- lists:seq(1, ?MAX_CHAR), TestCont(Char)]. + + %%% %%% Utilities. %%% -id(I) -> I. - setlimit(X) -> erts_debug:set_internal_state(available_internal_state,true), io:format("Setting loop limit, old: ~p, now set to ~p~n", [erts_debug:set_internal_state(unicode_loop_limit,X),X]). + +make_set(ListOfRanges) -> + List = lists:foldl(fun add_range/2, [], ListOfRanges), + sets:from_list(List). + +add_range({A,undefined}, Acc) -> + [A|Acc]; +add_range({A,B}, Acc) -> + lists:seq(A,B) ++ Acc. + +parse_properties(File) -> + {ok, Fd} = file:open(File, [read, raw, {read_ahead, 1000000}]), + Props0 = foldl(fun parse_properties/2, [], Fd), + file:close(Fd), + Props1 = sofs:to_external(sofs:relation_to_family(sofs:relation(Props0))), + maps:from_list(Props1). + +parse_properties(Line0, Acc) -> + [Line|_Comments] = tokens(Line0, "#"), + [CodePoints, Class | _] = tokens(Line, ";"), + case tokens(CodePoints, ".") of + [CodePoint] -> + [{to_atom(Class), {hex_to_int(CodePoint), undefined}}|Acc]; + [CodePoint1,"",CodePoint2] -> + [{to_atom(Class), {hex_to_int(CodePoint1), hex_to_int(CodePoint2)}}|Acc] + end. + +hex_to_int([]) -> []; +hex_to_int(HexStr) -> + list_to_integer(string:trim(HexStr, both), 16). + +to_atom(Str) -> + list_to_atom(string:lowercase(string:trim(Str, both))). + +foldl(Fun, Acc, Fd) -> + Get = fun() -> file:read_line(Fd) end, + foldl_1(Fun, Acc, Get). + +foldl_1(_Fun, {done, Acc}, _Get) -> Acc; +foldl_1(Fun, Acc, Get) -> + case Get() of + eof -> Acc; + {ok, "#" ++ _} -> %% Ignore comments + foldl_1(Fun, Acc, Get); + {ok, "\n"} -> %% Ignore empty lines + foldl_1(Fun, Acc, Get); + {ok, Line} -> + foldl_1(Fun, Fun(Line, Acc), Get) + end. + +%% Differs from string:lexemes, it returns empty string as token between two delimiters +tokens(S, [C]) -> + tokens(lists:reverse(S), C, []). + +tokens([Sep|S], Sep, Toks) -> + tokens(S, Sep, [[]|Toks]); +tokens([C|S], Sep, Toks) -> + tokens_2(S, Sep, Toks, [C]); +tokens([], _, Toks) -> + Toks. + +tokens_2([Sep|S], Sep, Toks, Tok) -> + tokens(S, Sep, [Tok|Toks]); +tokens_2([C|S], Sep, Toks, Tok) -> + tokens_2(S, Sep, Toks, [C|Tok]); +tokens_2([], _Sep, Toks, Tok) -> + [Tok|Toks]. diff --git a/lib/stdlib/test/unicode_util_SUITE.erl b/lib/stdlib/test/unicode_util_SUITE.erl index dbc7e21d1110..6832196c13a5 100644 --- a/lib/stdlib/test/unicode_util_SUITE.erl +++ b/lib/stdlib/test/unicode_util_SUITE.erl @@ -95,8 +95,10 @@ casefold(_) -> whitespace(_Config) -> %% Pattern whitespace - WS = unicode_util:whitespace(), - WS = lists:filter(fun unicode_util:is_whitespace/1, WS), + WS = lists:sort(unicode_util:pattern_whitespace()), + %% is_whitespace are an extended subset of pattern_whitespace + %% (more tested in the unicode module) + WS = lists:sort(lists:filter(fun unicode_util:is_whitespace/1, WS) ++ [8206,8207]), false = unicode_util:is_whitespace($A), ok. diff --git a/lib/stdlib/uc_spec/gen_unicode_mod.escript b/lib/stdlib/uc_spec/gen_unicode_mod.escript index 35c3f05ce261..1d6507bf697c 100644 --- a/lib/stdlib/uc_spec/gen_unicode_mod.escript +++ b/lib/stdlib/uc_spec/gen_unicode_mod.escript @@ -278,7 +278,7 @@ gen_header(Fd) -> -moduledoc false. -export([cp/1, gc/1]). -export([nfd/1, nfc/1, nfkd/1, nfkc/1]). --export([whitespace/0, is_whitespace/1]). +-export([pattern_whitespace/0, is_whitespace/1]). -export([uppercase/1, lowercase/1, titlecase/1, casefold/1]). -export([spec_version/0, lookup/1, category/1, get_case/1]). @@ -320,7 +320,7 @@ gen_header(Fd) -> {symbol,modifier} | {symbol,other}. --define(IS_CP(CP), is_integer(CP, 0, 16#110000)). +-define(IS_CP(CP), is_integer(CP, 0, 16#10FFFF)). -define(IS_ASCII(CP), is_integer(CP, 0, 127)). -define(IS_LATIN1(CP), is_integer(CP, 0, 255)). @@ -657,13 +657,16 @@ gen_norm(Fd) -> ok. gen_props(Fd, Props, Data) -> - WS0 = maps:get(pattern_white_space, Props), - WS = merge_ranges(WS0, split), + PWS0 = maps:get(pattern_white_space, Props), + PWS = merge_ranges(PWS0, split), io:put_chars(Fd, "%% Useful non-breakable whitespace chars\n" "%% defined as Pattern White Space in Unicode Standard Annex #31\n"), - io:put_chars(Fd, "-spec whitespace() -> [gc()].\n"), - WsChars = [CP || {CP, undefined} <- WS], - io:format(Fd, "whitespace() -> ~w.\n\n", [[[$\r,$\n]|WsChars]]), + io:put_chars(Fd, "-spec pattern_whitespace() -> [gc()].\n"), + WsChars = [CP || {CP, undefined} <- PWS], + io:format(Fd, "pattern_whitespace() -> ~w.\n\n", [[[$\r,$\n]|WsChars]]), + + WS0 = maps:get(white_space, Props), + WS = merge_ranges(WS0, split), io:put_chars(Fd, "-spec is_whitespace(gc()) -> boolean().\n"), IsWS = fun(Range) -> io:format(Fd, "is_whitespace~s true;\n", [gen_single_clause(Range)]) end,