erlang · dgud · Nov 18, 2025
diff --git a/lib/stdlib/src/erl_stdlib_errors.erl b/lib/stdlib/src/erl_stdlib_errors.erl
@@ -428,7 +428,15 @@ format_unicode_error(characters_to_nfkc_list, [_]) ->
 format_unicode_error(characters_to_nfkd_binary, [_]) ->
     [bad_char_data];
 format_unicode_error(characters_to_nfkd_list, [_]) ->
-    [bad_char_data].
+    [bad_char_data];
+format_unicode_error(category, [_]) ->
+    [bad_char];
+format_unicode_error(is_whitespace, [_]) ->
+    [bad_char];
+format_unicode_error(is_id_start, [_]) ->
+    [bad_char];
+format_unicode_error(is_id_continue, [_]) ->
+    [bad_char].
 
 unicode_char_data(Chars) ->
     try unicode:characters_to_binary(Chars) of
@@ -1121,6 +1129,8 @@ expand_error(bad_boolean) ->
     <<"not a boolean value">>;
 expand_error(bad_binary_list) ->
     <<"not a flat list of binaries">>;
+expand_error(bad_char) ->
+    <<"not a valid character">>;
 expand_error(bad_char_data) ->
     <<"not valid character data (an iodata term)">>;
 expand_error(bad_binary_pattern) ->

diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl
@@ -477,7 +477,7 @@ pad(CD, Length, both, Char) when is_integer(Length) ->
 -spec trim(String) -> unicode:chardata() when
       String :: unicode:chardata().
 trim(Str) ->
-    trim(Str, both, unicode_util:whitespace()).
+    trim(Str, both, unicode_util:pattern_whitespace()).
 
 -doc """
 Equivalent to [`trim(String, Dir, Whitespace})`](`trim/3`) where 
@@ -490,7 +490,7 @@ as Pattern_White_Space in
       String :: unicode:chardata(),
       Dir :: direction() | 'both'.
 trim(Str, Dir) ->
-    trim(Str, Dir, unicode_util:whitespace()).
+    trim(Str, Dir, unicode_util:pattern_whitespace()).
 
 -doc """
 Returns a string, where leading or trailing, or both, `Characters` have been

diff --git a/lib/stdlib/src/unicode.erl b/lib/stdlib/src/unicode.erl
@@ -21,7 +21,7 @@
 %%
 -module(unicode).
 -moduledoc """
-Functions for converting Unicode characters.
+Functions for converting and classifying Unicode characters.
 
 This module contains functions for converting between different character
 representations. It converts between ISO Latin-1 characters and Unicode
@@ -71,9 +71,12 @@ normalization can be found in the
          characters_to_nfkc_list/1, characters_to_nfkc_binary/1
         ]).
 
+-export([is_whitespace/1, is_id_start/1, is_id_continue/1, category/1]).
+
 -export_type([chardata/0, charlist/0, encoding/0, external_chardata/0,
               external_charlist/0, latin1_char/0, latin1_chardata/0,
-              latin1_charlist/0, latin1_binary/0, unicode_binary/0]).
+              latin1_charlist/0, latin1_binary/0, unicode_binary/0,
+              category/0]).
 
 -type encoding()  :: 'latin1' | 'unicode' | 'utf8'
                    | 'utf16' | {'utf16', endian()}
@@ -108,6 +111,15 @@ than UTF-8 (that is, UTF-16 or UTF-32).
                               latin1_binary() |
                               latin1_charlist(),
                             latin1_binary() | nil()).
+-doc "Character category".
+-type category() ::
+        {letter, uppercase | lowercase | titlecase | modifier | other} |
+        {mark, non_spacing | spacing_combining | enclosing} |
+        {number, decimal | letter | other} |
+        {separator, space | line | paragraph} |
+        {other, control | format | surrogate | private | not_assigned} |
+        {punctuation, connector | dash | open | close | initial | final | other} |
+        {symbol, math | currency | modifier | other}.
 
 %% We must inline these functions so that the stacktrace points to
 %% the correct function.
@@ -122,6 +134,8 @@ than UTF-8 (that is, UTF-16 or UTF-32).
 
 -export([bin_is_7bit/1, characters_to_binary/2, characters_to_list/2]).
 
+-define(IS_CP(CP), is_integer(CP, 0, 16#10FFFF)).
+
 -doc false.
 -spec bin_is_7bit(Binary) -> boolean() when
       Binary :: binary().
@@ -681,13 +695,149 @@ characters_to_nfkc_binary(CD, N, Row, Acc) when N > 0 ->
 characters_to_nfkc_binary(CD, _, Row, Acc) ->
     characters_to_nfkc_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)).
 
+-doc """
+Returns true if `Char` is a whitespace.
+
+Whitespace is defined in
+[Unicode Standard Annex #44](http://unicode.org/reports/tr44/).
+
+```erlang
+1> unicode:is_whitespace($\s).
+true
+2> unicode:is_whitespace($😊).
+false
+```
+""".
+-doc(#{since => ~"@OTP-19858@"}).
+-spec is_whitespace(char()) -> boolean().
+is_whitespace(X) %% ASCII (and low number) Optimizations
+  when X =:= 9; X =:= 10; X =:= 11; X =:= 12; X =:= 13; X =:= 32;
+       X =:= 133; X =:= 160 ->
+    true;
+is_whitespace(Char) when is_integer(Char, 0, 5000) -> %% Arbitrary limit without whitespace
+    false;
+is_whitespace(Char) when ?IS_CP(Char) ->
+    unicode_util:is_whitespace(Char);
+is_whitespace(Term) ->
+    badarg_with_info([Term]).
+
+
+-doc """
+Returns true if `Char` is an identifier start.
+
+Identifier start is defined by the ID_Start property in
+[Unicode Standard Annex #31](https://unicode.org/reports/tr31/#D1).
+
+```erlang
+1> unicode:is_id_start($a).
+true
+2> unicode:is_id_start($_).
+false
+3> unicode:is_id_start($-).
+false
+```
+""".
+-doc(#{since => ~"@OTP-19858@"}).
+-spec is_id_start(char()) -> boolean().
+is_id_start(X)  %% ASCII optimizations
+  when X =:= 65; X =:= 66; X =:= 67; X =:= 68; X =:= 69; X =:= 70; X =:= 71;
+       X =:= 72; X =:= 73; X =:= 74; X =:= 75; X =:= 76; X =:= 77; X =:= 78;
+       X =:= 79; X =:= 80; X =:= 81; X =:= 82; X =:= 83; X =:= 84; X =:= 85;
+       X =:= 86; X =:= 87; X =:= 88; X =:= 89; X =:= 90; X =:= 97; X =:= 98;
+       X =:= 99; X =:= 100; X =:= 101; X =:= 102; X =:= 103; X =:= 104; X =:= 105;
+       X =:= 106; X =:= 107; X =:= 108; X =:= 109; X =:= 110; X =:= 111; X =:= 112;
+       X =:= 113; X =:= 114; X =:= 115; X =:= 116; X =:= 117; X =:= 118; X =:= 119;
+       X =:= 120; X =:= 121; X =:= 122 ->
+    true;
+is_id_start(Char) when is_integer(Char, 0, 127) ->
+    false;
+is_id_start(Char) when ?IS_CP(Char) ->
+    case unicode_util:category(Char) of
+        {number,letter} -> true;
+        {letter,modifier} -> unicode_util:is_letter_not_pattern_syntax(Char);
+        {letter,_} -> true;
+        {_,_} -> unicode_util:is_other_id_start(Char)
+    end;
+is_id_start(Term) ->
+    badarg_with_info([Term]).
+
+
+-doc """
+Returns true if `Char` is an identifier continuation.
+
+Identifier continuation is defined by the ID_Continue property in
+[Unicode Standard Annex #31](https://unicode.org/reports/tr31/#D1).
+
+```erlang
+1> unicode:is_id_continue($a).
+true
+2> unicode:is_id_continue($_).
+true
+3> unicode:is_id_continue($-).
+false
+```
+""".
+-doc(#{since => ~"@OTP-19858@"}).
+-spec is_id_continue(char()) -> boolean().
+is_id_continue(X)
+  when X =:= 48; X =:= 49; X =:= 50; X =:= 51; X =:= 52; X =:= 53; X =:= 54;
+       X =:= 55; X =:= 56; X =:= 57; X =:= 65; X =:= 66; X =:= 67; X =:= 68;
+       X =:= 69; X =:= 70; X =:= 71; X =:= 72; X =:= 73; X =:= 74; X =:= 75;
+       X =:= 76; X =:= 77; X =:= 78; X =:= 79; X =:= 80; X =:= 81; X =:= 82;
+       X =:= 83; X =:= 84; X =:= 85; X =:= 86; X =:= 87; X =:= 88; X =:= 89;
+       X =:= 90; X =:= 95; X =:= 97; X =:= 98; X =:= 99; X =:= 100; X =:= 101;
+       X =:= 102; X =:= 103; X =:= 104; X =:= 105; X =:= 106; X =:= 107;
+       X =:= 108; X =:= 109; X =:= 110; X =:= 111; X =:= 112; X =:= 113;
+       X =:= 114; X =:= 115; X =:= 116; X =:= 117; X =:= 118; X =:= 119;
+       X =:= 120; X =:= 121; X =:= 122 ->
+    true;
+is_id_continue(Char) when is_integer(Char, 0, 127) ->
+    false;
+is_id_continue(Char) when ?IS_CP(Char) ->
+    case unicode_util:category(Char) of
+        {punctuation, connector} -> true;
+        {mark,non_spacing} -> true;
+        {mark,spacing_combining} -> true;
+        {number,other} -> unicode_util:is_other_id_continue(Char);
+        {number,_} -> true;
+        {letter,modifier} -> unicode_util:is_letter_not_pattern_syntax(Char);
+        {letter,_} -> true;
+        {_,_} -> unicode_util:is_other_id_start(Char) orelse
+                     unicode_util:is_other_id_continue(Char)
+    end;
+is_id_continue(Term) ->
+    badarg_with_info([Term]).
+
+-doc """
+Returns the `Char` category.
+
+```erlang
+1> unicode:category($a).
+{letter,lowercase}
+2> unicode:category($Ä).
+{letter,uppercase}
+3> unicode:category($😊).
+{symbol,other}
+4> unicode:category($€).
+{symbol,currency}
+5> unicode:category($[).
+{punctuation,open}
+```
+""".
+-doc(#{since => ~"@OTP-19858@"}).
+-spec category(char()) -> category().
+category(Char) when ?IS_CP(Char) ->
+    unicode_util:category(Char);
+category(Term) ->
+    badarg_with_info([Term]).
+
+%% internals
+
 acc_to_binary(Acc) ->
     list_to_binary(lists:reverse(Acc)).
 prepend_row_to_acc(Row, Acc) ->
     [characters_to_binary(lists:reverse(Row))|Acc].
 
-%% internals
-
 -doc false.
 characters_to_list_int(ML, Encoding) ->
     try

diff --git a/lib/stdlib/test/Makefile b/lib/stdlib/test/Makefile
@@ -197,6 +197,9 @@ release_tests_spec: make_emakefile
 		$(ERL_FILES) $(COVERFILE) $(EXTRA_FILES) "$(RELSYSDIR)"
 	chmod -R u+w "$(RELSYSDIR)"
 	@tar cf - *_SUITE_data property_test | (cd "$(RELSYSDIR)"; tar xf -)
+	$(INSTALL_DIR) "$(RELSYSDIR)/unicode_SUITE_data"
+	$(INSTALL_DATA) ../uc_spec/PropList.txt ../uc_spec/DerivedCoreProperties.txt \
+	       "$(RELSYSDIR)/unicode_SUITE_data"
 	$(INSTALL_DIR) "$(RELSYSDIR)/stdlib_SUITE_data"
 	$(INSTALL_DATA) $(ERL_TOP)/make/otp_version_tickets "$(RELSYSDIR)/stdlib_SUITE_data"