2121% %
2222-module (unicode ).
2323-moduledoc """
24- Functions for converting Unicode characters.
24+ Functions for converting and classifying Unicode characters.
2525
2626This module contains functions for converting between different character
2727representations. It converts between ISO Latin-1 characters and Unicode
@@ -71,9 +71,12 @@ normalization can be found in the
7171 characters_to_nfkc_list /1 , characters_to_nfkc_binary /1
7272 ]).
7373
74+ -export ([is_whitespace /1 , is_ID_start /1 , is_ID_continue /1 , category /1 ]).
75+
7476-export_type ([chardata / 0 , charlist / 0 , encoding / 0 , external_chardata / 0 ,
7577 external_charlist / 0 , latin1_char / 0 , latin1_chardata / 0 ,
76- latin1_charlist / 0 , latin1_binary / 0 , unicode_binary / 0 ]).
78+ latin1_charlist / 0 , latin1_binary / 0 , unicode_binary / 0 ,
79+ category / 0 ]).
7780
7881-type encoding () :: 'latin1' | 'unicode' | 'utf8'
7982 | 'utf16' | {'utf16' , endian ()}
@@ -108,6 +111,38 @@ than UTF-8 (that is, UTF-16 or UTF-32).
108111 latin1_binary () |
109112 latin1_charlist (),
110113 latin1_binary () | nil ()).
114+ -doc " Character category" .
115+ -type category () ::
116+ {letter ,uppercase } |
117+ {letter ,lowercase } |
118+ {letter ,titlecase } |
119+ {letter ,modifier } |
120+ {letter ,other } |
121+ {mark ,non_spacing } |
122+ {mark ,spacing_combining } |
123+ {mark ,enclosing } |
124+ {number ,decimal } |
125+ {number ,letter } |
126+ {number ,other } |
127+ {separator ,space } |
128+ {separator ,line } |
129+ {separator ,paragraph } |
130+ {other ,control } |
131+ {other ,format } |
132+ {other ,surrogate } |
133+ {other ,private } |
134+ {other ,not_assigned } |
135+ {punctuation ,connector } |
136+ {punctuation ,dash } |
137+ {punctuation ,open } |
138+ {punctuation ,close } |
139+ {punctuation ,initial } |
140+ {punctuation ,final } |
141+ {punctuation ,other } |
142+ {symbol ,math } |
143+ {symbol ,currency } |
144+ {symbol ,modifier } |
145+ {symbol ,other }.
111146
112147% % We must inline these functions so that the stacktrace points to
113148% % the correct function.
@@ -122,6 +157,8 @@ than UTF-8 (that is, UTF-16 or UTF-32).
122157
123158-export ([bin_is_7bit /1 , characters_to_binary /2 , characters_to_list /2 ]).
124159
160+ -define (IS_CP (CP ), is_integer (CP , 0 , 16#10FFFF )).
161+
125162-doc false .
126163-spec bin_is_7bit (Binary ) -> boolean () when
127164 Binary :: binary ().
@@ -681,13 +718,141 @@ characters_to_nfkc_binary(CD, N, Row, Acc) when N > 0 ->
681718characters_to_nfkc_binary (CD , _ , Row , Acc ) ->
682719 characters_to_nfkc_binary (CD , ? GC_N , [], prepend_row_to_acc (Row , Acc )).
683720
721+ -doc """
722+ Returns true if `Char` is a whitespace.
723+
724+ Whitespace is defined in
725+ [Unicode Standard Annex #44](http://unicode.org/reports/tr44/).
726+
727+ ```erlang
728+ 1> unicode:is_whitespace($\s).
729+ true
730+ 2> unicode:is_whitespace($😊).
731+ false
732+ ```
733+ """ .
734+ -doc (#{since => ~ " @OTP-19858@" }).
735+ -spec is_whitespace (char ()) -> boolean ().
736+ is_whitespace (Char ) when is_integer (Char , 9 , 13 ) ->
737+ true ;
738+ is_whitespace (32 ) ->
739+ true ;
740+ is_whitespace (133 ) ->
741+ true ;
742+ is_whitespace (160 ) ->
743+ true ;
744+ is_whitespace (Char ) when is_integer (Char , 0 , 5000 ) ->
745+ false ;
746+ is_whitespace (Char ) when ? IS_CP (Char ) ->
747+ unicode_util :is_whitespace (Char );
748+ is_whitespace (Term ) ->
749+ badarg_with_info ([Term ]).
750+
751+
752+ -doc """
753+ Returns true if `Char` is an identifier start.
754+
755+ Identifier start is defined by the ID_Start property in
756+ [Unicode Standard Annex #31](http://unicode.org/reports/tr31/).
757+
758+ ```erlang
759+ 1> unicode:is_ID_start($a).
760+ true
761+ 2> unicode:is_ID_start($_).
762+ false
763+ 3> unicode:is_ID_start($-).
764+ false
765+ ```
766+ """ .
767+ -doc (#{since => ~ " @OTP-19858@" }).
768+ -spec is_ID_start (char ()) -> boolean ().
769+ is_ID_start (Char ) % % ASCII optimizations
770+ when is_integer (Char , 16#41 , 16#5A );
771+ is_integer (Char , 16#61 , 16#7A ) ->
772+ true ;
773+ is_ID_start (Char ) when is_integer (Char , 0 , 127 ) ->
774+ false ;
775+ is_ID_start (Char ) when ? IS_CP (Char ) ->
776+ case unicode_util :category (Char ) of
777+ {number ,letter } -> true ;
778+ {letter ,modifier } -> unicode_util :is_letter_not_pattern_syntax (Char );
779+ {letter ,_ } -> true ;
780+ {_ ,_ } -> unicode_util :is_other_id_start (Char )
781+ end ;
782+ is_ID_start (Term ) ->
783+ badarg_with_info ([Term ]).
784+
785+
786+ -doc """
787+ Returns true if `Char` is an identifier continuation.
788+
789+ Identifier continuation is defined by the ID_Continue property in
790+ [Unicode Standard Annex #31](http://unicode.org/reports/tr31/).
791+
792+ ```erlang
793+ 1> unicode:is_ID_continue($a).
794+ true
795+ 2> unicode:is_ID_continue($_).
796+ true
797+ 3> unicode:is_ID_continue($-).
798+ false
799+ ```
800+ """ .
801+ -doc (#{since => ~ " @OTP-19858@" }).
802+ -spec is_ID_continue (char ()) -> boolean ().
803+ is_ID_continue (Char ) % % ASCII optimizations
804+ when is_integer (Char , 16#30 , 16#39 );
805+ is_integer (Char , 16#41 , 16#5A );
806+ (Char =:= 16#5F );
807+ is_integer (Char , 16#61 , 16#7A ) ->
808+ true ;
809+ is_ID_continue (Char ) when is_integer (Char , 0 , 127 ) ->
810+ false ;
811+ is_ID_continue (Char ) when ? IS_CP (Char ) ->
812+ case unicode_util :category (Char ) of
813+ {punctuation , connector } -> true ;
814+ {mark ,non_spacing } -> true ;
815+ {mark ,spacing_combining } -> true ;
816+ {number ,other } -> unicode_util :is_other_id_continue (Char );
817+ {number ,_ } -> true ;
818+ {letter ,modifier } -> unicode_util :is_letter_not_pattern_syntax (Char );
819+ {letter ,_ } -> true ;
820+ {_ ,_ } -> unicode_util :is_other_id_start (Char ) orelse
821+ unicode_util :is_other_id_continue (Char )
822+ end ;
823+ is_ID_continue (Term ) ->
824+ badarg_with_info ([Term ]).
825+
826+ -doc """
827+ Returns the `Char` category.
828+
829+ ```erlang
830+ 1> unicode:category($a).
831+ {letter,lowercase}
832+ 2> unicode:category($Ä).
833+ {letter,uppercase}
834+ 3> unicode:category($😊).
835+ {symbol,other}
836+ 3> unicode:category($€).
837+ {symbol,currency}
838+ 3> unicode:category($[).
839+ {punctuation,open}
840+ ```
841+ """ .
842+ -doc (#{since => ~ " @OTP-19858@" }).
843+ -spec category (char ()) -> category ().
844+ category (Char ) when ? IS_CP (Char ) ->
845+ unicode_util :category (Char );
846+ category (Term ) ->
847+ badarg_with_info ([Term ]).
848+
849+ % % internals
850+
684851acc_to_binary (Acc ) ->
685852 list_to_binary (lists :reverse (Acc )).
686853prepend_row_to_acc (Row , Acc ) ->
687854 [characters_to_binary (lists :reverse (Row ))|Acc ].
688855
689- % % internals
690-
691856-doc false .
692857characters_to_list_int (ML , Encoding ) ->
693858 try
0 commit comments