2121% %
2222-module (unicode ).
2323-moduledoc """
24- Functions for converting Unicode characters.
24+ Functions for converting and classifying Unicode characters.
2525
2626This module contains functions for converting between different character
2727representations. It converts between ISO Latin-1 characters and Unicode
@@ -71,9 +71,12 @@ normalization can be found in the
7171 characters_to_nfkc_list /1 , characters_to_nfkc_binary /1
7272 ]).
7373
74+ -export ([is_whitespace /1 , is_id_start /1 , is_id_continue /1 , category /1 ]).
75+
7476-export_type ([chardata / 0 , charlist / 0 , encoding / 0 , external_chardata / 0 ,
7577 external_charlist / 0 , latin1_char / 0 , latin1_chardata / 0 ,
76- latin1_charlist / 0 , latin1_binary / 0 , unicode_binary / 0 ]).
78+ latin1_charlist / 0 , latin1_binary / 0 , unicode_binary / 0 ,
79+ category / 0 ]).
7780
7881-type encoding () :: 'latin1' | 'unicode' | 'utf8'
7982 | 'utf16' | {'utf16' , endian ()}
@@ -108,6 +111,15 @@ than UTF-8 (that is, UTF-16 or UTF-32).
108111 latin1_binary () |
109112 latin1_charlist (),
110113 latin1_binary () | nil ()).
114+ -doc " Character category" .
115+ -type category () ::
116+ {letter , uppercase | lowercase | titlecase | modifier | other } |
117+ {mark , non_spacing | spacing_combining | enclosing } |
118+ {number , decimal | letter | other } |
119+ {separator , space | line | paragraph } |
120+ {other , control | format | surrogate | private | not_assigned } |
121+ {punctuation , connector | dash | open | close | initial | final | other } |
122+ {symbol , math | currency | modifier | other }.
111123
112124% % We must inline these functions so that the stacktrace points to
113125% % the correct function.
@@ -122,6 +134,8 @@ than UTF-8 (that is, UTF-16 or UTF-32).
122134
123135-export ([bin_is_7bit /1 , characters_to_binary /2 , characters_to_list /2 ]).
124136
137+ -define (IS_CP (CP ), is_integer (CP , 0 , 16#10FFFF )).
138+
125139-doc false .
126140-spec bin_is_7bit (Binary ) -> boolean () when
127141 Binary :: binary ().
@@ -681,13 +695,149 @@ characters_to_nfkc_binary(CD, N, Row, Acc) when N > 0 ->
681695characters_to_nfkc_binary (CD , _ , Row , Acc ) ->
682696 characters_to_nfkc_binary (CD , ? GC_N , [], prepend_row_to_acc (Row , Acc )).
683697
698+ -doc """
699+ Returns true if `Char` is a whitespace.
700+
701+ Whitespace is defined in
702+ [Unicode Standard Annex #44](http://unicode.org/reports/tr44/).
703+
704+ ```erlang
705+ 1> unicode:is_whitespace($\s).
706+ true
707+ 2> unicode:is_whitespace($😊).
708+ false
709+ ```
710+ """ .
711+ -doc (#{since => ~ " @OTP-19858@" }).
712+ -spec is_whitespace (char ()) -> boolean ().
713+ is_whitespace (X ) % % ASCII (and low number) Optimizations
714+ when X =:= 9 ; X =:= 10 ; X =:= 11 ; X =:= 12 ; X =:= 13 ; X =:= 32 ;
715+ X =:= 133 ; X =:= 160 ->
716+ true ;
717+ is_whitespace (Char ) when is_integer (Char , 0 , 5000 ) -> % % Arbitrary limit without whitespace
718+ false ;
719+ is_whitespace (Char ) when ? IS_CP (Char ) ->
720+ unicode_util :is_whitespace (Char );
721+ is_whitespace (Term ) ->
722+ badarg_with_info ([Term ]).
723+
724+
725+ -doc """
726+ Returns true if `Char` is an identifier start.
727+
728+ Identifier start is defined by the ID_Start property in
729+ [Unicode Standard Annex #31](https://unicode.org/reports/tr31/#D1).
730+
731+ ```erlang
732+ 1> unicode:is_id_start($a).
733+ true
734+ 2> unicode:is_id_start($_).
735+ false
736+ 3> unicode:is_id_start($-).
737+ false
738+ ```
739+ """ .
740+ -doc (#{since => ~ " @OTP-19858@" }).
741+ -spec is_id_start (char ()) -> boolean ().
742+ is_id_start (X ) % % ASCII optimizations
743+ when X =:= 65 ; X =:= 66 ; X =:= 67 ; X =:= 68 ; X =:= 69 ; X =:= 70 ; X =:= 71 ;
744+ X =:= 72 ; X =:= 73 ; X =:= 74 ; X =:= 75 ; X =:= 76 ; X =:= 77 ; X =:= 78 ;
745+ X =:= 79 ; X =:= 80 ; X =:= 81 ; X =:= 82 ; X =:= 83 ; X =:= 84 ; X =:= 85 ;
746+ X =:= 86 ; X =:= 87 ; X =:= 88 ; X =:= 89 ; X =:= 90 ; X =:= 97 ; X =:= 98 ;
747+ X =:= 99 ; X =:= 100 ; X =:= 101 ; X =:= 102 ; X =:= 103 ; X =:= 104 ; X =:= 105 ;
748+ X =:= 106 ; X =:= 107 ; X =:= 108 ; X =:= 109 ; X =:= 110 ; X =:= 111 ; X =:= 112 ;
749+ X =:= 113 ; X =:= 114 ; X =:= 115 ; X =:= 116 ; X =:= 117 ; X =:= 118 ; X =:= 119 ;
750+ X =:= 120 ; X =:= 121 ; X =:= 122 ->
751+ true ;
752+ is_id_start (Char ) when is_integer (Char , 0 , 127 ) ->
753+ false ;
754+ is_id_start (Char ) when ? IS_CP (Char ) ->
755+ case unicode_util :category (Char ) of
756+ {number ,letter } -> true ;
757+ {letter ,modifier } -> unicode_util :is_letter_not_pattern_syntax (Char );
758+ {letter ,_ } -> true ;
759+ {_ ,_ } -> unicode_util :is_other_id_start (Char )
760+ end ;
761+ is_id_start (Term ) ->
762+ badarg_with_info ([Term ]).
763+
764+
765+ -doc """
766+ Returns true if `Char` is an identifier continuation.
767+
768+ Identifier continuation is defined by the ID_Continue property in
769+ [Unicode Standard Annex #31](https://unicode.org/reports/tr31/#D1).
770+
771+ ```erlang
772+ 1> unicode:is_id_continue($a).
773+ true
774+ 2> unicode:is_id_continue($_).
775+ true
776+ 3> unicode:is_id_continue($-).
777+ false
778+ ```
779+ """ .
780+ -doc (#{since => ~ " @OTP-19858@" }).
781+ -spec is_id_continue (char ()) -> boolean ().
782+ is_id_continue (X )
783+ when X =:= 48 ; X =:= 49 ; X =:= 50 ; X =:= 51 ; X =:= 52 ; X =:= 53 ; X =:= 54 ;
784+ X =:= 55 ; X =:= 56 ; X =:= 57 ; X =:= 65 ; X =:= 66 ; X =:= 67 ; X =:= 68 ;
785+ X =:= 69 ; X =:= 70 ; X =:= 71 ; X =:= 72 ; X =:= 73 ; X =:= 74 ; X =:= 75 ;
786+ X =:= 76 ; X =:= 77 ; X =:= 78 ; X =:= 79 ; X =:= 80 ; X =:= 81 ; X =:= 82 ;
787+ X =:= 83 ; X =:= 84 ; X =:= 85 ; X =:= 86 ; X =:= 87 ; X =:= 88 ; X =:= 89 ;
788+ X =:= 90 ; X =:= 95 ; X =:= 97 ; X =:= 98 ; X =:= 99 ; X =:= 100 ; X =:= 101 ;
789+ X =:= 102 ; X =:= 103 ; X =:= 104 ; X =:= 105 ; X =:= 106 ; X =:= 107 ;
790+ X =:= 108 ; X =:= 109 ; X =:= 110 ; X =:= 111 ; X =:= 112 ; X =:= 113 ;
791+ X =:= 114 ; X =:= 115 ; X =:= 116 ; X =:= 117 ; X =:= 118 ; X =:= 119 ;
792+ X =:= 120 ; X =:= 121 ; X =:= 122 ->
793+ true ;
794+ is_id_continue (Char ) when is_integer (Char , 0 , 127 ) ->
795+ false ;
796+ is_id_continue (Char ) when ? IS_CP (Char ) ->
797+ case unicode_util :category (Char ) of
798+ {punctuation , connector } -> true ;
799+ {mark ,non_spacing } -> true ;
800+ {mark ,spacing_combining } -> true ;
801+ {number ,other } -> unicode_util :is_other_id_continue (Char );
802+ {number ,_ } -> true ;
803+ {letter ,modifier } -> unicode_util :is_letter_not_pattern_syntax (Char );
804+ {letter ,_ } -> true ;
805+ {_ ,_ } -> unicode_util :is_other_id_start (Char ) orelse
806+ unicode_util :is_other_id_continue (Char )
807+ end ;
808+ is_id_continue (Term ) ->
809+ badarg_with_info ([Term ]).
810+
811+ -doc """
812+ Returns the `Char` category.
813+
814+ ```erlang
815+ 1> unicode:category($a).
816+ {letter,lowercase}
817+ 2> unicode:category($Ä).
818+ {letter,uppercase}
819+ 3> unicode:category($😊).
820+ {symbol,other}
821+ 4> unicode:category($€).
822+ {symbol,currency}
823+ 5> unicode:category($[).
824+ {punctuation,open}
825+ ```
826+ """ .
827+ -doc (#{since => ~ " @OTP-19858@" }).
828+ -spec category (char ()) -> category ().
829+ category (Char ) when ? IS_CP (Char ) ->
830+ unicode_util :category (Char );
831+ category (Term ) ->
832+ badarg_with_info ([Term ]).
833+
834+ % % internals
835+
684836acc_to_binary (Acc ) ->
685837 list_to_binary (lists :reverse (Acc )).
686838prepend_row_to_acc (Row , Acc ) ->
687839 [characters_to_binary (lists :reverse (Row ))|Acc ].
688840
689- % % internals
690-
691841-doc false .
692842characters_to_list_int (ML , Encoding ) ->
693843 try
0 commit comments