@@ -312,8 +312,8 @@ gen_header(Fd) ->
312312 {punctuation,dash} |
313313 {punctuation,open} |
314314 {punctuation,close} |
315- {punctuation,initial} |
316- {punctuation,final} |
315+ {punctuation,initial} | % Punctuation, Initial quote (may behave like open or close depending on usage)
316+ {punctuation,final} | % Punctuation, Final quote (may behave like open or close depending on usage)
317317 {punctuation,other} |
318318 {symbol,math} |
319319 {symbol,currency} |
@@ -337,12 +337,12 @@ gen_static(Fd) ->
337337 'category':= category()}.
338338lookup(Codepoint) when ?IS_CP(Codepoint) ->
339339 {CCC,Can,Comp,Cat} = unicode_table(Codepoint),
340- #{ccc=>CCC, canon=>Can, compat=>Comp, category=>category(Codepoint, Cat)}.
340+ #{ccc=>CCC, canon=>Can, compat=>Comp, category=>category(Cat,Codepoint )}.
341341
342342-spec category(char()) -> category().
343343category(Codepoint) when ?IS_CP(Codepoint) ->
344344 {_,_,_,Cat} = unicode_table(Codepoint),
345- category(Codepoint, Cat).
345+ category(Cat,Codepoint ).
346346
347347
348348""" ),
@@ -420,9 +420,9 @@ category(Codepoint) when ?IS_CP(Codepoint) ->
420420 io :put_chars (Fd , " is_wide_cp(C) orelse is_wide(Cs);\n " ),
421421 io :put_chars (Fd , " is_wide([]) ->\n false.\n\n " ),
422422
423- io :put_chars (Fd , " category(CP, lookup_category ) ->\n "
424- " cat_translate( lookup_category(CP) );\n "
425- " category(_, Def ) -> cat_translate( Def) .\n\n " ),
423+ io :put_chars (Fd , " category(lookup_category, Cp ) ->\n "
424+ " lookup_category(Cp );\n "
425+ " category(Def, _ ) -> Def.\n\n " ),
426426 ok .
427427
428428gen_norm (Fd ) ->
@@ -674,13 +674,13 @@ gen_props(Fd, Props, Data) ->
674674 OIDS = maps :get (other_id_start , Props ),
675675 io :put_chars (Fd , " -spec is_other_id_start(gc()) -> boolean().\n " ),
676676 IsODIS = fun (Range ) -> io :format (Fd , " is_other_id_start~s true;\n " , [gen_single_clause (Range )]) end ,
677- [IsODIS (CP ) || CP <- OIDS ],
677+ [IsODIS (CP ) || CP <- merge_ranges ( OIDS ) ],
678678 io :put_chars (Fd , " is_other_id_start(_) -> false.\n\n " ),
679679
680680 OICS = maps :get (other_id_continue , Props ),
681681 io :put_chars (Fd , " -spec is_other_id_continue(gc()) -> boolean().\n " ),
682682 IsOICS = fun (Range ) -> io :format (Fd , " is_other_id_continue~s true;\n " , [gen_single_clause (Range )]) end ,
683- [IsOICS (CP ) || CP <- OICS ],
683+ [IsOICS (CP ) || CP <- merge_ranges ( OICS ) ],
684684 io :put_chars (Fd , " is_other_id_continue(_) -> false.\n\n " ),
685685
686686 PS0 = maps :get (pattern_syntax , Props ),
@@ -697,7 +697,7 @@ gen_props(Fd, Props, Data) ->
697697 end ,
698698 PS = [{PSC , undefined } || {PSC , undefined } < :- split_ranges (PS0 , []), KeepCat (PSC )],
699699 % % [io:format("~p ~p~n", [P, (array:get(P, Data))#cp.cat]) || {P,_} <- PS],
700- [IsNLPS (CP ) || CP <- PS ],
700+ [IsNLPS (CP ) || CP <- merge_ranges ( PS ) ],
701701 io :put_chars (Fd , " is_letter_not_pattern_syntax(_) -> true.\n\n " ),
702702
703703 ok .
@@ -848,7 +848,7 @@ gen_gc(Fd, GBP) ->
848848 " gc_1([CP|_]) when not ?IS_CP(CP) ->\n "
849849 " error({badarg,CP});\n " ),
850850 io :put_chars (Fd , " \n %% Continue control\n " ),
851- [GenControl (CP ) || CP <- Crs ],
851+ [GenControl (CP ) || CP <- merge_ranges ( Crs ) ],
852852 % % One clause per CP
853853 % % CRs0 = merge_ranges(maps:get(cr, GBP) ++ maps:get(lf, GBP) ++ maps:get(control, GBP)),
854854 % % [GenControl(CP) || CP <- CRs0, CP =/= {$\r, undefined}],
@@ -1153,8 +1153,7 @@ gen_unicode_table(Fd, Data, UpdateTests) ->
11531153 case UpdateTests of
11541154 true ->
11551155 Dict1 = lists :map (fun ({Id ,{CCC , Canon , Compat , Cat }}) ->
1156- {_ , ECat } = lists :keyfind (Cat , 1 , category_translate ()),
1157- {Id , {CCC , Canon , Compat , ECat }}
1156+ {Id , {CCC , Canon , Compat , Cat }}
11581157 end , Dict0 ),
11591158 TestFile = " ../test/unicode_util_SUITE_data/unicode_table.bin" ,
11601159 io :format (" Updating: ~s~n " , [TestFile ]),
@@ -1166,103 +1165,126 @@ gen_unicode_table(Fd, Data, UpdateTests) ->
11661165 [io :format (Fd , " unicode_table(~w ) -> ~w ;~n " , [CP , Map ]) || {CP ,Map } <- NonDef ],
11671166 io :format (Fd , " unicode_table(_) -> ~w .~n~n " ,[Def ]),
11681167
1169- [io :format (Fd , " cat_translate(~w ) -> ~w ;~n " , [Cat , EC ]) || {Cat ,EC } <- category_translate ()],
1170- io :format (Fd , " cat_translate(Cat) -> error({internal_error, Cat}).~n~n " ,[]),
1168+ % % [io:format(Fd, "cat_translate(~w) -> ~w;~n", [Cat, EC]) || {Cat,EC} <- category_translate()],
1169+ % % io:format(Fd, "cat_translate(Cat) -> error({internal_error, Cat}).~n~n",[]),
11711170 gen_category (Fd , CatTable , Data ),
11721171 ok .
11731172
11741173category ([C ,Sub ]) ->
1175- list_to_atom ([C - $A + $a , Sub ]).
1174+ Map = category_translate (),
1175+ maps :get (list_to_atom ([C - $A + $a , Sub ]), Map ).
11761176
11771177category_translate () ->
1178- [{lu , {letter , uppercase }}, % Letter, Uppercase
1179- {ll , {letter , lowercase }}, % Letter, Lowercase
1180- {lt , {letter , titlecase }}, % Letter, Titlecase
1181- {mn , {mark , non_spacing }}, % Mark, Non-Spacing
1182- {mc , {mark , spacing_combining }}, % Mark, Spacing Combining
1183- {me , {mark , enclosing }}, % Mark, Enclosing
1184- {nd , {number , decimal }}, % Number, Decimal Digit
1185- {nl , {number , letter }}, % Number, Letter
1186- {no , {number , other }}, % Number, Other
1187- {zs , {separator , space }}, % Separator, Space
1188- {zl , {separator , line }}, % Separator, Line
1189- {zp , {separator , paragraph }}, % Separator, Paragraph
1190- {cc , {other , control }}, % Other, Control
1191- {cf , {other , format }}, % Other, Format
1192- {cs , {other , surrogate }}, % Other, Surrogate
1193- {co , {other , private }}, % Other, Private Use
1194- {cn , {other , not_assigned }}, % Other, Not Assigned (no characters in the file have this property)
1195- {lm , {letter , modifier }}, % Letter, Modifier
1196- {lo , {letter , other }}, % Letter, Other
1197- {pc , {punctuation , connector }}, % Punctuation, Connector
1198- {pd , {punctuation , dash }}, % Punctuation, Dash
1199- {ps , {punctuation , open }}, % Punctuation, Open
1200- {pe , {punctuation , close }}, % Punctuation, Close
1201- {pi , {punctuation , initial }}, % Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
1202- {pf , {punctuation , final }}, % Punctuation, Final quote (may behave like Ps or Pe depending on usage)
1203- {po , {punctuation , other }}, % Punctuation, Other
1204- {sm , {symbol , math }}, % Symbol, Math
1205- {sc , {symbol , currency }}, % Symbol, Currency
1206- {sk , {symbol , modifier }}, % Symbol, Modifier
1207- {so , {symbol , other }}]. % Symbol, Other
1178+ #{lu => {letter , uppercase }, % Letter, Uppercase
1179+ ll => {letter , lowercase }, % Letter, Lowercase
1180+ lt => {letter , titlecase }, % Letter, Titlecase
1181+ mn => {mark , non_spacing }, % Mark, Non-Spacing
1182+ mc => {mark , spacing_combining }, % Mark, Spacing Combining
1183+ me => {mark , enclosing }, % Mark, Enclosing
1184+ nd => {number , decimal }, % Number, Decimal Digit
1185+ nl => {number , letter }, % Number, Letter
1186+ no => {number , other }, % Number, Other
1187+ zs => {separator , space }, % Separator, Space
1188+ zl => {separator , line }, % Separator, Line
1189+ zp => {separator , paragraph }, % Separator, Paragraph
1190+ cc => {other , control }, % Other, Control
1191+ cf => {other , format }, % Other, Format
1192+ cs => {other , surrogate }, % Other, Surrogate
1193+ co => {other , private }, % Other, Private Use
1194+ cn => {other , not_assigned }, % Other, Not Assigned (no characters in the file have this property)
1195+ lm => {letter , modifier }, % Letter, Modifier
1196+ lo => {letter , other }, % Letter, Other
1197+ pc => {punctuation , connector }, % Punctuation, Connector
1198+ pd => {punctuation , dash }, % Punctuation, Dash
1199+ ps => {punctuation , open }, % Punctuation, Open
1200+ pe => {punctuation , close }, % Punctuation, Close
1201+ pi => {punctuation , initial }, % Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
1202+ pf => {punctuation , final }, % Punctuation, Final quote (may behave like Ps or Pe depending on usage)
1203+ po => {punctuation , other }, % Punctuation, Other
1204+ sm => {symbol , math }, % Symbol, Math
1205+ sc => {symbol , currency }, % Symbol, Currency
1206+ sk => {symbol , modifier }, % Symbol, Modifier
1207+ so => {symbol , other } % Symbol, Other
1208+ }.
12081209
12091210gen_category (Fd , [{CP , {_ , _ , _ , Cat }}|Rest ], All ) ->
1210- gen_category (Fd , Rest , Cat , CP , CP , All , []).
1211+ {Single , Range , SubCat } = gen_category (Rest , Cat , CP , CP , All , [], [], []),
1212+ [io :format (Fd , " lookup_category(~w ) -> ~w ;~n " , [X , C ]) || {X ,C } < :- Single ],
1213+
1214+ Fun = fun (subcat ) -> " subcat_letter(CP)" ;
1215+ (Category ) -> io_lib :format (" ~w " , [Category ])
1216+ end ,
1217+ [io :format (Fd , " lookup_category(CP) when is_integer(CP, ~w , ~w ) -> ~s ;~n " ,
1218+ [S , E , Fun (C )]) || {S ,E ,C } < :- optimize_ranges_1 (Range )],
1219+ io :put_chars (Fd , " lookup_category(Cp) -> {other, not_assigned}.\n\n " ),
1220+
1221+ {SubSingle , SubRange } = gen_letter (SubCat , All ),
1222+ [io :format (Fd , " subcat_letter(~w ) -> ~w ;~n " , [X , C ]) || {X ,C } < :- SubSingle ],
1223+ [io :format (Fd , " subcat_letter(CP) when is_integer(CP, ~w , ~w ) -> ~w ;~n " ,
1224+ [S , E , C ]) || {S ,E ,C } < :- optimize_ranges_1 (SubRange )],
1225+ io :put_chars (Fd ,
1226+ " subcat_letter(CP) ->\n "
1227+ " case case_table(CP) of\n "
1228+ " {CP, CP} -> {letter,other};\n "
1229+ " {CP, _} -> {letter,uppercase};\n "
1230+ " {_, CP} -> {letter,lowercase};\n "
1231+ " {_, _, CP, _} -> {letter,titlecase};\n "
1232+ " {CP, _, _, _} -> {letter,uppercase};\n "
1233+ " {_,CP,_,_} -> {letter,lowercase}\n "
1234+ " end.\n\n " ),
1235+ ok .
12111236
1212- gen_category (Fd , [{CP , {_ , _ , _ , NextCat }}|Rest ], Cat , Start , End , All , Acc )
1237+ gen_category ([{CP , {_ , _ , _ , NextCat }}|Rest ], Cat , Start , End , All , Single , Range , SubCats )
12131238 when End + 1 =:= CP ->
12141239 IsLetterCat = letter_cat (NextCat , Cat ),
12151240 if NextCat =:= Cat ->
1216- gen_category (Fd , Rest , Cat , Start , CP , All , Acc );
1241+ gen_category (Rest , Cat , Start , CP , All , Single , Range , SubCats );
12171242 IsLetterCat ->
1218- gen_category (Fd , Rest , letter , Start , CP , All , Acc );
1243+ gen_category (Rest , letter , Start , CP , All , Single , Range , SubCats );
12191244 Start =:= End ->
1220- io :format (Fd , " lookup_category(~w ) -> ~w ;~n " , [Start , Cat ]),
1221- gen_category (Fd , Rest , NextCat , CP , CP , All , Acc );
1245+ gen_category (Rest , NextCat , CP , CP , All , [{Start , Cat }|Single ], Range , SubCats );
12221246 true ->
12231247 case Cat of
12241248 letter ->
1225- io :format (Fd , " lookup_category(CP) when is_integer(CP, ~w , ~w ) -> subcat_letter(CP);~n " ,
1226- [Start , End ]),
1227- gen_category (Fd , Rest , NextCat , CP , CP , All ,
1228- lists :reverse (lists :seq (Start , End )) ++ Acc );
1249+ gen_category (Rest , NextCat , CP , CP , All ,
1250+ Single , [{Start , End , subcat }|Range ],
1251+ lists :reverse (lists :seq (Start , End )) ++ SubCats );
12291252 _ ->
1230- io : format ( Fd , " lookup_category(CP) when is_integer( CP, ~w , ~w ) -> ~w ; ~n " , [ Start , End , Cat ]) ,
1231- gen_category ( Fd , Rest , NextCat , CP , CP , All , Acc )
1253+ gen_category ( Rest , NextCat , CP , CP , All ,
1254+ Single , [{ Start , End , Cat }| Range ], SubCats )
12321255 end
12331256 end ;
1234- gen_category (Fd , [{CP , {_ , _ , _ , NewCat }}|Rest ]= Cont , Cat , Start , End , All , Acc ) ->
1257+ gen_category ([{CP , {_ , _ , _ , NewCat }}|Rest ]= Cont , Cat , Start , End , All , Single , Range , SubCats ) ->
12351258 case array :get (End + 1 , All ) of
12361259 undefined ->
12371260 if Start =:= End ->
1238- io : format ( Fd , " lookup_category( ~w ) -> ~w ; ~n " , [ Start , Cat ]) ,
1239- gen_category ( Fd , Rest , NewCat , CP , CP , All , Acc );
1261+ gen_category ( Rest , NewCat , CP , CP , All ,
1262+ [{ Start , Cat }| Single ], Range , SubCats );
12401263 true ->
12411264 case Cat of
12421265 letter ->
1243- io :format (Fd , " lookup_category(CP) when is_integer(CP, ~w , ~w ) -> subcat_letter(CP);~n " ,
1244- [Start , End ]),
1245- gen_category (Fd , Rest , NewCat , CP , CP , All ,
1246- lists :reverse (lists :seq (Start , End )) ++ Acc );
1266+ gen_category (Rest , NewCat , CP , CP , All ,
1267+ Single , [{Start , End , subcat }|Range ],
1268+ lists :reverse (lists :seq (Start , End )) ++ SubCats );
12471269 _ ->
1248- io :format (Fd , " lookup_category(CP) when is_integer(CP, ~w , ~w ) -> ~w ;~n " ,
1249- [Start , End , Cat ]),
1250- gen_category (Fd , Rest , NewCat , CP , CP , All , Acc )
1270+ gen_category (Rest , NewCat , CP , CP , All ,
1271+ Single , [{Start , End , Cat }|Range ], SubCats )
12511272 end
12521273 end ;
12531274 _ -> % % We can make ranges larger by setting already assigned category
1254- gen_category (Fd , Cont , Cat , Start , End + 1 , All , Acc )
1275+ gen_category (Cont , Cat , Start , End + 1 , All , Single , Range , SubCats )
12551276 end ;
1256- gen_category (Fd , [], Cat , Start , End , All , Acc ) ->
1277+ gen_category ([], Cat , Start , End , _All , Single , Range , SubCats ) ->
12571278 case Start =:= End of
12581279 true ->
1259- io :format (Fd , " lookup_category(~w ) -> ~w ;~n " , [Start , Cat ]);
1280+ {lists :reverse ([{Start , Cat }|Single ]),
1281+ lists :reverse (Range ),
1282+ lists :reverse (SubCats )};
12601283 false ->
1261- io :format (Fd , " lookup_category(CP) when is_integer(CP, ~w , ~w ) -> ~w ;~n " , [Start , End , Cat ])
1262- end ,
1263- io :put_chars (Fd , " lookup_category(Cp) -> cn.\n\n " ),
1264- gen_letter (Fd , lists :reverse (Acc ), All ),
1265- ok .
1284+ {lists :reverse (Single ),
1285+ lists :reverse ([{Start , End ,Cat }|Range ]),
1286+ lists :reverse (SubCats )}
1287+ end .
12661288
12671289letter_cat (lm , _ ) ->
12681290 false ;
@@ -1271,59 +1293,47 @@ letter_cat(_, lm) ->
12711293letter_cat (L1 , L2 ) ->
12721294 is_letter (L1 ) andalso (L2 =:= letter orelse is_letter (L2 )).
12731295
1274- is_letter (LC ) ->
1275- lists : member ( LC , [ lu , ll , lt , lo , lm ]) .
1296+ is_letter ({ letter , _ } ) -> true ;
1297+ is_letter ( _ ) -> false .
12761298
1277- gen_letter (Fd , Letters , All ) ->
1278- gen_letter (Fd , Letters , All , []).
1279- gen_letter (Fd , [CP |Rest ], All , Acc ) ->
1299+ gen_letter (Letters , All ) ->
1300+ gen_letter (Letters , All , []).
1301+ gen_letter ([CP |Rest ], All , Acc ) ->
12801302 case array :get (CP , All ) of
12811303 undefined ->
1282- gen_letter (Fd , Rest , All , Acc );
1304+ gen_letter (Rest , All , Acc );
12831305 # cp {cat = Cat0 , cs = Cs } ->
12841306 case {category (Cat0 ), case_table (CP ,case_data (CP , Cs ))} of
12851307 {Sub ,Sub } ->
1286- gen_letter (Fd , Rest , All , Acc );
1308+ gen_letter (Rest , All , Acc );
12871309 {lm ,_ } ->
1288- gen_letter (Fd , Rest , All , Acc );
1310+ gen_letter (Rest , All , Acc );
12891311 {Cat , _Dbg } ->
12901312 case is_letter (Cat ) of
12911313 true ->
1292- gen_letter (Fd , Rest , All , [{CP , Cat }|Acc ]);
1314+ gen_letter (Rest , All , [{CP , Cat }|Acc ]);
12931315 false ->
1294- gen_letter (Fd , Rest , All , Acc )
1316+ gen_letter (Rest , All , Acc )
12951317 end
12961318 end
12971319 end ;
1298- gen_letter (Fd , [], _ , Acc ) ->
1320+ gen_letter ([], _ , Acc ) ->
12991321 [{Start , Cat }|SCletters ] = lists :reverse (Acc ),
1300- subcat_letter (Fd , SCletters , Start , Start , Cat ),
1301- io :put_chars (Fd ,
1302- " subcat_letter(CP) ->\n "
1303- " case case_table(CP) of\n "
1304- " {CP, CP} -> lo; %{letter,other};\n "
1305- " {CP, _} -> lu; %{letter,uppercase};\n "
1306- " {_, CP} -> ll; %{letter,lowercase};\n "
1307- " {_, _, CP, _} -> lt; %{letter,titlecase};\n "
1308- " {CP, _, _, _} -> lu; %{letter,uppercase};\n "
1309- " {_,CP,_,_} -> ll %{letter,lowercase}\n "
1310- " end.\n\n " ).
1311-
1312- subcat_letter (Fd , [{CP , Cat }|R ], Start , End , Cat ) when End + 1 =:= CP ->
1313- subcat_letter (Fd , R , Start , CP , Cat );
1314- subcat_letter (Fd , Rest , Start , Start , Cat ) ->
1315- io :format (Fd , " subcat_letter(~w ) -> ~w ;\n " ,[Start ,Cat ]),
1316- case Rest of
1317- [] -> ok ;
1318- [{CP , NewCat }|R ] -> subcat_letter (Fd , R , CP , CP , NewCat )
1319- end ;
1320- subcat_letter (Fd , Rest , Start , End , Cat ) ->
1321- io :format (Fd , " subcat_letter(CP) when is_integer(CP, ~w , ~w ) -> ~w ;\n " ,[Start ,End ,Cat ]),
1322- case Rest of
1323- [] -> ok ;
1324- [{CP , NewCat }|R ] -> subcat_letter (Fd , R , CP , CP , NewCat )
1322+ subcat_letter (SCletters , Start , Start , Cat , [], []).
1323+
1324+ subcat_letter ([{CP , Cat }|R ], Start , End , Cat , Single , Range ) when End + 1 =:= CP ->
1325+ subcat_letter (R , Start , CP , Cat , Single , Range );
1326+ subcat_letter ([{CP , NewCat }|R ], Start , Start , Cat , Single , Range ) ->
1327+ subcat_letter (R , CP , CP , NewCat , [{Start , Cat }|Single ], Range );
1328+ subcat_letter ([{CP , NewCat }|R ], Start , End , Cat , Single , Range ) ->
1329+ subcat_letter (R , CP , CP , NewCat , Single , [{Start , End , Cat }|Range ]);
1330+ subcat_letter ([], Start , End , Cat , Single , Range ) ->
1331+ case Start == End of
1332+ true -> {lists :reverse ([{Start , Cat }|Single ]), lists :reverse (Range )};
1333+ false -> {lists :reverse (Single ), lists :reverse ([{Start , End , Cat }|Range ])}
13251334 end .
13261335
1336+
13271337case_table (CP , CaseData ) ->
13281338 case CaseData of
13291339 {CP , CP } -> lo ;
0 commit comments