Skip to content

Commit bd27796

Browse files
committed
Fix category generation
1 parent eb8b2ff commit bd27796

File tree

1 file changed

+122
-112
lines changed

1 file changed

+122
-112
lines changed

lib/stdlib/uc_spec/gen_unicode_mod.escript

Lines changed: 122 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -312,8 +312,8 @@ gen_header(Fd) ->
312312
{punctuation,dash} |
313313
{punctuation,open} |
314314
{punctuation,close} |
315-
{punctuation,initial} |
316-
{punctuation,final} |
315+
{punctuation,initial} | % Punctuation, Initial quote (may behave like open or close depending on usage)
316+
{punctuation,final} | % Punctuation, Final quote (may behave like open or close depending on usage)
317317
{punctuation,other} |
318318
{symbol,math} |
319319
{symbol,currency} |
@@ -337,12 +337,12 @@ gen_static(Fd) ->
337337
'category':= category()}.
338338
lookup(Codepoint) when ?IS_CP(Codepoint) ->
339339
{CCC,Can,Comp,Cat} = unicode_table(Codepoint),
340-
#{ccc=>CCC, canon=>Can, compat=>Comp, category=>category(Codepoint,Cat)}.
340+
#{ccc=>CCC, canon=>Can, compat=>Comp, category=>category(Cat,Codepoint)}.
341341
342342
-spec category(char()) -> category().
343343
category(Codepoint) when ?IS_CP(Codepoint) ->
344344
{_,_,_,Cat} = unicode_table(Codepoint),
345-
category(Codepoint,Cat).
345+
category(Cat,Codepoint).
346346
347347
348348
"""),
@@ -420,9 +420,9 @@ category(Codepoint) when ?IS_CP(Codepoint) ->
420420
io:put_chars(Fd, " is_wide_cp(C) orelse is_wide(Cs);\n"),
421421
io:put_chars(Fd, "is_wide([]) ->\n false.\n\n"),
422422

423-
io:put_chars(Fd, "category(CP, lookup_category) ->\n"
424-
" cat_translate(lookup_category(CP));\n"
425-
"category(_, Def) -> cat_translate(Def).\n\n"),
423+
io:put_chars(Fd, "category(lookup_category, Cp) ->\n"
424+
" lookup_category(Cp);\n"
425+
"category(Def, _) -> Def.\n\n"),
426426
ok.
427427

428428
gen_norm(Fd) ->
@@ -674,13 +674,13 @@ gen_props(Fd, Props, Data) ->
674674
OIDS = maps:get(other_id_start, Props),
675675
io:put_chars(Fd, "-spec is_other_id_start(gc()) -> boolean().\n"),
676676
IsODIS = fun(Range) -> io:format(Fd, "is_other_id_start~s true;\n", [gen_single_clause(Range)]) end,
677-
[IsODIS(CP) || CP <- OIDS],
677+
[IsODIS(CP) || CP <- merge_ranges(OIDS)],
678678
io:put_chars(Fd, "is_other_id_start(_) -> false.\n\n"),
679679

680680
OICS = maps:get(other_id_continue, Props),
681681
io:put_chars(Fd, "-spec is_other_id_continue(gc()) -> boolean().\n"),
682682
IsOICS = fun(Range) -> io:format(Fd, "is_other_id_continue~s true;\n", [gen_single_clause(Range)]) end,
683-
[IsOICS(CP) || CP <- OICS],
683+
[IsOICS(CP) || CP <- merge_ranges(OICS)],
684684
io:put_chars(Fd, "is_other_id_continue(_) -> false.\n\n"),
685685

686686
PS0 = maps:get(pattern_syntax, Props),
@@ -697,7 +697,7 @@ gen_props(Fd, Props, Data) ->
697697
end,
698698
PS = [{PSC, undefined} || {PSC, undefined} <:- split_ranges(PS0, []), KeepCat(PSC)],
699699
%% [io:format("~p ~p~n", [P, (array:get(P, Data))#cp.cat]) || {P,_} <- PS],
700-
[IsNLPS(CP) || CP <- PS],
700+
[IsNLPS(CP) || CP <- merge_ranges(PS)],
701701
io:put_chars(Fd, "is_letter_not_pattern_syntax(_) -> true.\n\n"),
702702

703703
ok.
@@ -848,7 +848,7 @@ gen_gc(Fd, GBP) ->
848848
"gc_1([CP|_]) when not ?IS_CP(CP) ->\n"
849849
" error({badarg,CP});\n"),
850850
io:put_chars(Fd, "\n%% Continue control\n"),
851-
[GenControl(CP) || CP <- Crs],
851+
[GenControl(CP) || CP <- merge_ranges(Crs)],
852852
%% One clause per CP
853853
%% CRs0 = merge_ranges(maps:get(cr, GBP) ++ maps:get(lf, GBP) ++ maps:get(control, GBP)),
854854
%% [GenControl(CP) || CP <- CRs0, CP =/= {$\r, undefined}],
@@ -1153,8 +1153,7 @@ gen_unicode_table(Fd, Data, UpdateTests) ->
11531153
case UpdateTests of
11541154
true ->
11551155
Dict1 = lists:map(fun({Id,{CCC, Canon, Compat, Cat}}) ->
1156-
{_, ECat} = lists:keyfind(Cat, 1, category_translate()),
1157-
{Id, {CCC, Canon, Compat, ECat}}
1156+
{Id, {CCC, Canon, Compat, Cat}}
11581157
end, Dict0),
11591158
TestFile = "../test/unicode_util_SUITE_data/unicode_table.bin",
11601159
io:format("Updating: ~s~n", [TestFile]),
@@ -1166,103 +1165,126 @@ gen_unicode_table(Fd, Data, UpdateTests) ->
11661165
[io:format(Fd, "unicode_table(~w) -> ~w;~n", [CP, Map]) || {CP,Map} <- NonDef],
11671166
io:format(Fd, "unicode_table(_) -> ~w.~n~n",[Def]),
11681167

1169-
[io:format(Fd, "cat_translate(~w) -> ~w;~n", [Cat, EC]) || {Cat,EC} <- category_translate()],
1170-
io:format(Fd, "cat_translate(Cat) -> error({internal_error, Cat}).~n~n",[]),
1168+
%% [io:format(Fd, "cat_translate(~w) -> ~w;~n", [Cat, EC]) || {Cat,EC} <- category_translate()],
1169+
%% io:format(Fd, "cat_translate(Cat) -> error({internal_error, Cat}).~n~n",[]),
11711170
gen_category(Fd, CatTable, Data),
11721171
ok.
11731172

11741173
category([C,Sub]) ->
1175-
list_to_atom([C-$A+$a, Sub]).
1174+
Map = category_translate(),
1175+
maps:get(list_to_atom([C-$A+$a, Sub]), Map).
11761176

11771177
category_translate() ->
1178-
[{lu, {letter, uppercase}}, % Letter, Uppercase
1179-
{ll, {letter, lowercase}}, % Letter, Lowercase
1180-
{lt, {letter, titlecase}}, % Letter, Titlecase
1181-
{mn, {mark, non_spacing}}, % Mark, Non-Spacing
1182-
{mc, {mark, spacing_combining}}, % Mark, Spacing Combining
1183-
{me, {mark, enclosing}}, % Mark, Enclosing
1184-
{nd, {number, decimal}}, % Number, Decimal Digit
1185-
{nl, {number, letter}}, % Number, Letter
1186-
{no, {number, other}}, % Number, Other
1187-
{zs, {separator, space}}, % Separator, Space
1188-
{zl, {separator, line}}, % Separator, Line
1189-
{zp, {separator, paragraph}}, % Separator, Paragraph
1190-
{cc, {other, control}}, % Other, Control
1191-
{cf, {other, format}}, % Other, Format
1192-
{cs, {other, surrogate}}, % Other, Surrogate
1193-
{co, {other, private}}, % Other, Private Use
1194-
{cn, {other, not_assigned}}, % Other, Not Assigned (no characters in the file have this property)
1195-
{lm, {letter, modifier}}, % Letter, Modifier
1196-
{lo, {letter, other}}, % Letter, Other
1197-
{pc, {punctuation, connector}}, % Punctuation, Connector
1198-
{pd, {punctuation, dash}}, % Punctuation, Dash
1199-
{ps, {punctuation, open}}, % Punctuation, Open
1200-
{pe, {punctuation, close}}, % Punctuation, Close
1201-
{pi, {punctuation, initial}}, % Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
1202-
{pf, {punctuation, final}}, % Punctuation, Final quote (may behave like Ps or Pe depending on usage)
1203-
{po, {punctuation, other}}, % Punctuation, Other
1204-
{sm, {symbol, math}}, % Symbol, Math
1205-
{sc, {symbol, currency}}, % Symbol, Currency
1206-
{sk, {symbol, modifier}}, % Symbol, Modifier
1207-
{so, {symbol, other}}]. % Symbol, Other
1178+
#{lu => {letter, uppercase}, % Letter, Uppercase
1179+
ll => {letter, lowercase}, % Letter, Lowercase
1180+
lt => {letter, titlecase}, % Letter, Titlecase
1181+
mn => {mark, non_spacing}, % Mark, Non-Spacing
1182+
mc => {mark, spacing_combining}, % Mark, Spacing Combining
1183+
me => {mark, enclosing}, % Mark, Enclosing
1184+
nd => {number, decimal}, % Number, Decimal Digit
1185+
nl => {number, letter}, % Number, Letter
1186+
no => {number, other}, % Number, Other
1187+
zs => {separator, space}, % Separator, Space
1188+
zl => {separator, line}, % Separator, Line
1189+
zp => {separator, paragraph}, % Separator, Paragraph
1190+
cc => {other, control}, % Other, Control
1191+
cf => {other, format}, % Other, Format
1192+
cs => {other, surrogate}, % Other, Surrogate
1193+
co => {other, private}, % Other, Private Use
1194+
cn => {other, not_assigned}, % Other, Not Assigned (no characters in the file have this property)
1195+
lm => {letter, modifier}, % Letter, Modifier
1196+
lo => {letter, other}, % Letter, Other
1197+
pc => {punctuation, connector}, % Punctuation, Connector
1198+
pd => {punctuation, dash}, % Punctuation, Dash
1199+
ps => {punctuation, open}, % Punctuation, Open
1200+
pe => {punctuation, close}, % Punctuation, Close
1201+
pi => {punctuation, initial}, % Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
1202+
pf => {punctuation, final}, % Punctuation, Final quote (may behave like Ps or Pe depending on usage)
1203+
po => {punctuation, other}, % Punctuation, Other
1204+
sm => {symbol, math}, % Symbol, Math
1205+
sc => {symbol, currency}, % Symbol, Currency
1206+
sk => {symbol, modifier}, % Symbol, Modifier
1207+
so => {symbol, other} % Symbol, Other
1208+
}.
12081209

12091210
gen_category(Fd, [{CP, {_, _, _, Cat}}|Rest], All) ->
1210-
gen_category(Fd, Rest, Cat, CP, CP, All, []).
1211+
{Single, Range, SubCat} = gen_category(Rest, Cat, CP, CP, All, [], [], []),
1212+
[io:format(Fd, "lookup_category(~w) -> ~w;~n", [X, C]) || {X,C} <:- Single],
1213+
1214+
Fun = fun(subcat) -> "subcat_letter(CP)";
1215+
(Category) -> io_lib:format("~w", [Category])
1216+
end,
1217+
[io:format(Fd, "lookup_category(CP) when is_integer(CP, ~w, ~w) -> ~s;~n",
1218+
[S, E, Fun(C)]) || {S,E,C} <:- optimize_ranges_1(Range)],
1219+
io:put_chars(Fd, "lookup_category(Cp) -> {other, not_assigned}.\n\n"),
1220+
1221+
{SubSingle, SubRange} = gen_letter(SubCat, All),
1222+
[io:format(Fd, "subcat_letter(~w) -> ~w;~n", [X, C]) || {X,C} <:- SubSingle],
1223+
[io:format(Fd, "subcat_letter(CP) when is_integer(CP, ~w, ~w) -> ~w;~n",
1224+
[S, E, C]) || {S,E,C} <:- optimize_ranges_1(SubRange)],
1225+
io:put_chars(Fd,
1226+
"subcat_letter(CP) ->\n"
1227+
" case case_table(CP) of\n"
1228+
" {CP, CP} -> {letter,other};\n"
1229+
" {CP, _} -> {letter,uppercase};\n"
1230+
" {_, CP} -> {letter,lowercase};\n"
1231+
" {_, _, CP, _} -> {letter,titlecase};\n"
1232+
" {CP, _, _, _} -> {letter,uppercase};\n"
1233+
" {_,CP,_,_} -> {letter,lowercase}\n"
1234+
" end.\n\n"),
1235+
ok.
12111236

1212-
gen_category(Fd, [{CP, {_, _, _, NextCat}}|Rest], Cat, Start, End, All, Acc)
1237+
gen_category([{CP, {_, _, _, NextCat}}|Rest], Cat, Start, End, All, Single, Range, SubCats)
12131238
when End+1 =:= CP ->
12141239
IsLetterCat = letter_cat(NextCat, Cat),
12151240
if NextCat =:= Cat ->
1216-
gen_category(Fd, Rest, Cat, Start, CP, All, Acc);
1241+
gen_category(Rest, Cat, Start, CP, All, Single, Range, SubCats);
12171242
IsLetterCat ->
1218-
gen_category(Fd, Rest, letter, Start, CP, All, Acc);
1243+
gen_category(Rest, letter, Start, CP, All, Single, Range, SubCats);
12191244
Start =:= End ->
1220-
io:format(Fd, "lookup_category(~w) -> ~w;~n", [Start, Cat]),
1221-
gen_category(Fd, Rest, NextCat, CP, CP, All, Acc);
1245+
gen_category(Rest, NextCat, CP, CP, All, [{Start, Cat}|Single], Range, SubCats);
12221246
true ->
12231247
case Cat of
12241248
letter ->
1225-
io:format(Fd, "lookup_category(CP) when is_integer(CP, ~w, ~w) -> subcat_letter(CP);~n",
1226-
[Start, End]),
1227-
gen_category(Fd, Rest, NextCat, CP, CP, All,
1228-
lists:reverse(lists:seq(Start, End)) ++ Acc);
1249+
gen_category(Rest, NextCat, CP, CP, All,
1250+
Single, [{Start, End, subcat}|Range],
1251+
lists:reverse(lists:seq(Start, End)) ++ SubCats);
12291252
_ ->
1230-
io:format(Fd, "lookup_category(CP) when is_integer(CP, ~w, ~w) -> ~w;~n", [Start, End, Cat]),
1231-
gen_category(Fd, Rest, NextCat, CP, CP, All, Acc)
1253+
gen_category(Rest, NextCat, CP, CP, All,
1254+
Single, [{Start, End, Cat}|Range], SubCats)
12321255
end
12331256
end;
1234-
gen_category(Fd, [{CP, {_, _, _, NewCat}}|Rest]=Cont, Cat, Start, End, All, Acc) ->
1257+
gen_category([{CP, {_, _, _, NewCat}}|Rest]=Cont, Cat, Start, End, All, Single, Range, SubCats) ->
12351258
case array:get(End+1, All) of
12361259
undefined ->
12371260
if Start =:= End ->
1238-
io:format(Fd, "lookup_category(~w) -> ~w;~n", [Start, Cat]),
1239-
gen_category(Fd, Rest, NewCat, CP, CP, All, Acc);
1261+
gen_category(Rest, NewCat, CP, CP, All,
1262+
[{Start, Cat}|Single], Range, SubCats);
12401263
true ->
12411264
case Cat of
12421265
letter ->
1243-
io:format(Fd, "lookup_category(CP) when is_integer(CP, ~w, ~w) -> subcat_letter(CP);~n",
1244-
[Start, End]),
1245-
gen_category(Fd, Rest, NewCat, CP, CP, All,
1246-
lists:reverse(lists:seq(Start, End)) ++ Acc);
1266+
gen_category(Rest, NewCat, CP, CP, All,
1267+
Single, [{Start, End, subcat}|Range],
1268+
lists:reverse(lists:seq(Start, End)) ++ SubCats);
12471269
_ ->
1248-
io:format(Fd, "lookup_category(CP) when is_integer(CP, ~w, ~w) -> ~w;~n",
1249-
[Start, End, Cat]),
1250-
gen_category(Fd, Rest, NewCat, CP, CP, All, Acc)
1270+
gen_category(Rest, NewCat, CP, CP, All,
1271+
Single, [{Start, End, Cat}|Range], SubCats)
12511272
end
12521273
end;
12531274
_ -> %% We can make ranges larger by setting already assigned category
1254-
gen_category(Fd, Cont, Cat, Start, End+1, All, Acc)
1275+
gen_category(Cont, Cat, Start, End+1, All, Single, Range, SubCats)
12551276
end;
1256-
gen_category(Fd, [], Cat, Start, End, All, Acc) ->
1277+
gen_category([], Cat, Start, End, _All, Single, Range, SubCats) ->
12571278
case Start =:= End of
12581279
true ->
1259-
io:format(Fd, "lookup_category(~w) -> ~w;~n", [Start, Cat]);
1280+
{lists:reverse([{Start, Cat}|Single]),
1281+
lists:reverse(Range),
1282+
lists:reverse(SubCats)};
12601283
false ->
1261-
io:format(Fd, "lookup_category(CP) when is_integer(CP, ~w, ~w) -> ~w;~n", [Start, End, Cat])
1262-
end,
1263-
io:put_chars(Fd, "lookup_category(Cp) -> cn.\n\n"),
1264-
gen_letter(Fd, lists:reverse(Acc), All),
1265-
ok.
1284+
{lists:reverse(Single),
1285+
lists:reverse([{Start, End,Cat}|Range]),
1286+
lists:reverse(SubCats)}
1287+
end.
12661288

12671289
letter_cat(lm, _) ->
12681290
false;
@@ -1271,59 +1293,47 @@ letter_cat(_, lm) ->
12711293
letter_cat(L1, L2) ->
12721294
is_letter(L1) andalso (L2 =:= letter orelse is_letter(L2)).
12731295

1274-
is_letter(LC) ->
1275-
lists:member(LC, [lu,ll,lt,lo,lm]).
1296+
is_letter({letter, _}) -> true;
1297+
is_letter(_) -> false.
12761298

1277-
gen_letter(Fd, Letters, All) ->
1278-
gen_letter(Fd, Letters, All, []).
1279-
gen_letter(Fd, [CP|Rest], All, Acc) ->
1299+
gen_letter(Letters, All) ->
1300+
gen_letter(Letters, All, []).
1301+
gen_letter([CP|Rest], All, Acc) ->
12801302
case array:get(CP, All) of
12811303
undefined ->
1282-
gen_letter(Fd, Rest, All, Acc);
1304+
gen_letter(Rest, All, Acc);
12831305
#cp{cat=Cat0, cs=Cs} ->
12841306
case {category(Cat0), case_table(CP,case_data(CP, Cs))} of
12851307
{Sub,Sub} ->
1286-
gen_letter(Fd, Rest, All, Acc);
1308+
gen_letter(Rest, All, Acc);
12871309
{lm,_} ->
1288-
gen_letter(Fd, Rest, All, Acc);
1310+
gen_letter(Rest, All, Acc);
12891311
{Cat, _Dbg} ->
12901312
case is_letter(Cat) of
12911313
true ->
1292-
gen_letter(Fd, Rest, All, [{CP, Cat}|Acc]);
1314+
gen_letter(Rest, All, [{CP, Cat}|Acc]);
12931315
false ->
1294-
gen_letter(Fd, Rest, All, Acc)
1316+
gen_letter(Rest, All, Acc)
12951317
end
12961318
end
12971319
end;
1298-
gen_letter(Fd, [], _, Acc) ->
1320+
gen_letter([], _, Acc) ->
12991321
[{Start, Cat}|SCletters] = lists:reverse(Acc),
1300-
subcat_letter(Fd, SCletters, Start, Start, Cat),
1301-
io:put_chars(Fd,
1302-
"subcat_letter(CP) ->\n"
1303-
" case case_table(CP) of\n"
1304-
" {CP, CP} -> lo; %{letter,other};\n"
1305-
" {CP, _} -> lu; %{letter,uppercase};\n"
1306-
" {_, CP} -> ll; %{letter,lowercase};\n"
1307-
" {_, _, CP, _} -> lt; %{letter,titlecase};\n"
1308-
" {CP, _, _, _} -> lu; %{letter,uppercase};\n"
1309-
" {_,CP,_,_} -> ll %{letter,lowercase}\n"
1310-
" end.\n\n").
1311-
1312-
subcat_letter(Fd, [{CP, Cat}|R], Start, End, Cat) when End+1 =:= CP ->
1313-
subcat_letter(Fd, R, Start, CP, Cat);
1314-
subcat_letter(Fd, Rest, Start, Start, Cat) ->
1315-
io:format(Fd, "subcat_letter(~w) -> ~w;\n",[Start,Cat]),
1316-
case Rest of
1317-
[] -> ok;
1318-
[{CP, NewCat}|R] -> subcat_letter(Fd, R, CP, CP, NewCat)
1319-
end;
1320-
subcat_letter(Fd, Rest, Start, End, Cat) ->
1321-
io:format(Fd, "subcat_letter(CP) when is_integer(CP, ~w, ~w) -> ~w;\n",[Start,End,Cat]),
1322-
case Rest of
1323-
[] -> ok;
1324-
[{CP, NewCat}|R] -> subcat_letter(Fd, R, CP, CP, NewCat)
1322+
subcat_letter(SCletters, Start, Start, Cat, [], []).
1323+
1324+
subcat_letter([{CP, Cat}|R], Start, End, Cat, Single, Range) when End+1 =:= CP ->
1325+
subcat_letter(R, Start, CP, Cat, Single, Range);
1326+
subcat_letter([{CP, NewCat}|R], Start, Start, Cat, Single, Range) ->
1327+
subcat_letter(R, CP, CP, NewCat, [{Start, Cat}|Single], Range);
1328+
subcat_letter([{CP, NewCat}|R], Start, End, Cat, Single, Range) ->
1329+
subcat_letter(R, CP, CP, NewCat, Single, [{Start, End, Cat}|Range]);
1330+
subcat_letter([], Start, End, Cat, Single, Range) ->
1331+
case Start == End of
1332+
true -> {lists:reverse([{Start, Cat}|Single]), lists:reverse(Range)};
1333+
false -> {lists:reverse(Single), lists:reverse([{Start, End, Cat}|Range])}
13251334
end.
13261335

1336+
13271337
case_table(CP, CaseData) ->
13281338
case CaseData of
13291339
{CP, CP} -> lo;

0 commit comments

Comments
 (0)