Skip to content

Commit 98e1a00

Browse files
committed
Improve formatting
1 parent d518165 commit 98e1a00

File tree

1 file changed

+138
-66
lines changed

1 file changed

+138
-66
lines changed

scripts/gen-universal-table.py

Lines changed: 138 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424

2525
files = [io.open(x, encoding='utf-8') for x in files]
2626

27-
headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4]
27+
headers = [[f.readline() for i in range(2)]
28+
for j, f in enumerate(files) if j != 2]
2829
for j in range(7, 9):
2930
for line in files[j]:
3031
line = line.rstrip()
@@ -95,17 +96,18 @@
9596
data[0][0x111C8] = 'Consonant_Placeholder'
9697

9798
# Merge data into one dict:
98-
for i,v in enumerate (defaults):
99+
for i, v in enumerate(defaults):
99100
values[i][v] = values[i].get (v, 0) + 1
100101
combined = {}
101-
for i,d in enumerate (data):
102-
for u,v in d.items ():
102+
for i, d in enumerate(data):
103+
for u, v in d.items():
103104
if not u in combined:
104105
if i >= 4:
105106
continue
106107
combined[u] = list (defaults)
107108
combined[u][i] = v
108-
combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
109+
combined = {k: v for k, v in combined.items(
110+
) if v[6] not in DISABLED_SCRIPTS}
109111
data = combined
110112
del combined
111113

@@ -186,15 +188,20 @@
186188
class PropertyValue(object):
187189
def __init__(self, name_):
188190
self.name = name_
191+
189192
def __str__(self):
190193
return self.name
194+
191195
def __eq__(self, other):
192196
return self.name == (other if isinstance(other, str) else other.name)
197+
193198
def __ne__(self, other):
194199
return not (self == other)
200+
195201
def __hash__(self):
196202
return hash(str(self))
197203

204+
198205
property_values = {}
199206

200207
for name in property_names:
@@ -214,50 +221,87 @@ def is_BASE(U, UISC, UDI, UGC, AJT):
214221
AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
215222
(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
216223
Consonant_Subjoined, Vowel, Vowel_Dependent]))
224+
225+
217226
def is_BASE_NUM(U, UISC, UDI, UGC, AJT):
218227
return UISC == Brahmi_Joining_Number
228+
229+
219230
def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
220-
if UISC == Consonant_Placeholder: return True
231+
if UISC == Consonant_Placeholder:
232+
return True
221233
return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
234+
235+
222236
def is_CGJ(U, UISC, UDI, UGC, AJT):
223237
# Also includes VARIATION_SELECTOR, WJ, and ZWJ
224238
return U == 0x200D or UDI and UGC in [Mc, Me, Mn]
239+
240+
225241
def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
226242
return ((UISC == Consonant_Final and UGC != Lo) or
227243
UISC == Consonant_Succeeding_Repha)
244+
245+
228246
def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT):
229247
return UISC == Syllable_Modifier
248+
249+
230250
def is_CONS_MED(U, UISC, UDI, UGC, AJT):
231251
# Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
232252
return (UISC == Consonant_Medial and UGC != Lo or
233253
UISC == Consonant_Initial_Postfixed)
254+
255+
234256
def is_CONS_MOD(U, UISC, UDI, UGC, AJT):
235257
return (UISC in [Nukta, Gemination_Mark, Consonant_Killer] and
236258
not is_SYM_MOD(U, UISC, UDI, UGC, AJT))
259+
260+
237261
def is_CONS_SUB(U, UISC, UDI, UGC, AJT):
238262
return UISC == Consonant_Subjoined and UGC != Lo
263+
264+
239265
def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT):
240266
return UISC == Consonant_With_Stacker
267+
268+
241269
def is_HALANT(U, UISC, UDI, UGC, AJT):
242270
return (UISC in [Virama, Invisible_Stacker]
243271
and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT)
244272
and not is_SAKOT(U, UISC, UDI, UGC, AJT))
273+
274+
245275
def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT):
246276
# Split off of HALANT
247277
# https://github.com/harfbuzz/harfbuzz/issues/1379
248278
return U == 0x1134D
279+
280+
249281
def is_HALANT_NUM(U, UISC, UDI, UGC, AJT):
250282
return UISC == Number_Joiner
283+
284+
251285
def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT):
252286
return UISC == Hieroglyph
287+
288+
253289
def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT):
254290
return UISC == Hieroglyph_Joiner
291+
292+
255293
def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT):
256294
return UISC == Hieroglyph_Segment_Begin
295+
296+
257297
def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
258298
return UISC == Hieroglyph_Segment_End
299+
300+
259301
def is_ZWNJ(U, UISC, UDI, UGC, AJT):
260302
return UISC == Non_Joiner
303+
304+
261305
def is_OTHER(U, UISC, UDI, UGC, AJT):
262306
# Also includes BASE_IND, Rsv, and SYM
263307
return ((UGC in [Cn, Po] or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
@@ -266,93 +310,105 @@ def is_OTHER(U, UISC, UDI, UGC, AJT):
266310
and not is_CGJ(U, UISC, UDI, UGC, AJT)
267311
and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
268312
)
313+
314+
269315
def is_REPHA(U, UISC, UDI, UGC, AJT):
270316
return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
317+
318+
271319
def is_SAKOT(U, UISC, UDI, UGC, AJT):
272320
# Split off of HALANT
273321
return U == 0x1A60
322+
323+
274324
def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
275325
return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
326+
327+
276328
def is_VOWEL(U, UISC, UDI, UGC, AJT):
277329
# https://github.com/harfbuzz/harfbuzz/issues/376
278330
return (UISC == Pure_Killer or
279331
(UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
332+
333+
280334
def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
281335
# https://github.com/harfbuzz/harfbuzz/issues/376
282336
return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
283337
(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
284338

339+
285340
use_mapping = {
286-
'B': is_BASE,
287-
'N': is_BASE_NUM,
288-
'GB': is_BASE_OTHER,
289-
'CGJ': is_CGJ,
290-
'F': is_CONS_FINAL,
291-
'FM': is_CONS_FINAL_MOD,
292-
'M': is_CONS_MED,
293-
'CM': is_CONS_MOD,
294-
'SUB': is_CONS_SUB,
295-
'CS': is_CONS_WITH_STACKER,
296-
'H': is_HALANT,
297-
'HVM': is_HALANT_OR_VOWEL_MODIFIER,
298-
'HN': is_HALANT_NUM,
299-
'G': is_HIEROGLYPH,
300-
'J': is_HIEROGLYPH_JOINER,
301-
'SB': is_HIEROGLYPH_SEGMENT_BEGIN,
302-
'SE': is_HIEROGLYPH_SEGMENT_END,
303-
'ZWNJ': is_ZWNJ,
304-
'O': is_OTHER,
305-
'R': is_REPHA,
306-
'Sk': is_SAKOT,
307-
'SM': is_SYM_MOD,
308-
'V': is_VOWEL,
309-
'VM': is_VOWEL_MOD,
341+
'B': is_BASE,
342+
'N': is_BASE_NUM,
343+
'GB': is_BASE_OTHER,
344+
'CGJ': is_CGJ,
345+
'F': is_CONS_FINAL,
346+
'FM': is_CONS_FINAL_MOD,
347+
'M': is_CONS_MED,
348+
'CM': is_CONS_MOD,
349+
'SUB': is_CONS_SUB,
350+
'CS': is_CONS_WITH_STACKER,
351+
'H': is_HALANT,
352+
'HVM': is_HALANT_OR_VOWEL_MODIFIER,
353+
'HN': is_HALANT_NUM,
354+
'G': is_HIEROGLYPH,
355+
'J': is_HIEROGLYPH_JOINER,
356+
'SB': is_HIEROGLYPH_SEGMENT_BEGIN,
357+
'SE': is_HIEROGLYPH_SEGMENT_END,
358+
'ZWNJ': is_ZWNJ,
359+
'O': is_OTHER,
360+
'R': is_REPHA,
361+
'SK': is_SAKOT,
362+
'SM': is_SYM_MOD,
363+
'V': is_VOWEL,
364+
'VM': is_VOWEL_MOD,
310365
}
311366

312367
use_positions = {
313368
'F': {
314-
'Abv': [Top],
315-
'Blw': [Bottom],
316-
'Pst': [Right],
369+
'ABV': [Top],
370+
'BLW': [Bottom],
371+
'PST': [Right],
317372
},
318373
'M': {
319-
'Abv': [Top],
320-
'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
321-
'Pst': [Right],
322-
'Pre': [Left, Top_And_Bottom_And_Left],
374+
'ABV': [Top],
375+
'BLW': [Bottom, Bottom_And_Left, Bottom_And_Right],
376+
'PST': [Right],
377+
'PRE': [Left, Top_And_Bottom_And_Left],
323378
},
324379
'CM': {
325-
'Abv': [Top],
326-
'Blw': [Bottom, Overstruck],
380+
'ABV': [Top],
381+
'BLW': [Bottom, Overstruck],
327382
},
328383
'V': {
329-
'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
330-
'Blw': [Bottom, Overstruck, Bottom_And_Right],
331-
'Pst': [Right],
332-
'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
384+
'ABV': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
385+
'BLW': [Bottom, Overstruck, Bottom_And_Right],
386+
'PST': [Right],
387+
'PRE': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
333388
},
334389
'VM': {
335-
'Abv': [Top],
336-
'Blw': [Bottom, Overstruck],
337-
'Pst': [Right],
338-
'Pre': [Left],
390+
'ABV': [Top],
391+
'BLW': [Bottom, Overstruck],
392+
'PST': [Right],
393+
'PRE': [Left],
339394
},
340395
'SM': {
341-
'Abv': [Top],
342-
'Blw': [Bottom],
396+
'ABV': [Top],
397+
'BLW': [Bottom],
343398
},
344399
'H': None,
345400
'HVM': None,
346401
'B': None,
347402
'FM': {
348-
'Abv': [Top],
349-
'Blw': [Bottom],
350-
'Pst': [Not_Applicable],
403+
'ABV': [Top],
404+
'BLW': [Bottom],
405+
'PST': [Not_Applicable],
351406
},
352407
'R': None,
353408
'SUB': None,
354409
}
355410

411+
356412
def map_to_use(data):
357413
out = {}
358414
items = use_mapping.items()
@@ -361,55 +417,71 @@ def map_to_use(data):
361417
# Resolve Indic_Syllabic_Category
362418

363419
# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
364-
if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
420+
if 0x1CE2 <= U <= 0x1CE8:
421+
UISC = Cantillation_Mark
365422

366423
# Tibetan:
367424
# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
368-
if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
425+
if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F:
426+
UISC = Vowel_Dependent
369427

370428
# TODO: https://github.com/harfbuzz/harfbuzz/pull/627
371-
if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
429+
if 0x1BF2 <= U <= 0x1BF3:
430+
UISC = Nukta
431+
UIPC = Bottom
372432

373433
# TODO: U+1CED should only be allowed after some of
374434
# the nasalization marks, maybe only for U+1CE9..U+1CF1.
375-
if U == 0x1CED: UISC = Tone_Mark
435+
if U == 0x1CED:
436+
UISC = Tone_Mark
376437

377438
# TODO: https://github.com/microsoft/font-tools/issues/1
378-
if U == 0xA982: UISC = Consonant_Succeeding_Repha
439+
if U == 0xA982:
440+
UISC = Consonant_Succeeding_Repha
379441

380442
values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
381-
assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values)
443+
assert len(values) == 1, "%s %s %s %s %s %s" % (
444+
hex(U), UISC, UDI, UGC, AJT, values)
382445
USE = values[0]
383446

384447
# Resolve Indic_Positional_Category
385448

386449
# TODO: These should die, but have UIPC in Unicode 13.0.0
387-
if U in [0x953, 0x954]: UIPC = Not_Applicable
450+
if U in [0x953, 0x954]:
451+
UIPC = Not_Applicable
388452

389453
# TODO: These are not in USE's override list that we have, nor are they in Unicode 13.0.0
390-
if 0xA926 <= U <= 0xA92A: UIPC = Top
454+
if 0xA926 <= U <= 0xA92A:
455+
UIPC = Top
456+
391457
# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
392458
# and https://github.com/harfbuzz/harfbuzz/issues/1631
393-
if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
394-
if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
459+
if U in [0x11302, 0x11303, 0x114C1]:
460+
UIPC = Top
461+
if 0x1CF8 <= U <= 0x1CF9:
462+
UIPC = Top
395463

396464
# TODO: https://github.com/harfbuzz/harfbuzz/pull/982
397465
# also https://github.com/harfbuzz/harfbuzz/issues/1012
398-
if 0x1112A <= U <= 0x1112B: UIPC = Top
399-
if 0x11131 <= U <= 0x11132: UIPC = Top
466+
if 0x1112A <= U <= 0x1112B:
467+
UIPC = Top
468+
if 0x11131 <= U <= 0x11132:
469+
UIPC = Top
400470

401471
assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or
402472
USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT)
403473

404474
pos_mapping = use_positions.get(USE, None)
405475
if pos_mapping:
406476
values = [k for k,v in pos_mapping.items() if v and UIPC in v]
407-
assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
477+
assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (
478+
hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
408479
USE = USE + values[0]
409480

410481
out[U] = (USE, UBlock)
411482
return out
412483

484+
413485
defaults = ('O', 'No_Block')
414486
data = map_to_use(data)
415487

@@ -440,7 +512,7 @@ def print_block(block, start, end, data):
440512
if u in data:
441513
num += 1
442514
d = data.get(u, defaults)
443-
print('%6s,' % d[0].upper(), end='')
515+
print('%6s,' % d[0], end='')
444516

445517
total += end - start + 1
446518
used += num

0 commit comments

Comments
 (0)