24
24
25
25
files = [io .open (x , encoding = 'utf-8' ) for x in files ]
26
26
27
- headers = [[f .readline () for i in range (2 )] for j ,f in enumerate (files ) if j != 4 ]
27
+ headers = [[f .readline () for i in range (2 )]
28
+ for j , f in enumerate (files ) if j != 2 ]
28
29
for j in range (7 , 9 ):
29
30
for line in files [j ]:
30
31
line = line .rstrip ()
95
96
data [0 ][0x111C8 ] = 'Consonant_Placeholder'
96
97
97
98
# Merge data into one dict:
98
- for i ,v in enumerate (defaults ):
99
+ for i , v in enumerate (defaults ):
99
100
values [i ][v ] = values [i ].get (v , 0 ) + 1
100
101
combined = {}
101
- for i ,d in enumerate (data ):
102
- for u ,v in d .items ():
102
+ for i , d in enumerate (data ):
103
+ for u , v in d .items ():
103
104
if not u in combined :
104
105
if i >= 4 :
105
106
continue
106
107
combined [u ] = list (defaults )
107
108
combined [u ][i ] = v
108
- combined = {k : v for k , v in combined .items () if v [6 ] not in DISABLED_SCRIPTS }
109
+ combined = {k : v for k , v in combined .items (
110
+ ) if v [6 ] not in DISABLED_SCRIPTS }
109
111
data = combined
110
112
del combined
111
113
186
188
class PropertyValue (object ):
187
189
def __init__ (self , name_ ):
188
190
self .name = name_
191
+
189
192
def __str__ (self ):
190
193
return self .name
194
+
191
195
def __eq__ (self , other ):
192
196
return self .name == (other if isinstance (other , str ) else other .name )
197
+
193
198
def __ne__ (self , other ):
194
199
return not (self == other )
200
+
195
201
def __hash__ (self ):
196
202
return hash (str (self ))
197
203
204
+
198
205
property_values = {}
199
206
200
207
for name in property_names :
@@ -214,50 +221,87 @@ def is_BASE(U, UISC, UDI, UGC, AJT):
214
221
AJT in [jt_C , jt_D , jt_L , jt_R ] and UISC != Joiner or
215
222
(UGC == Lo and UISC in [Avagraha , Bindu , Consonant_Final , Consonant_Medial ,
216
223
Consonant_Subjoined , Vowel , Vowel_Dependent ]))
224
+
225
+
217
226
def is_BASE_NUM (U , UISC , UDI , UGC , AJT ):
218
227
return UISC == Brahmi_Joining_Number
228
+
229
+
219
230
def is_BASE_OTHER (U , UISC , UDI , UGC , AJT ):
220
- if UISC == Consonant_Placeholder : return True
231
+ if UISC == Consonant_Placeholder :
232
+ return True
221
233
return U in [0x2015 , 0x2022 , 0x25FB , 0x25FC , 0x25FD , 0x25FE ]
234
+
235
+
222
236
def is_CGJ (U , UISC , UDI , UGC , AJT ):
223
237
# Also includes VARIATION_SELECTOR, WJ, and ZWJ
224
238
return U == 0x200D or UDI and UGC in [Mc , Me , Mn ]
239
+
240
+
225
241
def is_CONS_FINAL (U , UISC , UDI , UGC , AJT ):
226
242
return ((UISC == Consonant_Final and UGC != Lo ) or
227
243
UISC == Consonant_Succeeding_Repha )
244
+
245
+
228
246
def is_CONS_FINAL_MOD (U , UISC , UDI , UGC , AJT ):
229
247
return UISC == Syllable_Modifier
248
+
249
+
230
250
def is_CONS_MED (U , UISC , UDI , UGC , AJT ):
231
251
# Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
232
252
return (UISC == Consonant_Medial and UGC != Lo or
233
253
UISC == Consonant_Initial_Postfixed )
254
+
255
+
234
256
def is_CONS_MOD (U , UISC , UDI , UGC , AJT ):
235
257
return (UISC in [Nukta , Gemination_Mark , Consonant_Killer ] and
236
258
not is_SYM_MOD (U , UISC , UDI , UGC , AJT ))
259
+
260
+
237
261
def is_CONS_SUB (U , UISC , UDI , UGC , AJT ):
238
262
return UISC == Consonant_Subjoined and UGC != Lo
263
+
264
+
239
265
def is_CONS_WITH_STACKER (U , UISC , UDI , UGC , AJT ):
240
266
return UISC == Consonant_With_Stacker
267
+
268
+
241
269
def is_HALANT (U , UISC , UDI , UGC , AJT ):
242
270
return (UISC in [Virama , Invisible_Stacker ]
243
271
and not is_HALANT_OR_VOWEL_MODIFIER (U , UISC , UDI , UGC , AJT )
244
272
and not is_SAKOT (U , UISC , UDI , UGC , AJT ))
273
+
274
+
245
275
def is_HALANT_OR_VOWEL_MODIFIER (U , UISC , UDI , UGC , AJT ):
246
276
# Split off of HALANT
247
277
# https://github.com/harfbuzz/harfbuzz/issues/1379
248
278
return U == 0x1134D
279
+
280
+
249
281
def is_HALANT_NUM (U , UISC , UDI , UGC , AJT ):
250
282
return UISC == Number_Joiner
283
+
284
+
251
285
def is_HIEROGLYPH (U , UISC , UDI , UGC , AJT ):
252
286
return UISC == Hieroglyph
287
+
288
+
253
289
def is_HIEROGLYPH_JOINER (U , UISC , UDI , UGC , AJT ):
254
290
return UISC == Hieroglyph_Joiner
291
+
292
+
255
293
def is_HIEROGLYPH_SEGMENT_BEGIN (U , UISC , UDI , UGC , AJT ):
256
294
return UISC == Hieroglyph_Segment_Begin
295
+
296
+
257
297
def is_HIEROGLYPH_SEGMENT_END (U , UISC , UDI , UGC , AJT ):
258
298
return UISC == Hieroglyph_Segment_End
299
+
300
+
259
301
def is_ZWNJ (U , UISC , UDI , UGC , AJT ):
260
302
return UISC == Non_Joiner
303
+
304
+
261
305
def is_OTHER (U , UISC , UDI , UGC , AJT ):
262
306
# Also includes BASE_IND, Rsv, and SYM
263
307
return ((UGC in [Cn , Po ] or UISC in [Consonant_Dead , Joiner , Modifying_Letter , Other ])
@@ -266,93 +310,105 @@ def is_OTHER(U, UISC, UDI, UGC, AJT):
266
310
and not is_CGJ (U , UISC , UDI , UGC , AJT )
267
311
and not is_SYM_MOD (U , UISC , UDI , UGC , AJT )
268
312
)
313
+
314
+
269
315
def is_REPHA (U , UISC , UDI , UGC , AJT ):
270
316
return UISC in [Consonant_Preceding_Repha , Consonant_Prefixed ]
317
+
318
+
271
319
def is_SAKOT (U , UISC , UDI , UGC , AJT ):
272
320
# Split off of HALANT
273
321
return U == 0x1A60
322
+
323
+
274
324
def is_SYM_MOD (U , UISC , UDI , UGC , AJT ):
275
325
return U in [0x1B6B , 0x1B6C , 0x1B6D , 0x1B6E , 0x1B6F , 0x1B70 , 0x1B71 , 0x1B72 , 0x1B73 ]
326
+
327
+
276
328
def is_VOWEL (U , UISC , UDI , UGC , AJT ):
277
329
# https://github.com/harfbuzz/harfbuzz/issues/376
278
330
return (UISC == Pure_Killer or
279
331
(UGC != Lo and UISC in [Vowel , Vowel_Dependent ] and U not in [0xAA29 ]))
332
+
333
+
280
334
def is_VOWEL_MOD (U , UISC , UDI , UGC , AJT ):
281
335
# https://github.com/harfbuzz/harfbuzz/issues/376
282
336
return (UISC in [Tone_Mark , Cantillation_Mark , Register_Shifter , Visarga ] or
283
337
(UGC != Lo and (UISC == Bindu or U in [0xAA29 ])))
284
338
339
+
285
340
use_mapping = {
286
- 'B' : is_BASE ,
287
- 'N' : is_BASE_NUM ,
288
- 'GB' : is_BASE_OTHER ,
289
- 'CGJ' : is_CGJ ,
290
- 'F' : is_CONS_FINAL ,
291
- 'FM' : is_CONS_FINAL_MOD ,
292
- 'M' : is_CONS_MED ,
293
- 'CM' : is_CONS_MOD ,
294
- 'SUB' : is_CONS_SUB ,
295
- 'CS' : is_CONS_WITH_STACKER ,
296
- 'H' : is_HALANT ,
297
- 'HVM' : is_HALANT_OR_VOWEL_MODIFIER ,
298
- 'HN' : is_HALANT_NUM ,
299
- 'G' : is_HIEROGLYPH ,
300
- 'J' : is_HIEROGLYPH_JOINER ,
301
- 'SB' : is_HIEROGLYPH_SEGMENT_BEGIN ,
302
- 'SE' : is_HIEROGLYPH_SEGMENT_END ,
303
- 'ZWNJ' : is_ZWNJ ,
304
- 'O' : is_OTHER ,
305
- 'R' : is_REPHA ,
306
- 'Sk' : is_SAKOT ,
307
- 'SM' : is_SYM_MOD ,
308
- 'V' : is_VOWEL ,
309
- 'VM' : is_VOWEL_MOD ,
341
+ 'B' : is_BASE ,
342
+ 'N' : is_BASE_NUM ,
343
+ 'GB' : is_BASE_OTHER ,
344
+ 'CGJ' : is_CGJ ,
345
+ 'F' : is_CONS_FINAL ,
346
+ 'FM' : is_CONS_FINAL_MOD ,
347
+ 'M' : is_CONS_MED ,
348
+ 'CM' : is_CONS_MOD ,
349
+ 'SUB' : is_CONS_SUB ,
350
+ 'CS' : is_CONS_WITH_STACKER ,
351
+ 'H' : is_HALANT ,
352
+ 'HVM' : is_HALANT_OR_VOWEL_MODIFIER ,
353
+ 'HN' : is_HALANT_NUM ,
354
+ 'G' : is_HIEROGLYPH ,
355
+ 'J' : is_HIEROGLYPH_JOINER ,
356
+ 'SB' : is_HIEROGLYPH_SEGMENT_BEGIN ,
357
+ 'SE' : is_HIEROGLYPH_SEGMENT_END ,
358
+ 'ZWNJ' : is_ZWNJ ,
359
+ 'O' : is_OTHER ,
360
+ 'R' : is_REPHA ,
361
+ 'SK' : is_SAKOT ,
362
+ 'SM' : is_SYM_MOD ,
363
+ 'V' : is_VOWEL ,
364
+ 'VM' : is_VOWEL_MOD ,
310
365
}
311
366
312
367
use_positions = {
313
368
'F' : {
314
- 'Abv ' : [Top ],
315
- 'Blw ' : [Bottom ],
316
- 'Pst ' : [Right ],
369
+ 'ABV ' : [Top ],
370
+ 'BLW ' : [Bottom ],
371
+ 'PST ' : [Right ],
317
372
},
318
373
'M' : {
319
- 'Abv ' : [Top ],
320
- 'Blw ' : [Bottom , Bottom_And_Left , Bottom_And_Right ],
321
- 'Pst ' : [Right ],
322
- 'Pre ' : [Left , Top_And_Bottom_And_Left ],
374
+ 'ABV ' : [Top ],
375
+ 'BLW ' : [Bottom , Bottom_And_Left , Bottom_And_Right ],
376
+ 'PST ' : [Right ],
377
+ 'PRE ' : [Left , Top_And_Bottom_And_Left ],
323
378
},
324
379
'CM' : {
325
- 'Abv ' : [Top ],
326
- 'Blw ' : [Bottom , Overstruck ],
380
+ 'ABV ' : [Top ],
381
+ 'BLW ' : [Bottom , Overstruck ],
327
382
},
328
383
'V' : {
329
- 'Abv ' : [Top , Top_And_Bottom , Top_And_Bottom_And_Right , Top_And_Right ],
330
- 'Blw ' : [Bottom , Overstruck , Bottom_And_Right ],
331
- 'Pst ' : [Right ],
332
- 'Pre ' : [Left , Top_And_Left , Top_And_Left_And_Right , Left_And_Right ],
384
+ 'ABV ' : [Top , Top_And_Bottom , Top_And_Bottom_And_Right , Top_And_Right ],
385
+ 'BLW ' : [Bottom , Overstruck , Bottom_And_Right ],
386
+ 'PST ' : [Right ],
387
+ 'PRE ' : [Left , Top_And_Left , Top_And_Left_And_Right , Left_And_Right ],
333
388
},
334
389
'VM' : {
335
- 'Abv ' : [Top ],
336
- 'Blw ' : [Bottom , Overstruck ],
337
- 'Pst ' : [Right ],
338
- 'Pre ' : [Left ],
390
+ 'ABV ' : [Top ],
391
+ 'BLW ' : [Bottom , Overstruck ],
392
+ 'PST ' : [Right ],
393
+ 'PRE ' : [Left ],
339
394
},
340
395
'SM' : {
341
- 'Abv ' : [Top ],
342
- 'Blw ' : [Bottom ],
396
+ 'ABV ' : [Top ],
397
+ 'BLW ' : [Bottom ],
343
398
},
344
399
'H' : None ,
345
400
'HVM' : None ,
346
401
'B' : None ,
347
402
'FM' : {
348
- 'Abv ' : [Top ],
349
- 'Blw ' : [Bottom ],
350
- 'Pst ' : [Not_Applicable ],
403
+ 'ABV ' : [Top ],
404
+ 'BLW ' : [Bottom ],
405
+ 'PST ' : [Not_Applicable ],
351
406
},
352
407
'R' : None ,
353
408
'SUB' : None ,
354
409
}
355
410
411
+
356
412
def map_to_use (data ):
357
413
out = {}
358
414
items = use_mapping .items ()
@@ -361,55 +417,71 @@ def map_to_use(data):
361
417
# Resolve Indic_Syllabic_Category
362
418
363
419
# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
364
- if 0x1CE2 <= U <= 0x1CE8 : UISC = Cantillation_Mark
420
+ if 0x1CE2 <= U <= 0x1CE8 :
421
+ UISC = Cantillation_Mark
365
422
366
423
# Tibetan:
367
424
# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
368
- if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F : UISC = Vowel_Dependent
425
+ if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F :
426
+ UISC = Vowel_Dependent
369
427
370
428
# TODO: https://github.com/harfbuzz/harfbuzz/pull/627
371
- if 0x1BF2 <= U <= 0x1BF3 : UISC = Nukta ; UIPC = Bottom
429
+ if 0x1BF2 <= U <= 0x1BF3 :
430
+ UISC = Nukta
431
+ UIPC = Bottom
372
432
373
433
# TODO: U+1CED should only be allowed after some of
374
434
# the nasalization marks, maybe only for U+1CE9..U+1CF1.
375
- if U == 0x1CED : UISC = Tone_Mark
435
+ if U == 0x1CED :
436
+ UISC = Tone_Mark
376
437
377
438
# TODO: https://github.com/microsoft/font-tools/issues/1
378
- if U == 0xA982 : UISC = Consonant_Succeeding_Repha
439
+ if U == 0xA982 :
440
+ UISC = Consonant_Succeeding_Repha
379
441
380
442
values = [k for k ,v in items if v (U , UISC , UDI , UGC , AJT )]
381
- assert len (values ) == 1 , "%s %s %s %s %s %s" % (hex (U ), UISC , UDI , UGC , AJT , values )
443
+ assert len (values ) == 1 , "%s %s %s %s %s %s" % (
444
+ hex (U ), UISC , UDI , UGC , AJT , values )
382
445
USE = values [0 ]
383
446
384
447
# Resolve Indic_Positional_Category
385
448
386
449
# TODO: These should die, but have UIPC in Unicode 13.0.0
387
- if U in [0x953 , 0x954 ]: UIPC = Not_Applicable
450
+ if U in [0x953 , 0x954 ]:
451
+ UIPC = Not_Applicable
388
452
389
453
# TODO: These are not in USE's override list that we have, nor are they in Unicode 13.0.0
390
- if 0xA926 <= U <= 0xA92A : UIPC = Top
454
+ if 0xA926 <= U <= 0xA92A :
455
+ UIPC = Top
456
+
391
457
# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
392
458
# and https://github.com/harfbuzz/harfbuzz/issues/1631
393
- if U in [0x11302 , 0x11303 , 0x114C1 ]: UIPC = Top
394
- if 0x1CF8 <= U <= 0x1CF9 : UIPC = Top
459
+ if U in [0x11302 , 0x11303 , 0x114C1 ]:
460
+ UIPC = Top
461
+ if 0x1CF8 <= U <= 0x1CF9 :
462
+ UIPC = Top
395
463
396
464
# TODO: https://github.com/harfbuzz/harfbuzz/pull/982
397
465
# also https://github.com/harfbuzz/harfbuzz/issues/1012
398
- if 0x1112A <= U <= 0x1112B : UIPC = Top
399
- if 0x11131 <= U <= 0x11132 : UIPC = Top
466
+ if 0x1112A <= U <= 0x1112B :
467
+ UIPC = Top
468
+ if 0x11131 <= U <= 0x11132 :
469
+ UIPC = Top
400
470
401
471
assert (UIPC in [Not_Applicable , Visual_Order_Left ] or U == 0x0F7F or
402
472
USE in use_positions ), "%s %s %s %s %s %s %s" % (hex (U ), UIPC , USE , UISC , UDI , UGC , AJT )
403
473
404
474
pos_mapping = use_positions .get (USE , None )
405
475
if pos_mapping :
406
476
values = [k for k ,v in pos_mapping .items () if v and UIPC in v ]
407
- assert len (values ) == 1 , "%s %s %s %s %s %s %s %s" % (hex (U ), UIPC , USE , UISC , UDI , UGC , AJT , values )
477
+ assert len (values ) == 1 , "%s %s %s %s %s %s %s %s" % (
478
+ hex (U ), UIPC , USE , UISC , UDI , UGC , AJT , values )
408
479
USE = USE + values [0 ]
409
480
410
481
out [U ] = (USE , UBlock )
411
482
return out
412
483
484
+
413
485
defaults = ('O' , 'No_Block' )
414
486
data = map_to_use (data )
415
487
@@ -440,7 +512,7 @@ def print_block(block, start, end, data):
440
512
if u in data :
441
513
num += 1
442
514
d = data .get (u , defaults )
443
- print ('%6s,' % d [0 ]. upper () , end = '' )
515
+ print ('%6s,' % d [0 ], end = '' )
444
516
445
517
total += end - start + 1
446
518
used += num
0 commit comments