34
34
headers [j - 1 ].append (line )
35
35
headers .append (["UnicodeData.txt does not have a header." ])
36
36
37
- data = [{} for _ in files ]
37
+ unicode_data = [{} for _ in files ]
38
38
values = [{} for _ in files ]
39
39
for i , f in enumerate (files ):
40
40
for line in f :
68
68
69
69
i0 = i if i < 7 else i - 7
70
70
for u in range (start , end + 1 ):
71
- data [i0 ][u ] = t
71
+ unicode_data [i0 ][u ] = t
72
72
values [i0 ][t ] = values [i0 ].get (t , 0 ) + end - start + 1
73
73
74
74
defaults = ('Other' , 'Not_Applicable' , 'jt_X' , '' , 'Cn' , 'No_Block' , 'Unknown' )
75
75
76
- # TODO Characters that are not in Unicode Indic files, but used in USE
77
- data [0 ][0x1B61 ] = defaults [0 ]
78
- data [0 ][0x1B63 ] = defaults [0 ]
79
- data [0 ][0x1B64 ] = defaults [0 ]
80
- data [0 ][0x1B65 ] = defaults [0 ]
81
- data [0 ][0x1B66 ] = defaults [0 ]
82
- data [0 ][0x1B67 ] = defaults [0 ]
83
- data [0 ][0x1B69 ] = defaults [0 ]
84
- data [0 ][0x1B6A ] = defaults [0 ]
85
- data [0 ][0x2060 ] = defaults [0 ]
86
- # TODO https://github.com/harfbuzz/harfbuzz/pull/1685
87
- data [0 ][0x1B5B ] = 'Consonant_Placeholder'
88
- data [0 ][0x1B5C ] = 'Consonant_Placeholder'
89
- data [0 ][0x1B5F ] = 'Consonant_Placeholder'
90
- data [0 ][0x1B62 ] = 'Consonant_Placeholder'
91
- data [0 ][0x1B68 ] = 'Consonant_Placeholder'
92
- # TODO https://github.com/harfbuzz/harfbuzz/issues/1035
93
- data [0 ][0x11C44 ] = 'Consonant_Placeholder'
94
- data [0 ][0x11C45 ] = 'Consonant_Placeholder'
95
- # TODO https://github.com/harfbuzz/harfbuzz/pull/1399
96
- data [0 ][0x111C8 ] = 'Consonant_Placeholder'
97
-
98
76
# Merge data into one dict:
99
- for i , v in enumerate (defaults ):
100
- values [i ][v ] = values [i ].get (v , 0 ) + 1
77
+ for i ,v in enumerate (defaults ):
78
+ values [i ][v ] = values [i ].get (v , 0 ) + 1
101
79
combined = {}
102
- for i , d in enumerate ( data ):
103
- for u , v in d .items ():
80
+ for i ,d in enumerate ( unicode_data ):
81
+ for u ,v in d .items ():
104
82
if not u in combined :
105
83
if i >= 4 :
106
84
continue
107
- combined [u ] = list (defaults )
85
+ combined [u ] = list (defaults )
108
86
combined [u ][i ] = v
109
- combined = {k : v for k , v in combined .items (
110
- ) if v [6 ] not in DISABLED_SCRIPTS }
111
- data = combined
112
- del combined
87
+ combined = {k : v for k , v in combined .items () if v [6 ] not in DISABLED_SCRIPTS }
113
88
114
89
115
90
property_names = [
@@ -234,8 +209,8 @@ def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
234
209
235
210
236
211
def is_CGJ (U , UISC , UDI , UGC , AJT ):
237
- # Also includes VARIATION_SELECTOR, WJ, and ZWJ
238
- return U == 0x200D or UDI and UGC in [Mc , Me , Mn ]
212
+ # Also includes VARIATION_SELECTOR and ZWJ
213
+ return UISC == Joiner or UDI and UGC in [Mc , Me , Mn ]
239
214
240
215
241
216
def is_CONS_FINAL (U , UISC , UDI , UGC , AJT ):
@@ -303,12 +278,13 @@ def is_ZWNJ(U, UISC, UDI, UGC, AJT):
303
278
304
279
305
280
def is_OTHER (U , UISC , UDI , UGC , AJT ):
306
- # Also includes BASE_IND, Rsv, and SYM
307
- return ((UGC in [ Cn , Po ] or UISC in [Consonant_Dead , Joiner , Modifying_Letter , Other ])
281
+ # Also includes BASE_IND, and SYM
282
+ return ((UGC == Po or UISC in [Consonant_Dead , Joiner , Modifying_Letter , Other ])
308
283
and not is_BASE (U , UISC , UDI , UGC , AJT )
309
284
and not is_BASE_OTHER (U , UISC , UDI , UGC , AJT )
310
285
and not is_CGJ (U , UISC , UDI , UGC , AJT )
311
286
and not is_SYM_MOD (U , UISC , UDI , UGC , AJT )
287
+ and not is_Word_Joiner (U , UISC , UDI , UGC , AJT )
312
288
)
313
289
314
290
@@ -326,16 +302,20 @@ def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
326
302
327
303
328
304
def is_VOWEL (U , UISC , UDI , UGC , AJT ):
329
- # https://github.com/harfbuzz/harfbuzz/issues/376
330
305
return (UISC == Pure_Killer or
331
- ( UGC != Lo and UISC in [Vowel , Vowel_Dependent ] and U not in [ 0xAA29 ]) )
306
+ UGC != Lo and UISC in [Vowel , Vowel_Dependent ])
332
307
333
308
334
309
def is_VOWEL_MOD (U , UISC , UDI , UGC , AJT ):
335
- # https://github.com/harfbuzz/harfbuzz/issues/376
336
310
return (UISC in [Tone_Mark , Cantillation_Mark , Register_Shifter , Visarga ] or
337
- ( UGC != Lo and ( UISC == Bindu or U in [ 0xAA29 ])) )
311
+ UGC != Lo and UISC == Bindu )
338
312
313
+ def is_Word_Joiner (U , UISC , UDI , UGC , AJT ):
314
+ # Also includes Rsv
315
+ return (UDI and U not in [0x115F , 0x1160 , 0x3164 , 0xFFA0 , 0x1BCA0 , 0x1BCA1 , 0x1BCA2 , 0x1BCA3 ]
316
+ and UISC == Other
317
+ and not is_CGJ (U , UISC , UDI , UGC , AJT )
318
+ ) or UGC == Cn
339
319
340
320
use_mapping = {
341
321
'B' : is_BASE ,
@@ -362,6 +342,7 @@ def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
362
342
'SM' : is_SYM_MOD ,
363
343
'V' : is_VOWEL ,
364
344
'VM' : is_VOWEL_MOD ,
345
+ 'WJ' : is_Word_Joiner ,
365
346
}
366
347
367
348
use_positions = {
@@ -435,10 +416,6 @@ def map_to_use(data):
435
416
if U == 0x1CED :
436
417
UISC = Tone_Mark
437
418
438
- # TODO: https://github.com/microsoft/font-tools/issues/1
439
- if U == 0xA982 :
440
- UISC = Consonant_Succeeding_Repha
441
-
442
419
values = [k for k ,v in items if v (U , UISC , UDI , UGC , AJT )]
443
420
assert len (values ) == 1 , "%s %s %s %s %s %s" % (
444
421
hex (U ), UISC , UDI , UGC , AJT , values )
@@ -482,19 +459,20 @@ def map_to_use(data):
482
459
return out
483
460
484
461
485
- defaults = ('O' , 'No_Block' )
486
- data = map_to_use (data )
462
+ use_data = map_to_use (combined )
487
463
488
464
print ('// WARNING: this file was generated by ../scripts/gen-universal-table.py' )
489
465
print ()
466
+ print ('use crate::GlyphInfo;' )
467
+ print ('use unicode_properties::GeneralCategory;' )
490
468
print ('use super::universal::{Category, category::*};' )
491
469
492
470
total = 0
493
471
used = 0
494
472
last_block = None
495
473
496
474
497
- def print_block (block , start , end , data ):
475
+ def print_block (block , start , end , use_data ):
498
476
global total , used , last_block
499
477
if block and block != last_block :
500
478
print ()
@@ -509,18 +487,24 @@ def print_block(block, start, end, data):
509
487
if u % 16 == 0 :
510
488
print ()
511
489
print (' /* %04X */' % u , end = '' )
512
- if u in data :
490
+ if u in use_data :
513
491
num += 1
514
- d = data .get (u , defaults )
515
- print ('%6s,' % d [0 ], end = '' )
492
+ d = use_data .get (u )
493
+ if d is not None :
494
+ d = d [0 ]
495
+ elif u in unicode_data [4 ]:
496
+ d = 'O'
497
+ else :
498
+ d = 'WJ'
499
+ print ("%6s," % d , end = '' )
516
500
517
501
total += end - start + 1
518
502
used += num
519
503
if block :
520
504
last_block = block
521
505
522
506
523
- uu = sorted (data .keys ())
507
+ uu = sorted (use_data .keys ())
524
508
525
509
last = - 100000
526
510
num = 0
@@ -534,19 +518,19 @@ def print_block(block, start, end, data):
534
518
for u in uu :
535
519
if u <= last :
536
520
continue
537
- if data [u ][0 ] == 'O' :
521
+ if use_data [u ][0 ] == 'O' :
538
522
continue
539
- block = data [u ][1 ]
523
+ block = use_data [u ][1 ]
540
524
541
525
start = u // 8 * 8
542
526
end = start + 1
543
- while end in uu and block == data [end ][1 ]:
527
+ while end in uu and block == use_data [end ][1 ]:
544
528
end += 1
545
529
end = (end - 1 ) // 8 * 8 + 7
546
530
547
531
if start != last + 1 :
548
532
if start - last <= 1 + 16 * 3 :
549
- print_block (None , last + 1 , start - 1 , data )
533
+ print_block (None , last + 1 , start - 1 , use_data )
550
534
last = start - 1
551
535
else :
552
536
if last >= 0 :
@@ -556,7 +540,7 @@ def print_block(block, start, end, data):
556
540
(start , offset ))
557
541
starts .append (start )
558
542
559
- print_block (block , start , end , data )
543
+ print_block (block , start , end , use_data )
560
544
last = end
561
545
ends .append (last + 1 )
562
546
offset += ends [- 1 ] - starts [- 1 ]
@@ -570,7 +554,8 @@ def print_block(block, start, end, data):
570
554
print (o )
571
555
print ()
572
556
print ('#[rustfmt::skip]' )
573
- print ('pub fn get_category(u: u32) -> Category {' )
557
+ print ('pub fn get_category(info: &GlyphInfo) -> Category {' )
558
+ print (' let u = info.glyph_id;' )
574
559
print (' match u >> %d {' % page_bits )
575
560
pages = set ([u >> page_bits for u in starts + ends ])
576
561
for p in sorted (pages ):
@@ -585,6 +570,10 @@ def print_block(block, start, end, data):
585
570
print (' _ => {}' )
586
571
print (' }' )
587
572
print ()
573
+ print (' if info.general_category() == GeneralCategory::Unassigned {' )
574
+ print (' return WJ;' )
575
+ print (' }' )
576
+ print ()
588
577
print (' O' )
589
578
print ('}' )
590
579
0 commit comments