Skip to content

Commit 06d7fbd

Browse files
ralfstxclaude
andcommitted
✨ Add script detection for text shaping
Text shaping previously used no `scriptTag`, so all text was shaped with `DFLT`. This missed script-specific shaping rules for non-Latin scripts like Arabic contextual forms or Devanagari conjuncts. This commit adds a new `script-detection` module that detects the Unicode script of text characters using a range-based lookup table, segments text into script runs, and maps scripts to OpenType tags. The table of Unicode script ranges is generated from the official Unicode Character Database (Scripts.txt) with a custom script (`scripts/generate-script-ranges.ts`). It includes all scripts with OpenType tag mappings plus Common and Inherited. `extractTextSegments()` now segments by script before splitting into chunks, passing the resolved `scriptTag` through to `font.shapeText()`. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 22cc038 commit 06d7fbd

File tree

9 files changed

+1103
-29
lines changed

9 files changed

+1103
-29
lines changed

NOTICE.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
This project includes data derived from the Unicode Character Database.
2+
3+
Copyright © Unicode, Inc.
4+
Licensed under the Unicode License.
5+
https://www.unicode.org/license.html
6+
7+
See scripts/generate-script-ranges.ts for details on how the data was derived.

examples/text.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,17 @@ const document = {
8989
margin: { y: 10 },
9090
},
9191
),
92+
// Text kerning and ligatures
93+
rows(
94+
[
95+
text('AV, Ta with kerning, fi, ffi with ligatures.'),
96+
text('AV, Ta without kerning, fi, ffi without ligatures.', {
97+
fontKerning: 'none',
98+
fontVariantLigatures: 'none',
99+
}),
100+
],
101+
{ margin: { y: 10 } },
102+
),
92103
],
93104
};
94105

scripts/generate-script-ranges.ts

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
/**
2+
* Generates src/script-ranges.gen.ts from Unicode Scripts.txt.
3+
*
4+
* Usage: node scripts/generate-script-ranges.ts
5+
*
6+
* Input: vendor.local/unicode/UCD/Scripts.txt (Unicode Character Database) in the project root.
7+
* Output: src/script-ranges.gen.ts — sorted, merged script range table.
8+
*/
9+
import { readFileSync, writeFileSync } from 'node:fs';
10+
import { dirname, resolve } from 'node:path';
11+
import { fileURLToPath } from 'node:url';
12+
13+
const projectRoot = resolve(dirname(fileURLToPath(import.meta.url)), '..');
14+
const inputPath = resolve(projectRoot, 'vendor.local/unicode/UCD/Scripts.txt');
15+
const outputPath = resolve(projectRoot, 'src/script-ranges.gen.ts');
16+
17+
/**
18+
* Scripts to include in the generated table. These are the scripts that
19+
* have OpenType tag mappings (used for text shaping) plus Common and
20+
* Inherited (needed for script resolution in segmentByScript).
21+
*/
22+
const includedScripts = new Set([
23+
'Common',
24+
'Inherited',
25+
'Latin',
26+
'Greek',
27+
'Cyrillic',
28+
'Armenian',
29+
'Hebrew',
30+
'Arabic',
31+
'Devanagari',
32+
'Bengali',
33+
'Gurmukhi',
34+
'Gujarati',
35+
'Tamil',
36+
'Telugu',
37+
'Kannada',
38+
'Malayalam',
39+
'Thai',
40+
'Georgian',
41+
'Hangul',
42+
'Hiragana',
43+
'Katakana',
44+
'Han',
45+
]);
46+
47+
type Range = { start: number; end: number; script: string };
48+
49+
// --- Parse ---
50+
51+
const input = readFileSync(inputPath, 'utf-8');
52+
const ranges: Range[] = [];
53+
54+
for (const line of input.split('\n')) {
55+
const trimmed = line.trim();
56+
if (!trimmed || trimmed.startsWith('#')) continue;
57+
58+
// Lines have the format:
59+
// 0000..001F ; Common # Cc (...more comments)
60+
// 0020 ; Common # Zs (...more comments)
61+
const match = trimmed.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(\w+)/);
62+
if (!match) continue;
63+
64+
const start = parseInt(match[1], 16);
65+
const end = match[2] ? parseInt(match[2], 16) : start;
66+
const script = match[3];
67+
68+
if (includedScripts.has(script)) {
69+
ranges.push({ start, end, script });
70+
}
71+
}
72+
73+
// --- Sort ---
74+
75+
ranges.sort((a, b) => a.start - b.start);
76+
77+
// --- Merge adjacent ranges for the same script ---
78+
79+
const merged: Range[] = [];
80+
for (const range of ranges) {
81+
const prev = merged[merged.length - 1];
82+
if (prev && prev.script === range.script && prev.end + 1 >= range.start) {
83+
prev.end = Math.max(prev.end, range.end);
84+
} else {
85+
merged.push({ ...range });
86+
}
87+
}
88+
89+
// --- Validate ---
90+
91+
for (let i = 1; i < merged.length; i++) {
92+
if (merged[i].start <= merged[i - 1].end) {
93+
throw new Error(
94+
`Overlapping ranges: ${formatRange(merged[i - 1])} and ${formatRange(merged[i])}`,
95+
);
96+
}
97+
}
98+
99+
// --- Generate ---
100+
101+
const lines: string[] = [
102+
'/**',
103+
' * This file is generated from Unicode Scripts.txt (Unicode 15.1).',
104+
' *',
105+
' * Source: https://www.unicode.org/Public/UCD/latest/ucd/Scripts.txt',
106+
' * Licensed under the Unicode License: https://www.unicode.org/license.html',
107+
' *',
108+
' * Do not edit manually. Regenerate with: scripts/generate-script-ranges.ts',
109+
' */',
110+
'',
111+
'/** A Unicode code point range with an associated script. */',
112+
'export type ScriptRange = {',
113+
' start: number;',
114+
' end: number;',
115+
' script: string;',
116+
'};',
117+
'',
118+
119+
`/** A list of sorted, non-overlapping Unicode code point ranges covering ${includedScripts.size} scripts. */`,
120+
'export const scriptRanges: ScriptRange[] = [',
121+
];
122+
123+
for (const range of merged) {
124+
const s = `0x${range.start.toString(16).padStart(4, '0')}`;
125+
const e = `0x${range.end.toString(16).padStart(4, '0')}`;
126+
lines.push(` { start: ${s}, end: ${e}, script: '${range.script}' },`);
127+
}
128+
129+
lines.push('] as const;', '');
130+
131+
writeFileSync(outputPath, lines.join('\n'));
132+
133+
console.log(`Generated ${outputPath}`);
134+
console.log(` ${merged.length} ranges (merged from ${ranges.length} entries)`);
135+
136+
function formatRange(r: Range): string {
137+
return `${r.start.toString(16).toUpperCase()}..${r.end.toString(16).toUpperCase()} ${r.script}`;
138+
}

scripts/tsconfig.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"extends": "../tsconfig.json",
3+
"compilerOptions": {
4+
"noEmit": true
5+
},
6+
"include": ["./**/*.ts"]
7+
}

src/script-detection.test.ts

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
import { describe, expect, it } from 'vitest';
2+
3+
import { detectScript, scriptToOpenTypeTag, segmentByScript } from './script-detection.ts';
4+
5+
describe('script-detection', () => {
6+
describe('detectScript', () => {
7+
it('returns Latin for ASCII letters', () => {
8+
expect(detectScript(0x41)).toBe('Latin'); // A
9+
expect(detectScript(0x5a)).toBe('Latin'); // Z
10+
expect(detectScript(0x61)).toBe('Latin'); // a
11+
expect(detectScript(0x7a)).toBe('Latin'); // z
12+
});
13+
14+
it('returns Common for ASCII digits', () => {
15+
expect(detectScript(0x30)).toBe('Common'); // 0
16+
expect(detectScript(0x39)).toBe('Common'); // 9
17+
});
18+
19+
it('returns Common for space and punctuation', () => {
20+
expect(detectScript(0x20)).toBe('Common'); // space
21+
expect(detectScript(0x2c)).toBe('Common'); // comma
22+
expect(detectScript(0x2e)).toBe('Common'); // period
23+
expect(detectScript(0x21)).toBe('Common'); // !
24+
});
25+
26+
it('returns Latin for extended Latin characters', () => {
27+
expect(detectScript(0x00c0)).toBe('Latin'); // À
28+
expect(detectScript(0x00e9)).toBe('Latin'); // é
29+
expect(detectScript(0x017e)).toBe('Latin'); // ž
30+
});
31+
32+
it('returns Inherited for combining diacritical marks', () => {
33+
expect(detectScript(0x0300)).toBe('Inherited'); // combining grave accent
34+
expect(detectScript(0x0301)).toBe('Inherited'); // combining acute accent
35+
expect(detectScript(0x036f)).toBe('Inherited'); // last in range
36+
});
37+
38+
it('returns Greek for Greek characters', () => {
39+
expect(detectScript(0x0391)).toBe('Greek'); // Α (Alpha)
40+
expect(detectScript(0x03c9)).toBe('Greek'); // ω (omega)
41+
});
42+
43+
it('returns Cyrillic for Cyrillic characters', () => {
44+
expect(detectScript(0x0410)).toBe('Cyrillic'); // А
45+
expect(detectScript(0x044f)).toBe('Cyrillic'); // я
46+
});
47+
48+
it('returns Hebrew for Hebrew characters', () => {
49+
expect(detectScript(0x05d0)).toBe('Hebrew'); // א (alef)
50+
});
51+
52+
it('returns Arabic for Arabic characters', () => {
53+
expect(detectScript(0x0627)).toBe('Arabic'); // ا (alif)
54+
expect(detectScript(0x0645)).toBe('Arabic'); // م (meem)
55+
});
56+
57+
it('returns Devanagari for Devanagari characters', () => {
58+
expect(detectScript(0x0915)).toBe('Devanagari'); // क
59+
expect(detectScript(0x0928)).toBe('Devanagari'); // न
60+
});
61+
62+
it('returns Thai for Thai characters', () => {
63+
expect(detectScript(0x0e01)).toBe('Thai'); // ก
64+
});
65+
66+
it('returns Han for CJK ideographs', () => {
67+
expect(detectScript(0x4e2d)).toBe('Han'); // 中
68+
expect(detectScript(0x6587)).toBe('Han'); // 文
69+
});
70+
71+
it('returns Hiragana for Hiragana characters', () => {
72+
expect(detectScript(0x3042)).toBe('Hiragana'); // あ
73+
});
74+
75+
it('returns Katakana for Katakana characters', () => {
76+
expect(detectScript(0x30a2)).toBe('Katakana'); // ア
77+
});
78+
79+
it('returns Hangul for Hangul syllables', () => {
80+
expect(detectScript(0xac00)).toBe('Hangul'); // 가
81+
});
82+
83+
it('returns Han for CJK Extension B (supplementary plane)', () => {
84+
expect(detectScript(0x20000)).toBe('Han');
85+
});
86+
87+
it('returns Common for unmapped code points', () => {
88+
expect(detectScript(0x10ffff)).toBe('Common');
89+
});
90+
});
91+
92+
describe('segmentByScript', () => {
93+
it('returns empty array for empty string', () => {
94+
expect(segmentByScript('')).toEqual([]);
95+
});
96+
97+
it('returns single run for pure Latin text', () => {
98+
expect(segmentByScript('Hello')).toEqual([{ text: 'Hello', script: 'Latin' }]);
99+
});
100+
101+
it('returns single run for Latin text with spaces', () => {
102+
expect(segmentByScript('Hello world')).toEqual([{ text: 'Hello world', script: 'Latin' }]);
103+
});
104+
105+
it('resolves all-Common text to Common', () => {
106+
expect(segmentByScript('123 456')).toEqual([{ text: '123 456', script: 'Common' }]);
107+
});
108+
109+
it('splits Latin and Cyrillic', () => {
110+
expect(segmentByScript('Hello Мир')).toEqual([
111+
{ text: 'Hello ', script: 'Latin' },
112+
{ text: 'Мир', script: 'Cyrillic' },
113+
]);
114+
});
115+
116+
it('resolves leading Common to first concrete script', () => {
117+
expect(segmentByScript('(Мир)')).toEqual([{ text: '(Мир)', script: 'Cyrillic' }]);
118+
});
119+
120+
it('resolves trailing punctuation to preceding script', () => {
121+
expect(segmentByScript('Hello, Мир!')).toEqual([
122+
{ text: 'Hello, ', script: 'Latin' },
123+
{ text: 'Мир!', script: 'Cyrillic' },
124+
]);
125+
});
126+
127+
it('splits Latin and CJK', () => {
128+
expect(segmentByScript('abc中文def')).toEqual([
129+
{ text: 'abc', script: 'Latin' },
130+
{ text: '中文', script: 'Han' },
131+
{ text: 'def', script: 'Latin' },
132+
]);
133+
});
134+
135+
it('handles Arabic text', () => {
136+
expect(segmentByScript('مرحبا')).toEqual([{ text: 'مرحبا', script: 'Arabic' }]);
137+
});
138+
139+
it('handles Devanagari text', () => {
140+
expect(segmentByScript('नमस्ते')).toEqual([{ text: 'नमस्ते', script: 'Devanagari' }]);
141+
});
142+
143+
it('keeps combining marks with base character', () => {
144+
// e + combining acute accent -> both Latin
145+
expect(segmentByScript('e\u0301')).toEqual([{ text: 'e\u0301', script: 'Latin' }]);
146+
});
147+
148+
it('resolves combining marks to preceding script', () => {
149+
// Arabic letter + combining mark -> both Arabic
150+
expect(segmentByScript('\u0627\u0300')).toEqual([{ text: '\u0627\u0300', script: 'Arabic' }]);
151+
});
152+
153+
it('handles three different scripts', () => {
154+
expect(segmentByScript('Hello Мир 中文')).toEqual([
155+
{ text: 'Hello ', script: 'Latin' },
156+
{ text: 'Мир ', script: 'Cyrillic' },
157+
{ text: '中文', script: 'Han' },
158+
]);
159+
});
160+
161+
it('handles supplementary plane characters', () => {
162+
// CJK Extension B character (U+20000)
163+
expect(segmentByScript('\u{20000}')).toEqual([{ text: '\u{20000}', script: 'Han' }]);
164+
});
165+
});
166+
167+
describe('scriptToOpenTypeTag', () => {
168+
it('maps Latin to latn', () => {
169+
expect(scriptToOpenTypeTag('Latin')).toBe('latn');
170+
});
171+
172+
it('maps Cyrillic to cyrl', () => {
173+
expect(scriptToOpenTypeTag('Cyrillic')).toBe('cyrl');
174+
});
175+
176+
it('maps Arabic to arab', () => {
177+
expect(scriptToOpenTypeTag('Arabic')).toBe('arab');
178+
});
179+
180+
it('maps Devanagari to dev2', () => {
181+
expect(scriptToOpenTypeTag('Devanagari')).toBe('dev2');
182+
});
183+
184+
it('maps Han to hani', () => {
185+
expect(scriptToOpenTypeTag('Han')).toBe('hani');
186+
});
187+
188+
it('maps Hiragana to kana', () => {
189+
expect(scriptToOpenTypeTag('Hiragana')).toBe('kana');
190+
});
191+
192+
it('maps Katakana to kana', () => {
193+
expect(scriptToOpenTypeTag('Katakana')).toBe('kana');
194+
});
195+
196+
it('returns DFLT for Common', () => {
197+
expect(scriptToOpenTypeTag('Common')).toBe('DFLT');
198+
});
199+
200+
it('returns DFLT for unknown scripts', () => {
201+
expect(scriptToOpenTypeTag('Unknown')).toBe('DFLT');
202+
});
203+
});
204+
});

0 commit comments

Comments
 (0)