eclipsesource
diff --git a/‎NOTICE.txt‎
Lines changed: 7 additions & 0 deletions b/‎NOTICE.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎examples/text.ts‎
Lines changed: 11 additions & 0 deletions b/‎examples/text.ts‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎scripts/generate-script-ranges.ts‎
Lines changed: 138 additions & 0 deletions b/‎scripts/generate-script-ranges.ts‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎scripts/tsconfig.json‎
Lines changed: 7 additions & 0 deletions b/‎scripts/tsconfig.json‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/script-detection.test.ts‎
Lines changed: 204 additions & 0 deletions b/‎src/script-detection.test.ts‎
Lines changed: 204 additions & 0 deletions
@@ -0,0 +1,7 @@
+This project includes data derived from the Unicode Character Database.
+
+Copyright © Unicode, Inc.
+Licensed under the Unicode License.
+https://www.unicode.org/license.html
+
+See scripts/generate-script-ranges.ts for details on how the data was derived.
@@ -89,6 +89,17 @@ const document = {
         margin: { y: 10 },
       },
     ),
+    // Text kerning and ligatures
+    rows(
+      [
+        text('AV, Ta with kerning, fi, ffi with ligatures.'),
+        text('AV, Ta without kerning, fi, ffi without ligatures.', {
+          fontKerning: 'none',
+          fontVariantLigatures: 'none',
+        }),
+      ],
+      { margin: { y: 10 } },
+    ),
   ],
 };
 
 
@@ -0,0 +1,138 @@
+/**
+ * Generates src/script-ranges.gen.ts from Unicode Scripts.txt.
+ *
+ * Usage: node scripts/generate-script-ranges.ts
+ *
+ * Input:  vendor.local/unicode/UCD/Scripts.txt (Unicode Character Database) in the project root.
+ * Output: src/script-ranges.gen.ts — sorted, merged script range table.
+ */
+import { readFileSync, writeFileSync } from 'node:fs';
+import { dirname, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+const projectRoot = resolve(dirname(fileURLToPath(import.meta.url)), '..');
+const inputPath = resolve(projectRoot, 'vendor.local/unicode/UCD/Scripts.txt');
+const outputPath = resolve(projectRoot, 'src/script-ranges.gen.ts');
+
+/**
+ * Scripts to include in the generated table. These are the scripts that
+ * have OpenType tag mappings (used for text shaping) plus Common and
+ * Inherited (needed for script resolution in segmentByScript).
+ */
+const includedScripts = new Set([
+  'Common',
+  'Inherited',
+  'Latin',
+  'Greek',
+  'Cyrillic',
+  'Armenian',
+  'Hebrew',
+  'Arabic',
+  'Devanagari',
+  'Bengali',
+  'Gurmukhi',
+  'Gujarati',
+  'Tamil',
+  'Telugu',
+  'Kannada',
+  'Malayalam',
+  'Thai',
+  'Georgian',
+  'Hangul',
+  'Hiragana',
+  'Katakana',
+  'Han',
+]);
+
+type Range = { start: number; end: number; script: string };
+
+// --- Parse ---
+
+const input = readFileSync(inputPath, 'utf-8');
+const ranges: Range[] = [];
+
+for (const line of input.split('\n')) {
+  const trimmed = line.trim();
+  if (!trimmed || trimmed.startsWith('#')) continue;
+
+  // Lines have the format:
+  // 0000..001F    ; Common # Cc (...more comments)
+  // 0020          ; Common # Zs (...more comments)
+  const match = trimmed.match(/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(\w+)/);
+  if (!match) continue;
+
+  const start = parseInt(match[1], 16);
+  const end = match[2] ? parseInt(match[2], 16) : start;
+  const script = match[3];
+
+  if (includedScripts.has(script)) {
+    ranges.push({ start, end, script });
+  }
+}
+
+// --- Sort ---
+
+ranges.sort((a, b) => a.start - b.start);
+
+// --- Merge adjacent ranges for the same script ---
+
+const merged: Range[] = [];
+for (const range of ranges) {
+  const prev = merged[merged.length - 1];
+  if (prev && prev.script === range.script && prev.end + 1 >= range.start) {
+    prev.end = Math.max(prev.end, range.end);
+  } else {
+    merged.push({ ...range });
+  }
+}
+
+// --- Validate ---
+
+for (let i = 1; i < merged.length; i++) {
+  if (merged[i].start <= merged[i - 1].end) {
+    throw new Error(
+      `Overlapping ranges: ${formatRange(merged[i - 1])} and ${formatRange(merged[i])}`,
+    );
+  }
+}
+
+// --- Generate ---
+
+const lines: string[] = [
+  '/**',
+  ' * This file is generated from Unicode Scripts.txt (Unicode 15.1).',
+  ' *',
+  ' * Source: https://www.unicode.org/Public/UCD/latest/ucd/Scripts.txt',
+  ' * Licensed under the Unicode License: https://www.unicode.org/license.html',
+  ' *',
+  ' * Do not edit manually. Regenerate with: scripts/generate-script-ranges.ts',
+  ' */',
+  '',
+  '/** A Unicode code point range with an associated script. */',
+  'export type ScriptRange = {',
+  '  start: number;',
+  '  end: number;',
+  '  script: string;',
+  '};',
+  '',
+
+  `/** A list of sorted, non-overlapping Unicode code point ranges covering ${includedScripts.size} scripts. */`,
+  'export const scriptRanges: ScriptRange[] = [',
+];
+
+for (const range of merged) {
+  const s = `0x${range.start.toString(16).padStart(4, '0')}`;
+  const e = `0x${range.end.toString(16).padStart(4, '0')}`;
+  lines.push(`  { start: ${s}, end: ${e}, script: '${range.script}' },`);
+}
+
+lines.push('] as const;', '');
+
+writeFileSync(outputPath, lines.join('\n'));
+
+console.log(`Generated ${outputPath}`);
+console.log(`  ${merged.length} ranges (merged from ${ranges.length} entries)`);
+
+function formatRange(r: Range): string {
+  return `${r.start.toString(16).toUpperCase()}..${r.end.toString(16).toUpperCase()} ${r.script}`;
+}
@@ -0,0 +1,7 @@
+{
+  "extends": "../tsconfig.json",
+  "compilerOptions": {
+    "noEmit": true
+  },
+  "include": ["./**/*.ts"]
+}
@@ -0,0 +1,204 @@
+import { describe, expect, it } from 'vitest';
+
+import { detectScript, scriptToOpenTypeTag, segmentByScript } from './script-detection.ts';
+
+describe('script-detection', () => {
+  describe('detectScript', () => {
+    it('returns Latin for ASCII letters', () => {
+      expect(detectScript(0x41)).toBe('Latin'); // A
+      expect(detectScript(0x5a)).toBe('Latin'); // Z
+      expect(detectScript(0x61)).toBe('Latin'); // a
+      expect(detectScript(0x7a)).toBe('Latin'); // z
+    });
+
+    it('returns Common for ASCII digits', () => {
+      expect(detectScript(0x30)).toBe('Common'); // 0
+      expect(detectScript(0x39)).toBe('Common'); // 9
+    });
+
+    it('returns Common for space and punctuation', () => {
+      expect(detectScript(0x20)).toBe('Common'); // space
+      expect(detectScript(0x2c)).toBe('Common'); // comma
+      expect(detectScript(0x2e)).toBe('Common'); // period
+      expect(detectScript(0x21)).toBe('Common'); // !
+    });
+
+    it('returns Latin for extended Latin characters', () => {
+      expect(detectScript(0x00c0)).toBe('Latin'); // À
+      expect(detectScript(0x00e9)).toBe('Latin'); // é
+      expect(detectScript(0x017e)).toBe('Latin'); // ž
+    });
+
+    it('returns Inherited for combining diacritical marks', () => {
+      expect(detectScript(0x0300)).toBe('Inherited'); // combining grave accent
+      expect(detectScript(0x0301)).toBe('Inherited'); // combining acute accent
+      expect(detectScript(0x036f)).toBe('Inherited'); // last in range
+    });
+
+    it('returns Greek for Greek characters', () => {
+      expect(detectScript(0x0391)).toBe('Greek'); // Α (Alpha)
+      expect(detectScript(0x03c9)).toBe('Greek'); // ω (omega)
+    });
+
+    it('returns Cyrillic for Cyrillic characters', () => {
+      expect(detectScript(0x0410)).toBe('Cyrillic'); // А
+      expect(detectScript(0x044f)).toBe('Cyrillic'); // я
+    });
+
+    it('returns Hebrew for Hebrew characters', () => {
+      expect(detectScript(0x05d0)).toBe('Hebrew'); // א (alef)
+    });
+
+    it('returns Arabic for Arabic characters', () => {
+      expect(detectScript(0x0627)).toBe('Arabic'); // ا (alif)
+      expect(detectScript(0x0645)).toBe('Arabic'); // م (meem)
+    });
+
+    it('returns Devanagari for Devanagari characters', () => {
+      expect(detectScript(0x0915)).toBe('Devanagari'); // क
+      expect(detectScript(0x0928)).toBe('Devanagari'); // न
+    });
+
+    it('returns Thai for Thai characters', () => {
+      expect(detectScript(0x0e01)).toBe('Thai'); // ก
+    });
+
+    it('returns Han for CJK ideographs', () => {
+      expect(detectScript(0x4e2d)).toBe('Han'); // 中
+      expect(detectScript(0x6587)).toBe('Han'); // 文
+    });
+
+    it('returns Hiragana for Hiragana characters', () => {
+      expect(detectScript(0x3042)).toBe('Hiragana'); // あ
+    });
+
+    it('returns Katakana for Katakana characters', () => {
+      expect(detectScript(0x30a2)).toBe('Katakana'); // ア
+    });
+
+    it('returns Hangul for Hangul syllables', () => {
+      expect(detectScript(0xac00)).toBe('Hangul'); // 가
+    });
+
+    it('returns Han for CJK Extension B (supplementary plane)', () => {
+      expect(detectScript(0x20000)).toBe('Han');
+    });
+
+    it('returns Common for unmapped code points', () => {
+      expect(detectScript(0x10ffff)).toBe('Common');
+    });
+  });
+
+  describe('segmentByScript', () => {
+    it('returns empty array for empty string', () => {
+      expect(segmentByScript('')).toEqual([]);
+    });
+
+    it('returns single run for pure Latin text', () => {
+      expect(segmentByScript('Hello')).toEqual([{ text: 'Hello', script: 'Latin' }]);
+    });
+
+    it('returns single run for Latin text with spaces', () => {
+      expect(segmentByScript('Hello world')).toEqual([{ text: 'Hello world', script: 'Latin' }]);
+    });
+
+    it('resolves all-Common text to Common', () => {
+      expect(segmentByScript('123 456')).toEqual([{ text: '123 456', script: 'Common' }]);
+    });
+
+    it('splits Latin and Cyrillic', () => {
+      expect(segmentByScript('Hello Мир')).toEqual([
+        { text: 'Hello ', script: 'Latin' },
+        { text: 'Мир', script: 'Cyrillic' },
+      ]);
+    });
+
+    it('resolves leading Common to first concrete script', () => {
+      expect(segmentByScript('(Мир)')).toEqual([{ text: '(Мир)', script: 'Cyrillic' }]);
+    });
+
+    it('resolves trailing punctuation to preceding script', () => {
+      expect(segmentByScript('Hello, Мир!')).toEqual([
+        { text: 'Hello, ', script: 'Latin' },
+        { text: 'Мир!', script: 'Cyrillic' },
+      ]);
+    });
+
+    it('splits Latin and CJK', () => {
+      expect(segmentByScript('abc中文def')).toEqual([
+        { text: 'abc', script: 'Latin' },
+        { text: '中文', script: 'Han' },
+        { text: 'def', script: 'Latin' },
+      ]);
+    });
+
+    it('handles Arabic text', () => {
+      expect(segmentByScript('مرحبا')).toEqual([{ text: 'مرحبا', script: 'Arabic' }]);
+    });
+
+    it('handles Devanagari text', () => {
+      expect(segmentByScript('नमस्ते')).toEqual([{ text: 'नमस्ते', script: 'Devanagari' }]);
+    });
+
+    it('keeps combining marks with base character', () => {
+      // e + combining acute accent -> both Latin
+      expect(segmentByScript('e\u0301')).toEqual([{ text: 'e\u0301', script: 'Latin' }]);
+    });
+
+    it('resolves combining marks to preceding script', () => {
+      // Arabic letter + combining mark -> both Arabic
+      expect(segmentByScript('\u0627\u0300')).toEqual([{ text: '\u0627\u0300', script: 'Arabic' }]);
+    });
+
+    it('handles three different scripts', () => {
+      expect(segmentByScript('Hello Мир 中文')).toEqual([
+        { text: 'Hello ', script: 'Latin' },
+        { text: 'Мир ', script: 'Cyrillic' },
+        { text: '中文', script: 'Han' },
+      ]);
+    });
+
+    it('handles supplementary plane characters', () => {
+      // CJK Extension B character (U+20000)
+      expect(segmentByScript('\u{20000}')).toEqual([{ text: '\u{20000}', script: 'Han' }]);
+    });
+  });
+
+  describe('scriptToOpenTypeTag', () => {
+    it('maps Latin to latn', () => {
+      expect(scriptToOpenTypeTag('Latin')).toBe('latn');
+    });
+
+    it('maps Cyrillic to cyrl', () => {
+      expect(scriptToOpenTypeTag('Cyrillic')).toBe('cyrl');
+    });
+
+    it('maps Arabic to arab', () => {
+      expect(scriptToOpenTypeTag('Arabic')).toBe('arab');
+    });
+
+    it('maps Devanagari to dev2', () => {
+      expect(scriptToOpenTypeTag('Devanagari')).toBe('dev2');
+    });
+
+    it('maps Han to hani', () => {
+      expect(scriptToOpenTypeTag('Han')).toBe('hani');
+    });
+
+    it('maps Hiragana to kana', () => {
+      expect(scriptToOpenTypeTag('Hiragana')).toBe('kana');
+    });
+
+    it('maps Katakana to kana', () => {
+      expect(scriptToOpenTypeTag('Katakana')).toBe('kana');
+    });
+
+    it('returns DFLT for Common', () => {
+      expect(scriptToOpenTypeTag('Common')).toBe('DFLT');
+    });
+
+    it('returns DFLT for unknown scripts', () => {
+      expect(scriptToOpenTypeTag('Unknown')).toBe('DFLT');
+    });
+  });
+});