|
| 1 | +import { describe, expect, it } from 'vitest'; |
| 2 | + |
| 3 | +import { detectScript, scriptToOpenTypeTag, segmentByScript } from './script-detection.ts'; |
| 4 | + |
| 5 | +describe('script-detection', () => { |
| 6 | + describe('detectScript', () => { |
| 7 | + it('returns Latin for ASCII letters', () => { |
| 8 | + expect(detectScript(0x41)).toBe('Latin'); // A |
| 9 | + expect(detectScript(0x5a)).toBe('Latin'); // Z |
| 10 | + expect(detectScript(0x61)).toBe('Latin'); // a |
| 11 | + expect(detectScript(0x7a)).toBe('Latin'); // z |
| 12 | + }); |
| 13 | + |
| 14 | + it('returns Common for ASCII digits', () => { |
| 15 | + expect(detectScript(0x30)).toBe('Common'); // 0 |
| 16 | + expect(detectScript(0x39)).toBe('Common'); // 9 |
| 17 | + }); |
| 18 | + |
| 19 | + it('returns Common for space and punctuation', () => { |
| 20 | + expect(detectScript(0x20)).toBe('Common'); // space |
| 21 | + expect(detectScript(0x2c)).toBe('Common'); // comma |
| 22 | + expect(detectScript(0x2e)).toBe('Common'); // period |
| 23 | + expect(detectScript(0x21)).toBe('Common'); // ! |
| 24 | + }); |
| 25 | + |
| 26 | + it('returns Latin for extended Latin characters', () => { |
| 27 | + expect(detectScript(0x00c0)).toBe('Latin'); // À |
| 28 | + expect(detectScript(0x00e9)).toBe('Latin'); // é |
| 29 | + expect(detectScript(0x017e)).toBe('Latin'); // ž |
| 30 | + }); |
| 31 | + |
| 32 | + it('returns Inherited for combining diacritical marks', () => { |
| 33 | + expect(detectScript(0x0300)).toBe('Inherited'); // combining grave accent |
| 34 | + expect(detectScript(0x0301)).toBe('Inherited'); // combining acute accent |
| 35 | + expect(detectScript(0x036f)).toBe('Inherited'); // last in range |
| 36 | + }); |
| 37 | + |
| 38 | + it('returns Greek for Greek characters', () => { |
| 39 | + expect(detectScript(0x0391)).toBe('Greek'); // Α (Alpha) |
| 40 | + expect(detectScript(0x03c9)).toBe('Greek'); // ω (omega) |
| 41 | + }); |
| 42 | + |
| 43 | + it('returns Cyrillic for Cyrillic characters', () => { |
| 44 | + expect(detectScript(0x0410)).toBe('Cyrillic'); // А |
| 45 | + expect(detectScript(0x044f)).toBe('Cyrillic'); // я |
| 46 | + }); |
| 47 | + |
| 48 | + it('returns Hebrew for Hebrew characters', () => { |
| 49 | + expect(detectScript(0x05d0)).toBe('Hebrew'); // א (alef) |
| 50 | + }); |
| 51 | + |
| 52 | + it('returns Arabic for Arabic characters', () => { |
| 53 | + expect(detectScript(0x0627)).toBe('Arabic'); // ا (alif) |
| 54 | + expect(detectScript(0x0645)).toBe('Arabic'); // م (meem) |
| 55 | + }); |
| 56 | + |
| 57 | + it('returns Devanagari for Devanagari characters', () => { |
| 58 | + expect(detectScript(0x0915)).toBe('Devanagari'); // क |
| 59 | + expect(detectScript(0x0928)).toBe('Devanagari'); // न |
| 60 | + }); |
| 61 | + |
| 62 | + it('returns Thai for Thai characters', () => { |
| 63 | + expect(detectScript(0x0e01)).toBe('Thai'); // ก |
| 64 | + }); |
| 65 | + |
| 66 | + it('returns Han for CJK ideographs', () => { |
| 67 | + expect(detectScript(0x4e2d)).toBe('Han'); // 中 |
| 68 | + expect(detectScript(0x6587)).toBe('Han'); // 文 |
| 69 | + }); |
| 70 | + |
| 71 | + it('returns Hiragana for Hiragana characters', () => { |
| 72 | + expect(detectScript(0x3042)).toBe('Hiragana'); // あ |
| 73 | + }); |
| 74 | + |
| 75 | + it('returns Katakana for Katakana characters', () => { |
| 76 | + expect(detectScript(0x30a2)).toBe('Katakana'); // ア |
| 77 | + }); |
| 78 | + |
| 79 | + it('returns Hangul for Hangul syllables', () => { |
| 80 | + expect(detectScript(0xac00)).toBe('Hangul'); // 가 |
| 81 | + }); |
| 82 | + |
| 83 | + it('returns Han for CJK Extension B (supplementary plane)', () => { |
| 84 | + expect(detectScript(0x20000)).toBe('Han'); |
| 85 | + }); |
| 86 | + |
| 87 | + it('returns Common for unmapped code points', () => { |
| 88 | + expect(detectScript(0x10ffff)).toBe('Common'); |
| 89 | + }); |
| 90 | + }); |
| 91 | + |
| 92 | + describe('segmentByScript', () => { |
| 93 | + it('returns empty array for empty string', () => { |
| 94 | + expect(segmentByScript('')).toEqual([]); |
| 95 | + }); |
| 96 | + |
| 97 | + it('returns single run for pure Latin text', () => { |
| 98 | + expect(segmentByScript('Hello')).toEqual([{ text: 'Hello', script: 'Latin' }]); |
| 99 | + }); |
| 100 | + |
| 101 | + it('returns single run for Latin text with spaces', () => { |
| 102 | + expect(segmentByScript('Hello world')).toEqual([{ text: 'Hello world', script: 'Latin' }]); |
| 103 | + }); |
| 104 | + |
| 105 | + it('resolves all-Common text to Common', () => { |
| 106 | + expect(segmentByScript('123 456')).toEqual([{ text: '123 456', script: 'Common' }]); |
| 107 | + }); |
| 108 | + |
| 109 | + it('splits Latin and Cyrillic', () => { |
| 110 | + expect(segmentByScript('Hello Мир')).toEqual([ |
| 111 | + { text: 'Hello ', script: 'Latin' }, |
| 112 | + { text: 'Мир', script: 'Cyrillic' }, |
| 113 | + ]); |
| 114 | + }); |
| 115 | + |
| 116 | + it('resolves leading Common to first concrete script', () => { |
| 117 | + expect(segmentByScript('(Мир)')).toEqual([{ text: '(Мир)', script: 'Cyrillic' }]); |
| 118 | + }); |
| 119 | + |
| 120 | + it('resolves trailing punctuation to preceding script', () => { |
| 121 | + expect(segmentByScript('Hello, Мир!')).toEqual([ |
| 122 | + { text: 'Hello, ', script: 'Latin' }, |
| 123 | + { text: 'Мир!', script: 'Cyrillic' }, |
| 124 | + ]); |
| 125 | + }); |
| 126 | + |
| 127 | + it('splits Latin and CJK', () => { |
| 128 | + expect(segmentByScript('abc中文def')).toEqual([ |
| 129 | + { text: 'abc', script: 'Latin' }, |
| 130 | + { text: '中文', script: 'Han' }, |
| 131 | + { text: 'def', script: 'Latin' }, |
| 132 | + ]); |
| 133 | + }); |
| 134 | + |
| 135 | + it('handles Arabic text', () => { |
| 136 | + expect(segmentByScript('مرحبا')).toEqual([{ text: 'مرحبا', script: 'Arabic' }]); |
| 137 | + }); |
| 138 | + |
| 139 | + it('handles Devanagari text', () => { |
| 140 | + expect(segmentByScript('नमस्ते')).toEqual([{ text: 'नमस्ते', script: 'Devanagari' }]); |
| 141 | + }); |
| 142 | + |
| 143 | + it('keeps combining marks with base character', () => { |
| 144 | + // e + combining acute accent -> both Latin |
| 145 | + expect(segmentByScript('e\u0301')).toEqual([{ text: 'e\u0301', script: 'Latin' }]); |
| 146 | + }); |
| 147 | + |
| 148 | + it('resolves combining marks to preceding script', () => { |
| 149 | + // Arabic letter + combining mark -> both Arabic |
| 150 | + expect(segmentByScript('\u0627\u0300')).toEqual([{ text: '\u0627\u0300', script: 'Arabic' }]); |
| 151 | + }); |
| 152 | + |
| 153 | + it('handles three different scripts', () => { |
| 154 | + expect(segmentByScript('Hello Мир 中文')).toEqual([ |
| 155 | + { text: 'Hello ', script: 'Latin' }, |
| 156 | + { text: 'Мир ', script: 'Cyrillic' }, |
| 157 | + { text: '中文', script: 'Han' }, |
| 158 | + ]); |
| 159 | + }); |
| 160 | + |
| 161 | + it('handles supplementary plane characters', () => { |
| 162 | + // CJK Extension B character (U+20000) |
| 163 | + expect(segmentByScript('\u{20000}')).toEqual([{ text: '\u{20000}', script: 'Han' }]); |
| 164 | + }); |
| 165 | + }); |
| 166 | + |
| 167 | + describe('scriptToOpenTypeTag', () => { |
| 168 | + it('maps Latin to latn', () => { |
| 169 | + expect(scriptToOpenTypeTag('Latin')).toBe('latn'); |
| 170 | + }); |
| 171 | + |
| 172 | + it('maps Cyrillic to cyrl', () => { |
| 173 | + expect(scriptToOpenTypeTag('Cyrillic')).toBe('cyrl'); |
| 174 | + }); |
| 175 | + |
| 176 | + it('maps Arabic to arab', () => { |
| 177 | + expect(scriptToOpenTypeTag('Arabic')).toBe('arab'); |
| 178 | + }); |
| 179 | + |
| 180 | + it('maps Devanagari to dev2', () => { |
| 181 | + expect(scriptToOpenTypeTag('Devanagari')).toBe('dev2'); |
| 182 | + }); |
| 183 | + |
| 184 | + it('maps Han to hani', () => { |
| 185 | + expect(scriptToOpenTypeTag('Han')).toBe('hani'); |
| 186 | + }); |
| 187 | + |
| 188 | + it('maps Hiragana to kana', () => { |
| 189 | + expect(scriptToOpenTypeTag('Hiragana')).toBe('kana'); |
| 190 | + }); |
| 191 | + |
| 192 | + it('maps Katakana to kana', () => { |
| 193 | + expect(scriptToOpenTypeTag('Katakana')).toBe('kana'); |
| 194 | + }); |
| 195 | + |
| 196 | + it('returns DFLT for Common', () => { |
| 197 | + expect(scriptToOpenTypeTag('Common')).toBe('DFLT'); |
| 198 | + }); |
| 199 | + |
| 200 | + it('returns DFLT for unknown scripts', () => { |
| 201 | + expect(scriptToOpenTypeTag('Unknown')).toBe('DFLT'); |
| 202 | + }); |
| 203 | + }); |
| 204 | +}); |
0 commit comments