Skip to content

Commit a46b63e

Browse files
committed
add substitution support for Thai, Devangari and Hangul
1 parent 36adf14 commit a46b63e

21 files changed

Lines changed: 2063 additions & 14 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,4 @@ phpunit.xml
1818
rector.php
1919
target
2020
vendor
21+
PLAN_*

README.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ It is built to handle multilingual text paths where normalization, code-point ha
4242
- Right-to-left and mixed-direction text processing
4343
- Supporting shaping/step logic for complex scripts
4444

45+
### Character Substitution
46+
- Context-sensitive codepoint-level substitution via `Substitution::replaceChars()`
47+
- **Thai** — repositions leading vowels (Sara E/AE/O/AI, U+0E40–U+0E44, U+0E4D) to follow their base consonant, matching PDF visual-order glyph streams
48+
- **Devanagari** — moves left-positional matras (U+093F) to precede their base consonant cluster, including conjuncts joined by Virama (U+094D)
49+
- **Hangul** — composes Hangul Jamo sequences (U+1100–U+11FF, U+A960–U+A97F, U+D7B0–U+D7FF) into precomposed syllables (U+AC00–U+D7A3) per Unicode Standard §3.12
50+
4551
---
4652

4753
## Requirements
@@ -73,6 +79,47 @@ echo $bidi->getString();
7379

7480
---
7581

82+
## Character substitution
83+
84+
`Substitution::replaceChars()` takes an array of Unicode codepoints and returns a transformed array with script-specific substitutions applied. It is a pure codepoint-level transform with no font or PDF dependency.
85+
86+
```php
87+
<?php
88+
89+
require_once __DIR__ . '/vendor/autoload.php';
90+
91+
$sub = new \Com\Tecnick\Unicode\Substitution();
92+
93+
// Thai: leading vowel repositioned after its base consonant
94+
// Logical order: [U+0E40 SARA E, U+0E01 KO KAI]
95+
// Visual order: [U+0E01 KO KAI, U+0E40 SARA E]
96+
$result = $sub->replaceChars([0x0E40, 0x0E01]);
97+
// $result === [0x0E01, 0x0E40]
98+
99+
// Devanagari: left matra repositioned before its base consonant cluster
100+
// Logical order: [U+0915 KA, U+093F VOWEL SIGN I]
101+
// Visual order: [U+093F VOWEL SIGN I, U+0915 KA]
102+
$result = $sub->replaceChars([0x0915, 0x093F]);
103+
// $result === [0x093F, 0x0915]
104+
105+
// Hangul: Jamo composed into a precomposed syllable
106+
// [U+1100 KIYEOK, U+1161 JUNGSEONG A, U+11A8 JONGSEONG KIYEOK] → [U+AC01 각]
107+
$result = $sub->replaceChars([0x1100, 0x1161, 0x11A8]);
108+
// $result === [0xAC01]
109+
```
110+
111+
### Supported scripts and Unicode ranges
112+
113+
| Script | Unicode range(s) | Transformation |
114+
|---|---|---|
115+
| Thai | U+0E00–U+0E7F | Leading vowels repositioned after base consonant |
116+
| Devanagari | U+0900–U+097F | Left matras repositioned before consonant cluster |
117+
| Hangul Jamo | U+1100–U+11FF, U+A960–U+A97F, U+D7B0–U+D7FF | Jamo composed to precomposed syllables (U+AC00–U+D7A3) |
118+
119+
Codepoints belonging to unsupported scripts are passed through unchanged.
120+
121+
---
122+
76123
## Development
77124

78125
```bash

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.0.52
1+
2.1.0

src/Bidi/StepX.php

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -156,9 +156,10 @@ protected function processX(): void
156156
protected function processXcase(int $pos, int $ord): void
157157
{
158158
$edss = \end($this->dss);
159-
if ($edss === false) {
160-
return;
161-
}
159+
160+
// $this->dss always has the paragraph-level entry pushed in __construct and
161+
// is never fully emptied (pop guards require count > 1), so end() is not false.
162+
assert($edss !== false);
162163

163164
switch ($ord) {
164165
case UniConstant::RLE:
@@ -376,9 +377,8 @@ protected function processPdiCase(int $pos, int $ord, array $edss): void
376377
\array_pop($this->dss);
377378
--$count_dss;
378379
$edss = \end($this->dss);
379-
if ($edss === false) {
380-
break;
381-
}
380+
// Loop guard $count_dss > 1 ensures the array has at least 1 entry after pop.
381+
assert($edss !== false);
382382
}
383383

384384
// - Pop the last entry from the directional status stack and decrement the valid isolate
@@ -389,16 +389,17 @@ protected function processPdiCase(int $pos, int $ord, array $edss): void
389389
--$this->vic;
390390

391391
$edss = \end($this->dss);
392-
if ($edss === false) {
393-
return;
394-
}
395392

396393
// - In all cases, look up the last entry on the directional status stack left after the
397394
// steps above and:
398395
// - Set the PDI’s level to the entry's embedding level.
399396
// - If the entry's directional override status is not neutral, reset the current character type
400397
// from PDI to L if the override status is left-to-right, and to R if the override status is
401398
// right-to-left.
399+
// UAX#9 §X6a guarantees the preceding step left >= 2 entries, so this pop
400+
// does not empty the stack.
401+
assert($edss !== false);
402+
402403
$this->pushChar($pos, $ord, $edss);
403404
}
404405

src/Bidi/StepXten.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,9 +221,9 @@ protected function setStartEndOfSequence(): void
221221
// last character of the sequence is an isolate initiator (lacking a matching PDI), with the paragraph
222222
// embedding level.
223223
$lastchr = \end($seq['item']);
224-
if ($lastchr === false) {
225-
return;
226-
}
224+
225+
// A level run always contains at least one character, so end() is not false.
226+
assert($lastchr !== false);
227227

228228
$lev = $lastchr['level'];
229229
if ((! isset($this->chardata[($seq['end'] + 1)]['level'])) || $this->isIsolateInitiator($lastchr['char'])) {

src/Convert.php

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,12 @@ public function chr(int $ord): string
5252
*/
5353
public function ord(string $chr): int
5454
{
55-
$uni = \unpack('N', \mb_convert_encoding($chr, 'UCS-4BE', 'UTF-8'));
55+
$ucs = \mb_convert_encoding($chr, 'UCS-4BE', 'UTF-8');
56+
if (\strlen($ucs) < 4) {
57+
throw new UniException('Error converting string');
58+
}
59+
60+
$uni = \unpack('N', $ucs);
5661
if (($uni === false) || (!isset($uni[1])) || (!\is_int($uni[1]))) {
5762
throw new UniException('Error converting string');
5863
}

src/Data/Devanagari.php

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
<?php
2+
3+
/**
4+
* Devanagari.php
5+
*
6+
* @since 2026-04-30
7+
* @category Library
8+
* @package Unicode
9+
* @author Nicola Asuni <info@tecnick.com>
10+
* @copyright 2011-2026 Nicola Asuni - Tecnick.com LTD
11+
* @license https://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT)
12+
* @link https://github.com/tecnickcom/tc-lib-unicode
13+
*
14+
* This file is part of tc-lib-unicode software library.
15+
*
16+
* Source: https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
17+
* https://unicode.org/Public/15.1.0/ucd/IndicPositionalCategory.txt
18+
* Unicode Standard version: 15.1
19+
*/
20+
21+
namespace Com\Tecnick\Unicode\Data;
22+
23+
/**
24+
* Com\Tecnick\Unicode\Data\Devanagari
25+
*
26+
* Devanagari codepoint tables for character substitution and cluster
27+
* reordering.
28+
*
29+
* @since 2026-04-30
30+
* @category Library
31+
* @package Unicode
32+
* @author Nicola Asuni <info@tecnick.com>
33+
* @copyright 2011-2026 Nicola Asuni - Tecnick.com LTD
34+
* @license https://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT)
35+
* @link https://github.com/tecnickcom/tc-lib-unicode
36+
*/
37+
final class Devanagari
38+
{
39+
/**
40+
* Devanagari vowel signs with Indic Positional Category "Left".
41+
*
42+
* These matras are stored after their base consonant (or consonant
43+
* cluster) in Unicode logical order but must be rendered to the LEFT of
44+
* the base in a PDF glyph stream. They are therefore repositioned to
45+
* precede the consonant cluster during substitution.
46+
*
47+
* Codepoints:
48+
* U+093F DEVANAGARI VOWEL SIGN I
49+
*
50+
* Source: https://unicode.org/Public/15.1.0/ucd/IndicPositionalCategory.txt
51+
*
52+
* @var array<int, true>
53+
*/
54+
public const LEFT_MATRAS = [
55+
0x093F => true,
56+
];
57+
58+
/**
59+
* Devanagari Virama (U+094D).
60+
*
61+
* Joins two consonants into a conjunct cluster. When scanning a consonant
62+
* cluster for pre-base matra reordering, consecutive (consonant + VIRAMA)
63+
* pairs extend the cluster.
64+
*
65+
* Source: https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
66+
*/
67+
public const VIRAMA = 0x094D;
68+
69+
/**
70+
* First codepoint of the standard Devanagari consonant range.
71+
*
72+
* U+0915 DEVANAGARI LETTER KA
73+
*
74+
* Source: https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
75+
*/
76+
public const BASE_CONSONANT_FIRST = 0x0915;
77+
78+
/**
79+
* Last codepoint of the standard Devanagari consonant range.
80+
*
81+
* U+0939 DEVANAGARI LETTER HA
82+
*
83+
* Source: https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
84+
*/
85+
public const BASE_CONSONANT_LAST = 0x0939;
86+
87+
/**
88+
* First codepoint of the extended Devanagari consonant range
89+
* (consonants with nukta — deprecated precomposed forms).
90+
*
91+
* U+0958 DEVANAGARI LETTER QA
92+
*
93+
* Source: https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
94+
*/
95+
public const BASE_CONSONANT_EXT_FIRST = 0x0958;
96+
97+
/**
98+
* Last codepoint of the extended Devanagari consonant range.
99+
*
100+
* U+095F DEVANAGARI LETTER YYA
101+
*
102+
* Source: https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
103+
*/
104+
public const BASE_CONSONANT_EXT_LAST = 0x095F;
105+
}

src/Data/Hangul.php

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
<?php
2+
3+
/**
4+
* Hangul.php
5+
*
6+
* @since 2026-04-30
7+
* @category Library
8+
* @package Unicode
9+
* @author Nicola Asuni <info@tecnick.com>
10+
* @copyright 2011-2026 Nicola Asuni - Tecnick.com LTD
11+
* @license https://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT)
12+
* @link https://github.com/tecnickcom/tc-lib-unicode
13+
*
14+
* This file is part of tc-lib-unicode software library.
15+
*
16+
* Source: The Unicode Standard, version 15.1, section 3.12 "Conjoining Jamo Behavior"
17+
* https://www.unicode.org/versions/Unicode15.1.0/
18+
* https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
19+
* Unicode Standard version: 15.1
20+
*/
21+
22+
namespace Com\Tecnick\Unicode\Data;
23+
24+
/**
25+
* Com\Tecnick\Unicode\Data\Hangul
26+
*
27+
* Algorithmic constants for Hangul Jamo → precomposed Hangul syllable
28+
* composition, as defined in section 3.12 of the Unicode Standard.
29+
*
30+
* Precomposed syllables occupy the range U+AC00–U+D7A3 and are derived by:
31+
*
32+
* S = SBase + (L − LBase) × NCount + (V − VBase) × TCount + (T − TBase)
33+
*
34+
* where T = TBase means "no trailing consonant" (TBase itself is not a
35+
* trailing consonant; the effective trailing index is 0 in that case).
36+
*
37+
* @since 2026-04-30
38+
* @category Library
39+
* @package Unicode
40+
* @author Nicola Asuni <info@tecnick.com>
41+
* @copyright 2011-2026 Nicola Asuni - Tecnick.com LTD
42+
* @license https://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT)
43+
* @link https://github.com/tecnickcom/tc-lib-unicode
44+
*/
45+
final class Hangul
46+
{
47+
/**
48+
* First precomposed Hangul syllable: U+AC00 HANGUL SYLLABLE GA.
49+
*
50+
* Source: Unicode Standard 15.1, section 3.12
51+
*/
52+
public const SBASE = 0xAC00;
53+
54+
/**
55+
* First Hangul leading consonant (choseong): U+1100 HANGUL CHOSEONG KIYEOK.
56+
*
57+
* Source: Unicode Standard 15.1, section 3.12
58+
*/
59+
public const LBASE = 0x1100;
60+
61+
/**
62+
* First Hangul vowel (jungseong): U+1161 HANGUL JUNGSEONG A.
63+
*
64+
* Source: Unicode Standard 15.1, section 3.12
65+
*/
66+
public const VBASE = 0x1161;
67+
68+
/**
69+
* Trailing consonant base value: U+11A7.
70+
*
71+
* The first actual trailing consonant (jongseong) is U+11A8; TBase is
72+
* one below that, so that (T − TBase) gives a 1-based index and a T of
73+
* TBase itself encodes "no trailing consonant" (index 0).
74+
*
75+
* Source: Unicode Standard 15.1, section 3.12
76+
*/
77+
public const TBASE = 0x11A7;
78+
79+
/**
80+
* Number of leading consonants (19).
81+
*
82+
* Covers U+1100–U+1112.
83+
*
84+
* Source: Unicode Standard 15.1, section 3.12
85+
*/
86+
public const LCOUNT = 19;
87+
88+
/**
89+
* Number of vowels (21).
90+
*
91+
* Covers U+1161–U+1175.
92+
*
93+
* Source: Unicode Standard 15.1, section 3.12
94+
*/
95+
public const VCOUNT = 21;
96+
97+
/**
98+
* Number of trailing consonant slots (28), including the "none" slot.
99+
*
100+
* Effective trailing consonants: U+11A8–U+11C2 (27 codepoints).
101+
* The 28th slot represents absence of a trailing consonant.
102+
*
103+
* Source: Unicode Standard 15.1, section 3.12
104+
*/
105+
public const TCOUNT = 28;
106+
107+
/**
108+
* Number of precomposed syllables per leading consonant.
109+
*
110+
* NCount = VCount × TCount = 21 × 28 = 588.
111+
*
112+
* Source: Unicode Standard 15.1, section 3.12
113+
*/
114+
public const NCOUNT = self::VCOUNT * self::TCOUNT;
115+
116+
/**
117+
* Total number of precomposed Hangul syllables.
118+
*
119+
* SCount = LCount × NCount = 19 × 588 = 11172.
120+
*
121+
* Source: Unicode Standard 15.1, section 3.12
122+
*/
123+
public const SCOUNT = self::LCOUNT * self::NCOUNT;
124+
}

0 commit comments

Comments
 (0)