tecnickcom
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 47 additions & 0 deletions b/‎README.md‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/Bidi/StepX.php‎
Lines changed: 10 additions & 9 deletions b/‎src/Bidi/StepX.php‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎src/Bidi/StepXten.php‎
Lines changed: 3 additions & 3 deletions b/‎src/Bidi/StepXten.php‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/Convert.php‎
Lines changed: 6 additions & 1 deletion b/‎src/Convert.php‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/Data/Devanagari.php‎
Lines changed: 105 additions & 0 deletions b/‎src/Data/Devanagari.php‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎src/Data/Hangul.php‎
Lines changed: 124 additions & 0 deletions b/‎src/Data/Hangul.php‎
Lines changed: 124 additions & 0 deletions
@@ -18,3 +18,4 @@ phpunit.xml
 rector.php
 target
 vendor
+PLAN_*
@@ -42,6 +42,12 @@ It is built to handle multilingual text paths where normalization, code-point ha
 - Right-to-left and mixed-direction text processing
 - Supporting shaping/step logic for complex scripts
 
+### Character Substitution
+- Context-sensitive codepoint-level substitution via `Substitution::replaceChars()`
+- **Thai** — repositions leading vowels (Sara E/AE/O/AI, U+0E40–U+0E44, U+0E4D) to follow their base consonant, matching PDF visual-order glyph streams
+- **Devanagari** — moves left-positional matras (U+093F) to precede their base consonant cluster, including conjuncts joined by Virama (U+094D)
+- **Hangul** — composes Hangul Jamo sequences (U+1100–U+11FF, U+A960–U+A97F, U+D7B0–U+D7FF) into precomposed syllables (U+AC00–U+D7A3) per Unicode Standard §3.12
+
 ---
 
 ## Requirements
@@ -73,6 +79,47 @@ echo $bidi->getString();
 
 ---
 
+## Character substitution
+
+`Substitution::replaceChars()` takes an array of Unicode codepoints and returns a transformed array with script-specific substitutions applied. It is a pure codepoint-level transform with no font or PDF dependency.
+
+```php
+<?php
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+$sub = new \Com\Tecnick\Unicode\Substitution();
+
+// Thai: leading vowel repositioned after its base consonant
+// Logical order:  [U+0E40 SARA E, U+0E01 KO KAI]
+// Visual order:   [U+0E01 KO KAI, U+0E40 SARA E]
+$result = $sub->replaceChars([0x0E40, 0x0E01]);
+// $result === [0x0E01, 0x0E40]
+
+// Devanagari: left matra repositioned before its base consonant cluster
+// Logical order:  [U+0915 KA, U+093F VOWEL SIGN I]
+// Visual order:   [U+093F VOWEL SIGN I, U+0915 KA]
+$result = $sub->replaceChars([0x0915, 0x093F]);
+// $result === [0x093F, 0x0915]
+
+// Hangul: Jamo composed into a precomposed syllable
+// [U+1100 KIYEOK, U+1161 JUNGSEONG A, U+11A8 JONGSEONG KIYEOK] → [U+AC01 각]
+$result = $sub->replaceChars([0x1100, 0x1161, 0x11A8]);
+// $result === [0xAC01]
+```
+
+### Supported scripts and Unicode ranges
+
+| Script | Unicode range(s) | Transformation |
+|---|---|---|
+| Thai | U+0E00–U+0E7F | Leading vowels repositioned after base consonant |
+| Devanagari | U+0900–U+097F | Left matras repositioned before consonant cluster |
+| Hangul Jamo | U+1100–U+11FF, U+A960–U+A97F, U+D7B0–U+D7FF | Jamo composed to precomposed syllables (U+AC00–U+D7A3) |
+
+Codepoints belonging to unsupported scripts are passed through unchanged.
+
+---
+
 ## Development
 
 ```bash
 
@@ -1 +1 @@
-2.0.52
+2.1.0
@@ -156,9 +156,10 @@ protected function processX(): void
     protected function processXcase(int $pos, int $ord): void
     {
         $edss = \end($this->dss);
-        if ($edss === false) {
-            return;
-        }
+
+    // $this->dss always has the paragraph-level entry pushed in __construct and
+    // is never fully emptied (pop guards require count > 1), so end() is not false.
+        assert($edss !== false);
 
         switch ($ord) {
             case UniConstant::RLE:
@@ -376,9 +377,8 @@ protected function processPdiCase(int $pos, int $ord, array $edss): void
             \array_pop($this->dss);
             --$count_dss;
             $edss = \end($this->dss);
-            if ($edss === false) {
-                break;
-            }
+            // Loop guard $count_dss > 1 ensures the array has at least 1 entry after pop.
+            assert($edss !== false);
         }
 
         //        - Pop the last entry from the directional status stack and decrement the valid isolate
@@ -389,16 +389,17 @@ protected function processPdiCase(int $pos, int $ord, array $edss): void
         --$this->vic;
 
         $edss = \end($this->dss);
-        if ($edss === false) {
-            return;
-        }
 
         //      - In all cases, look up the last entry on the directional status stack left after the
         //        steps above and:
         //        - Set the PDI’s level to the entry's embedding level.
         //        - If the entry's directional override status is not neutral, reset the current character type
         //          from PDI to L if the override status is left-to-right, and to R if the override status is
         //          right-to-left.
+        // UAX#9 §X6a guarantees the preceding step left >= 2 entries, so this pop
+        // does not empty the stack.
+        assert($edss !== false);
+
         $this->pushChar($pos, $ord, $edss);
     }
 
 
@@ -221,9 +221,9 @@ protected function setStartEndOfSequence(): void
             // last character of the sequence is an isolate initiator (lacking a matching PDI), with the paragraph
             // embedding level.
             $lastchr = \end($seq['item']);
-            if ($lastchr === false) {
-                return;
-            }
+
+            // A level run always contains at least one character, so end() is not false.
+            assert($lastchr !== false);
 
             $lev = $lastchr['level'];
             if ((! isset($this->chardata[($seq['end'] + 1)]['level'])) || $this->isIsolateInitiator($lastchr['char'])) {
 
@@ -52,7 +52,12 @@ public function chr(int $ord): string
      */
     public function ord(string $chr): int
     {
-        $uni = \unpack('N', \mb_convert_encoding($chr, 'UCS-4BE', 'UTF-8'));
+        $ucs = \mb_convert_encoding($chr, 'UCS-4BE', 'UTF-8');
+        if (\strlen($ucs) < 4) {
+            throw new UniException('Error converting string');
+        }
+
+        $uni = \unpack('N', $ucs);
         if (($uni === false) || (!isset($uni[1])) || (!\is_int($uni[1]))) {
             throw new UniException('Error converting string');
         }
 
@@ -0,0 +1,105 @@
+<?php
+
+/**
+ * Devanagari.php
+ *
+ * @since     2026-04-30
+ * @category  Library
+ * @package   Unicode
+ * @author    Nicola Asuni <info@tecnick.com>
+ * @copyright 2011-2026 Nicola Asuni - Tecnick.com LTD
+ * @license   https://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT)
+ * @link      https://github.com/tecnickcom/tc-lib-unicode
+ *
+ * This file is part of tc-lib-unicode software library.
+ *
+ * Source: https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
+ *         https://unicode.org/Public/15.1.0/ucd/IndicPositionalCategory.txt
+ * Unicode Standard version: 15.1
+ */
+
+namespace Com\Tecnick\Unicode\Data;
+
+/**
+ * Com\Tecnick\Unicode\Data\Devanagari
+ *
+ * Devanagari codepoint tables for character substitution and cluster
+ * reordering.
+ *
+ * @since     2026-04-30
+ * @category  Library
+ * @package   Unicode
+ * @author    Nicola Asuni <info@tecnick.com>
+ * @copyright 2011-2026 Nicola Asuni - Tecnick.com LTD
+ * @license   https://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT)
+ * @link      https://github.com/tecnickcom/tc-lib-unicode
+ */
+final class Devanagari
+{
+    /**
+     * Devanagari vowel signs with Indic Positional Category "Left".
+     *
+     * These matras are stored after their base consonant (or consonant
+     * cluster) in Unicode logical order but must be rendered to the LEFT of
+     * the base in a PDF glyph stream. They are therefore repositioned to
+     * precede the consonant cluster during substitution.
+     *
+     * Codepoints:
+     *   U+093F DEVANAGARI VOWEL SIGN I
+     *
+     * Source: https://unicode.org/Public/15.1.0/ucd/IndicPositionalCategory.txt
+     *
+     * @var array<int, true>
+     */
+    public const LEFT_MATRAS = [
+        0x093F => true,
+    ];
+
+    /**
+     * Devanagari Virama (U+094D).
+     *
+     * Joins two consonants into a conjunct cluster. When scanning a consonant
+     * cluster for pre-base matra reordering, consecutive (consonant + VIRAMA)
+     * pairs extend the cluster.
+     *
+     * Source: https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
+     */
+    public const VIRAMA = 0x094D;
+
+    /**
+     * First codepoint of the standard Devanagari consonant range.
+     *
+     * U+0915 DEVANAGARI LETTER KA
+     *
+     * Source: https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
+     */
+    public const BASE_CONSONANT_FIRST = 0x0915;
+
+    /**
+     * Last codepoint of the standard Devanagari consonant range.
+     *
+     * U+0939 DEVANAGARI LETTER HA
+     *
+     * Source: https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
+     */
+    public const BASE_CONSONANT_LAST = 0x0939;
+
+    /**
+     * First codepoint of the extended Devanagari consonant range
+     * (consonants with nukta — deprecated precomposed forms).
+     *
+     * U+0958 DEVANAGARI LETTER QA
+     *
+     * Source: https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
+     */
+    public const BASE_CONSONANT_EXT_FIRST = 0x0958;
+
+    /**
+     * Last codepoint of the extended Devanagari consonant range.
+     *
+     * U+095F DEVANAGARI LETTER YYA
+     *
+     * Source: https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
+     */
+    public const BASE_CONSONANT_EXT_LAST = 0x095F;
+}
@@ -0,0 +1,124 @@
+<?php
+
+/**
+ * Hangul.php
+ *
+ * @since     2026-04-30
+ * @category  Library
+ * @package   Unicode
+ * @author    Nicola Asuni <info@tecnick.com>
+ * @copyright 2011-2026 Nicola Asuni - Tecnick.com LTD
+ * @license   https://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT)
+ * @link      https://github.com/tecnickcom/tc-lib-unicode
+ *
+ * This file is part of tc-lib-unicode software library.
+ *
+ * Source: The Unicode Standard, version 15.1, section 3.12 "Conjoining Jamo Behavior"
+ *         https://www.unicode.org/versions/Unicode15.1.0/
+ *         https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt
+ * Unicode Standard version: 15.1
+ */
+
+namespace Com\Tecnick\Unicode\Data;
+
+/**
+ * Com\Tecnick\Unicode\Data\Hangul
+ *
+ * Algorithmic constants for Hangul Jamo → precomposed Hangul syllable
+ * composition, as defined in section 3.12 of the Unicode Standard.
+ *
+ * Precomposed syllables occupy the range U+AC00–U+D7A3 and are derived by:
+ *
+ *   S = SBase + (L − LBase) × NCount + (V − VBase) × TCount + (T − TBase)
+ *
+ * where T = TBase means "no trailing consonant" (TBase itself is not a
+ * trailing consonant; the effective trailing index is 0 in that case).
+ *
+ * @since     2026-04-30
+ * @category  Library
+ * @package   Unicode
+ * @author    Nicola Asuni <info@tecnick.com>
+ * @copyright 2011-2026 Nicola Asuni - Tecnick.com LTD
+ * @license   https://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT)
+ * @link      https://github.com/tecnickcom/tc-lib-unicode
+ */
+final class Hangul
+{
+    /**
+     * First precomposed Hangul syllable: U+AC00 HANGUL SYLLABLE GA.
+     *
+     * Source: Unicode Standard 15.1, section 3.12
+     */
+    public const SBASE = 0xAC00;
+
+    /**
+     * First Hangul leading consonant (choseong): U+1100 HANGUL CHOSEONG KIYEOK.
+     *
+     * Source: Unicode Standard 15.1, section 3.12
+     */
+    public const LBASE = 0x1100;
+
+    /**
+     * First Hangul vowel (jungseong): U+1161 HANGUL JUNGSEONG A.
+     *
+     * Source: Unicode Standard 15.1, section 3.12
+     */
+    public const VBASE = 0x1161;
+
+    /**
+     * Trailing consonant base value: U+11A7.
+     *
+     * The first actual trailing consonant (jongseong) is U+11A8; TBase is
+     * one below that, so that (T − TBase) gives a 1-based index and a T of
+     * TBase itself encodes "no trailing consonant" (index 0).
+     *
+     * Source: Unicode Standard 15.1, section 3.12
+     */
+    public const TBASE = 0x11A7;
+
+    /**
+     * Number of leading consonants (19).
+     *
+     * Covers U+1100–U+1112.
+     *
+     * Source: Unicode Standard 15.1, section 3.12
+     */
+    public const LCOUNT = 19;
+
+    /**
+     * Number of vowels (21).
+     *
+     * Covers U+1161–U+1175.
+     *
+     * Source: Unicode Standard 15.1, section 3.12
+     */
+    public const VCOUNT = 21;
+
+    /**
+     * Number of trailing consonant slots (28), including the "none" slot.
+     *
+     * Effective trailing consonants: U+11A8–U+11C2 (27 codepoints).
+     * The 28th slot represents absence of a trailing consonant.
+     *
+     * Source: Unicode Standard 15.1, section 3.12
+     */
+    public const TCOUNT = 28;
+
+    /**
+     * Number of precomposed syllables per leading consonant.
+     *
+     * NCount = VCount × TCount = 21 × 28 = 588.
+     *
+     * Source: Unicode Standard 15.1, section 3.12
+     */
+    public const NCOUNT = self::VCOUNT * self::TCOUNT;
+
+    /**
+     * Total number of precomposed Hangul syllables.
+     *
+     * SCount = LCount × NCount = 19 × 588 = 11172.
+     *
+     * Source: Unicode Standard 15.1, section 3.12
+     */
+    public const SCOUNT = self::LCOUNT * self::NCOUNT;
+}
-Original file line number
+Diff line change
 rector.php
 target
 vendor
 +PLAN_*
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,12 @@ public function chr(int $ord): string`
`52`	`52`	`*/`
`53`	`53`	`public function ord(string $chr): int`
`54`	`54`	`{`
`55`		`- $uni = \unpack('N', \mb_convert_encoding($chr, 'UCS-4BE', 'UTF-8'));`
	`55`	`+ $ucs = \mb_convert_encoding($chr, 'UCS-4BE', 'UTF-8');`
	`56`	`+ if (\strlen($ucs) < 4) {`
	`57`	`+ throw new UniException('Error converting string');`
	`58`	`+ }`
	`59`	`+`
	`60`	`+ $uni = \unpack('N', $ucs);`
`56`	`61`	`if (($uni === false) \|\| (!isset($uni[1])) \|\| (!\is_int($uni[1]))) {`
`57`	`62`	`throw new UniException('Error converting string');`
`58`	`63`	`}`