🔤 Update Unicode scanner to v2026.03.0

dragonfire1119 · dragonfire1119 · commit f32e56e8bf46 · 2026-03-07T18:12:08.000-06:00
refactor: Expand common Unicode exclusions and improve
documentation

Update version to 2026.03.0 across all references. Enhance the
--exclude-common flag to cover additional typography characters
including soft hyphens, superscripts, subscripts, Roman numerals,
combining diacritical marks, and the replacement character. These
additions reduce false positives in documentation and markdown
repositories while maintaining security against actual threats.
Improve help text clarity for the --exclude-common option.
diff --git a/check-for-unicode/run.sh b/check-for-unicode/run.sh
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
 
-# Unicode Security Scanner v2025.11.0 AI+
+# Unicode Security Scanner v2026.03.0 AI+
 # Detects dangerous Unicode characters that can be used in security attacks
 # Including Trojan Source attacks (CVE-2021-42574) and other invisible characters
 
 # Script configuration
-VERSION="2025.11.0"
+VERSION="2026.03.0"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 # Command-line options (defaults)
@@ -53,7 +53,9 @@ OPTIONS:
                         (comma-separated, e.g., "critical,high")
     --allowlist FILE    Path to allowlist file (default: .unicode-allowlist)
     --exclude-emojis    Exclude emoji characters and variation selectors (reduces false positives)
-    --exclude-common    Exclude common Unicode like smart quotes, dashes (very permissive)
+    --exclude-common    Exclude common Unicode typography: smart quotes, dashes,
+                        ellipsis, superscripts, subscripts, Roman numerals,
+                        combining accents (recommended for docs/markdown repos)
     --include-binary    Include binary files (archives, images, executables, etc.)
                         By default, only text files are scanned to avoid false positives
 
@@ -202,6 +204,19 @@ is_common_unicode() {
     [[ "$unicode_code" =~ ^203[9A]$ ]] && return 0
     # Per mille: U+2030
     [[ "$unicode_code" == "2030" ]] && return 0
+    # Soft Hyphen: U+00AD (invisible formatting hint, harmless in HTML/markdown)
+    [[ "$unicode_code" == "00AD" ]] && return 0
+    # Superscript digits: U+00B2 (²), U+00B3 (³), U+00B9 (¹), U+2070-U+2079
+    [[ "$unicode_code" =~ ^00B[239]$ ]] && return 0
+    [[ "$unicode_code" =~ ^207[0-9]$ ]] && return 0
+    # Subscript digits: U+2080-U+2084
+    [[ "$unicode_code" =~ ^208[0-4]$ ]] && return 0
+    # Roman numerals: U+2160-U+2179 (used in outlines, legal docs, lists)
+    [[ "$unicode_code" =~ ^21[67][0-9A-F]$ ]] && return 0
+    # Combining diacritical marks (U+0300-U+030C) — used in accented Latin text (French, Spanish, etc.)
+    [[ "$unicode_code" =~ ^030[0-9A-C]$ ]] && return 0
+    # Replacement Character U+FFFD — appears naturally from encoding tools/editors, not a security threat
+    [[ "$unicode_code" == "FFFD" ]] && return 0
     return 1
 }