|
1 | 1 | #!/usr/bin/env bash |
2 | 2 |
|
3 | | -# Unicode Security Scanner v2025.11.0 AI+ |
| 3 | +# Unicode Security Scanner v2026.03.0 AI+ |
4 | 4 | # Detects dangerous Unicode characters that can be used in security attacks |
5 | 5 | # Including Trojan Source attacks (CVE-2021-42574) and other invisible characters |
6 | 6 |
|
7 | 7 | # Script configuration |
8 | | -VERSION="2025.11.0" |
| 8 | +VERSION="2026.03.0" |
9 | 9 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
10 | 10 |
|
11 | 11 | # Command-line options (defaults) |
@@ -53,7 +53,9 @@ OPTIONS: |
53 | 53 | (comma-separated, e.g., "critical,high") |
54 | 54 | --allowlist FILE Path to allowlist file (default: .unicode-allowlist) |
55 | 55 | --exclude-emojis Exclude emoji characters and variation selectors (reduces false positives) |
56 | | - --exclude-common Exclude common Unicode like smart quotes, dashes (very permissive) |
| 56 | + --exclude-common Exclude common Unicode typography: smart quotes, dashes, |
| 57 | + ellipsis, superscripts, subscripts, Roman numerals, |
| 58 | + combining accents (recommended for docs/markdown repos) |
57 | 59 | --include-binary Include binary files (archives, images, executables, etc.) |
58 | 60 | By default, only text files are scanned to avoid false positives |
59 | 61 |
|
@@ -202,6 +204,19 @@ is_common_unicode() { |
202 | 204 | [[ "$unicode_code" =~ ^203[9A]$ ]] && return 0 |
203 | 205 | # Per mille: U+2030 |
204 | 206 | [[ "$unicode_code" == "2030" ]] && return 0 |
| 207 | + # Soft Hyphen: U+00AD (invisible formatting hint, harmless in HTML/markdown) |
| 208 | + [[ "$unicode_code" == "00AD" ]] && return 0 |
| 209 | + # Superscript digits: U+00B2 (²), U+00B3 (³), U+00B9 (¹), U+2070-U+2079 |
| 210 | + [[ "$unicode_code" =~ ^00B[239]$ ]] && return 0 |
| 211 | + [[ "$unicode_code" =~ ^207[0-9]$ ]] && return 0 |
| 212 | + # Subscript digits: U+2080-U+2084 |
| 213 | + [[ "$unicode_code" =~ ^208[0-4]$ ]] && return 0 |
| 214 | + # Roman numerals: U+2160-U+2179 (used in outlines, legal docs, lists) |
| 215 | + [[ "$unicode_code" =~ ^21[67][0-9A-F]$ ]] && return 0 |
| 216 | + # Combining diacritical marks (U+0300-U+030C) — used in accented Latin text (French, Spanish, etc.) |
| 217 | + [[ "$unicode_code" =~ ^030[0-9A-C]$ ]] && return 0 |
| 218 | + # Replacement Character U+FFFD — appears naturally from encoding tools/editors, not a security threat |
| 219 | + [[ "$unicode_code" == "FFFD" ]] && return 0 |
205 | 220 | return 1 |
206 | 221 | } |
207 | 222 |
|
|
0 commit comments