|
| 1 | +package rust |
| 2 | + |
| 3 | +import ( |
| 4 | + "fmt" |
| 5 | + "testing" |
| 6 | +) |
| 7 | + |
| 8 | +// TestEscapeRustParity validates escapeRune, EscapeString, EscapeCharAll, |
| 9 | +// isPrintable, and isGraphemeExtended against exact Rust 1.93 |
| 10 | +// char::escape_debug / str::escape_debug reference output. |
| 11 | +func TestEscapeRustParity(t *testing.T) { |
| 12 | + t.Parallel() |
| 13 | + |
| 14 | + // escapeRune with ESCAPE_ALL must match Rust char::escape_debug(). |
| 15 | + // Reference data generated by running Rust 1.93 char::escape_debug() |
| 16 | + // on each codepoint. |
| 17 | + t.Run("char_escape_debug", func(t *testing.T) { |
| 18 | + t.Parallel() |
| 19 | + ref := map[rune]string{ |
| 20 | + // C0 controls 0x00-0x1F |
| 21 | + 0x0000: `\0`, 0x0001: `\u{1}`, 0x0002: `\u{2}`, 0x0003: `\u{3}`, |
| 22 | + 0x0004: `\u{4}`, 0x0005: `\u{5}`, 0x0006: `\u{6}`, 0x0007: `\u{7}`, |
| 23 | + 0x0008: `\u{8}`, 0x0009: `\t`, 0x000A: `\n`, 0x000B: `\u{b}`, |
| 24 | + 0x000C: `\u{c}`, 0x000D: `\r`, 0x000E: `\u{e}`, 0x000F: `\u{f}`, |
| 25 | + 0x0010: `\u{10}`, 0x0011: `\u{11}`, 0x0012: `\u{12}`, 0x0013: `\u{13}`, |
| 26 | + 0x0014: `\u{14}`, 0x0015: `\u{15}`, 0x0016: `\u{16}`, 0x0017: `\u{17}`, |
| 27 | + 0x0018: `\u{18}`, 0x0019: `\u{19}`, 0x001A: `\u{1a}`, 0x001B: `\u{1b}`, |
| 28 | + 0x001C: `\u{1c}`, 0x001D: `\u{1d}`, 0x001E: `\u{1e}`, 0x001F: `\u{1f}`, |
| 29 | + // Special printable ASCII |
| 30 | + 0x0022: `\"`, 0x0027: `\'`, 0x005C: `\\`, |
| 31 | + // DEL |
| 32 | + 0x007F: `\u{7f}`, |
| 33 | + // C1 controls 0x80-0x9F |
| 34 | + 0x0080: `\u{80}`, 0x0081: `\u{81}`, 0x0082: `\u{82}`, 0x0083: `\u{83}`, |
| 35 | + 0x0084: `\u{84}`, 0x0085: `\u{85}`, 0x0086: `\u{86}`, 0x0087: `\u{87}`, |
| 36 | + 0x0088: `\u{88}`, 0x0089: `\u{89}`, 0x008A: `\u{8a}`, 0x008B: `\u{8b}`, |
| 37 | + 0x008C: `\u{8c}`, 0x008D: `\u{8d}`, 0x008E: `\u{8e}`, 0x008F: `\u{8f}`, |
| 38 | + 0x0090: `\u{90}`, 0x0091: `\u{91}`, 0x0092: `\u{92}`, 0x0093: `\u{93}`, |
| 39 | + 0x0094: `\u{94}`, 0x0095: `\u{95}`, 0x0096: `\u{96}`, 0x0097: `\u{97}`, |
| 40 | + 0x0098: `\u{98}`, 0x0099: `\u{99}`, 0x009A: `\u{9a}`, 0x009B: `\u{9b}`, |
| 41 | + 0x009C: `\u{9c}`, 0x009D: `\u{9d}`, 0x009E: `\u{9e}`, 0x009F: `\u{9f}`, |
| 42 | + // Latin-1 supplement 0xA0-0xBF |
| 43 | + 0x00A0: `\u{a0}`, |
| 44 | + 0x00A1: "\u00A1", 0x00A2: "\u00A2", 0x00A3: "\u00A3", |
| 45 | + 0x00A4: "\u00A4", 0x00A5: "\u00A5", 0x00A6: "\u00A6", 0x00A7: "\u00A7", |
| 46 | + 0x00A8: "\u00A8", 0x00A9: "\u00A9", 0x00AA: "\u00AA", 0x00AB: "\u00AB", |
| 47 | + 0x00AC: "\u00AC", |
| 48 | + 0x00AD: `\u{ad}`, |
| 49 | + 0x00AE: "\u00AE", 0x00AF: "\u00AF", |
| 50 | + 0x00B0: "\u00B0", 0x00B1: "\u00B1", 0x00B2: "\u00B2", 0x00B3: "\u00B3", |
| 51 | + 0x00B4: "\u00B4", 0x00B5: "\u00B5", 0x00B6: "\u00B6", 0x00B7: "\u00B7", |
| 52 | + 0x00B8: "\u00B8", 0x00B9: "\u00B9", 0x00BA: "\u00BA", 0x00BB: "\u00BB", |
| 53 | + 0x00BC: "\u00BC", 0x00BD: "\u00BD", 0x00BE: "\u00BE", 0x00BF: "\u00BF", |
| 54 | + // Combining marks (Mn) — grapheme extend → escaped |
| 55 | + 0x0300: `\u{300}`, 0x0301: `\u{301}`, 0x0302: `\u{302}`, |
| 56 | + // Combining marks (Me) — grapheme extend → escaped |
| 57 | + 0x0488: `\u{488}`, 0x0489: `\u{489}`, |
| 58 | + 0x1ABE: `\u{1abe}`, |
| 59 | + 0x20DD: `\u{20dd}`, 0x20DE: `\u{20de}`, |
| 60 | + // Spacing marks (Mc) — NOT grapheme extend, printable → pass through |
| 61 | + 0x0903: "\u0903", 0x093E: "\u093E", 0x0940: "\u0940", |
| 62 | + // Format characters (Cf) — not printable → escaped |
| 63 | + 0x200B: `\u{200b}`, 0x200C: `\u{200c}`, 0x200D: `\u{200d}`, |
| 64 | + 0x200E: `\u{200e}`, 0x200F: `\u{200f}`, |
| 65 | + 0x202A: `\u{202a}`, 0x202E: `\u{202e}`, |
| 66 | + 0x0600: `\u{600}`, 0x0601: `\u{601}`, 0x061C: `\u{61c}`, |
| 67 | + // Separators — not printable |
| 68 | + 0x2028: `\u{2028}`, 0x2029: `\u{2029}`, |
| 69 | + // Invisible operators |
| 70 | + 0x2060: `\u{2060}`, 0x2061: `\u{2061}`, |
| 71 | + // BOM |
| 72 | + 0xFEFF: `\u{feff}`, |
| 73 | + // Interlinear annotation |
| 74 | + 0xFFF9: `\u{fff9}`, 0xFFFA: `\u{fffa}`, 0xFFFB: `\u{fffb}`, |
| 75 | + // Specials |
| 76 | + 0xFFFC: "\uFFFC", 0xFFFD: "\uFFFD", |
| 77 | + 0xFFFE: `\u{fffe}`, 0xFFFF: `\u{ffff}`, |
| 78 | + // Halfwidth katakana (Lm, Other_Grapheme_Extend) |
| 79 | + 0xFF9E: `\u{ff9e}`, 0xFF9F: `\u{ff9f}`, |
| 80 | + // Unassigned |
| 81 | + 0x0378: `\u{378}`, 0x0379: `\u{379}`, |
| 82 | + // Hangul jungseong — printable, not grapheme extend |
| 83 | + 0x1160: "\u1160", 0x1161: "\u1161", |
| 84 | + // Tag characters — not printable |
| 85 | + 0xE0001: `\u{e0001}`, 0xE0020: `\u{e0020}`, 0xE007F: `\u{e007f}`, |
| 86 | + // Variation selectors supplement — grapheme extend → escaped |
| 87 | + 0xE0100: `\u{e0100}`, 0xE01EF: `\u{e01ef}`, |
| 88 | + // Past variation selectors — not printable |
| 89 | + 0xE01F0: `\u{e01f0}`, |
| 90 | + // SMP |
| 91 | + 0x10000: "\U00010000", 0x1000C: `\u{1000c}`, 0x1F600: "\U0001F600", |
| 92 | + // CJK extension B boundary |
| 93 | + 0x20000: "\U00020000", 0x2A6E0: `\u{2a6e0}`, |
| 94 | + } |
| 95 | + // Fill in all printable ASCII 0x20-0x7E |
| 96 | + for r := rune(0x20); r <= 0x7E; r++ { |
| 97 | + if _, ok := ref[r]; !ok { |
| 98 | + ref[r] = string(r) |
| 99 | + } |
| 100 | + } |
| 101 | + for r, want := range ref { |
| 102 | + got := escapeRune(r, true) |
| 103 | + if got != want { |
| 104 | + t.Errorf("U+%04X: got %q, want %q", r, got, want) |
| 105 | + } |
| 106 | + } |
| 107 | + }) |
| 108 | + |
| 109 | + // EscapeString must match Rust str::escape_debug(): |
| 110 | + // first char uses ESCAPE_ALL, continuation uses escape_grapheme_extended=false. |
| 111 | + t.Run("str_escape_debug", func(t *testing.T) { |
| 112 | + t.Parallel() |
| 113 | + tests := []struct { |
| 114 | + name string |
| 115 | + in string |
| 116 | + want string |
| 117 | + }{ |
| 118 | + {"hello", "hello", "hello"}, |
| 119 | + {"empty", "", ""}, |
| 120 | + {"null", "\x00", `\0`}, |
| 121 | + {"whitespace_controls", "\t\n\r", `\t\n\r`}, |
| 122 | + {"single_quote_mid", "a'b", `a\'b`}, |
| 123 | + {"single_quote_first", "'a", `\'a`}, |
| 124 | + {"double_quote_mid", "a\"b", `a\"b`}, |
| 125 | + {"backslash_mid", "a\\b", `a\\b`}, |
| 126 | + // Grapheme extend: first escaped, continuation raw |
| 127 | + {"combining_cont", "a\u0300", "a\u0300"}, |
| 128 | + {"combining_first", "\u0300a", `\u{300}` + "a"}, |
| 129 | + {"two_combiners_after_base", "a\u0300\u0301", "a\u0300\u0301"}, |
| 130 | + {"two_combiners_no_base", "\u0300\u0301", `\u{300}` + "\u0301"}, |
| 131 | + {"katakana_cont", "a\uFF9E", "a\uFF9E"}, |
| 132 | + {"katakana_first", "\uFF9Ea", `\u{ff9e}` + "a"}, |
| 133 | + {"enclosing_cont", "a\u20DD", "a\u20DD"}, |
| 134 | + {"enclosing_first", "\u20DDa", `\u{20dd}` + "a"}, |
| 135 | + // Mc (NOT grapheme extend) — always raw |
| 136 | + {"visarga_only", "\u0903", "\u0903"}, |
| 137 | + {"visarga_cont", "a\u0903", "a\u0903"}, |
| 138 | + // Non-printable — always escaped |
| 139 | + {"zwj_only", "\u200D", `\u{200d}`}, |
| 140 | + {"zwj_cont", "a\u200D", `a\u{200d}`}, |
| 141 | + {"zwnj_only", "\u200C", `\u{200c}`}, |
| 142 | + {"zwnj_cont", "a\u200C", `a\u{200c}`}, |
| 143 | + {"bom", "\uFEFF", `\u{feff}`}, |
| 144 | + {"bom_cont", "a\uFEFF", `a\u{feff}`}, |
| 145 | + {"e_acute", "\u00E9", "\u00E9"}, |
| 146 | + {"soft_hyphen", string(rune(0xAD)), `\u{ad}`}, |
| 147 | + {"soft_hyphen_cont", "a" + string(rune(0xAD)), `a\u{ad}`}, |
| 148 | + {"nbsp", string(rune(0xA0)), `\u{a0}`}, |
| 149 | + {"nbsp_cont", "a" + string(rune(0xA0)), `a\u{a0}`}, |
| 150 | + } |
| 151 | + for _, tc := range tests { |
| 152 | + t.Run(tc.name, func(t *testing.T) { |
| 153 | + t.Parallel() |
| 154 | + if got := EscapeString(tc.in); got != tc.want { |
| 155 | + t.Errorf("got %q, want %q", got, tc.want) |
| 156 | + } |
| 157 | + }) |
| 158 | + } |
| 159 | + }) |
| 160 | + |
| 161 | + // EscapeCharAll must escape grapheme extend in ALL positions |
| 162 | + // (matches per-char char::escape_debug used by Rust Cedar patterns). |
| 163 | + t.Run("char_all", func(t *testing.T) { |
| 164 | + t.Parallel() |
| 165 | + tests := []struct { |
| 166 | + name string |
| 167 | + in string |
| 168 | + want string |
| 169 | + }{ |
| 170 | + {"plain", "hello", "hello"}, |
| 171 | + {"combining_cont", "a\u0300", `a\u{300}`}, |
| 172 | + {"combining_first", "\u0300a", `\u{300}` + "a"}, |
| 173 | + {"katakana_cont", "a\uFF9E", `a\u{ff9e}`}, |
| 174 | + {"visarga", "a\u0903", "a\u0903"}, // Mc only → raw |
| 175 | + } |
| 176 | + for _, tc := range tests { |
| 177 | + t.Run(tc.name, func(t *testing.T) { |
| 178 | + t.Parallel() |
| 179 | + if got := EscapeCharAll(tc.in); got != tc.want { |
| 180 | + t.Errorf("got %q, want %q", got, tc.want) |
| 181 | + } |
| 182 | + }) |
| 183 | + } |
| 184 | + }) |
| 185 | + |
| 186 | + // Grapheme extend chars must be escaped as first char, raw in continuation. |
| 187 | + // Non-printable non-grapheme-extend chars must always be escaped. |
| 188 | + t.Run("first_vs_continuation", func(t *testing.T) { |
| 189 | + t.Parallel() |
| 190 | + for _, r := range []rune{0x0300, 0x0301, 0x20DD, 0xFF9E, 0xFF9F, 0x0488} { |
| 191 | + t.Run(fmt.Sprintf("grapheme_extend_U+%04X", r), func(t *testing.T) { |
| 192 | + t.Parallel() |
| 193 | + wantFirst := fmt.Sprintf(`\u{%x}`, r) |
| 194 | + if got := EscapeString(string(r)); got != wantFirst { |
| 195 | + t.Errorf("as first: got %q, want %q", got, wantFirst) |
| 196 | + } |
| 197 | + wantCont := "a" + string(r) |
| 198 | + if got := EscapeString("a" + string(r)); got != wantCont { |
| 199 | + t.Errorf("in continuation: got %q, want %q", got, wantCont) |
| 200 | + } |
| 201 | + }) |
| 202 | + } |
| 203 | + for _, r := range []rune{0x200D, 0x00AD, 0x0080, 0xFFFE} { |
| 204 | + t.Run(fmt.Sprintf("always_escaped_U+%04X", r), func(t *testing.T) { |
| 205 | + t.Parallel() |
| 206 | + wantEsc := fmt.Sprintf(`\u{%x}`, r) |
| 207 | + if got := EscapeString(string(r)); got != wantEsc { |
| 208 | + t.Errorf("as first: got %q, want %q", got, wantEsc) |
| 209 | + } |
| 210 | + if got := EscapeString("a" + string(r)); got != "a"+wantEsc { |
| 211 | + t.Errorf("in continuation: got %q, want %q", got, "a"+wantEsc) |
| 212 | + } |
| 213 | + }) |
| 214 | + } |
| 215 | + }) |
| 216 | + |
| 217 | + // isPrintable must match Rust's is_printable() lookup tables. |
| 218 | + t.Run("isPrintable", func(t *testing.T) { |
| 219 | + t.Parallel() |
| 220 | + tests := []struct { |
| 221 | + r rune |
| 222 | + want bool |
| 223 | + }{ |
| 224 | + // ASCII |
| 225 | + {0x00, false}, {0x1f, false}, {0x20, true}, {0x7e, true}, {0x7f, false}, |
| 226 | + // C1 controls |
| 227 | + {0x80, false}, {0x85, false}, {0x9f, false}, |
| 228 | + // BMP specials |
| 229 | + {0xa0, false}, {0xa1, true}, {0xad, false}, {0xae, true}, {0xe9, true}, |
| 230 | + {0x0300, true}, {0x0378, false}, {0x0903, true}, |
| 231 | + {0x200c, false}, {0x200d, false}, {0x20dd, true}, {0xff9e, true}, {0xfffe, false}, |
| 232 | + // SMP |
| 233 | + {0x10000, true}, {0x1f600, true}, {0x1000c, false}, {0x10100, true}, |
| 234 | + // Supplementary — hardcoded ranges |
| 235 | + {0x20000, true}, |
| 236 | + {0x2a6e0, false}, {0x2b81e, false}, {0x2ceae, false}, |
| 237 | + {0x2ebe1, false}, {0x2ee5e, false}, {0x2fa1e, false}, |
| 238 | + {0x3134b, false}, {0x3347a, false}, {0xe01f0, false}, |
| 239 | + {0xe0100, true}, |
| 240 | + } |
| 241 | + for _, tc := range tests { |
| 242 | + t.Run(fmt.Sprintf("U+%04X", tc.r), func(t *testing.T) { |
| 243 | + t.Parallel() |
| 244 | + if got := isPrintable(tc.r); got != tc.want { |
| 245 | + t.Errorf("got %v, want %v", got, tc.want) |
| 246 | + } |
| 247 | + }) |
| 248 | + } |
| 249 | + }) |
| 250 | + |
| 251 | + // isGraphemeExtended must match Unicode Grapheme_Extend = Mn + Me + Other_Grapheme_Extend. |
| 252 | + t.Run("isGraphemeExtended", func(t *testing.T) { |
| 253 | + t.Parallel() |
| 254 | + tests := []struct { |
| 255 | + r rune |
| 256 | + want bool |
| 257 | + }{ |
| 258 | + {'A', false}, {0x01, false}, |
| 259 | + {0x0300, true}, {0x0301, true}, // Mn |
| 260 | + {0x0488, true}, {0x1ABE, true}, {0x20DD, true}, // Me |
| 261 | + {0x0903, false}, {0x093E, false}, // Mc only |
| 262 | + {0x200C, true}, // Other_Grapheme_Extend |
| 263 | + {0x200D, false}, // Join_Control, NOT Grapheme_Extend |
| 264 | + {0xFF9E, true}, {0xFF9F, true}, // Other_Grapheme_Extend |
| 265 | + } |
| 266 | + for _, tc := range tests { |
| 267 | + t.Run(fmt.Sprintf("U+%04X", tc.r), func(t *testing.T) { |
| 268 | + t.Parallel() |
| 269 | + if got := isGraphemeExtended(tc.r); got != tc.want { |
| 270 | + t.Errorf("got %v, want %v", got, tc.want) |
| 271 | + } |
| 272 | + }) |
| 273 | + } |
| 274 | + }) |
| 275 | + |
| 276 | + // Round-trip: EscapeString/EscapeCharAll → Unquote must recover the original. |
| 277 | + t.Run("round_trip", func(t *testing.T) { |
| 278 | + t.Parallel() |
| 279 | + inputs := []string{ |
| 280 | + "", "hello", "a\tb\nc\r\x00", "a'b\"c\\d", |
| 281 | + "a\u0300b", "\u0300a", "a\u0300\u0301", |
| 282 | + "\u0903", "a\uFF9Eb", "\u20DDa", |
| 283 | + string([]rune{0x01, 0x7f, 0x80, 0x9f}), |
| 284 | + "\u00ad", "\u200c\u200d", "\ufffe", |
| 285 | + "\U0001F600", "\U00010000", |
| 286 | + string([]rune{0x0300, 'a', 0xFF9E, 'b', 0x20DD}), |
| 287 | + string([]rune{0xa0, 0xa1, 0xad, 0xae, 0xbf}), |
| 288 | + string([]rune{0x200b, 0x200c, 0x200d, 0x200e, 0x200f}), |
| 289 | + string([]rune{0xe0001, 0xe0020, 0xe007f}), |
| 290 | + // Go-style escapes that Cedar doesn't support (\a, \b, \f, \v) |
| 291 | + "\a\b\f\v", |
| 292 | + // All ASCII controls |
| 293 | + string([]rune{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0b, 0x0c, 0x0e, 0x0f}), |
| 294 | + // Mixed printable and non-printable |
| 295 | + "hello\x00world\tfoo\nbar", |
| 296 | + } |
| 297 | + for _, in := range inputs { |
| 298 | + for _, mode := range []struct { |
| 299 | + name string |
| 300 | + fn func(string) string |
| 301 | + }{ |
| 302 | + {"EscapeString", EscapeString}, |
| 303 | + {"EscapeCharAll", EscapeCharAll}, |
| 304 | + } { |
| 305 | + escaped := mode.fn(in) |
| 306 | + unescaped, _, err := Unquote([]byte(escaped), false) |
| 307 | + if err != nil { |
| 308 | + t.Errorf("%s(%q): Unquote failed: %v (escaped=%q)", mode.name, in, err, escaped) |
| 309 | + } else if unescaped != in { |
| 310 | + t.Errorf("%s round-trip: %q → %q → %q", mode.name, in, escaped, unescaped) |
| 311 | + } |
| 312 | + } |
| 313 | + } |
| 314 | + }) |
| 315 | +} |
0 commit comments