Skip to content

Commit 388bb82

Browse files
philhasseyclaude
andcommitted
Add exhaustive escape_debug parity tests against Rust 1.93 reference
Validate escapeRune, EscapeString, EscapeCharAll, isPrintable, and isGraphemeExtended against exact Rust 1.93 char::escape_debug() and str::escape_debug() output for 160+ codepoints covering: - All C0/C1 controls, full Latin-1 supplement boundary (0xA0-0xBF) - Combining marks (Mn, Me, Mc), format chars, separators, BOM - Halfwidth katakana, hangul jungseong, tag chars, variation selectors - SMP and supplementary plane boundaries - str first-vs-continuation grapheme extend distinction - Round-trip correctness through Unquote for both escape modes Signed-off-by: Phil Hassey <phil@strongdm.com> Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 609b143 commit 388bb82

File tree

1 file changed

+315
-0
lines changed

1 file changed

+315
-0
lines changed

internal/rust/exhaustive_test.go

Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
package rust
2+
3+
import (
4+
"fmt"
5+
"testing"
6+
)
7+
8+
// TestEscapeRustParity validates escapeRune, EscapeString, EscapeCharAll,
9+
// isPrintable, and isGraphemeExtended against exact Rust 1.93
10+
// char::escape_debug / str::escape_debug reference output.
11+
func TestEscapeRustParity(t *testing.T) {
12+
t.Parallel()
13+
14+
// escapeRune with ESCAPE_ALL must match Rust char::escape_debug().
15+
// Reference data generated by running Rust 1.93 char::escape_debug()
16+
// on each codepoint.
17+
t.Run("char_escape_debug", func(t *testing.T) {
18+
t.Parallel()
19+
ref := map[rune]string{
20+
// C0 controls 0x00-0x1F
21+
0x0000: `\0`, 0x0001: `\u{1}`, 0x0002: `\u{2}`, 0x0003: `\u{3}`,
22+
0x0004: `\u{4}`, 0x0005: `\u{5}`, 0x0006: `\u{6}`, 0x0007: `\u{7}`,
23+
0x0008: `\u{8}`, 0x0009: `\t`, 0x000A: `\n`, 0x000B: `\u{b}`,
24+
0x000C: `\u{c}`, 0x000D: `\r`, 0x000E: `\u{e}`, 0x000F: `\u{f}`,
25+
0x0010: `\u{10}`, 0x0011: `\u{11}`, 0x0012: `\u{12}`, 0x0013: `\u{13}`,
26+
0x0014: `\u{14}`, 0x0015: `\u{15}`, 0x0016: `\u{16}`, 0x0017: `\u{17}`,
27+
0x0018: `\u{18}`, 0x0019: `\u{19}`, 0x001A: `\u{1a}`, 0x001B: `\u{1b}`,
28+
0x001C: `\u{1c}`, 0x001D: `\u{1d}`, 0x001E: `\u{1e}`, 0x001F: `\u{1f}`,
29+
// Special printable ASCII
30+
0x0022: `\"`, 0x0027: `\'`, 0x005C: `\\`,
31+
// DEL
32+
0x007F: `\u{7f}`,
33+
// C1 controls 0x80-0x9F
34+
0x0080: `\u{80}`, 0x0081: `\u{81}`, 0x0082: `\u{82}`, 0x0083: `\u{83}`,
35+
0x0084: `\u{84}`, 0x0085: `\u{85}`, 0x0086: `\u{86}`, 0x0087: `\u{87}`,
36+
0x0088: `\u{88}`, 0x0089: `\u{89}`, 0x008A: `\u{8a}`, 0x008B: `\u{8b}`,
37+
0x008C: `\u{8c}`, 0x008D: `\u{8d}`, 0x008E: `\u{8e}`, 0x008F: `\u{8f}`,
38+
0x0090: `\u{90}`, 0x0091: `\u{91}`, 0x0092: `\u{92}`, 0x0093: `\u{93}`,
39+
0x0094: `\u{94}`, 0x0095: `\u{95}`, 0x0096: `\u{96}`, 0x0097: `\u{97}`,
40+
0x0098: `\u{98}`, 0x0099: `\u{99}`, 0x009A: `\u{9a}`, 0x009B: `\u{9b}`,
41+
0x009C: `\u{9c}`, 0x009D: `\u{9d}`, 0x009E: `\u{9e}`, 0x009F: `\u{9f}`,
42+
// Latin-1 supplement 0xA0-0xBF
43+
0x00A0: `\u{a0}`,
44+
0x00A1: "\u00A1", 0x00A2: "\u00A2", 0x00A3: "\u00A3",
45+
0x00A4: "\u00A4", 0x00A5: "\u00A5", 0x00A6: "\u00A6", 0x00A7: "\u00A7",
46+
0x00A8: "\u00A8", 0x00A9: "\u00A9", 0x00AA: "\u00AA", 0x00AB: "\u00AB",
47+
0x00AC: "\u00AC",
48+
0x00AD: `\u{ad}`,
49+
0x00AE: "\u00AE", 0x00AF: "\u00AF",
50+
0x00B0: "\u00B0", 0x00B1: "\u00B1", 0x00B2: "\u00B2", 0x00B3: "\u00B3",
51+
0x00B4: "\u00B4", 0x00B5: "\u00B5", 0x00B6: "\u00B6", 0x00B7: "\u00B7",
52+
0x00B8: "\u00B8", 0x00B9: "\u00B9", 0x00BA: "\u00BA", 0x00BB: "\u00BB",
53+
0x00BC: "\u00BC", 0x00BD: "\u00BD", 0x00BE: "\u00BE", 0x00BF: "\u00BF",
54+
// Combining marks (Mn) — grapheme extend → escaped
55+
0x0300: `\u{300}`, 0x0301: `\u{301}`, 0x0302: `\u{302}`,
56+
// Combining marks (Me) — grapheme extend → escaped
57+
0x0488: `\u{488}`, 0x0489: `\u{489}`,
58+
0x1ABE: `\u{1abe}`,
59+
0x20DD: `\u{20dd}`, 0x20DE: `\u{20de}`,
60+
// Spacing marks (Mc) — NOT grapheme extend, printable → pass through
61+
0x0903: "\u0903", 0x093E: "\u093E", 0x0940: "\u0940",
62+
// Format characters (Cf) — not printable → escaped
63+
0x200B: `\u{200b}`, 0x200C: `\u{200c}`, 0x200D: `\u{200d}`,
64+
0x200E: `\u{200e}`, 0x200F: `\u{200f}`,
65+
0x202A: `\u{202a}`, 0x202E: `\u{202e}`,
66+
0x0600: `\u{600}`, 0x0601: `\u{601}`, 0x061C: `\u{61c}`,
67+
// Separators — not printable
68+
0x2028: `\u{2028}`, 0x2029: `\u{2029}`,
69+
// Invisible operators
70+
0x2060: `\u{2060}`, 0x2061: `\u{2061}`,
71+
// BOM
72+
0xFEFF: `\u{feff}`,
73+
// Interlinear annotation
74+
0xFFF9: `\u{fff9}`, 0xFFFA: `\u{fffa}`, 0xFFFB: `\u{fffb}`,
75+
// Specials
76+
0xFFFC: "\uFFFC", 0xFFFD: "\uFFFD",
77+
0xFFFE: `\u{fffe}`, 0xFFFF: `\u{ffff}`,
78+
// Halfwidth katakana (Lm, Other_Grapheme_Extend)
79+
0xFF9E: `\u{ff9e}`, 0xFF9F: `\u{ff9f}`,
80+
// Unassigned
81+
0x0378: `\u{378}`, 0x0379: `\u{379}`,
82+
// Hangul jungseong — printable, not grapheme extend
83+
0x1160: "\u1160", 0x1161: "\u1161",
84+
// Tag characters — not printable
85+
0xE0001: `\u{e0001}`, 0xE0020: `\u{e0020}`, 0xE007F: `\u{e007f}`,
86+
// Variation selectors supplement — grapheme extend → escaped
87+
0xE0100: `\u{e0100}`, 0xE01EF: `\u{e01ef}`,
88+
// Past variation selectors — not printable
89+
0xE01F0: `\u{e01f0}`,
90+
// SMP
91+
0x10000: "\U00010000", 0x1000C: `\u{1000c}`, 0x1F600: "\U0001F600",
92+
// CJK extension B boundary
93+
0x20000: "\U00020000", 0x2A6E0: `\u{2a6e0}`,
94+
}
95+
// Fill in all printable ASCII 0x20-0x7E
96+
for r := rune(0x20); r <= 0x7E; r++ {
97+
if _, ok := ref[r]; !ok {
98+
ref[r] = string(r)
99+
}
100+
}
101+
for r, want := range ref {
102+
got := escapeRune(r, true)
103+
if got != want {
104+
t.Errorf("U+%04X: got %q, want %q", r, got, want)
105+
}
106+
}
107+
})
108+
109+
// EscapeString must match Rust str::escape_debug():
110+
// first char uses ESCAPE_ALL, continuation uses escape_grapheme_extended=false.
111+
t.Run("str_escape_debug", func(t *testing.T) {
112+
t.Parallel()
113+
tests := []struct {
114+
name string
115+
in string
116+
want string
117+
}{
118+
{"hello", "hello", "hello"},
119+
{"empty", "", ""},
120+
{"null", "\x00", `\0`},
121+
{"whitespace_controls", "\t\n\r", `\t\n\r`},
122+
{"single_quote_mid", "a'b", `a\'b`},
123+
{"single_quote_first", "'a", `\'a`},
124+
{"double_quote_mid", "a\"b", `a\"b`},
125+
{"backslash_mid", "a\\b", `a\\b`},
126+
// Grapheme extend: first escaped, continuation raw
127+
{"combining_cont", "a\u0300", "a\u0300"},
128+
{"combining_first", "\u0300a", `\u{300}` + "a"},
129+
{"two_combiners_after_base", "a\u0300\u0301", "a\u0300\u0301"},
130+
{"two_combiners_no_base", "\u0300\u0301", `\u{300}` + "\u0301"},
131+
{"katakana_cont", "a\uFF9E", "a\uFF9E"},
132+
{"katakana_first", "\uFF9Ea", `\u{ff9e}` + "a"},
133+
{"enclosing_cont", "a\u20DD", "a\u20DD"},
134+
{"enclosing_first", "\u20DDa", `\u{20dd}` + "a"},
135+
// Mc (NOT grapheme extend) — always raw
136+
{"visarga_only", "\u0903", "\u0903"},
137+
{"visarga_cont", "a\u0903", "a\u0903"},
138+
// Non-printable — always escaped
139+
{"zwj_only", "\u200D", `\u{200d}`},
140+
{"zwj_cont", "a\u200D", `a\u{200d}`},
141+
{"zwnj_only", "\u200C", `\u{200c}`},
142+
{"zwnj_cont", "a\u200C", `a\u{200c}`},
143+
{"bom", "\uFEFF", `\u{feff}`},
144+
{"bom_cont", "a\uFEFF", `a\u{feff}`},
145+
{"e_acute", "\u00E9", "\u00E9"},
146+
{"soft_hyphen", string(rune(0xAD)), `\u{ad}`},
147+
{"soft_hyphen_cont", "a" + string(rune(0xAD)), `a\u{ad}`},
148+
{"nbsp", string(rune(0xA0)), `\u{a0}`},
149+
{"nbsp_cont", "a" + string(rune(0xA0)), `a\u{a0}`},
150+
}
151+
for _, tc := range tests {
152+
t.Run(tc.name, func(t *testing.T) {
153+
t.Parallel()
154+
if got := EscapeString(tc.in); got != tc.want {
155+
t.Errorf("got %q, want %q", got, tc.want)
156+
}
157+
})
158+
}
159+
})
160+
161+
// EscapeCharAll must escape grapheme extend in ALL positions
162+
// (matches per-char char::escape_debug used by Rust Cedar patterns).
163+
t.Run("char_all", func(t *testing.T) {
164+
t.Parallel()
165+
tests := []struct {
166+
name string
167+
in string
168+
want string
169+
}{
170+
{"plain", "hello", "hello"},
171+
{"combining_cont", "a\u0300", `a\u{300}`},
172+
{"combining_first", "\u0300a", `\u{300}` + "a"},
173+
{"katakana_cont", "a\uFF9E", `a\u{ff9e}`},
174+
{"visarga", "a\u0903", "a\u0903"}, // Mc only → raw
175+
}
176+
for _, tc := range tests {
177+
t.Run(tc.name, func(t *testing.T) {
178+
t.Parallel()
179+
if got := EscapeCharAll(tc.in); got != tc.want {
180+
t.Errorf("got %q, want %q", got, tc.want)
181+
}
182+
})
183+
}
184+
})
185+
186+
// Grapheme extend chars must be escaped as first char, raw in continuation.
187+
// Non-printable non-grapheme-extend chars must always be escaped.
188+
t.Run("first_vs_continuation", func(t *testing.T) {
189+
t.Parallel()
190+
for _, r := range []rune{0x0300, 0x0301, 0x20DD, 0xFF9E, 0xFF9F, 0x0488} {
191+
t.Run(fmt.Sprintf("grapheme_extend_U+%04X", r), func(t *testing.T) {
192+
t.Parallel()
193+
wantFirst := fmt.Sprintf(`\u{%x}`, r)
194+
if got := EscapeString(string(r)); got != wantFirst {
195+
t.Errorf("as first: got %q, want %q", got, wantFirst)
196+
}
197+
wantCont := "a" + string(r)
198+
if got := EscapeString("a" + string(r)); got != wantCont {
199+
t.Errorf("in continuation: got %q, want %q", got, wantCont)
200+
}
201+
})
202+
}
203+
for _, r := range []rune{0x200D, 0x00AD, 0x0080, 0xFFFE} {
204+
t.Run(fmt.Sprintf("always_escaped_U+%04X", r), func(t *testing.T) {
205+
t.Parallel()
206+
wantEsc := fmt.Sprintf(`\u{%x}`, r)
207+
if got := EscapeString(string(r)); got != wantEsc {
208+
t.Errorf("as first: got %q, want %q", got, wantEsc)
209+
}
210+
if got := EscapeString("a" + string(r)); got != "a"+wantEsc {
211+
t.Errorf("in continuation: got %q, want %q", got, "a"+wantEsc)
212+
}
213+
})
214+
}
215+
})
216+
217+
// isPrintable must match Rust's is_printable() lookup tables.
218+
t.Run("isPrintable", func(t *testing.T) {
219+
t.Parallel()
220+
tests := []struct {
221+
r rune
222+
want bool
223+
}{
224+
// ASCII
225+
{0x00, false}, {0x1f, false}, {0x20, true}, {0x7e, true}, {0x7f, false},
226+
// C1 controls
227+
{0x80, false}, {0x85, false}, {0x9f, false},
228+
// BMP specials
229+
{0xa0, false}, {0xa1, true}, {0xad, false}, {0xae, true}, {0xe9, true},
230+
{0x0300, true}, {0x0378, false}, {0x0903, true},
231+
{0x200c, false}, {0x200d, false}, {0x20dd, true}, {0xff9e, true}, {0xfffe, false},
232+
// SMP
233+
{0x10000, true}, {0x1f600, true}, {0x1000c, false}, {0x10100, true},
234+
// Supplementary — hardcoded ranges
235+
{0x20000, true},
236+
{0x2a6e0, false}, {0x2b81e, false}, {0x2ceae, false},
237+
{0x2ebe1, false}, {0x2ee5e, false}, {0x2fa1e, false},
238+
{0x3134b, false}, {0x3347a, false}, {0xe01f0, false},
239+
{0xe0100, true},
240+
}
241+
for _, tc := range tests {
242+
t.Run(fmt.Sprintf("U+%04X", tc.r), func(t *testing.T) {
243+
t.Parallel()
244+
if got := isPrintable(tc.r); got != tc.want {
245+
t.Errorf("got %v, want %v", got, tc.want)
246+
}
247+
})
248+
}
249+
})
250+
251+
// isGraphemeExtended must match Unicode Grapheme_Extend = Mn + Me + Other_Grapheme_Extend.
252+
t.Run("isGraphemeExtended", func(t *testing.T) {
253+
t.Parallel()
254+
tests := []struct {
255+
r rune
256+
want bool
257+
}{
258+
{'A', false}, {0x01, false},
259+
{0x0300, true}, {0x0301, true}, // Mn
260+
{0x0488, true}, {0x1ABE, true}, {0x20DD, true}, // Me
261+
{0x0903, false}, {0x093E, false}, // Mc only
262+
{0x200C, true}, // Other_Grapheme_Extend
263+
{0x200D, false}, // Join_Control, NOT Grapheme_Extend
264+
{0xFF9E, true}, {0xFF9F, true}, // Other_Grapheme_Extend
265+
}
266+
for _, tc := range tests {
267+
t.Run(fmt.Sprintf("U+%04X", tc.r), func(t *testing.T) {
268+
t.Parallel()
269+
if got := isGraphemeExtended(tc.r); got != tc.want {
270+
t.Errorf("got %v, want %v", got, tc.want)
271+
}
272+
})
273+
}
274+
})
275+
276+
// Round-trip: EscapeString/EscapeCharAll → Unquote must recover the original.
277+
t.Run("round_trip", func(t *testing.T) {
278+
t.Parallel()
279+
inputs := []string{
280+
"", "hello", "a\tb\nc\r\x00", "a'b\"c\\d",
281+
"a\u0300b", "\u0300a", "a\u0300\u0301",
282+
"\u0903", "a\uFF9Eb", "\u20DDa",
283+
string([]rune{0x01, 0x7f, 0x80, 0x9f}),
284+
"\u00ad", "\u200c\u200d", "\ufffe",
285+
"\U0001F600", "\U00010000",
286+
string([]rune{0x0300, 'a', 0xFF9E, 'b', 0x20DD}),
287+
string([]rune{0xa0, 0xa1, 0xad, 0xae, 0xbf}),
288+
string([]rune{0x200b, 0x200c, 0x200d, 0x200e, 0x200f}),
289+
string([]rune{0xe0001, 0xe0020, 0xe007f}),
290+
// Go-style escapes that Cedar doesn't support (\a, \b, \f, \v)
291+
"\a\b\f\v",
292+
// All ASCII controls
293+
string([]rune{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0b, 0x0c, 0x0e, 0x0f}),
294+
// Mixed printable and non-printable
295+
"hello\x00world\tfoo\nbar",
296+
}
297+
for _, in := range inputs {
298+
for _, mode := range []struct {
299+
name string
300+
fn func(string) string
301+
}{
302+
{"EscapeString", EscapeString},
303+
{"EscapeCharAll", EscapeCharAll},
304+
} {
305+
escaped := mode.fn(in)
306+
unescaped, _, err := Unquote([]byte(escaped), false)
307+
if err != nil {
308+
t.Errorf("%s(%q): Unquote failed: %v (escaped=%q)", mode.name, in, err, escaped)
309+
} else if unescaped != in {
310+
t.Errorf("%s round-trip: %q → %q → %q", mode.name, in, escaped, unescaped)
311+
}
312+
}
313+
}
314+
})
315+
}

0 commit comments

Comments
 (0)