Skip to content

Commit 2abee60

Browse files
authored
Merge pull request #2 from carpentry-org/claude/fix-invalid-escape-sequences
Fix silent acceptance of invalid escape sequences in string parser
2 parents 267bc57 + bf9efb3 commit 2abee60

3 files changed

Lines changed: 67 additions & 29 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ the project follows [Semantic Versioning](https://semver.org/).
55

66
## Unreleased
77

8+
- String parser now rejects invalid escape sequences (e.g. `\z`,
9+
`\0`) with a parse error instead of silently emitting raw bytes.
10+
`\u` and `\U` escapes also validate that the required hex digits
11+
are actually hex characters.
812
- `Form.str` uses `StringBuf` for `join-children`, `join-segments`,
913
and `escape-string`, replacing O(n²) `String.append` loops with
1014
amortised O(n) appends. Adds a dependency on `strbuf@0.1.0`.

carp-reader.carp

Lines changed: 38 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -528,17 +528,17 @@ there is no exponent here.")
528528
(let-do [c (Cursor.advance cur \")
529529
bytes (the (Array Byte) [])
530530
going true
531-
err false]
532-
(while-do (and going (not err))
531+
err @""]
532+
(while-do (and going (Int.= 0 (String.length &err)))
533533
(if (Int.>= @(Cursor.pos &c) len)
534-
(set! err true)
534+
(set! err @"closing quote")
535535
(let [ch (String.char-at src @(Cursor.pos &c))]
536536
(cond
537537
(Char.= ch \")
538538
(do (set! c (Cursor.advance &c ch)) (set! going false))
539539
(Char.= ch \\)
540540
(if (Int.>= (Int.+ @(Cursor.pos &c) 1) len)
541-
(set! err true)
541+
(set! err @"closing quote")
542542
(let [esc (String.char-at src
543543
(Int.+ @(Cursor.pos &c) 1))]
544544
(cond
@@ -600,40 +600,49 @@ there is no exponent here.")
600600
(Int.- end
601601
@(Cursor.pos &c))
602602
false)))
603-
(set! err true)))
603+
(set! err @"hex digits after \\x")))
604604
(Char.= esc \u)
605605
(if (Int.>= (Int.+ @(Cursor.pos &c) 6)
606606
(Int.inc len))
607-
(set! err true)
608-
(do
609-
(set! bytes
610-
(push-utf8-bytes bytes
611-
(read-hex src
612-
(Int.+ @(Cursor.pos &c)
613-
2)
614-
4)))
615-
(set! c (Cursor.advance-by &c 6 false))))
607+
(set! err @"4 hex digits in \\u escape")
608+
(let-do [hex-start (Int.+ @(Cursor.pos &c) 2)
609+
all-hex true]
610+
(for [j hex-start (Int.+ hex-start 4)]
611+
(unless (hex-digit? (String.char-at src j))
612+
(set! all-hex false)))
613+
(if all-hex
614+
(do
615+
(set! bytes
616+
(push-utf8-bytes bytes
617+
(read-hex src
618+
hex-start
619+
4)))
620+
(set! c (Cursor.advance-by &c 6 false)))
621+
(set! err @"hex digits in \\u escape"))))
616622
(Char.= esc \U)
617623
(if (Int.>= (Int.+ @(Cursor.pos &c) 10)
618624
(Int.inc len))
619-
(set! err true)
620-
(do
621-
(set! bytes
622-
(push-utf8-bytes bytes
623-
(read-hex src
624-
(Int.+ @(Cursor.pos &c)
625-
2)
626-
8)))
627-
(set! c (Cursor.advance-by &c 10 false))))
628-
(do
629-
(set! bytes (push-byte bytes \\))
630-
(set! bytes (push-byte bytes esc))
631-
(set! c (Cursor.advance-by &c 2 false))))))
625+
(set! err @"8 hex digits in \\U escape")
626+
(let-do [hex-start (Int.+ @(Cursor.pos &c) 2)
627+
all-hex true]
628+
(for [j hex-start (Int.+ hex-start 8)]
629+
(unless (hex-digit? (String.char-at src j))
630+
(set! all-hex false)))
631+
(if all-hex
632+
(do
633+
(set! bytes
634+
(push-utf8-bytes bytes
635+
(read-hex src
636+
hex-start
637+
8)))
638+
(set! c (Cursor.advance-by &c 10 false)))
639+
(set! err @"hex digits in \\U escape"))))
640+
(set! err @"valid escape sequence"))))
632641
(do
633642
(set! bytes (push-byte bytes ch))
634643
(set! c (Cursor.advance &c ch)))))))
635-
(if err
636-
(Reply.ErrConsumed (ParseErr.expected-at &c @"closing quote"))
644+
(if (Int.> (String.length &err) 0)
645+
(Reply.ErrConsumed (ParseErr.expected-at &c err))
637646
(Reply.OkConsumed (Form.Str (String.from-bytes &bytes)) c))))))))
638647

639648
(private raw-string)

test/carp-reader.carp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,31 @@
193193
b\"" &(Located.str &(first-form "\"a
194194
b\"")) "string with literal newline stays as literal newline")
195195

196+
; --- valid escape sequences in strings ---
197+
(assert-equal t
198+
"\"\\t\""
199+
&(Located.str &(first-form "\"\\t\""))
200+
"round-trip: tab escape")
201+
(assert-equal t
202+
"\"A\""
203+
&(Located.str &(first-form "\"\\x41\""))
204+
"hex escape \\x41 produces A")
205+
(assert-equal t
206+
"\"A\""
207+
&(Located.str &(first-form "\"\\u0041\""))
208+
"unicode escape \\u0041 produces A")
209+
210+
; --- invalid escape sequences in strings ---
211+
(assert-true t (parse-error? "\"\\z\"") "invalid escape \\z errors")
212+
(assert-true t (parse-error? "\"\\q\"") "invalid escape \\q errors")
213+
(assert-true t (parse-error? "\"\\0\"") "\\0 is not a valid escape")
214+
(assert-true t (parse-error? "\"\\x\"") "\\x without hex digits errors")
215+
(assert-true t (parse-error? "\"\\xZZ\"") "\\x with non-hex chars errors")
216+
(assert-true t (parse-error? "\"\\uGHIJ\"") "\\u with non-hex chars errors")
217+
(assert-true t
218+
(parse-error? "\"\\U0000GHIJ\"")
219+
"\\U with non-hex chars errors")
220+
196221
; --- error cases ---
197222
(assert-true t (parse-error? "(") "unbalanced open paren errors")
198223
(assert-true t (parse-error? "\"unterminated") "unterminated string errors"))

0 commit comments

Comments
 (0)