Skip to content

Commit 05127d9

Browse files
stormpativov
andauthored
feat: multi-line string literals (#208)
Co-authored-by: Iván Ovejero <ivov.src@gmail.com>
1 parent 30c05ea commit 05127d9

48 files changed

Lines changed: 1648 additions & 254 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

crates/emit/src/expressions/literals.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,15 +137,16 @@ impl Emitter<'_> {
137137
}
138138

139139
pub(crate) fn emit_raw_string(value: &str) -> String {
140-
// Go discards `\r` from backtick raw strings, so we must fall back to
141-
// double-quoted form when the content contains CR.
140+
// Go backtick raw strings cannot contain backticks, and Go discards `\r`
141+
// from them, so fall back to double-quoted form in either case.
142142
if !value.contains('`') && !value.contains('\r') {
143143
format!("`{}`", value)
144144
} else {
145145
let escaped = value
146146
.replace('\\', "\\\\")
147147
.replace('"', "\\\"")
148-
.replace('\r', "\\r");
148+
.replace('\r', "\\r")
149+
.replace('\n', "\\n");
149150
format!("\"{}\"", escaped)
150151
}
151152
}
@@ -188,6 +189,10 @@ pub(crate) fn convert_escape_sequences(s: &str) -> String {
188189
} else {
189190
result.push(c);
190191
}
192+
} else if c == '\n' {
193+
result.push_str("\\n");
194+
} else if c == '\r' {
195+
result.push_str("\\r");
191196
} else {
192197
result.push(c);
193198
}

crates/format/src/formatter.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,13 @@ impl<'a> Formatter<'a> {
534534
}
535535
}
536536
Literal::Boolean(b) => Document::str(if *b { "true" } else { "false" }),
537+
Literal::String { value, raw: true } if value.contains('\n') => {
538+
Document::verbatim(format!("r\"{value}\""))
539+
}
537540
Literal::String { value, raw: true } => Document::string(format!("r\"{value}\"")),
541+
Literal::String { value, raw: false } if value.contains('\n') => {
542+
Document::verbatim(format!("\"{value}\""))
543+
}
538544
Literal::String { value, raw: false } => Document::string(format!("\"{value}\"")),
539545
Literal::Char(c) => Document::string(format!("'{c}'")),
540546
Literal::Slice(elements) => self.slice(elements),
@@ -564,6 +570,9 @@ impl<'a> Formatter<'a> {
564570

565571
for part in parts {
566572
match part {
573+
FormatStringPart::Text(s) if s.contains('\n') => {
574+
docs.push(Document::verbatim(s.clone()))
575+
}
567576
FormatStringPart::Text(s) => docs.push(Document::string(s.clone())),
568577
FormatStringPart::Expression(e) => {
569578
docs.push(Document::str("{"));

crates/format/src/lindig.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ fn fits(
4949
current_width += s.graphemes(true).count() as isize;
5050
}
5151

52+
Document::VerbatimText(s) => {
53+
if s.contains('\n') {
54+
return false;
55+
}
56+
current_width += s.graphemes(true).count() as isize;
57+
}
58+
5259
Document::StrictBreak { unbroken, .. } | Document::FlexBreak { unbroken, .. } => {
5360
match mode {
5461
Mode::Broken | Mode::ForcedBroken => return true,
@@ -149,6 +156,23 @@ fn format(
149156
output.push_str(s);
150157
}
151158

159+
Document::VerbatimText(s) => {
160+
if pending_indent >= 0 {
161+
write_indent(output, pending_indent);
162+
pending_indent = -1;
163+
}
164+
let mut segments = s.split('\n');
165+
if let Some(first) = segments.next() {
166+
output.push_str(first);
167+
width += first.graphemes(true).count() as isize;
168+
}
169+
for segment in segments {
170+
output.push('\n');
171+
output.push_str(segment);
172+
width = segment.graphemes(true).count() as isize;
173+
}
174+
}
175+
152176
Document::Sequence(vec) => {
153177
for doc in vec.iter().rev() {
154178
docs.push((indent, mode, doc));
@@ -195,6 +219,7 @@ pub enum Document<'a> {
195219
NestIfBroken(isize, Box<Self>),
196220
Group(Box<Self>),
197221
Text(Cow<'a, str>),
222+
VerbatimText(Cow<'a, str>),
198223
}
199224

200225
impl<'a> Document<'a> {
@@ -206,6 +231,10 @@ impl<'a> Document<'a> {
206231
Document::Text(Cow::Owned(string))
207232
}
208233

234+
pub fn verbatim(string: String) -> Self {
235+
Document::VerbatimText(Cow::Owned(string))
236+
}
237+
209238
pub fn group(self) -> Self {
210239
Self::Group(Box::new(self))
211240
}

crates/syntax/src/lex/mod.rs

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -873,8 +873,6 @@ impl<'source> Lexer<'source> {
873873
terminated = true;
874874
self.next();
875875
break;
876-
} else if byte == b'\n' {
877-
break; // unterminated string literal across newline, handled below
878876
}
879877

880878
self.next();
@@ -888,7 +886,7 @@ impl<'source> Lexer<'source> {
888886
}
889887

890888
if !terminated {
891-
self.error_unterminated_string(start_offset, length);
889+
self.error_unterminated_string(start_offset, 1);
892890
}
893891

894892
Token {
@@ -911,8 +909,6 @@ impl<'source> Lexer<'source> {
911909
terminated = true;
912910
self.next();
913911
break;
914-
} else if byte == b'\n' {
915-
break;
916912
} else if byte == 0 {
917913
self.error_disallowed_byte_in_raw_string(self.current_offset, byte);
918914
self.next();
@@ -925,7 +921,7 @@ impl<'source> Lexer<'source> {
925921
let length = end_offset - start_offset;
926922

927923
if !terminated {
928-
self.error_unterminated_raw_string(start_offset, length);
924+
self.error_unterminated_raw_string(start_offset, 2);
929925
}
930926

931927
Token {
@@ -1163,14 +1159,49 @@ impl<'source> Lexer<'source> {
11631159
None
11641160
}
11651161

1162+
// Caller has just consumed `{` of the broken interpolation, so we start
1163+
// inside it (depth=1). Newlines are not a recovery boundary now that
1164+
// f-string text spans them, so we balance braces and skip past quoted
1165+
// strings to avoid stopping at the first inner `"`.
11661166
fn skip_to_format_string_end(&mut self) {
1167+
let mut depth = 1;
11671168
while !self.at_eof() {
11681169
match self.current_byte() {
1169-
b'"' => {
1170+
b'\\' => {
1171+
self.next();
1172+
if !self.at_eof() {
1173+
self.next();
1174+
}
1175+
}
1176+
b'"' if depth == 0 => {
11701177
self.next();
11711178
return;
11721179
}
1173-
b'\n' => return,
1180+
b'"' => {
1181+
self.next();
1182+
while !self.at_eof() && self.current_byte() != b'"' {
1183+
if self.current_byte() == b'\\' {
1184+
self.next();
1185+
if self.at_eof() {
1186+
break;
1187+
}
1188+
}
1189+
self.next();
1190+
}
1191+
if !self.at_eof() {
1192+
self.next();
1193+
}
1194+
}
1195+
b'{' => {
1196+
depth += 1;
1197+
self.next();
1198+
}
1199+
b'}' => {
1200+
if depth > 0 {
1201+
depth -= 1;
1202+
}
1203+
self.next();
1204+
}
11741205
_ => self.next(),
11751206
}
11761207
}
@@ -1234,12 +1265,6 @@ impl<'source> Lexer<'source> {
12341265
return tokens;
12351266
}
12361267

1237-
b'\n' => {
1238-
let length = self.current_offset.saturating_sub(start_offset);
1239-
self.error_unterminated_format_string(start_offset, length);
1240-
return tokens;
1241-
}
1242-
12431268
b'{' => {
12441269
self.push_format_string_text_if_needed(&mut tokens, text_segment_start);
12451270

@@ -1258,8 +1283,7 @@ impl<'source> Lexer<'source> {
12581283
}
12591284
}
12601285

1261-
let length = self.current_offset.saturating_sub(start_offset);
1262-
self.error_unterminated_format_string(start_offset, length);
1286+
self.error_unterminated_format_string(start_offset, 2);
12631287
tokens
12641288
}
12651289

crates/syntax/src/parse/expressions.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use ecow::EcoString;
22

3+
use super::strings::cook_string_contents;
34
use super::{MAX_TUPLE_ARITY, ParseError, Parser};
45
use crate::ast::{
56
Annotation, Attribute, BinaryOperator, Binding, Expression, FormatStringPart, ImportAlias,
@@ -182,7 +183,7 @@ impl<'source> Parser<'source> {
182183
s
183184
};
184185
Literal::String {
185-
value: s_stripped.to_string(),
186+
value: cook_string_contents(s_stripped),
186187
raw: false,
187188
}
188189
}
@@ -203,7 +204,7 @@ impl<'source> Parser<'source> {
203204
s
204205
};
205206
Literal::String {
206-
value: s_stripped.to_string(),
207+
value: cook_string_contents(s_stripped),
207208
raw: true,
208209
}
209210
}
@@ -1067,7 +1068,7 @@ impl<'source> Parser<'source> {
10671068
FormatStringText => {
10681069
let text = self.current_token().text;
10691070
self.next();
1070-
parts.push(FormatStringPart::Text(text.to_string()));
1071+
parts.push(FormatStringPart::Text(cook_string_contents(text)));
10711072
}
10721073
FormatStringInterpolationStart => {
10731074
self.ensure(FormatStringInterpolationStart);

crates/syntax/src/parse/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ mod expressions;
1919
mod identifiers;
2020
mod patterns;
2121
mod pratt;
22+
mod strings;
2223

2324
pub use error::ParseError;
2425

crates/syntax/src/parse/patterns.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use ecow::EcoString;
22

3+
use super::strings::cook_string_contents;
34
use super::{MAX_TUPLE_ARITY, ParseError, Parser};
45
use crate::ast::{Annotation, Binding, Literal, Pattern, RestPattern, Span, StructFieldPattern};
56
use crate::lex::Token;
@@ -243,20 +244,20 @@ impl<'source> Parser<'source> {
243244
self.next();
244245
let (value, raw) = if kind == crate::lex::TokenKind::RawString {
245246
let stripped = if s.len() >= 3 && s.starts_with("r\"") && s.ends_with('"') {
246-
s[2..s.len() - 1].to_string()
247+
&s[2..s.len() - 1]
247248
} else if s.len() >= 2 && s.starts_with("r\"") {
248-
s[2..].to_string()
249+
&s[2..]
249250
} else {
250-
s.to_string()
251+
s
251252
};
252-
(stripped, true)
253+
(cook_string_contents(stripped), true)
253254
} else {
254255
let stripped = if s.len() >= 2 && s.starts_with('"') && s.ends_with('"') {
255-
s[1..s.len() - 1].to_string()
256+
&s[1..s.len() - 1]
256257
} else {
257-
s.to_string()
258+
s
258259
};
259-
(stripped, false)
260+
(cook_string_contents(stripped), false)
260261
};
261262

262263
Pattern::Literal {

crates/syntax/src/parse/strings.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/// Normalise CRLF to LF; pass other bytes through unchanged.
2+
pub(crate) fn cook_string_contents(content: &str) -> String {
3+
if !content.contains('\r') {
4+
return content.to_string();
5+
}
6+
7+
let bytes = content.as_bytes();
8+
let mut out: Vec<u8> = Vec::with_capacity(content.len());
9+
let mut i = 0;
10+
let mut copy_start = 0;
11+
12+
while i < bytes.len() {
13+
if bytes[i] == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
14+
out.extend_from_slice(&bytes[copy_start..i]);
15+
out.push(b'\n');
16+
i += 2;
17+
copy_start = i;
18+
continue;
19+
}
20+
i += 1;
21+
}
22+
23+
out.extend_from_slice(&bytes[copy_start..]);
24+
25+
// SAFETY: input was valid UTF-8 and we only inserted ASCII LF or skipped
26+
// ASCII bytes, so the remaining content is still valid UTF-8.
27+
unsafe { String::from_utf8_unchecked(out) }
28+
}
29+
30+
#[cfg(test)]
31+
mod tests {
32+
use super::*;
33+
34+
#[test]
35+
fn preserves_single_line() {
36+
assert_eq!(cook_string_contents("hello"), "hello");
37+
}
38+
39+
#[test]
40+
fn preserves_embedded_newline() {
41+
assert_eq!(cook_string_contents("a\nb"), "a\nb");
42+
}
43+
44+
#[test]
45+
fn normalises_crlf_to_lf() {
46+
assert_eq!(cook_string_contents("a\r\nb"), "a\nb");
47+
}
48+
49+
#[test]
50+
fn preserves_lone_cr() {
51+
assert_eq!(cook_string_contents("a\rb"), "a\rb");
52+
}
53+
54+
#[test]
55+
fn passes_other_escapes_through() {
56+
assert_eq!(cook_string_contents("a\\nb"), "a\\nb");
57+
assert_eq!(cook_string_contents("a\\\\b"), "a\\\\b");
58+
}
59+
60+
#[test]
61+
fn preserves_multibyte_utf8() {
62+
assert_eq!(cook_string_contents("héllo"), "héllo");
63+
assert_eq!(cook_string_contents("a\r\né"), "a\né");
64+
}
65+
}

0 commit comments

Comments
 (0)