Skip to content

Commit 063159d

Browse files
committed
feat(fmt): An attempt at aesthetic items into PL
This is an attempt to get around the complications of managing lexer + parser output, which PRQL#4397 has hit in a few incarnations by just adding comments ('aesthetics') to PL. This very very nearly works -- with chumsky we can create a function that wraps anything that might have a comment, implement a trait on the AST items that contain it, and away we go (though it did require a lot of debugging in the end). This would then be really easy to write back out. I think there's literally a single case where it doesn't work -- where a comment doesn't come directly before or directly after an AST item -- in the final trailing comma of a tuple or array. So tests fail at the moment. Next we need to consider: - Can we workaround that one case? We don't actually care about whether there's a trailing comma, so we could likely hack around it... - Are there actually other cases of this model failing? I know this approach -- of putting aesthetic items into AST -- is not generally favored, and it's really rare that there's even a single case of something not working.
1 parent a8d29b8 commit 063159d

33 files changed

+370
-62
lines changed

prqlc/prqlc-ast/src/expr.rs

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,17 @@ pub use self::ident::Ident;
1111
pub use self::ops::{BinOp, UnOp};
1212
pub use self::token::{Literal, ValueAndUnit};
1313
use super::token;
14-
use crate::span::Span;
15-
use crate::Ty;
14+
use crate::{span::Span, WithAesthetics};
15+
use crate::{TokenKind, Ty};
1616

1717
impl Expr {
1818
pub fn new<K: Into<ExprKind>>(kind: K) -> Self {
1919
Expr {
2020
kind: kind.into(),
2121
span: None,
2222
alias: None,
23+
aesthetics_before: Vec::new(),
24+
aesthetics_after: Vec::new(),
2325
}
2426
}
2527
}
@@ -38,6 +40,24 @@ pub struct Expr {
3840

3941
#[serde(skip_serializing_if = "Option::is_none")]
4042
pub alias: Option<String>,
43+
44+
// Maybe should be Token?
45+
#[serde(skip_serializing_if = "Vec::is_empty")]
46+
pub aesthetics_before: Vec<TokenKind>,
47+
#[serde(skip_serializing_if = "Vec::is_empty")]
48+
pub aesthetics_after: Vec<TokenKind>,
49+
}
50+
51+
impl WithAesthetics for Expr {
52+
fn with_aesthetics(
53+
mut self,
54+
aesthetics_before: Vec<TokenKind>,
55+
aesthetics_after: Vec<TokenKind>,
56+
) -> Self {
57+
self.aesthetics_before = aesthetics_before;
58+
self.aesthetics_after = aesthetics_after;
59+
self
60+
}
4161
}
4262

4363
#[derive(Debug, EnumAsInner, PartialEq, Clone, Serialize, Deserialize, strum::AsRefStr)]

prqlc/prqlc-ast/src/lib.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,11 @@ pub use span::*;
99
pub use stmt::*;
1010
pub use token::*;
1111
pub use types::*;
12+
13+
pub trait WithAesthetics {
14+
fn with_aesthetics(
15+
self,
16+
aesthetics_before: Vec<TokenKind>,
17+
aethetics_after: Vec<TokenKind>,
18+
) -> Self;
19+
}

prqlc/prqlc-ast/src/stmt.rs

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use enum_as_inner::EnumAsInner;
44
use semver::VersionReq;
55
use serde::{Deserialize, Serialize};
66

7-
use crate::{expr::Expr, Ident, Span, Ty};
7+
use crate::{expr::Expr, Ident, Span, TokenKind, Ty, WithAesthetics};
88

99
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
1010
pub struct QueryDef {
@@ -31,6 +31,26 @@ pub struct Stmt {
3131

3232
#[serde(skip_serializing_if = "Vec::is_empty", default)]
3333
pub annotations: Vec<Annotation>,
34+
35+
// Maybe should be Token?
36+
#[serde(skip_serializing_if = "Vec::is_empty")]
37+
pub aesthetics_before: Vec<TokenKind>,
38+
#[serde(skip_serializing_if = "Vec::is_empty")]
39+
pub aesthetics_after: Vec<TokenKind>,
40+
}
41+
42+
impl WithAesthetics for Stmt {
43+
fn with_aesthetics(
44+
self,
45+
aesthetics_before: Vec<TokenKind>,
46+
aesthetics_after: Vec<TokenKind>,
47+
) -> Self {
48+
Stmt {
49+
aesthetics_before,
50+
aesthetics_after,
51+
..self
52+
}
53+
}
3454
}
3555

3656
#[derive(Debug, EnumAsInner, PartialEq, Clone, Serialize, Deserialize)]
@@ -73,6 +93,24 @@ pub struct ImportDef {
7393
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
7494
pub struct Annotation {
7595
pub expr: Box<Expr>,
96+
#[serde(skip_serializing_if = "Vec::is_empty")]
97+
pub aesthetics_before: Vec<TokenKind>,
98+
#[serde(skip_serializing_if = "Vec::is_empty")]
99+
pub aesthetics_after: Vec<TokenKind>,
100+
}
101+
102+
impl WithAesthetics for Annotation {
103+
fn with_aesthetics(
104+
self,
105+
aesthetics_before: Vec<TokenKind>,
106+
aesthetics_after: Vec<TokenKind>,
107+
) -> Self {
108+
Annotation {
109+
aesthetics_before,
110+
aesthetics_after,
111+
..self
112+
}
113+
}
76114
}
77115

78116
impl Stmt {
@@ -81,6 +119,8 @@ impl Stmt {
81119
kind,
82120
span: None,
83121
annotations: Vec::new(),
122+
aesthetics_before: Vec::new(),
123+
aesthetics_after: Vec::new(),
84124
}
85125
}
86126
}

prqlc/prqlc-parser/src/expr.rs

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use super::interpolation;
1010
use crate::err::parse_error::PError;
1111
use crate::types::type_expr;
1212

13-
pub fn expr_call() -> impl Parser<TokenKind, Expr, Error = PError> {
13+
pub fn expr_call() -> impl Parser<TokenKind, Expr, Error = PError> + Clone {
1414
let expr = expr();
1515

1616
lambda_func(expr.clone()).or(func_call(expr))
@@ -27,7 +27,9 @@ pub fn expr() -> impl Parser<TokenKind, Expr, Error = PError> + Clone {
2727
.map(|x| x.to_string())
2828
.map(ExprKind::Internal);
2929

30-
let nested_expr = pipeline(lambda_func(expr.clone()).or(func_call(expr.clone()))).boxed();
30+
let nested_expr = with_aesthetics(
31+
pipeline(lambda_func(expr.clone()).or(func_call(expr.clone()))).boxed(),
32+
);
3133

3234
let tuple = ident_part()
3335
.then_ignore(ctrl('='))
@@ -122,18 +124,20 @@ pub fn expr() -> impl Parser<TokenKind, Expr, Error = PError> + Clone {
122124

123125
let param = select! { TokenKind::Param(id) => ExprKind::Param(id) };
124126

125-
let term = choice((
126-
literal,
127-
internal,
128-
tuple,
129-
array,
130-
interpolation,
131-
ident_kind,
132-
case,
133-
param,
134-
))
135-
.map_with_span(into_expr)
136-
.or(pipeline)
127+
let term = with_aesthetics(
128+
choice((
129+
literal,
130+
internal,
131+
tuple,
132+
array,
133+
interpolation,
134+
ident_kind,
135+
case,
136+
param,
137+
))
138+
.map_with_span(into_expr)
139+
.or(pipeline),
140+
)
137141
.boxed();
138142

139143
// indirections
@@ -229,9 +233,9 @@ pub fn expr() -> impl Parser<TokenKind, Expr, Error = PError> + Clone {
229233
})
230234
}
231235

232-
pub fn pipeline<E>(expr: E) -> impl Parser<TokenKind, Expr, Error = PError>
236+
pub fn pipeline<E>(expr: E) -> impl Parser<TokenKind, Expr, Error = PError> + Clone
233237
where
234-
E: Parser<TokenKind, Expr, Error = PError>,
238+
E: Parser<TokenKind, Expr, Error = PError> + Clone,
235239
{
236240
// expr has to be a param, because it can be either a normal expr() or
237241
// a recursive expr called from within expr()
@@ -264,7 +268,7 @@ where
264268
pub fn binary_op_parser<'a, Term, Op>(
265269
term: Term,
266270
op: Op,
267-
) -> impl Parser<TokenKind, Expr, Error = PError> + 'a
271+
) -> impl Parser<TokenKind, Expr, Error = PError> + 'a + Clone
268272
where
269273
Term: Parser<TokenKind, Expr, Error = PError> + 'a,
270274
Op: Parser<TokenKind, BinOp, Error = PError> + 'a,
@@ -290,7 +294,7 @@ where
290294
.boxed()
291295
}
292296

293-
fn func_call<E>(expr: E) -> impl Parser<TokenKind, Expr, Error = PError>
297+
fn func_call<E>(expr: E) -> impl Parser<TokenKind, Expr, Error = PError> + Clone
294298
where
295299
E: Parser<TokenKind, Expr, Error = PError> + Clone,
296300
{
@@ -342,7 +346,7 @@ where
342346
.labelled("function call")
343347
}
344348

345-
fn lambda_func<E>(expr: E) -> impl Parser<TokenKind, Expr, Error = PError>
349+
fn lambda_func<E>(expr: E) -> impl Parser<TokenKind, Expr, Error = PError> + Clone
346350
where
347351
E: Parser<TokenKind, Expr, Error = PError> + Clone + 'static,
348352
{
@@ -402,7 +406,7 @@ where
402406
.labelled("function definition")
403407
}
404408

405-
pub fn ident() -> impl Parser<TokenKind, Ident, Error = PError> {
409+
pub fn ident() -> impl Parser<TokenKind, Ident, Error = PError> + Clone {
406410
ident_part()
407411
.separated_by(ctrl('.'))
408412
.at_least(1)

prqlc/prqlc-parser/src/interpolation.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ fn parse_interpolate() {
9999
0:8-9,
100100
),
101101
alias: None,
102+
aesthetics_before: [],
103+
aesthetics_after: [],
102104
},
103105
format: None,
104106
},
@@ -144,6 +146,8 @@ fn parse_interpolate() {
144146
0:14-15,
145147
),
146148
alias: None,
149+
aesthetics_before: [],
150+
aesthetics_after: [],
147151
},
148152
format: None,
149153
},

prqlc/prqlc-parser/src/lib.rs

Lines changed: 47 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ pub fn parse_source(source: &str, source_id: u16) -> Result<Vec<Stmt>, Vec<Error
2323
let mut errors = Vec::new();
2424

2525
let (tokens, lex_errors) = ::chumsky::Parser::parse_recovery(&lexer::lexer(), source);
26+
// let (tokens, lex_errors) = ::chumsky::Parser::parse_recovery_verbose(&lexer::lexer(), source);
2627

2728
log::debug!("Lex errors: {:?}", lex_errors);
2829
errors.extend(
@@ -31,21 +32,12 @@ pub fn parse_source(source: &str, source_id: u16) -> Result<Vec<Stmt>, Vec<Error
3132
.map(|e| convert_lexer_error(source, e, source_id)),
3233
);
3334

34-
// We don't want comments in the AST (but we do intend to use them as part of
35-
// formatting)
36-
let semantic_tokens: Option<_> = tokens.map(|tokens| {
37-
tokens.into_iter().filter(|token| {
38-
!matches!(
39-
token.kind,
40-
TokenKind::Comment(_) | TokenKind::LineWrap(_) | TokenKind::DocComment(_)
41-
)
42-
})
43-
});
44-
45-
let ast = if let Some(semantic_tokens) = semantic_tokens {
46-
let stream = prepare_stream(semantic_tokens, source, source_id);
35+
let ast = if let Some(tokens) = tokens {
36+
let stream = prepare_stream(tokens.into_iter(), source, source_id);
4737

48-
let (ast, parse_errors) = ::chumsky::Parser::parse_recovery(&stmt::source(), stream);
38+
let (ast, parse_errors) =
39+
// ::chumsky::Parser::parse_recovery_verbose(&stmt::source(), stream);
40+
::chumsky::Parser::parse_recovery(&stmt::source(), stream);
4941

5042
log::debug!("parse errors: {:?}", parse_errors);
5143
errors.extend(parse_errors.into_iter().map(|e| e.into()));
@@ -72,16 +64,16 @@ pub fn lex_source(source: &str) -> Result<TokenVec, Vec<Error>> {
7264

7365
mod common {
7466
use chumsky::prelude::*;
75-
use prqlc_ast::expr::*;
7667
use prqlc_ast::stmt::*;
7768
use prqlc_ast::token::*;
7869
use prqlc_ast::Span;
7970
use prqlc_ast::Ty;
8071
use prqlc_ast::TyKind;
72+
use prqlc_ast::{expr::*, WithAesthetics};
8173

8274
use crate::err::parse_error::PError;
8375

84-
pub fn ident_part() -> impl Parser<TokenKind, String, Error = PError> {
76+
pub fn ident_part() -> impl Parser<TokenKind, String, Error = PError> + Clone {
8577
return select! {
8678
TokenKind::Ident(ident) => ident,
8779
TokenKind::Keyword(ident) if &ident == "module" => ident,
@@ -112,6 +104,8 @@ mod common {
112104
kind,
113105
span: Some(span),
114106
annotations,
107+
aesthetics_before: Vec::new(),
108+
aesthetics_after: Vec::new(),
115109
}
116110
}
117111

@@ -128,6 +122,43 @@ mod common {
128122
..Ty::new(kind)
129123
}
130124
}
125+
126+
pub fn aesthetic() -> impl Parser<TokenKind, TokenKind, Error = PError> + Clone {
127+
select! {
128+
TokenKind::Comment(comment) => TokenKind::Comment(comment),
129+
TokenKind::LineWrap(lw) => TokenKind::LineWrap(lw),
130+
TokenKind::DocComment(dc) => TokenKind::DocComment(dc),
131+
}
132+
}
133+
134+
pub fn with_aesthetics<P, O>(parser: P) -> impl Parser<TokenKind, O, Error = PError> + Clone
135+
where
136+
P: Parser<TokenKind, O, Error = PError> + Clone,
137+
O: WithAesthetics,
138+
{
139+
// We can have newlines between the aesthetics and the actual token to
140+
// cover a case like `# foo` here:
141+
//
142+
// ```prql
143+
// # foo
144+
//
145+
// from bar
146+
// # baz
147+
// select artists
148+
// ```
149+
//
150+
// ...but not after the aesthetics after the token; since we don't want
151+
// to eat the newline after `from bar`
152+
//
153+
let aesthetics_before = aesthetic().then_ignore(new_line().repeated()).repeated();
154+
let aesthetics_after = aesthetic().separated_by(new_line());
155+
156+
aesthetics_before.then(parser).then(aesthetics_after).map(
157+
|((aesthetics_before, inner), aesthetics_after)| {
158+
inner.with_aesthetics(aesthetics_before, aesthetics_after)
159+
},
160+
)
161+
}
131162
}
132163

133164
/// Convert the output of the lexer into the input of the parser. Requires

prqlc/prqlc-parser/src/snapshots/prqlc_parser__test__pipeline_parse_tree.snap

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ expression: "parse_single(r#\"\nfrom employees\nfilter country == \"USA\"
2424
right:
2525
Literal:
2626
String: USA
27+
aesthetics_after:
28+
- Comment: " Each line transforms the previous result."
2729
- FuncCall:
2830
name:
2931
Ident: derive
@@ -36,12 +38,16 @@ expression: "parse_single(r#\"\nfrom employees\nfilter country == \"USA\"
3638
right:
3739
Ident: payroll_tax
3840
alias: gross_salary
41+
aesthetics_before:
42+
- Comment: " This adds columns / variables."
3943
- Binary:
4044
left:
4145
Ident: gross_salary
4246
op: Add
4347
right:
4448
Ident: benefits_cost
49+
aesthetics_after:
50+
- Comment: " Variables can use other variables."
4551
alias: gross_cost
4652
- FuncCall:
4753
name:
@@ -71,6 +77,8 @@ expression: "parse_single(r#\"\nfrom employees\nfilter country == \"USA\"
7177
Ident: average
7278
args:
7379
- Ident: salary
80+
aesthetics_before:
81+
- Comment: " Aggregate each group to a single row"
7482
- FuncCall:
7583
name:
7684
Ident: average

0 commit comments

Comments
 (0)