Skip to content

Commit 2aa7477

Browse files
max-sixtyclaude
andauthored
fix: Prevent DISTINCT ON internal sorting from leaking past joins (#5633)
Co-authored-by: Claude <[email protected]>
1 parent 377a1f3 commit 2aa7477

File tree

3 files changed

+170
-9
lines changed

3 files changed

+170
-9
lines changed

prqlc/prqlc/src/lib.rs

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,4 +614,117 @@ mod tests {
614614
.map(|name| Target::from_str(&name))
615615
.collect();
616616
}
617+
618+
/// Regression test for #4633: sort inside group should not leak to outer query after join.
619+
///
620+
/// Per PRQL spec, `group` resets the order. The `sort` inside a group is for
621+
/// row selection (which row to keep), not output ordering. After the group,
622+
/// there is no defined order, so it should not appear in the outer query.
623+
#[test]
624+
fn test_sort_not_propagated_after_join() {
625+
use insta::assert_snapshot;
626+
627+
// DISTINCT ON (postgres) uses the sort for row selection within the CTE.
628+
// This internal sorting must not leak to the outer query after a join.
629+
assert_snapshot!(
630+
super::compile(
631+
r#"
632+
prql target:sql.postgres
633+
634+
from tracks
635+
group media_type_id (
636+
sort name
637+
take 1
638+
)
639+
join media_types (== media_type_id)
640+
select {
641+
tracks.track_id,
642+
media_types.name
643+
}
644+
"#,
645+
&super::Options::default().no_signature()
646+
).unwrap(),
647+
@"
648+
WITH table_0 AS (
649+
SELECT
650+
DISTINCT ON (media_type_id) track_id,
651+
media_type_id,
652+
name
653+
FROM
654+
tracks
655+
ORDER BY
656+
media_type_id,
657+
name
658+
)
659+
SELECT
660+
table_0.track_id,
661+
media_types.name
662+
FROM
663+
table_0
664+
INNER JOIN media_types ON table_0.media_type_id = media_types.media_type_id
665+
"
666+
);
667+
}
668+
669+
/// Verify that explicit sorts after group are preserved past joins.
670+
///
671+
/// Per PRQL spec, `sort` introduces a new order. When the user explicitly
672+
/// sorts AFTER a group, that becomes the new output order and should
673+
/// propagate through subsequent transforms including joins.
674+
#[test]
675+
fn test_explicit_sort_after_distinct_on_preserved() {
676+
use insta::assert_snapshot;
677+
678+
// Explicit `sort media_type_id` after the group introduces a new order.
679+
// This user-requested ordering should propagate past the join.
680+
assert_snapshot!(
681+
super::compile(
682+
r#"
683+
prql target:sql.postgres
684+
685+
from tracks
686+
group media_type_id (
687+
sort name
688+
take 1
689+
)
690+
sort media_type_id
691+
join media_types (== media_type_id)
692+
select {
693+
tracks.track_id,
694+
media_types.name
695+
}
696+
"#,
697+
&super::Options::default().no_signature()
698+
).unwrap(),
699+
@"
700+
WITH table_0 AS (
701+
SELECT
702+
DISTINCT ON (media_type_id) track_id,
703+
media_type_id,
704+
name
705+
FROM
706+
tracks
707+
ORDER BY
708+
media_type_id,
709+
name
710+
),
711+
table_1 AS (
712+
SELECT
713+
table_0.track_id,
714+
media_types.name,
715+
table_0.media_type_id
716+
FROM
717+
table_0
718+
INNER JOIN media_types ON table_0.media_type_id = media_types.media_type_id
719+
)
720+
SELECT
721+
track_id,
722+
name
723+
FROM
724+
table_1
725+
ORDER BY
726+
media_type_id
727+
"
728+
);
729+
}
617730
}

prqlc/prqlc/src/sql/pq/postprocess.rs

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ pub(super) fn postprocess(query: SqlQuery, ctx: &mut Context) -> SqlQuery {
2727
fn infer_sorts(query: SqlQuery, ctx: &mut Context) -> SqlQuery {
2828
let mut s = SortingInference {
2929
last_sorting: Vec::new(),
30+
last_sorting_from_distinct_on: false,
3031
ctes_sorting: HashMap::new(),
3132
main_relation: false,
3233
ctx,
@@ -37,6 +38,12 @@ fn infer_sorts(query: SqlQuery, ctx: &mut Context) -> SqlQuery {
3738

3839
struct SortingInference<'a> {
3940
last_sorting: Sorting,
41+
/// True if last_sorting originated from DISTINCT ON (used for row selection).
42+
///
43+
/// Per the PRQL spec, `group` resets the order - any `sort` inside a group
44+
/// is for internal row selection, not output ordering. This flag tracks such
45+
/// internal sorting so it doesn't propagate past transforms like `join`.
46+
last_sorting_from_distinct_on: bool,
4047
ctes_sorting: HashMap<TId, CteSorting>,
4148
main_relation: bool,
4249
ctx: &'a mut Context,
@@ -148,6 +155,12 @@ impl SortingInference<'_> {
148155
#[derive(Debug)]
149156
struct CteSorting {
150157
sorting: Sorting,
158+
/// True if the CTE's sorting originated from DISTINCT ON (row selection).
159+
///
160+
/// Per the PRQL spec, `group` resets the order. DISTINCT ON sorting is
161+
/// internal to the group - it determines which row to keep, not output
162+
/// ordering. This flag ensures such sorting doesn't leak to outer queries.
163+
from_distinct_on: bool,
151164
}
152165

153166
impl RqFold for SortingInference<'_> {}
@@ -239,7 +252,11 @@ impl PqFold for SortingInference<'_> {
239252
// store sorting to be used later in From references
240253
let sorting = self.last_sorting.drain(..).collect();
241254
log::debug!("--- sorting {sorting:?}");
242-
let sorting = CteSorting { sorting };
255+
let sorting = CteSorting {
256+
sorting,
257+
from_distinct_on: self.last_sorting_from_distinct_on,
258+
};
259+
self.last_sorting_from_distinct_on = false;
243260
self.ctes_sorting.insert(cte.tid, sorting);
244261

245262
ctes.push(cte);
@@ -305,6 +322,9 @@ impl PqMapper<RelationExpr, RelationExpr, (), ()> for SortingInference<'_> {
305322
transforms: Vec<SqlTransform<RelationExpr, ()>>,
306323
) -> Result<Vec<SqlTransform<RelationExpr, ()>>> {
307324
let mut sorting = Vec::new();
325+
// Track whether sorting originated from DISTINCT ON (internal row selection).
326+
// Per PRQL spec, `group` resets order - internal sorts don't define output order.
327+
let mut sorting_from_distinct_on = false;
308328

309329
let mut result = Vec::with_capacity(transforms.len() + 1);
310330

@@ -314,17 +334,20 @@ impl PqMapper<RelationExpr, RelationExpr, (), ()> for SortingInference<'_> {
314334
match expr.kind {
315335
RelationExprKind::Ref(ref tid) => {
316336
// infer sorting from referenced pipeline
317-
if let Some(cte_sorting) = self.ctes_sorting.get_mut(tid) {
337+
if let Some(cte_sorting) = self.ctes_sorting.get(tid) {
318338
sorting.clone_from(&cte_sorting.sorting);
339+
sorting_from_distinct_on = cte_sorting.from_distinct_on;
319340
} else {
320-
sorting = Vec::new();
341+
sorting.clear();
342+
sorting_from_distinct_on = false;
321343
};
322344
}
323345
RelationExprKind::SubQuery(rel) => {
324346
let rel = self.fold_sql_relation(rel)?;
325347

326348
// infer sorting from sub-query
327349
sorting = self.last_sorting.drain(..).collect();
350+
sorting_from_distinct_on = self.last_sorting_from_distinct_on;
328351

329352
expr.kind = RelationExprKind::SubQuery(rel);
330353
}
@@ -337,16 +360,40 @@ impl PqMapper<RelationExpr, RelationExpr, (), ()> for SortingInference<'_> {
337360
// just store sorting and don't emit Sort
338361
SqlTransform::Sort(expr) => {
339362
sorting.clone_from(&expr);
363+
// A new explicit Sort clears the DISTINCT ON flag - this is a
364+
// user-requested ordering, not an internal DISTINCT ON sort.
365+
sorting_from_distinct_on = false;
340366
continue;
341367
}
342368

343369
// clear sorting
344370
SqlTransform::Distinct | SqlTransform::Aggregate { .. } => {
345-
sorting = Vec::new();
371+
sorting.clear();
372+
sorting_from_distinct_on = false;
373+
}
374+
375+
// Per PRQL spec: `group` resets order, `join` retains left's order.
376+
// DISTINCT ON sorting is internal to the group (for row selection),
377+
// so it must not propagate past joins. Explicit user sorts are preserved.
378+
// See issue #4633.
379+
SqlTransform::Join { .. } => {
380+
if sorting_from_distinct_on {
381+
sorting.clear();
382+
sorting_from_distinct_on = false;
383+
}
346384
}
347385

348386
// emit Sort before Take
349-
SqlTransform::Take(_) | SqlTransform::DistinctOn(_) => {
387+
SqlTransform::Take(_) => {
388+
result.push(SqlTransform::Sort(sorting.clone()));
389+
}
390+
391+
SqlTransform::DistinctOn(_) => {
392+
// DISTINCT ON uses sorting for row selection within the group.
393+
// Mark it so this internal sorting doesn't leak to outer queries.
394+
// Note: ROW_NUMBER (used for take > 1 or non-Postgres) doesn't have
395+
// this issue because its sorting is embedded in the window function.
396+
sorting_from_distinct_on = true;
350397
result.push(SqlTransform::Sort(sorting.clone()));
351398
}
352399
_ => {}
@@ -371,6 +418,7 @@ impl PqMapper<RelationExpr, RelationExpr, (), ()> for SortingInference<'_> {
371418

372419
// remember sorting for this pipeline
373420
self.last_sorting = sorting;
421+
self.last_sorting_from_distinct_on = sorting_from_distinct_on;
374422

375423
Ok(result)
376424
}

prqlc/prqlc/tests/integration/sql.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3299,7 +3299,7 @@ fn test_prql_to_sql_table() {
32993299
"#;
33003300
let sql = compile(query).unwrap();
33013301
assert_snapshot!(sql,
3302-
@r"
3302+
@"
33033303
WITH newest_employees AS (
33043304
SELECT
33053305
*
@@ -6338,7 +6338,7 @@ fn test_sort_cast_filter_join_select() {
63386338
join side:left artists (this.`artist_id` == that.`artist_id`)
63396339
select {this.`artist_id`, this.`title`, this.`name`}
63406340
"###
6341-
).unwrap(), @r"
6341+
).unwrap(), @"
63426342
WITH table_1 AS (
63436343
SELECT
63446344
CAST(artist_id AS double precision) AS artist_id,
@@ -6394,7 +6394,7 @@ fn test_sort_filter_derive_join_select() {
63946394
) (this.`artist_id` == that.`artist_id_right`)
63956395
select {this.`artist_id`, this.`title`}
63966396
"###
6397-
).unwrap(), @r"
6397+
).unwrap(), @"
63986398
WITH table_2 AS (
63996399
SELECT
64006400
CAST(artist_id AS double precision) AS artist_id,
@@ -6442,7 +6442,7 @@ fn test_sort_cast_filter_join_select_with_alias() {
64426442
join side:left artists (this.`double_artist_id` == that.`artist_id`)
64436443
select {this.`double_artist_id`, this.`title`, this.`name`}
64446444
"###
6445-
).unwrap(), @r"
6445+
).unwrap(), @"
64466446
WITH table_1 AS (
64476447
SELECT
64486448
CAST(artist_id AS double precision) AS double_artist_id,

0 commit comments

Comments
 (0)