Skip to content

Commit 622e651

Browse files
committed
feat: promote qualify metadata from subquery-based dedup WHERE pattern (Gap 2)
Detect the common dedup pattern (ROW_NUMBER() OVER (...) AS rn in subquery + WHERE rn = 1 in outer query) and promote it to qualify_info on the outer unit. Supports EQ, LTE, LT comparisons against ranking functions (ROW_NUMBER, RANK, DENSE_RANK, NTILE). Adds ranking_window_columns to QueryUnit model for cross-unit metadata propagation.
1 parent 67859c7 commit 622e651

3 files changed

Lines changed: 256 additions & 0 deletions

File tree

src/clgraph/models.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,11 @@ class QueryUnit:
297297
# Stores info about WHERE clause columns for filter lineage edges
298298
where_predicates: List["WherePredicateInfo"] = field(default_factory=list)
299299

300+
# Ranking window columns for dedup qualify promotion (Gap 2)
301+
# Maps alias -> {function, partition_by, order_by} for ranking functions
302+
# Example: {'rn': {'function': 'ROW_NUMBER', 'partition_by': ['id'], 'order_by': [...]}}
303+
ranking_window_columns: Dict[str, Dict[str, Any]] = field(default_factory=dict)
304+
300305
# Metadata
301306
depth: int = 0 # Nesting depth (0 = main query)
302307
order: int = 0 # Topological order for CTEs

src/clgraph/query_parser.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,9 @@ def _parse_select_unit(
211211
)
212212
)
213213

214+
# 4c. Promote dedup qualify info from WHERE (Gap 2)
215+
self._promote_dedup_qualify_if_applicable(select_node, unit)
216+
214217
# 5. Parse HAVING clause (may contain subqueries)
215218
having_clause = select_node.args.get("having")
216219
if having_clause:
@@ -1687,6 +1690,53 @@ def _parse_qualify_clause(self, qualify_node: exp.Qualify, unit: QueryUnit):
16871690
"window_functions": window_functions,
16881691
}
16891692

1693+
def _promote_dedup_qualify_if_applicable(self, select_node: exp.Select, unit: QueryUnit):
1694+
"""
1695+
Promote dedup qualify info from a subquery-based WHERE pattern (Gap 2).
1696+
1697+
Detects the common dedup pattern:
1698+
SELECT ... FROM (SELECT *, ROW_NUMBER() OVER (...) AS rn FROM t) WHERE rn = 1
1699+
and promotes it to qualify_info on the outer unit.
1700+
1701+
Only ranking functions (ROW_NUMBER, RANK, DENSE_RANK, NTILE) are eligible.
1702+
Comparison operators =, <=, < against a literal are recognized.
1703+
1704+
Args:
1705+
select_node: The SELECT expression
1706+
unit: The query unit to potentially add qualify_info to
1707+
"""
1708+
where_clause = select_node.args.get("where")
1709+
if not where_clause or unit.qualify_info:
1710+
return
1711+
1712+
for dep_unit_id in unit.depends_on_units:
1713+
dep_unit = self.unit_graph.units.get(dep_unit_id)
1714+
if not dep_unit or not dep_unit.ranking_window_columns:
1715+
continue
1716+
1717+
for node in where_clause.walk():
1718+
if isinstance(node, (exp.EQ, exp.LTE, exp.LT)):
1719+
left, right = node.left, node.right
1720+
col_name = None
1721+
if isinstance(left, exp.Column) and isinstance(right, exp.Literal):
1722+
col_name = left.name
1723+
elif isinstance(right, exp.Column) and isinstance(left, exp.Literal):
1724+
col_name = right.name
1725+
1726+
if col_name and col_name in dep_unit.ranking_window_columns:
1727+
window_meta = dep_unit.ranking_window_columns[col_name]
1728+
unit.qualify_info = {
1729+
"condition": where_clause.this.sql(),
1730+
"partition_columns": list(window_meta["partition_by"]),
1731+
"order_columns": [
1732+
c["column"] if isinstance(c, dict) else c
1733+
for c in window_meta["order_by"]
1734+
],
1735+
"window_functions": [window_meta["function"]],
1736+
"promoted_from_subquery": True,
1737+
}
1738+
return
1739+
16901740
def _parse_grouping_sets(self, group_clause: exp.Group, unit: QueryUnit):
16911741
"""
16921742
Parse GROUP BY clause for GROUPING SETS, CUBE, and ROLLUP constructs.
@@ -1842,6 +1892,18 @@ def _parse_window_functions(self, select_node: exp.Select, unit: QueryUnit):
18421892
if windows:
18431893
unit.window_info = {"windows": windows}
18441894

1895+
# Populate ranking_window_columns for dedup qualify promotion (Gap 2)
1896+
RANKING_FUNCTIONS = {"ROW_NUMBER", "RANK", "DENSE_RANK", "NTILE"}
1897+
for window_def in windows:
1898+
func_name = window_def.get("function", "").upper()
1899+
output_col = window_def.get("output_column")
1900+
if func_name in RANKING_FUNCTIONS and output_col:
1901+
unit.ranking_window_columns[output_col] = {
1902+
"function": func_name,
1903+
"partition_by": window_def.get("partition_by", []),
1904+
"order_by": window_def.get("order_by", []),
1905+
}
1906+
18451907
def _parse_single_window(
18461908
self,
18471909
window: exp.Window,
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
"""
2+
Test suite for Gap 2: Subquery-Based Dedup Qualify Promotion.
3+
4+
Tests that the common dedup pattern:
5+
SELECT ... FROM (SELECT *, ROW_NUMBER() OVER (...) AS rn FROM t) WHERE rn = 1
6+
is promoted to qualify metadata on the outer query unit.
7+
"""
8+
9+
import pytest
10+
11+
from clgraph.query_parser import RecursiveQueryParser
12+
13+
# ============================================================================
14+
# Test Group 1: Qualify Promotion from Subquery Dedup Pattern
15+
# ============================================================================
16+
17+
18+
class TestDedupQualifyPromotion:
19+
"""Test promotion of subquery-based dedup WHERE to qualify_info."""
20+
21+
def test_qualify_promotion_eq(self):
22+
"""WHERE rn = 1 with ROW_NUMBER promotes qualify_info on outer unit."""
23+
sql = """
24+
SELECT id, name
25+
FROM (
26+
SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS rn
27+
FROM t
28+
)
29+
WHERE rn = 1
30+
"""
31+
parser = RecursiveQueryParser(sql, dialect="bigquery")
32+
graph = parser.parse()
33+
34+
main_unit = graph.units["main"]
35+
assert main_unit.qualify_info is not None
36+
assert main_unit.qualify_info["promoted_from_subquery"] is True
37+
assert "ROW_NUMBER" in main_unit.qualify_info["window_functions"]
38+
assert "id" in main_unit.qualify_info["partition_columns"]
39+
40+
def test_qualify_promotion_lte(self):
41+
"""WHERE rn <= 3 with ROW_NUMBER promotes qualify_info."""
42+
sql = """
43+
SELECT id, name
44+
FROM (
45+
SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS rn
46+
FROM t
47+
)
48+
WHERE rn <= 3
49+
"""
50+
parser = RecursiveQueryParser(sql, dialect="bigquery")
51+
graph = parser.parse()
52+
53+
main_unit = graph.units["main"]
54+
assert main_unit.qualify_info is not None
55+
assert main_unit.qualify_info["promoted_from_subquery"] is True
56+
assert "ROW_NUMBER" in main_unit.qualify_info["window_functions"]
57+
58+
def test_qualify_promotion_lt(self):
59+
"""WHERE rn < 3 with ROW_NUMBER promotes qualify_info."""
60+
sql = """
61+
SELECT id, name
62+
FROM (
63+
SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS rn
64+
FROM t
65+
)
66+
WHERE rn < 3
67+
"""
68+
parser = RecursiveQueryParser(sql, dialect="bigquery")
69+
graph = parser.parse()
70+
71+
main_unit = graph.units["main"]
72+
assert main_unit.qualify_info is not None
73+
assert main_unit.qualify_info["promoted_from_subquery"] is True
74+
assert "ROW_NUMBER" in main_unit.qualify_info["window_functions"]
75+
76+
77+
# ============================================================================
78+
# Test Group 2: Non-Ranking Functions Should NOT Promote
79+
# ============================================================================
80+
81+
82+
class TestNonRankingNotPromoted:
83+
"""Test that non-ranking window functions are not promoted."""
84+
85+
def test_sum_window_not_promoted(self):
86+
"""SUM() OVER (...) + WHERE total > 100 should NOT produce qualify_info."""
87+
sql = """
88+
SELECT id, total
89+
FROM (
90+
SELECT id, SUM(amount) OVER (PARTITION BY id) AS total
91+
FROM t
92+
)
93+
WHERE total > 100
94+
"""
95+
parser = RecursiveQueryParser(sql, dialect="bigquery")
96+
graph = parser.parse()
97+
98+
main_unit = graph.units["main"]
99+
assert main_unit.qualify_info is None
100+
101+
102+
# ============================================================================
103+
# Test Group 3: Explicit QUALIFY Not Overwritten
104+
# ============================================================================
105+
106+
107+
class TestExplicitQualifyNotOverwritten:
108+
"""Test that explicit QUALIFY clause is not overwritten by promotion."""
109+
110+
def test_explicit_qualify_preserved(self):
111+
"""Explicit QUALIFY should remain; promotion should not overwrite."""
112+
sql = """
113+
SELECT customer_id, order_date
114+
FROM orders
115+
QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY order_date DESC) = 1
116+
"""
117+
parser = RecursiveQueryParser(sql, dialect="bigquery")
118+
graph = parser.parse()
119+
120+
main_unit = graph.units["main"]
121+
assert main_unit.qualify_info is not None
122+
# Explicit QUALIFY should NOT have promoted_from_subquery
123+
assert main_unit.qualify_info.get("promoted_from_subquery") is not True
124+
125+
126+
# ============================================================================
127+
# Test Group 4: rn Not in Output Columns
128+
# ============================================================================
129+
130+
131+
class TestRnNotInOutput:
132+
"""Test that the ranking alias (rn) is not in the outer unit output columns."""
133+
134+
def test_rn_not_in_output(self):
135+
"""Outer SELECT id, name should not include rn in output columns."""
136+
sql = """
137+
SELECT id, name
138+
FROM (
139+
SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS rn
140+
FROM t
141+
)
142+
WHERE rn = 1
143+
"""
144+
parser = RecursiveQueryParser(sql, dialect="bigquery")
145+
graph = parser.parse()
146+
147+
main_unit = graph.units["main"]
148+
output_col_names = [c.get("name", "") for c in main_unit.output_columns]
149+
assert "rn" not in output_col_names
150+
151+
152+
# ============================================================================
153+
# Test Group 5: ranking_window_columns Populated on Inner Unit
154+
# ============================================================================
155+
156+
157+
class TestRankingWindowColumns:
158+
"""Test that inner subquery unit has ranking_window_columns metadata."""
159+
160+
def test_ranking_window_columns_populated(self):
161+
"""Inner unit should have ranking_window_columns with correct metadata."""
162+
sql = """
163+
SELECT id, name
164+
FROM (
165+
SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY ts DESC) AS rn
166+
FROM t
167+
)
168+
WHERE rn = 1
169+
"""
170+
parser = RecursiveQueryParser(sql, dialect="bigquery")
171+
graph = parser.parse()
172+
173+
# Find the inner subquery unit (not 'main')
174+
inner_units = [u for uid, u in graph.units.items() if uid != "main"]
175+
assert len(inner_units) >= 1
176+
177+
# At least one inner unit should have ranking_window_columns
178+
inner_with_ranking = [u for u in inner_units if u.ranking_window_columns]
179+
assert len(inner_with_ranking) >= 1
180+
181+
inner_unit = inner_with_ranking[0]
182+
assert "rn" in inner_unit.ranking_window_columns
183+
rn_meta = inner_unit.ranking_window_columns["rn"]
184+
assert rn_meta["function"] == "ROW_NUMBER"
185+
assert "id" in rn_meta["partition_by"]
186+
187+
188+
if __name__ == "__main__":
189+
pytest.main([__file__, "-v", "--tb=short"])

0 commit comments

Comments
 (0)