feat: Improve star (*) handling in cross-query lineage with EXCEPT/REPLACE support

mingjerli · claude · mingjerli · commit af3a079fbef8 · 2025-12-10T18:16:06.000-08:00
- Enhanced _add_cross_query_edges to properly handle SELECT * in pipelines - Added support for SELECT * EXCEPT to exclude columns from cross-query edges - Added support for SELECT * REPLACE to maintain lineage for transformed columns - Fixed issue where upstream columns weren't connected to * in downstream queries - Copy is_star, except_columns, and replace_columns fields to pipeline nodes Tests: - Added test_star_expansion_in_cross_query_edges - Added test_star_except_in_cross_query_edges - Added test_star_replace_in_cross_query_edges - All 44 tests in test_multi_query.py pass This ensures that: - Star notation is properly expanded in cross-query scenarios - SELECT * EXCEPT correctly excludes specified columns from lineage - SELECT * REPLACE maintains proper lineage for transformed columns - Backward tracing correctly follows through all pipeline layers 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
diff --git a/src/clgraph/pipeline.py b/src/clgraph/pipeline.py
@@ -149,6 +149,10 @@ def _add_query_columns(
                 owner=owner,
                 tags=tags,
                 custom_metadata=custom_metadata,
+                # Star expansion fields
+                is_star=node.is_star,
+                except_columns=node.except_columns,
+                replace_columns=node.replace_columns,
             )
             pipeline.add_column(column)
 
@@ -187,6 +191,7 @@ def _add_cross_query_edges(self, pipeline: "Pipeline"):
           - For each column C in Q1's output:
               - For each query Qi that reads T:
                   - If Qi references T.C, create edge: Q1.C -> Qi.C
+                  - If Qi references T.*, create edges: Q1.C -> Qi.* for ALL columns
         """
         for table_name, table_node in pipeline.table_graph.tables.items():
             # Find query that creates this table
@@ -204,7 +209,56 @@ def _add_cross_query_edges(self, pipeline: "Pipeline"):
 
             # Find queries that read this table
             for reading_query_id in table_node.read_by:
-                # Match columns by name
+                # Check if reading query has a * column for this table (input layer)
+                input_star_column = None
+                for col in pipeline.columns.values():
+                    if (
+                        col.query_id == reading_query_id
+                        and col.table_name == table_name
+                        and col.column_name == "*"
+                        and col.layer == "input"
+                    ):
+                        input_star_column = col
+                        break
+
+                # Also check if there's an output * in the same query (has EXCEPT/REPLACE)
+                # This is for queries like: SELECT * EXCEPT (...) FROM table
+                output_star_column = None
+                if input_star_column:
+                    for col in pipeline.columns.values():
+                        if (
+                            col.query_id == reading_query_id
+                            and col.column_name == "*"
+                            and col.layer == "output"
+                        ):
+                            output_star_column = col
+                            break
+
+                # Use output * for EXCEPT/REPLACE info, but connect to input *
+                star_column = input_star_column
+                except_columns = output_star_column.except_columns if output_star_column else set()
+
+                # If there's a star column, connect all output columns to it
+                # BUT respect EXCEPT clause - skip columns that are excepted
+                if star_column:
+                    for output_col in output_columns:
+                        # Skip columns in EXCEPT clause
+                        if output_col.column_name in except_columns:
+                            continue
+
+                        edge = ColumnEdge(
+                            from_node=output_col,
+                            to_node=star_column,
+                            edge_type="cross_query",
+                            context="cross_query",
+                            transformation=f"{creating_query_id} -> {reading_query_id}",
+                            query_id=None,  # Cross-query edge
+                        )
+                        pipeline.add_edge(edge)
+
+                # ALWAYS match columns by name (not just when there's no star)
+                # This handles cases where the query uses both * (for COUNT(*))
+                # and specific columns (for SUM(amount), etc.)
                 for output_col in output_columns:
                     # Find corresponding input column in reading query
                     # Search for this column in reading query's lineage
diff --git a/tests/test_multi_query.py b/tests/test_multi_query.py
@@ -909,6 +909,171 @@ def test_get_lineage_path(self):
         # Should have edges connecting them
         assert len(path) > 0
 
+    def test_star_expansion_in_cross_query_edges(self):
+        """
+        Test that * is properly handled in cross-query lineage.
+
+        When upstream query creates a table with known columns, and downstream
+        query uses both * (for COUNT(*)) and specific columns (for SUM(amount)),
+        we should create edges for both:
+        1. All upstream columns -> * node (for COUNT(*))
+        2. Specific column matches (for individual column references)
+        """
+        queries = [
+            """
+            CREATE TABLE staging.user_orders AS
+            SELECT
+                user_id,
+                order_id,
+                amount,
+                order_date
+            FROM raw.orders
+            WHERE status = 'completed'
+            """,
+            """
+            CREATE TABLE analytics.user_metrics AS
+            SELECT
+                user_id,
+                COUNT(*) as order_count,
+                SUM(amount) as total_revenue,
+                AVG(amount) as avg_order_value
+            FROM staging.user_orders
+            GROUP BY user_id
+            """,
+        ]
+
+        parser = MultiQueryParser()
+        table_graph = parser.parse_queries(queries)
+
+        builder = PipelineLineageBuilder()
+        pipeline = builder.build(table_graph)
+
+        # Verify that cross-query edges include connections to * column
+        cross_query_edges = [e for e in pipeline.edges if e.query_id is None]
+
+        # Should have edges from all staging.user_orders columns to the * node
+        star_edges = [
+            e
+            for e in cross_query_edges
+            if e.to_node.column_name == "*" and e.to_node.table_name == "staging.user_orders"
+        ]
+        # All 4 columns from query_0 should connect to the * in query_1
+        assert len(star_edges) == 4
+
+        # Should ALSO have edges for specifically referenced columns (user_id, amount)
+        specific_edges = [
+            e
+            for e in cross_query_edges
+            if e.to_node.column_name in ("user_id", "amount")
+            and e.to_node.table_name == "staging.user_orders"
+            and e.to_node.layer == "input"
+        ]
+        # user_id and amount should each have an edge
+        assert len(specific_edges) >= 2
+
+        # Verify backward lineage traces all the way to source
+        sources = pipeline.trace_column_backward("analytics.user_metrics", "total_revenue")
+        assert any(s.table_name == "raw.orders" and s.column_name == "amount" for s in sources)
+
+    def test_star_except_in_cross_query_edges(self):
+        """
+        Test that SELECT * EXCEPT properly excludes columns in cross-query lineage.
+
+        When downstream query uses SELECT * EXCEPT (col1, col2), we should NOT
+        create cross-query edges for the excepted columns.
+        """
+        queries = [
+            """
+            CREATE TABLE staging.orders AS
+            SELECT
+                order_id,
+                user_id,
+                amount,
+                sensitive_data,
+                order_date
+            FROM raw.orders
+            """,
+            """
+            CREATE TABLE analytics.clean_orders AS
+            SELECT * EXCEPT (sensitive_data)
+            FROM staging.orders
+            """,
+        ]
+
+        parser = MultiQueryParser()
+        table_graph = parser.parse_queries(queries)
+
+        builder = PipelineLineageBuilder()
+        pipeline = builder.build(table_graph)
+
+        # Verify that cross-query edges exclude sensitive_data
+        cross_query_edges = [e for e in pipeline.edges if e.query_id is None]
+
+        # Get edges from staging.orders to the * in analytics
+        star_edges = [
+            e
+            for e in cross_query_edges
+            if e.to_node.column_name == "*" and e.to_node.table_name == "staging.orders"
+        ]
+
+        # Should have edges for order_id, user_id, amount, order_date
+        # Should NOT have edge for sensitive_data
+        edge_from_columns = {e.from_node.column_name for e in star_edges}
+        assert "order_id" in edge_from_columns
+        assert "user_id" in edge_from_columns
+        assert "amount" in edge_from_columns
+        assert "order_date" in edge_from_columns
+        assert "sensitive_data" not in edge_from_columns  # This should be excluded!
+
+    def test_star_replace_in_cross_query_edges(self):
+        """
+        Test that SELECT * REPLACE maintains lineage for replaced columns.
+
+        REPLACE doesn't remove columns, it transforms them. Cross-query edges
+        should still exist for replaced columns.
+        """
+        queries = [
+            """
+            CREATE TABLE staging.orders AS
+            SELECT
+                order_id,
+                user_id,
+                amount,
+                status,
+                order_date
+            FROM raw.orders
+            """,
+            """
+            CREATE TABLE analytics.orders_normalized AS
+            SELECT * REPLACE (UPPER(status) as status)
+            FROM staging.orders
+            """,
+        ]
+
+        parser = MultiQueryParser()
+        table_graph = parser.parse_queries(queries)
+
+        builder = PipelineLineageBuilder()
+        pipeline = builder.build(table_graph)
+
+        # Verify that cross-query edges include ALL columns (including status)
+        cross_query_edges = [e for e in pipeline.edges if e.query_id is None]
+
+        # Get edges from staging.orders to the * in analytics
+        star_edges = [
+            e
+            for e in cross_query_edges
+            if e.to_node.column_name == "*" and e.to_node.table_name == "staging.orders"
+        ]
+
+        # Should have edges for ALL columns including the replaced one
+        edge_from_columns = {e.from_node.column_name for e in star_edges}
+        assert "order_id" in edge_from_columns
+        assert "user_id" in edge_from_columns
+        assert "amount" in edge_from_columns
+        assert "status" in edge_from_columns  # Replaced column should still have edge
+        assert "order_date" in edge_from_columns
+
 
 # ============================================================================
 # Part 4: Edge Cases Tests