Skip to content

Commit ffb25bb

Browse files
authored
Merge pull request #23 from mingjerli/feat/add-repr-methods
feat: Add __repr__ methods to QueryUnit, QueryUnitGraph, and Pipeline
2 parents a3e8e29 + 99cdfa0 commit ffb25bb

3 files changed

Lines changed: 130 additions & 1 deletion

File tree

README.md

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,37 @@ for table in source_tables:
8080
print(f" {table}")
8181
```
8282

83+
**Output:**
84+
```
85+
ColumnLineageGraph(
86+
query_units=[cte:monthly_sales, main]
87+
nodes=[
88+
monthly_sales.month (layer=cte, type=expression)
89+
monthly_sales.total_amount (layer=cte, type=aggregate)
90+
monthly_sales.user_id (layer=cte, type=direct_column)
91+
orders.amount (layer=input, type=base_column)
92+
orders.order_date (layer=input, type=base_column)
93+
orders.user_id (layer=input, type=base_column)
94+
output.month (layer=output, type=direct_column)
95+
output.name (layer=output, type=direct_column)
96+
output.total_amount (layer=output, type=direct_column)
97+
users.name (layer=input, type=base_column)
98+
]
99+
edges=[
100+
monthly_sales.month -> output.month (direct_column)
101+
monthly_sales.total_amount -> output.total_amount (direct_column)
102+
orders.amount -> monthly_sales.total_amount (aggregate)
103+
orders.order_date -> monthly_sales.month (expression)
104+
orders.user_id -> monthly_sales.user_id (direct_column)
105+
users.name -> output.name (direct_column)
106+
]
107+
)
108+
------------------------------------------------------------
109+
2 source tables:
110+
users
111+
orders
112+
```
113+
83114
### Multi-Query Pipeline Lineage
84115

85116
```python
@@ -135,6 +166,24 @@ for impact in impacts:
135166
print(f" {impact}")
136167
```
137168

169+
**Output:**
170+
```
171+
Pipeline with 3 queries
172+
------------------------------------------------------------
173+
Execution order (5 tables):
174+
1. source_events
175+
2. users
176+
3. raw_events
177+
4. daily_active_users
178+
5. user_summary
179+
------------------------------------------------------------
180+
Backward lineage for user_summary.event_count (1 sources):
181+
ColumnNode('daily_active_users:raw_events.*')
182+
------------------------------------------------------------
183+
Forward lineage for source_events.event_timestamp (1 impacts):
184+
ColumnNode('user_summary:user_summary.activity_date')
185+
```
186+
138187
### Metadata from SQL Comments
139188

140189
```python
@@ -176,6 +225,16 @@ for col in pipeline.columns.values():
176225
print(f" PII: {col.sql_metadata.pii}")
177226
```
178227

228+
**Output:**
229+
```
230+
Total columns: 6
231+
------------------------------------------------------------
232+
PII columns (1):
233+
select:select.email
234+
Owner: data-team
235+
------------------------------------------------------------
236+
```
237+
179238
### Metadata Management and Export
180239

181240
```python
@@ -229,6 +288,25 @@ GraphVizExporter.export_to_file(pipeline, "lineage.dot")
229288
print("✓ Exported to lineage.json, columns.csv, lineage.dot")
230289
```
231290

291+
**Output:**
292+
```
293+
📊 Propagating metadata for 8 columns...
294+
✅ Done! Propagated metadata for 8 columns
295+
Found 3 PII columns:
296+
ColumnNode('raw.orders:raw.orders.user_email')
297+
Owner: data-team
298+
Tags: contact, sensitive
299+
ColumnNode('analytics.revenue:analytics.revenue.user_email')
300+
Owner: data-team
301+
Tags: contact, sensitive
302+
ColumnNode('analytics.revenue:raw.orders.user_email')
303+
Owner: data-team
304+
Tags: contact, sensitive
305+
------------------------------------------------------------
306+
Exporting to multiple formats...
307+
✓ Exported to lineage.json, columns.csv, lineage.dot
308+
```
309+
232310
### LLM-Powered Description Generation
233311

234312
<!-- skip-test -->
@@ -255,7 +333,7 @@ pipeline = Pipeline(queries, dialect="bigquery")
255333

256334
# Configure LLM (Ollama - free, local), or replace to any LangChain Chat models.
257335
llm = ChatOllama(model="qwen3-coder:30b", temperature=0.3)
258-
pipeline.column_graph.llm = llm
336+
pipeline.llm = llm
259337

260338
# Generate descriptions for all columns
261339
print(f"Generating descriptions for {len(pipeline.columns)} columns...")
@@ -273,6 +351,31 @@ for col in columns_with_descriptions:
273351
print(f" {col.description}")
274352
```
275353

354+
**Output:**
355+
```
356+
Generating descriptions for 12 columns...
357+
📊 Generating descriptions for 8 columns...
358+
✅ Done! Generated 8 descriptions
359+
------------------------------------------------------------
360+
Generated descriptions for 8 columns:
361+
raw.orders:raw.orders.order_id:
362+
Unique identifier for each customer order placed in the system per order record.
363+
raw.orders:raw.orders.user_email:
364+
User email addresses from the orders table, one per order record.
365+
raw.orders:raw.orders.amount:
366+
Order total amount in USD per customer.
367+
raw.orders:raw.orders.order_date:
368+
Order date when customers placed their purchases per day.
369+
analytics.revenue:analytics.revenue.user_email:
370+
User email addresses from order records, one per order entry.
371+
analytics.revenue:raw.orders.user_email:
372+
User email addresses from order records, one per order entry.
373+
analytics.revenue:analytics.revenue.total_revenue:
374+
Total revenue aggregated per customer from order amounts in USD.
375+
analytics.revenue:raw.orders.amount:
376+
Order total amount in USD per customer from raw orders table.
377+
```
378+
276379
## Architecture
277380

278381
### Conceptual Structure

src/clgraph/models.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ def __eq__(self, other):
9696
return False
9797
return self.unit_id == other.unit_id
9898

99+
def __repr__(self):
100+
return f"{self.unit_type.value}:{self.unit_id}"
101+
99102
def is_leaf(self) -> bool:
100103
"""Check if this is a leaf unit (only depends on base tables)"""
101104
return len(self.depends_on_units) == 0
@@ -147,6 +150,13 @@ def get_unit_by_name(self, name: str) -> Optional[QueryUnit]:
147150
return unit
148151
return None
149152

153+
def __repr__(self):
154+
"""Show topologically sorted query units"""
155+
sorted_units = self.get_topological_order()
156+
unit_reprs = [repr(unit) for unit in sorted_units]
157+
units_str = ", ".join(unit_reprs)
158+
return f"QueryUnitGraph([{units_str}])"
159+
150160

151161
# ============================================================================
152162
# Column Lineage Models

src/clgraph/pipeline.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1341,6 +1341,22 @@ def _remap_query_ids(self):
13411341
table.read_by = [self.query_mapping.get(qid, qid) for qid in table.read_by]
13421342
table.modified_by = [self.query_mapping.get(qid, qid) for qid in table.modified_by]
13431343

1344+
def __repr__(self):
1345+
"""Show topologically sorted SQL statements"""
1346+
sorted_query_ids = self.table_graph.topological_sort()
1347+
query_strs = []
1348+
1349+
for query_id in sorted_query_ids:
1350+
query = self.table_graph.queries[query_id]
1351+
# Truncate SQL to first 60 chars for readability
1352+
sql_preview = query.sql.strip().replace("\n", " ")
1353+
if len(sql_preview) > 60:
1354+
sql_preview = sql_preview[:57] + "..."
1355+
query_strs.append(f"{query_id}: {sql_preview}")
1356+
1357+
queries_display = "\n ".join(query_strs)
1358+
return f"Pipeline(\n {queries_display}\n)"
1359+
13441360
def split(self, sinks: List) -> List["Pipeline"]:
13451361
"""
13461362
Split pipeline into non-overlapping subpipelines based on target tables.

0 commit comments

Comments
 (0)