diff --git a/.gitignore b/.gitignore index 7591726..8b262b0 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ test_*.py .planning/ .ruff_cache/ Start_MCP_Server.bat + diff --git a/src/teradata_mcp_server/app.py b/src/teradata_mcp_server/app.py index aab1366..c9d0c8d 100644 --- a/src/teradata_mcp_server/app.py +++ b/src/teradata_mcp_server/app.py @@ -34,6 +34,7 @@ from teradata_mcp_server.config import Settings from teradata_mcp_server.middleware import RequestContextMiddleware from teradata_mcp_server.tools import ContextCatalog +from teradata_mcp_server.tools.graph.graph_edge_contract import GRAPH_EDGE_CONTRACT from teradata_mcp_server.tools.utils import ( convert_tdml_docstring_to_mcp_docstring, execute_analytic_function, @@ -1287,5 +1288,19 @@ def get_glossary_term(term_name: str) -> dict[str, Any]: else: return {"error": f"Glossary term not found: {term_name}"} + # ── Graph Edge Contract Resource ────────────────────────────────────── + # Always registered (static content, no YAML dependency). + # AI agents retrieve this to understand the edge_repository schema + # required by all graph_* tools. + # ────────────────────────────────────────────────────────────────────── + if any(re.match(pattern, "graph_edge_contract") for pattern in config.get("resource", [])): + + @mcp.resource("graph://edge-contract") + def get_graph_edge_contract() -> str: + """Return the Graph Edge Contract schema definition.""" + return GRAPH_EDGE_CONTRACT + + logger.info("Registered resource: graph_edge_contract") + # Return the configured app and some handles used by the entrypoint if needed return mcp, logger diff --git a/src/teradata_mcp_server/config/profiles.yml b/src/teradata_mcp_server/config/profiles.yml index 4141766..59ce35f 100644 --- a/src/teradata_mcp_server/config/profiles.yml +++ b/src/teradata_mcp_server/config/profiles.yml @@ -44,7 +44,7 @@ eda: - "base_(?!(writeQuery|dynamicQuery)$).*" - qlty_.* - sec_userDbPermissions - + bar: tool: - ^bar_* @@ -60,4 +60,13 @@ llmUser: - ^base_* - ^chat_* prompt: - - ^chat_* \ No newline at end of file + - ^chat_* + +graph: + tool: + - ^graph_.* + prompt: + - ^graph_.* + resource: + - ^graph_edge_contract$ + diff --git a/src/teradata_mcp_server/tools/graph/README.md b/src/teradata_mcp_server/tools/graph/README.md new file mode 100644 index 0000000..16fa96c --- /dev/null +++ b/src/teradata_mcp_server/tools/graph/README.md @@ -0,0 +1,669 @@ +# Graph Dependency Analysis Tools + +**Version:** 3.0 +**Last Updated:** 2026-04-10 +**Purpose:** Directed dependency graph analysis for Teradata object lineage + +This package provides seven complementary tools for analysing object dependencies +in Teradata. All tools are stored-procedure-free — the only Teradata privilege +required is `SELECT` on an edge repository conforming to the +[Graph Edge Contract](#graph-edge-contract). + +--- + +## Quick Start + +```python +# Step 0 — Generate an edge repository (once, if you don't have one) +# For AI-Native Data Products, skip this — use lineage_graph directly: +# edge_repository="{ProductName}_Semantic.lineage_graph" +ddl = handle_graph_edgeContractDDL( + conn=connection, + target_database="MY_PROJECT_Semantic", + object_name="EdgeRepository", + output_type="TABLE" +) + +# Step 1 — Find root objects (seed points for analysis) +roots = handle_graph_findRootObjects( + conn=connection, + container_pattern="%MY_PROJECT%", + object_types="Table", + edge_repository="MY_PROJECT_Semantic.EdgeRepository" +) + +# Step 2 — Compute BFS hop distances and group into migration waves +waves = handle_graph_bfsLevels( + conn=connection, + root_node_list="MY_DB_STD_T.source_table_a,MY_DB_STD_T.source_table_b", + include_containers="MY_DB%", + edge_repository="MY_PROJECT_Semantic.EdgeRepository" +) + +# Objects grouped by nearest_root = migration wave grouping +# Objects ordered by downstream_level = deployment sequence within each wave + +# Step 3 — Trace lineage and impact paths for a specific object +lineage = handle_graph_traceLineage( + conn=connection, + object_name="MY_DB_STD_T.source_table_a", + max_depth_down=5, + edge_repository="MY_PROJECT_Semantic.EdgeRepository" +) +``` + +--- + +## Tools + +Seven complementary tools covering the full graph analysis workflow: + +| Step | Tool | Implementation | Purpose | +|------|------|---------------|---------| +| 0 | [`graph_edgeContractDDL`](#graph_edgecontractddl) | Template | Generate edge repository DDL — start here | +| 1 | [`graph_findRootObjects`](#graph_findrootobjects) | SQL | Discover objects with no upstream dependencies | +| 2 | [`graph_bfsLevels`](#graph_bfslevels) | Python BFS | Wave planning, deployment sequencing, blast-radius sizing | +| 3 | [`graph_traceLineage`](#graph_tracelineage) | Python + recursive CTE | Full lineage tracing, impact path analysis, edge detail | +| 4 | [`graph_detectCycles`](#graph_detectcycles) | Python DFS | Circular reference detection, DAG validation | +| 5 | [`graph_connectedComponents`](#graph_connectedcomponents) | Python Union-Find | Graph partitioning, isolated sub-graph identification | +| 6 | [`graph_analyseDatabase`](#graph_analysedatabase) | Composite | All four analyses in one call, one shared edge fetch | + +**Typical workflow:** `edgeContractDDL` → `findRootObjects` → `bfsLevels` → `traceLineage` → `detectCycles` + +**When to use `graph_analyseDatabase`:** if you need three or more of the individual analyses, use this instead — it fetches the edge set once and shares it across all four analyses in a single MCP response. + +--- + +## Graph Edge Contract + +All tools require an **edge repository** — a Teradata table or view conforming to the Graph Edge Contract. The contract defines six required columns and two optional enrichment columns: + +### Required Columns + +| Column | Type | Description | +|--------|------|-------------| +| `Src_Container_Name` | `VARCHAR(128) NOT NULL` | Source (upstream) container — Teradata database name, ETL workflow folder, dbt project, etc. | +| `Src_Object_Name` | `VARCHAR(128) NOT NULL` | Source object name | +| `Src_Kind` | `VARCHAR(30) NOT NULL` | Source object type (e.g. `Table`, `View`, `Job`) | +| `Tgt_Container_Name` | `VARCHAR(128) NOT NULL` | Target (downstream) container | +| `Tgt_Object_Name` | `VARCHAR(128) NOT NULL` | Target object name | +| `Tgt_Kind` | `VARCHAR(30) NOT NULL` | Target object type | + +### Optional Enrichment Columns + +| Column | Type | Description | +|--------|------|-------------| +| `Edge_Relationship` | `VARCHAR(50)` | Nature of the edge: `DIRECT`, `ETL_INPUT`, `ETL_OUTPUT`, `JOIN`, `TRANSFORM` | +| `Transformation_Type` | `VARCHAR(50)` | Process category: `ETL`, `FEATURE_ENG`, `AGGREGATION`, `EMBEDDING_GEN` | + +Optional columns are ignored by graph analysis tools but surfaced to graph visualisation clients for edge labelling. + +### Edge Semantics + +All edges share a single consistent direction — Src is always upstream, Tgt is always downstream. The `Edge_Relationship` optional column carries the semantic label for visualisation clients; the graph analysis tools traverse all edges identically regardless of label. + +The same Src→Tgt direction is read differently depending on edge type: + +| Edge type | How to read it | Example | +|---|---|---| +| Object dependency | Src *is referenced by* Tgt | `CUSTOMER_TABLE` → `CUSTOMER_VIEW` | +| ETL input | Src *is read by* Tgt | `CUSTOMER_TABLE` → `ETL_LOAD_JOB` | +| ETL output | Src *writes to* Tgt | `ETL_LOAD_JOB` → `CUSTOMER_FEATURES` | + +In all three cases: Src is the prerequisite, Tgt is the consumer. A single edge repository can hold both object dependency edges and data lineage edges and be traversed uniformly by the graph tools. + +The `lineage_graph` view (Observability Module v1.5) surfaces ETL jobs as first-class nodes, producing two edges per declared flow: +- **Leg 1:** `source_table` →*(is read by)*→ `job_name` (`Edge_Relationship = ETL_INPUT`) +- **Leg 2:** `job_name` →*(writes to)*→ `target_table` (`Edge_Relationship = ETL_OUTPUT`) + +This enables end-to-end lineage traversal through jobs, not just between tables. + +### AI-Native Data Product Shortcut + +If you have a data product built on the [AI-Native Data Product standard](https://github.com/Teradata/ai-native-data-product), the `{ProductName}_Semantic.lineage_graph` view (Observability Module v1.5) already conforms to this contract. Use it directly: + +```python +edge_repository="{ProductName}_Semantic.lineage_graph" +``` + +No DDL generation required. + +--- + +## Package Structure + +``` +teradata_mcp_server/tools/ +├── graph_tools.py # Registration hub (imports + GRAPH_TOOLS list only) +├── graph/ +│ ├── __init__.py # Re-exports all handle_* for ModuleLoader +│ ├── _graph_utils.py # Shared utilities (internal — not an MCP tool) +│ ├── graph_edge_contract.py # Tool: DDL generator + Graph Edge Contract text +│ ├── graph_findRootObjects.py # Tool: SQL-based root object discovery +│ ├── graph_bfsLevels.py # Tool: Pure-Python BFS +│ ├── graph_traceLineage.py # Tool: Python + recursive CTE lineage analysis +│ ├── graph_detectCycles.py # Tool: Python Union-Find + iterative DFS +│ ├── graph_connectedComponents.py # Tool: Python Union-Find WCC analysis +│ └── graph_analyseDatabase.py # Tool: Composite single-fetch analysis +└── utils.py # Shared MCP utilities +``` + +`graph_tools.py` is intentionally thin — it contains no logic, only imports and the `GRAPH_TOOLS` registration list. See the comments in that file for the rationale. + +`_graph_utils.py` is an internal module. It is not registered as an MCP tool. It exports: +- `parse_csv_patterns` — normalise CSV input strings +- `build_like_or` — build single-column LIKE clauses for SQL WHERE +- `bfs_safe_int` — safe int conversion for nullable level columns +- `create_bfs_summary` — BFS result statistics +- `extract_cycle_candidates` — identify direction=BOTH nodes + +--- + +## Tool Reference + +### `graph_edgeContractDDL` + +Generate Teradata DDL for a Graph Edge Contract-conforming edge repository. + +Call this first if you don't yet have an edge repository. No database connection is used — DDL is returned as text ready to run. + +#### Parameters + +| Parameter | Type | Default | Required | Description | +|-----------|------|---------|----------|-------------| +| `target_database` | string | — | ✅ | Database in which to create the edge repository.
For AI-Native Data Products: `{ProductName}_Semantic` | +| `object_name` | string | `EdgeRepository` | ❌ | Name for the edge table or view | +| `output_type` | string | `TABLE` | ❌ | `TABLE`: CREATE TABLE DDL + sample DML
`VIEW`: customisable template for mapping an existing lineage source | + +#### Example + +```python +# Generate a CREATE TABLE with sample DML +result = handle_graph_edgeContractDDL( + conn=connection, + target_database="MY_PROJECT_Semantic", + object_name="EdgeRepository", + output_type="TABLE" +) +print(result[0]['ddl']) # Run this in Teradata +print(result[0]['sample_dml']) # Optional: insert sample rows + +# Generate a VIEW template to wrap an existing lineage source +result = handle_graph_edgeContractDDL( + conn=connection, + target_database="MY_PROJECT_Semantic", + object_name="lineage_graph", + output_type="VIEW" +) +``` + +--- + +### `graph_findRootObjects` + +Find objects with no upstream dependencies in specified containers. + +Root objects are foundational data sources that nothing else feeds into. They are the natural starting points for downstream impact analysis and migration wave planning. + +#### Parameters + +| Parameter | Type | Default | Required | Description | +|-----------|------|---------|----------|-------------| +| `container_pattern` | string | — | ✅ | Database/schema LIKE pattern(s). Supports `%` wildcards and CSV.
Examples: `MY_DB%`, `%PROJECT_A%,%PROJECT_B%` | +| `exclude_objects` | string | `''` | ❌ | LIKE patterns to exclude. Matches `Container.Object`.
Example: `SANDBOX%,%.temp_%` | +| `edge_repository` | string | — | ✅ | Edge repository conforming to the Graph Edge Contract.
AI-Native Data Products: `{ProductName}_Semantic.lineage_graph` | +| `object_types` | string | `''` | ❌ | Filter by object type: `Table`, `View`, `Procedure`, `Macro`.
CSV supported: `Table,View`. Empty = all types. | +| `return_format` | string | `detailed` | ❌ | `detailed` — full list with metadata
`summary` — statistics only | + +#### Use Cases + +| Use Case | Configuration | +|----------|---------------| +| Migration seed discovery | `container_pattern="%MY_PROJECT%"` | +| Source table discovery | `object_types="Table"` | +| Exclude sandbox schemas | `exclude_objects="SANDBOX%,%.temp_%"` | +| Quick count | `return_format="summary"` | + +#### Example + +```python +# Find root tables, ordered by downstream impact +result = handle_graph_findRootObjects( + conn=connection, + container_pattern="MY_DB_STD_T,MY_DB_STD_V", + object_types="Table", + edge_repository="MY_PROJECT_Semantic.EdgeRepository" +) +for obj in result['results']['summary']['top_impact_objects']: + print(f" {obj['name']} → {obj['downstream_count']} dependents") +``` + +--- + +### `graph_bfsLevels` + +Compute BFS shortest-path hop distances from one or more root nodes. + +**Implementation:** Pure Python — One SQL round-trip fetches the scoped edge set; all BFS computation runs in the MCP server process. + +**Use this tool for:** deployment sequencing, migration wave grouping, blast-radius sizing, cycle candidate depth analysis. + +**Do not use this tool for:** lineage tracing, impact path detail, edge-level analysis — use `graph_traceLineage` for those. + +#### Direction Convention + +Each edge row: `Src` "is referenced by" `Tgt` → Src is the dependency (upstream); Tgt is the dependent (downstream). + +| Direction | Traversal | Meaning | +|-----------|-----------|---------| +| Upstream BFS | Reverse adjacency (Tgt → Src) | Discovers what a node depends on | +| Downstream BFS | Forward adjacency (Src → Tgt) | Discovers what depends on a node | + +Root objects with in-degree zero correctly show `upstream_level=None` for all non-root nodes — they have no upstream sources. + +#### Parameters + +| Parameter | Type | Default | Required | Description | +|-----------|------|---------|----------|-------------| +| `root_node_list` | string | — | ✅ | CSV of exact fully-qualified node names. No wildcards.
Example: `MY_DB.table_a,MY_DB.table_b` | +| `max_depth_up` | integer | `10` | ❌ | Maximum upstream hops. `0` = skip upstream analysis. | +| `max_depth_down` | integer | `10` | ❌ | Maximum downstream hops. `0` = skip downstream analysis. | +| `exclude_objects` | string | `''` | ❌ | CSV LIKE patterns to exclude from BFS traversal | +| `include_containers` | string | `''` | ❌ | CSV container LIKE patterns (whitelist). Always supply when scope is known — pushed into SQL to reduce fetch volume. | +| `edge_repository` | string | — | ✅ | Edge repository conforming to the Graph Edge Contract | + +#### Example + +```python +# Wave planning: downstream only, scoped to project containers +result = handle_graph_bfsLevels( + conn=connection, + root_node_list="MY_DB_STD_T.source_a,MY_DB_STD_T.source_b", + max_depth_up=0, + max_depth_down=10, + include_containers="MY_DB%,REPORTING%", + edge_repository="MY_PROJECT_Semantic.EdgeRepository" +) +# Sort by downstream_level ascending for deployment order +# Group by nearest_root for wave assignment +``` + +--- + +### `graph_traceLineage` + +Analyse object dependencies — finds upstream dependencies (what the object depends on) and downstream dependents (what depends on the object). + +**Implementation:** Hybrid — Python constructs Teradata recursive CTEs that execute entirely server-side. Only the reachable subgraph crosses the network. + +**Use this tool for:** impact analysis, lineage tracing, pre-deployment validation, edge-level dependency detail. + +**Do not use this tool for:** migration wave sequencing — use `graph_bfsLevels` for that. + +#### Parameters + +| Parameter | Type | Default | Required | Description | +|-----------|------|---------|----------|-------------| +| `object_name` | string | — | ✅ | Object name pattern(s). Supports `%` wildcards and CSV.
Single: `MY_DB.my_table`
Wildcard: `MY_DB%.%`
Multiple: `MY_DB_A.%,MY_DB_B.%` | +| `max_depth_up` | integer | `3` | ❌ | Maximum upstream levels to traverse (0–10) | +| `max_depth_down` | integer | `3` | ❌ | Maximum downstream levels to traverse (0–10) | +| `exclude_objects` | string | `''` | ❌ | CSV LIKE patterns to exclude. Matches `DB.Object` format. | +| `include_containers` | string | `''` | ❌ | CSV container LIKE patterns (whitelist). Empty = all containers. | +| `edge_repository` | string | — | ✅ | Edge repository conforming to the Graph Edge Contract | +| `return_format` | string | `detailed` | ❌ | `detailed`, `summary`, or `edges_only` | + +#### Example + +```python +# Full impact analysis — what breaks if this object changes? +result = handle_graph_traceLineage( + conn=connection, + object_name="MY_DB_STD_T.core_entity", + max_depth_up=0, + max_depth_down=5, + edge_repository="MY_PROJECT_Semantic.EdgeRepository" +) +print(f"Downstream dependents: {len(result['results']['downstream_edges'])}") +``` + +--- + +### `graph_detectCycles` + +Detect circular references (cycles) in the dependency graph. + +**Implementation:** Pure Python — one SQL SELECT fetches the scoped edge set; Union-Find WCC partitioning followed by iterative DFS cycle detection runs in the MCP server process. + +Run this tool before wave planning to confirm the graph is a valid DAG. A cycle will cause topological sort to hang silently. + +#### Parameters + +| Parameter | Type | Default | Required | Description | +|-----------|------|---------|----------|-------------| +| `container_pattern` | string | — | ✅ | CSV LIKE patterns for container scope.
Example: `MY_DB%` or `%PROJECT_A%,%PROJECT_B%` | +| `exclude_objects` | string | `''` | ❌ | CSV LIKE patterns to exclude from the scan | +| `edge_repository` | string | — | ✅ | Edge repository conforming to the Graph Edge Contract | + +#### Example + +```python +result = handle_graph_detectCycles( + conn=connection, + container_pattern="MY_DB%", + edge_repository="MY_PROJECT_Semantic.EdgeRepository" +) +print(result['results']['summary_stats'][0]['Summary_Message']) +# "No cycles detected — graph is a DAG." +# or: "3 cycle(s) detected." +for cycle in result['results']['cycle_summaries']: + print(f" Cycle {cycle['Cycle_Id']}: {cycle['Cycle_Path']}") +``` + +--- + +### `graph_connectedComponents` + +Identify all Weakly Connected Components (WCC) in the dependency graph. + +**Implementation:** Pure Python — one SQL SELECT, then Union-Find WCC partitioning in the MCP server process. + +A connected component is a maximal set of nodes reachable from one another when edge direction is ignored. Use this to understand graph structure, identify isolated sub-graphs, and pre-filter before cycle detection. + +#### Parameters + +| Parameter | Type | Default | Required | Description | +|-----------|------|---------|----------|-------------| +| `container_pattern` | string | — | ✅ | CSV LIKE patterns for container scope | +| `exclude_objects` | string | `''` | ❌ | CSV LIKE patterns to exclude from the scan | +| `edge_repository` | string | — | ✅ | Edge repository conforming to the Graph Edge Contract | + +#### Example + +```python +result = handle_graph_connectedComponents( + conn=connection, + container_pattern="MY_DB%", + edge_repository="MY_PROJECT_Semantic.EdgeRepository" +) +stats = result['results']['summary_stats'][0] +print(f"{stats['Component_Count']} components, " + f"largest has {stats['Largest_Component']} nodes") +``` + +--- + +### `graph_analyseDatabase` + +Composite analysis — runs root object discovery, connected component analysis, cycle detection, and BFS wave planning in a **single MCP call** with **one shared edge fetch**. + +Use this instead of calling the four individual tools when you need two or more of those analyses together. It eliminates the scalability bottleneck of serial MCP round-trips (4 SQL fetches → 1; 4 MCP responses → 1). + +#### Parameters + +| Parameter | Type | Default | Required | Description | +|-----------|------|---------|----------|-------------| +| `container_pattern` | string | — | ✅ | CSV LIKE patterns for container scope | +| `exclude_objects` | string | `''` | ❌ | CSV LIKE patterns to exclude | +| `top_n_roots` | integer | `4` | ❌ | Number of top root objects (by downstream impact) to include in BFS wave analysis | +| `max_depth_down` | integer | `10` | ❌ | Maximum downstream BFS hops from roots | +| `max_depth_up` | integer | `0` | ❌ | Maximum upstream BFS hops. `0` = skip upstream. | +| `edge_repository` | string | — | ✅ | Edge repository conforming to the Graph Edge Contract | + +#### Example + +```python +# Full database readiness assessment — one call +result = handle_graph_analyseDatabase( + conn=connection, + container_pattern="MY_DB%", + top_n_roots=6, + max_depth_down=10, + edge_repository="MY_PROJECT_Semantic.EdgeRepository" +) + +root_count = result['results']['root_objects']['summary']['total_root_objects'] +cycle_count = result['results']['cycles']['stats'][0]['Cycle_Count'] +comp_count = result['results']['components']['stats'][0]['Component_Count'] +bfs_nodes = result['results']['bfs_waves']['summary']['total_nodes'] +total_ms = result['results']['edge_stats']['total_time_ms'] + +print(f"{root_count} roots | {cycle_count} cycles | " + f"{comp_count} components | {bfs_nodes} BFS nodes | {total_ms}ms") +``` + +--- + +## Architecture + +### Python/SQL Design + +The only Teradata privilege required across the entire package is `SELECT` on the edge repository view or table. + +| Tool | Implementation strategy | +|------|------------------------| +| `graph_edgeContractDDL` | Pure template generation — no SQL executed | +| `graph_findRootObjects` | Single SQL SELECT with NOT EXISTS subquery | +| `graph_bfsLevels` | One bulk edge SELECT; standard queue-based BFS (O(V+E)) in Python | +| `graph_traceLineage` | Python constructs recursive CTEs; traversal runs server-side in Teradata spool | +| `graph_detectCycles` | One scoped edge SELECT; Union-Find WCC + iterative DFS in Python | +| `graph_connectedComponents` | One scoped edge SELECT; path-compressed Union-Find in Python | +| `graph_analyseDatabase` | One shared edge SELECT; all four algorithms run in Python | + +### Progressive Disclosure + +The package supports both MCP registration modes simultaneously: + +- **Static mode:** `graph_tools.py` → `GRAPH_TOOLS` list → MCP server registration at startup +- **Progressive Disclosure mode:** `__init__.py` → ModuleLoader discovers `handle_*` functions → `ContextCatalog` registers them using docstrings + +In Progressive Disclosure mode the ContextCatalog uses the function docstrings for both approximate-match summaries and exact-match full documentation. The `*_TOOL` descriptor dicts serve static mode only. + +--- + +## Dependencies + +### Teradata + +| Requirement | Details | +|------------|---------| +| `SELECT` on edge repository | The only privilege required — applies to all tools | +| Edge repository | A table or view conforming to the Graph Edge Contract.
Generate one with `graph_edgeContractDDL`, or use an existing `{ProductName}_Semantic.lineage_graph` view. | + +No server-side DDL objects required. + +### Python + +All packages are standard library or already included in the base MCP server: + +| Package | Used by | Source | +|---------|---------|--------| +| `teradatasql` | All tools | MCP server base | +| `collections` | `graph_bfsLevels`, `graph_analyseDatabase` | Standard library | +| `fnmatch` | `graph_bfsLevels` | Standard library | +| `logging` | All tools | Standard library | + +--- + +## Installation + +### File Placement + +``` +teradata_mcp_server/tools/ +├── graph_tools.py +├── graph/ +│ ├── __init__.py +│ ├── _graph_utils.py +│ ├── graph_edge_contract.py +│ ├── graph_findRootObjects.py +│ ├── graph_bfsLevels.py +│ ├── graph_traceLineage.py +│ ├── graph_detectCycles.py +│ ├── graph_connectedComponents.py +│ └── graph_analyseDatabase.py +└── utils.py +``` + +### Configuration + +Add to your `profiles.yml`: + +```yaml +graph: + allmodule: True + tool: + graph_edgeContractDDL: True + graph_findRootObjects: True + graph_bfsLevels: True + graph_traceLineage: True + graph_detectCycles: True + graph_connectedComponents: True + graph_analyseDatabase: True +``` + +--- + +## Performance + +### Key Principles + +**Always supply `include_containers` for `graph_bfsLevels`** — this filter is pushed into the SQL WHERE clause, dramatically reducing edge fetch volume. Without it, every edge in the repository is fetched. One additional LIKE pattern costs almost nothing; fetching a million irrelevant edges costs significantly. + +**Use `graph_analyseDatabase` when you need multiple analyses** — it runs four analyses from one edge fetch instead of four separate fetches. + +**Start with `max_depth=3` for `graph_traceLineage`** — incrementally increase only if needed. Recursive CTE depth directly affects server-side spool consumption. + +**Use `exclude_objects` aggressively** — filter out sandbox schemas, temporary objects, and personal schemas. Document and version-control your team's standard exclusion patterns. + +**Run `graph_detectCycles` before wave planning** — a cycle will cause topological sort to hang silently. + +--- + +## Troubleshooting + +| Issue | Cause | Solution | +|-------|-------|----------| +| **Empty BFS results** | Root node FQ name incorrect | Verify exact name via `graph_findRootObjects` — no wildcards in `root_node_list` | +| **`upstream_level` always None** | Correct behaviour for root objects | Root objects with in-degree zero have no upstream sources — this is expected | +| **Large edge fetch for BFS** | No `include_containers` specified | Always supply `include_containers` when scope is known | +| **Query timeout** | Depth too high or large graph | Reduce `max_depth` or add `exclude_objects` / `include_containers` | +| **`edge_repository` error** | Parameter not supplied | Pass the FQ name of your edge repository. AI-Native Data Products: `{ProductName}_Semantic.lineage_graph`. Otherwise run `graph_edgeContractDDL` first. | +| **NULL check violations** | Edge repository has NULL required columns | Run the validation query from the `graph_edgeContractDDL` sample DML output | + +### Debug Steps + +```python +# 1. Verify object exists and find exact FQ name +result = handle_graph_findRootObjects( + conn=connection, + container_pattern="MY_DB_STD_T", + edge_repository="MY_PROJECT_Semantic.EdgeRepository" +) +# Check result for the exact FullyQualifiedName + +# 2. Test BFS with minimal scope and shallow depth +result = handle_graph_bfsLevels( + conn=connection, + root_node_list="MY_DB_STD_T.my_root_table", + max_depth_down=2, + edge_repository="MY_PROJECT_Semantic.EdgeRepository" +) + +# 3. Check cycle-free before wave planning +result = handle_graph_detectCycles( + conn=connection, + container_pattern="MY_DB%", + edge_repository="MY_PROJECT_Semantic.EdgeRepository" +) +print(result['results']['summary_stats'][0]['Summary_Message']) + +# 4. Validate edge repository conforms to contract +# (Run the validation query from graph_edgeContractDDL sample_dml output) +base_readQuery(sql=""" + SELECT 'NULL_CHECK' AS Validation, COUNT(*) AS Violations + FROM MY_PROJECT_Semantic.EdgeRepository + WHERE Src_Container_Name IS NULL + OR Src_Object_Name IS NULL + OR Src_Kind IS NULL + OR Tgt_Container_Name IS NULL + OR Tgt_Object_Name IS NULL + OR Tgt_Kind IS NULL +""") +``` + +--- + +## Best Practices + +1. **Always run `graph_detectCycles` before migration planning** — a cycle will cause topological sort to hang silently. + +2. **Use `graph_findRootObjects` to seed `graph_bfsLevels`** — never guess root node names; they must be exact FQ names with no wildcards. + +3. **Always supply `include_containers` for `graph_bfsLevels`** — without it, every edge in the repository is fetched regardless of scope. + +4. **Deploy in `downstream_level` ascending order within each wave** — depth 0 (root) first, then +1, +2, and so on. Never deploy a consumer before its dependency. + +5. **Check `cycle_candidates` in BFS results** — `direction='BOTH'` nodes with unequal absolute levels indicate back-edges. Investigate before treating them as simple dependents. + +6. **Prefer `graph_analyseDatabase` for full readiness assessments** — one call, one edge fetch, four analyses. + +--- + +## Future Enhancements + +| Tool | Status | Notes | +|------|--------|-------| +| `graph_edgeContractDDL` | ✅ v1.1 | Graph Edge Contract v1.1 — optional enrichment columns | +| `graph_findRootObjects` | ✅ v1.1 | | +| `graph_bfsLevels` | ✅ v2.0 | SP replaced by pure-Python BFS | +| `graph_traceLineage` | ✅ v1.0 | Renamed from `graph_queryDependenciesAgent` | +| `graph_detectCycles` | ✅ v2.0 | SP replaced by Python Union-Find + iterative DFS | +| `graph_connectedComponents` | ✅ v2.0 | SP replaced by Python Union-Find | +| `graph_analyseDatabase` | ✅ v1.0 | Composite single-fetch analysis | +| `graph_findOrphanedObjects` | 🔲 Planned | Objects with no upstream or downstream | +| `graph_calculateMetrics` | 🔲 Planned | Centrality, clustering coefficient | +| `graph_suggestRefactoring` | 🔲 Planned | Structure-based refactoring opportunities | + +--- + +## Version History + +### 3.0 (2026-04-10) + +Compliance pass, Graph Edge Contract v1.1, SP-free architecture for all tools. + +- **Rename:** `graph_queryDependenciesAgent` → `graph_traceLineage`. The tool is a deterministic recursive CTE query, not an agent. +- **New tools:** `graph_edgeContractDDL` (DDL generator + canonical contract text) and `graph_analyseDatabase` (composite single-fetch analysis). +- **SP-free:** `graph_detectCycles` and `graph_connectedComponents` converted from SP-based to pure-Python (Union-Find WCC + iterative DFS). No stored procedures remain anywhere in the package. +- **Graph Edge Contract v1.1:** Column names corrected from `SrcContainer`/`SrcObject`/`SrcKind` to `Src_Container_Name`/`Src_Object_Name`/`Src_Kind` (and Tgt equivalents) — prior generated tables were incompatible with the tool SQL. Optional enrichment columns `Edge_Relationship` and `Transformation_Type` added. `Src_Kind`/`Tgt_Kind` COMPRESS lists expanded to cover both single-letter codes and full-word values. +- **Parameter standardisation:** `object_dependency_table` → `edge_repository`; `excl_patterns` → `exclude_objects` across `graph_detectCycles` and `graph_connectedComponents`. +- **Dead parameter removal:** `strategy` and `max_edges_for_cte` removed from `graph_detectCycles`. +- **Helper consolidation (phase 1):** `parse_csv_patterns` and `build_like_or` extracted to `_graph_utils.py`; 10 local copies removed across 6 files. +- **AI-Native Data Product convention:** `{ProductName}_Semantic.lineage_graph` (Observability Module v1.5) documented as a ready-to-use edge repository requiring no DDL generation. +- Progressive Disclosure compliance: all 7 tools registered in `GRAPH_TOOLS`; `GRAPH_EDGE_CONTRACT_DDL_TOOL` descriptor added. + +### 2.0 (2026-03-31) + +Major refactor — modular package structure, SP replaced by Python BFS for `graph_bfsLevels`. + +- Split monolithic `graph_tools.py` into one file per tool under `graph/` sub-package +- `graph_tools.py` reduced to a thin registration hub +- `graph_bfsLevels` SP replaced by pure-Python BFS — no stored procedure, one SQL round-trip, standard queue-based BFS (O(V+E)) +- BFS traversal direction fix: upstream BFS now correctly uses reverse adjacency (Tgt→Src) +- Shared BFS helpers extracted to `graph/_graph_utils.py` + +### 1.3 (2026-01-15) + +Added `graph_connectedComponents` — Weakly Connected Component analysis. + +### 1.2 (2025-12-01) + +Added `graph_detectCycles` — WCC-partitioned cycle detection. + +### 1.1 (2025-03-05) + +Added `graph_findRootObjects` — root object discovery with CSV pattern support, object type filtering, and two return formats. + +### 1.0 (2025-03-04) + +Initial release — `graph_queryDependenciesAgent` (now `graph_traceLineage`): bidirectional dependency analysis via server-side recursive CTEs. diff --git a/src/teradata_mcp_server/tools/graph/__init__.py b/src/teradata_mcp_server/tools/graph/__init__.py new file mode 100644 index 0000000..abffc58 --- /dev/null +++ b/src/teradata_mcp_server/tools/graph/__init__.py @@ -0,0 +1,44 @@ +# graph/__init__.py +""" +Graph analysis tools package for dependency graph analysis. + +This __init__.py re-exports all handle_* functions from the individual +tool modules so that the MCP server's ModuleLoader can discover them +via inspect.getmembers() when it loads this package. + +The ModuleLoader (module_loader.py) maps the 'graph' prefix to +'teradata_mcp_server.tools.graph' and then calls: + + module = importlib.import_module('teradata_mcp_server.tools.graph') + for name, func in inspect.getmembers(module, inspect.isfunction): + all_functions[name] = func + +If the handle_* functions are not importable at the package level, +the ModuleLoader finds nothing and no graph tools are registered. + +Import order follows the logical workflow: + findRootObjects → bfsLevels → traceLineage + → detectCycles → connectedComponents → analyseDatabase (composite) + +Author: Paul Dancer — Teradata Consulting Services +""" + +# ── Step 1: Root object discovery (SQL-only) ────────────────────── +# ── Step 6: Composite analysis (single call, shared edge fetch) ── +from .graph_analyse_database import handle_graph_analyseDatabase + +# ── Step 2: BFS wave planning (pure Python) ─────────────────────── +from .graph_bfs_levels import handle_graph_bfsLevels + +# ── Step 5: Connected components (Python Union-Find WCC) ───────── +from .graph_connected_components import handle_graph_connectedComponents + +# ── Step 4: Cycle detection (Python Union-Find + iterative DFS) ── +from .graph_detect_cycles import handle_graph_detectCycles + +# ── Step 7: Edge contract DDL generator (no DB connection needed) ─ +from .graph_edge_contract import handle_graph_edgeContractDDL +from .graph_find_root_objects import handle_graph_findRootObjects + +# ── Step 3: Full lineage / impact analysis (hybrid CTE) ────────── +from .graph_trace_lineage import handle_graph_traceLineage diff --git a/src/teradata_mcp_server/tools/graph/_graph_utils.py b/src/teradata_mcp_server/tools/graph/_graph_utils.py new file mode 100644 index 0000000..179f1e7 --- /dev/null +++ b/src/teradata_mcp_server/tools/graph/_graph_utils.py @@ -0,0 +1,196 @@ +""" +_graph_utils.py — Shared utility functions for graph analysis tools. + +This module is INTERNAL to the graph tool package — it is not registered +as an MCP tool and is not imported by the server directly. It exists to +avoid duplicating the BFS helper logic across individual tool files. + +Naming convention: the leading underscore signals internal use only. + +Contents: + bfs_safe_int — Safe int conversion for nullable level columns + create_bfs_summary — Summary statistics from a BFS node result list + extract_cycle_candidates — Extract direction='BOTH' nodes as cycle candidates + +These helpers were originally private functions (_bfs_safe_int, +_create_bfs_summary, _extract_cycle_candidates) embedded in the monolithic +graph_tools.py. They are lifted here unchanged so each tool file can import +them rather than carrying local copies. + +Author: Paul Dancer — Teradata Global Field Tech +""" + + +def parse_csv_patterns(csv_str: str) -> list[str]: + """ + Split a CSV pattern string into a list of trimmed, non-empty tokens. + + Used by all graph tools to normalise container_pattern, exclude_objects, + include_containers, root_node_list, and similar CSV inputs before use. + + Arguments: + csv_str - Comma-separated string (may contain whitespace around commas, + or be empty / None) + + Returns: + List of trimmed non-empty strings; empty list if csv_str is blank or None + """ + return [p.strip() for p in (csv_str or "").split(",") if p.strip()] + + +def build_like_or(patterns: list[str], column: str) -> str: + """ + Build a parenthesised OR-joined LIKE clause for a SQL WHERE predicate. + + Used by graph tools to construct container-scoping predicates against a + single SQL column (typically Src_Container_Name or Tgt_Container_Name). + + Arguments: + patterns - List of SQL LIKE pattern strings (e.g. ['%SALES%', '%FIN%']) + column - SQL column reference (e.g. 'Src_Container_Name') + + Returns: + SQL fragment of the form "(col LIKE 'A%' OR col LIKE 'B%')". + Callers must ensure patterns is non-empty before calling — an empty + list produces the degenerate string "()" which is invalid SQL. + """ + clauses = [f"{column} LIKE '{p}'" for p in patterns] + return "(" + " OR ".join(clauses) + ")" + + +def bfs_safe_int(value) -> int | None: + """ + Safely convert a value to int, returning None if conversion fails. + + Used for upstream_level and downstream_level columns which may be None + (NULL from Teradata) when a node is unreachable in one direction. + + Arguments: + value - Any value from a node dict or Teradata result row + + Returns: + int or None + """ + if value is None: + return None + try: + return int(value) + except (ValueError, TypeError): + return None + + +def create_bfs_summary(nodes: list, cycle_candidates: list) -> dict: + """ + Create summary statistics from a BFS node result list. + + cycle_candidates is passed in from the caller rather than being + computed internally — extract_cycle_candidates is called once in + the handler and the result is shared here and in response_data, + avoiding a redundant second pass over the node list. + + Arguments: + nodes - List of node dicts (one per reachable node) + cycle_candidates - Pre-computed list from extract_cycle_candidates + + Returns: + Dictionary with counts by direction and depth extremes: + total_nodes, root_nodes, upstream_only, downstream_only, + both_directions, cycle_candidates, max_upstream_depth, + max_downstream_depth, nodes_per_nearest_root, object_kind_counts + """ + root_nodes = [n for n in nodes if n.get("is_root") == "Y"] + upstream_nodes = [n for n in nodes if n.get("direction") == "U"] + downstream_nodes = [n for n in nodes if n.get("direction") == "D"] + both_nodes = [n for n in nodes if n.get("direction") == "BOTH"] + cycle_cands = cycle_candidates + + # Deepest upstream level (most negative → largest absolute value) + up_levels = [ + abs(bfs_safe_int(n.get("upstream_level")) or 0) + for n in nodes + if bfs_safe_int(n.get("upstream_level")) is not None + ] + + # Deepest downstream level (most positive) + down_levels = [ + bfs_safe_int(n.get("downstream_level")) or 0 + for n in nodes + if bfs_safe_int(n.get("downstream_level")) is not None + ] + + # Nearest root grouping — how many nodes per root + root_groups: dict[str, int] = {} + for n in nodes: + nearest = n.get("nearest_root") + if nearest: + root_groups[nearest] = root_groups.get(nearest, 0) + 1 + + # Object kind breakdown + kind_counts: dict[str, int] = {} + for n in nodes: + kind = n.get("object_kind") or "Unknown" + kind_counts[kind] = kind_counts.get(kind, 0) + 1 + + return { + "total_nodes": len(nodes), + "root_nodes": len(root_nodes), + "upstream_only": len(upstream_nodes), + "downstream_only": len(downstream_nodes), + "both_directions": len(both_nodes), + "cycle_candidates": len(cycle_cands), + "max_upstream_depth": max(up_levels, default=0), + "max_downstream_depth": max(down_levels, default=0), + "nodes_per_nearest_root": root_groups, + "object_kind_counts": kind_counts, + } + + +def extract_cycle_candidates(nodes: list) -> list: + """ + Extract nodes that are reachable in both directions with unequal + absolute upstream and downstream levels. + + A node with direction='BOTH' and abs(upstream_level) != downstream_level + is a cycle candidate — the asymmetry indicates a back-edge in the graph, + which is the hallmark of a circular reference when traversing the + object dependency graph. + + Nodes with direction='BOTH' and equal absolute levels are shared + dependencies (reachable in both directions at the same hop count) + and are included with cycle_likely=False for completeness. + + Arguments: + nodes - List of node dicts + + Returns: + List of cycle candidate dicts enriched with: + cycle_likely - True if abs(upstream_level) != downstream_level + upstream_abs - Absolute value of upstream_level for easy comparison + """ + candidates = [] + + for n in nodes: + if n.get("direction") != "BOTH": + continue + + up_level = bfs_safe_int(n.get("upstream_level")) + down_level = bfs_safe_int(n.get("downstream_level")) + + if up_level is None or down_level is None: + continue + + up_abs = abs(up_level) + cycle_likely = up_abs != down_level + + candidates.append( + { + **n, + "upstream_abs": up_abs, + "cycle_likely": cycle_likely, + } + ) + + # Sort: most likely cycles first (asymmetric), then by node name + candidates.sort(key=lambda x: (not x["cycle_likely"], x.get("node", ""))) + + return candidates diff --git a/src/teradata_mcp_server/tools/graph/graph_analyse_database.py b/src/teradata_mcp_server/tools/graph/graph_analyse_database.py new file mode 100644 index 0000000..da6a5b3 --- /dev/null +++ b/src/teradata_mcp_server/tools/graph/graph_analyse_database.py @@ -0,0 +1,880 @@ +""" +graph_analyseDatabase.py — Composite graph analysis tool. + +Provides handle_graph_analyseDatabase and GRAPH_ANALYSE_DATABASE_TOOL. + +Runs all four core graph analyses in a single MCP tool call: + 1. Root object discovery (findRootObjects logic) + 2. Connected component analysis (connectedComponents logic) + 3. Cycle detection (detectCycles logic) + 4. BFS deployment waves (bfsLevels logic) + +CRITICAL SCALABILITY ADVANTAGE: + The four individual tools each independently fetch the edge set from + Teradata via SQL — that is 4 round-trips fetching the same rows. + This composite tool fetches the edge set ONCE and shares it across + all four analyses in memory. On a graph with 100 000 edges, this + eliminates ~3 redundant network transfers and ~3 redundant SQL scans. + + Additionally, the composite tool returns ONE MCP response instead of + four, eliminating 3 stdio/JSON serialisation round-trips through the + Claude Desktop MCP transport layer — the primary source of latency + observed at small scale. + +SP-free: all computation runs in the MCP server process. The only +Teradata privilege required is SELECT on the edge repository view. + +If you don't have an edge repository yet, call graph_edgeContractDDL first to generate the CREATE TABLE or CREATE VIEW DDL for one. + +Author: Paul Dancer — Teradata Global Field Tech +""" + +import logging +import time +from collections import defaultdict, deque +from collections.abc import Iterator + +from teradatasql import TeradataConnection + +from teradata_mcp_server.tools.graph._graph_utils import ( + bfs_safe_int, + build_like_or, + create_bfs_summary, + extract_cycle_candidates, + parse_csv_patterns, +) +from teradata_mcp_server.tools.utils import create_response + +logger = logging.getLogger("teradata_mcp_server") + + +# ═══════════════════════════════════════════════════════════════════ +# Shared helpers +# ═══════════════════════════════════════════════════════════════════ +# parse_csv_patterns and build_like_or are imported from _graph_utils. + + +def _build_excl_where(excl_patterns: list[str]) -> str: + """ + Build exclusion predicates for SQL WHERE clause. + + Supports both database-only patterns ('SANDBOX%') and fully-qualified + patterns ('DB.Object%') containing a dot separator. + + Arguments: + excl_patterns - List of exclusion LIKE patterns + + Returns: + SQL fragment starting with ' AND NOT (...)', or '' if no patterns + """ + if not excl_patterns: + return "" + clauses = [] + for p in excl_patterns: + if "." in p: + db_part, obj_part = p.split(".", 1) + clauses.append(f"(Src_Container_Name LIKE '{db_part}' AND Src_Object_Name LIKE '{obj_part}')") + else: + clauses.append(f"Src_Container_Name LIKE '{p}'") + return " AND NOT (" + " OR ".join(clauses) + ")" + + +# ═══════════════════════════════════════════════════════════════════ +# Union-Find (path-compressed, union-by-rank) +# ═══════════════════════════════════════════════════════════════════ + + +class _UnionFind: + """ + Path-compressed Union-Find for connected component detection. + + Provides near-constant-time union and find operations (O(α·N) + amortised via path compression and union-by-rank). + """ + + def __init__(self): + """Initialise empty Union-Find structure.""" + self._parent: dict[str, str] = {} + self._rank: dict[str, int] = {} + + def find(self, x: str) -> str: + """ + Find the root representative of x with path compression. + + Arguments: + x - Node identifier + + Returns: + Root representative of x's component + """ + if x not in self._parent: + self._parent[x] = x + self._rank[x] = 0 + while self._parent[x] != x: + self._parent[x] = self._parent[self._parent[x]] + x = self._parent[x] + return x + + def union(self, a: str, b: str) -> None: + """ + Merge the components containing a and b (union-by-rank). + + Arguments: + a - First node identifier + b - Second node identifier + """ + ra, rb = self.find(a), self.find(b) + if ra == rb: + return + if self._rank[ra] < self._rank[rb]: + ra, rb = rb, ra + self._parent[rb] = ra + if self._rank[ra] == self._rank[rb]: + self._rank[ra] += 1 + + def components(self) -> dict[str, list[str]]: + """ + Return all components as {root: [members]} dict. + + Returns: + Dictionary mapping component root to sorted member list + """ + comps: dict[str, list[str]] = defaultdict(list) + for node in self._parent: + comps[self.find(node)].append(node) + return {k: sorted(v) for k, v in comps.items()} + + +# ═══════════════════════════════════════════════════════════════════ +# Iterative DFS cycle detection +# ═══════════════════════════════════════════════════════════════════ + + +def _find_cycles_dfs(nodes: set, adj: dict[str, list[str]]) -> list[list[str]]: + """ + Find all simple directed cycles via iterative DFS (grey/black colouring). + + Iterative approach avoids Python's recursion limit on deep graphs. + + Arguments: + nodes - Set of node FQ names in this component + adj - Adjacency list {src: [tgt, ...]} + + Returns: + List of cycles; each cycle is a list of FQ names (start == end) + """ + white, grey, black = 0, 1, 2 + colour: dict[str, int] = {} + cycles: list[list[str]] = [] + + for start in nodes: + if colour.get(start) == black: + continue + stack: list[tuple[str, Iterator[str], list[str]]] = [(start, iter(adj.get(start, [])), [start])] + colour[start] = grey + + while stack: + node, neighbours, path = stack[-1] + try: + nxt = next(neighbours) + if colour.get(nxt) == grey: + idx = path.index(nxt) + cycles.append(path[idx:] + [nxt]) + elif colour.get(nxt) != black: + colour[nxt] = grey + stack.append((nxt, iter(adj.get(nxt, [])), path + [nxt])) + except StopIteration: + colour[node] = black + stack.pop() + + return cycles + + +# ═══════════════════════════════════════════════════════════════════ +# BFS engine +# ═══════════════════════════════════════════════════════════════════ + + +def _run_bfs( + root_fqs: list[str], + fwd_adj: dict[str, list[str]], + rev_adj: dict[str, list[str]], + node_meta: dict[str, dict], + max_depth_down: int, + max_depth_up: int, +) -> dict: + """ + Run multi-source BFS from the given roots on the in-memory edge set. + + Arguments: + root_fqs - List of root node fully-qualified names + fwd_adj - Forward adjacency {src: [tgt, ...]} (for downstream) + rev_adj - Reverse adjacency {tgt: [src, ...]} (for upstream) + node_meta - {fq: {container, object, kind}} metadata lookup + max_depth_down - Maximum downstream hops + max_depth_up - Maximum upstream hops + + Returns: + Dict with 'nodes', 'cycle_candidates', 'summary' keys + """ + down_level: dict[str, int] = {} + up_level: dict[str, int] = {} + nearest_root: dict[str, str] = {} + + # ── Seed roots at level 0 ── + for r in root_fqs: + down_level[r] = 0 + up_level[r] = 0 + nearest_root[r] = r + + # ── Downstream BFS (forward: src → tgt) ── + if max_depth_down > 0: + queue: deque[tuple[str, int, str]] = deque() + for r in root_fqs: + queue.append((r, 0, r)) + while queue: + node, depth, root = queue.popleft() + for tgt in fwd_adj.get(node, []): + if tgt not in down_level: + new_depth = depth + 1 + if new_depth <= max_depth_down: + down_level[tgt] = new_depth + nearest_root[tgt] = root + queue.append((tgt, new_depth, root)) + + # ── Upstream BFS (reverse: tgt → src) ── + if max_depth_up > 0: + queue = deque() + for r in root_fqs: + queue.append((r, 0, r)) + while queue: + node, depth, root = queue.popleft() + for src in rev_adj.get(node, []): + if src not in up_level: + new_depth = depth + 1 + if new_depth <= max_depth_up: + up_level[src] = -(new_depth) + if src not in nearest_root: + nearest_root[src] = root + queue.append((src, new_depth, root)) + + # ── Assemble node list ── + root_set = set(root_fqs) + all_reached = set(down_level.keys()) | set(up_level.keys()) + nodes = [] + for fq in sorted(all_reached): + is_root = fq in root_set + d_val = down_level.get(fq) + u_val = up_level.get(fq) + + if is_root: + direction = "ROOT" + elif d_val is not None and u_val is not None: + direction = "BOTH" + elif u_val is not None: + direction = "U" + else: + direction = "D" + + meta = node_meta.get(fq, {}) + nodes.append( + { + "node": fq, + "container_name": meta.get("container", fq.split(".")[0] if "." in fq else ""), + "object_name": meta.get("object", fq.split(".")[1] if "." in fq else fq), + "object_kind": meta.get("kind", "Unknown"), + "upstream_level": u_val if not is_root else 0, + "downstream_level": d_val if d_val is not None else (0 if is_root else None), + "nearest_root": nearest_root.get(fq, ""), + "direction": direction, + "is_root": "Y" if is_root else "N", + } + ) + + cycle_cands = extract_cycle_candidates(nodes) + summary = create_bfs_summary(nodes, cycle_cands) + + return { + "nodes": nodes, + "cycle_candidates": cycle_cands, + "summary": summary, + } + + +# ═══════════════════════════════════════════════════════════════════ +# Public handler +# ═══════════════════════════════════════════════════════════════════ + + +def handle_graph_analyseDatabase( + conn: TeradataConnection, + container_pattern: str, + exclude_objects: str = "", + top_n_roots: int = 4, + max_depth_down: int = 10, + max_depth_up: int = 0, + edge_repository: str = "", + tool_name: str | None = None, + *args, + **kwargs, +): + """ + Composite graph analysis — runs findRootObjects, connectedComponents, + detectCycles, and bfsLevels in a single MCP call with ONE shared + edge fetch. + + This tool eliminates the scalability bottleneck of serial MCP round- + trips by combining four graph analyses that would otherwise require + four separate tool calls, each independently fetching the same edge + set from Teradata. + + Performance vs individual tools: + - 1 SQL round-trip instead of 4 (shared edge fetch) + - 1 MCP response instead of 4 (eliminates stdio serialisation overhead) + - Same algorithmic complexity (O(V+E) BFS, O(α·N) Union-Find, O(V+E) DFS) + - In-memory edge sharing: all analyses operate on the same Python list + + Use this for: + - Full database migration readiness assessment + - Pre-migration cycle + root + wave analysis in one call + - Dashboard data population (all four analyses needed simultaneously) + - Any workflow that would otherwise call 3+ individual graph tools + + Arguments: + container_pattern - str: CSV LIKE patterns for container scope. + Supports wildcards (%) and CSV format. + Examples: '%SALES%', '%SALES%,%FINANCE%', 'PROD_%' + + CRITICAL: STRING type, not array. + CORRECT: container_pattern="%SALES%,%FINANCE%" + WRONG: container_pattern=["%SALES%", "%FINANCE%"] + + exclude_objects - str: CSV LIKE patterns to exclude. + Default: '' (no exclusions) + + top_n_roots - int: Number of top root objects (by downstream + dependent count) to include in BFS wave analysis. + Default: 4 + + max_depth_down - int: Maximum downstream BFS hops from roots. + Default: 10 + + max_depth_up - int: Maximum upstream BFS hops from roots. + 0 = skip upstream analysis. + Default: 0 + + edge_repository - str: Edge repository view/table conforming to the + Graph Edge Contract (Src_Container_Name, + Src_Object_Name, Src_Kind, Tgt_Container_Name, + Tgt_Object_Name, Tgt_Kind columns). + Call graph_edgeContractDDL to generate one. + Required parameter — no default. + + Returns: + ResponseType: single response containing all four analyses: + + { + "root_objects": { "objects": [...], "summary": {...} }, + "components": { "node_details": [...], "summaries": [...], "stats": [...] }, + "cycles": { "details": [...], "summaries": [...], "stats": [...] }, + "bfs_waves": { "nodes": [...], "cycle_candidates": [...], "summary": {...} }, + "edge_stats": { "total_edges": N, "fetch_time_ms": N } + } + + Example calls: + # Full analysis of Sales and Finance databases + handle_graph_analyseDatabase( + conn=connection, + container_pattern="%SALES%,%FINANCE%", + edge_repository="MY_LINEAGE_DB.EdgeRepository" + ) + + # Single database family with top 8 roots + handle_graph_analyseDatabase( + conn=connection, + container_pattern="%FINANCE%", + top_n_roots=8, + edge_repository="MY_LINEAGE_DB.EdgeRepository" + ) + + # Exclude sandbox schemas + handle_graph_analyseDatabase( + conn=connection, + container_pattern="PROD_%,STAGE_%", + exclude_objects="SANDBOX%,%.temp_%", + edge_repository="MY_LINEAGE_DB.EdgeRepository" + ) + """ + logger.debug( + "Tool: handle_graph_analyseDatabase: Args: " + "container_pattern=%s, exclude_objects=%s, top_n_roots=%d, " + "max_depth_down=%d, max_depth_up=%d, edge_repository=%s", + container_pattern, + exclude_objects, + top_n_roots, + max_depth_down, + max_depth_up, + edge_repository, + ) + + t_start = time.time() + container_patterns = parse_csv_patterns(container_pattern) + excl_patterns = parse_csv_patterns(exclude_objects) + + if not container_patterns: + return create_response( + {"error": "container_pattern must not be empty"}, + {"tool_name": tool_name or "graph_analyseDatabase", "status": "error"}, + ) + + if not edge_repository: + return create_response( + {"error": "edge_repository is required. Call graph_edgeContractDDL to generate one."}, + {"tool_name": tool_name or "graph_analyseDatabase", "status": "error"}, + ) + + try: + # ═══════════════════════════════════════════════════════════ + # STEP 0 — Single shared edge fetch (ONE SQL round-trip) + # ═══════════════════════════════════════════════════════════ + container_where = build_like_or(container_patterns, "Src_Container_Name") + excl_where = _build_excl_where(excl_patterns) + + edge_sql = f""" +LOCKING ROW FOR ACCESS +SELECT + TRIM(Src_Container_Name) AS SrcDB + ,TRIM(Src_Object_Name) AS SrcObj + ,Src_Kind AS SrcKind + ,TRIM(Tgt_Container_Name) AS TgtDB + ,TRIM(Tgt_Object_Name) AS TgtObj + ,Tgt_Kind AS TgtKind +FROM {edge_repository} +WHERE {container_where} + {excl_where} +""" + logger.debug("Tool: handle_graph_analyseDatabase: Edge SQL:\n%s", edge_sql) + + with conn.cursor() as cur: + cur.execute(edge_sql) + raw_edges = cur.fetchall() + + t_fetch = time.time() + fetch_ms = round((t_fetch - t_start) * 1000) + edge_count = len(raw_edges) + + logger.info("Tool: handle_graph_analyseDatabase: Fetched %d edges in %dms", edge_count, fetch_ms) + + # ── Build in-memory structures shared by all analyses ── + # Forward adjacency: src → [tgt, ...] (directed: dependency → dependent) + fwd_adj: dict[str, list[str]] = defaultdict(list) + # Reverse adjacency: tgt → [src, ...] (for upstream BFS) + rev_adj: dict[str, list[str]] = defaultdict(list) + # Node metadata registry + node_meta: dict[str, dict] = {} + # Union-Find for connected components + uf = _UnionFind() + # Track downstream dependent counts for root discovery + src_nodes: dict[str, int] = defaultdict(int) + tgt_nodes: set[str] = set() + + for src_db, src_obj, src_kind, tgt_db, tgt_obj, tgt_kind in raw_edges: + if not src_obj or not tgt_obj: + continue # Skip null edges + + src_fq = f"{src_db}.{src_obj}" + tgt_fq = f"{tgt_db}.{tgt_obj}" + + fwd_adj[src_fq].append(tgt_fq) + rev_adj[tgt_fq].append(src_fq) + uf.union(src_fq, tgt_fq) + + # Count downstream dependents per source + src_nodes[src_fq] += 1 + tgt_nodes.add(tgt_fq) + + # Store node metadata + if src_fq not in node_meta: + node_meta[src_fq] = { + "container": src_db, + "object": src_obj, + "kind": src_kind or "Unknown", + } + if tgt_fq not in node_meta: + node_meta[tgt_fq] = { + "container": tgt_db, + "object": tgt_obj, + "kind": tgt_kind or "Unknown", + } + + # ═══════════════════════════════════════════════════════════ + # STEP 1 — Root objects (objects never appearing as targets) + # ═══════════════════════════════════════════════════════════ + root_objects = [] + for fq, downstream_count in src_nodes.items(): + if fq not in tgt_nodes: + meta = node_meta.get(fq, {}) + root_objects.append( + { + "DatabaseName": meta.get("container", ""), + "ObjectName": meta.get("object", ""), + "FullyQualifiedName": fq, + "ObjectType": meta.get("kind", "Unknown"), + "DownstreamDependentCount": downstream_count, + } + ) + + # Sort by downstream impact descending + root_objects.sort(key=lambda x: (-x["DownstreamDependentCount"], x["FullyQualifiedName"])) + + # Summary statistics + type_counts: dict[str, int] = {} + db_counts: dict[str, int] = {} + for obj in root_objects: + t = obj["ObjectType"] + type_counts[t] = type_counts.get(t, 0) + 1 + d = obj["DatabaseName"] + db_counts[d] = db_counts.get(d, 0) + 1 + + root_summary = { + "total_root_objects": len(root_objects), + "object_type_counts": type_counts, + "database_counts": db_counts, + "total_downstream_dependencies": sum(o["DownstreamDependentCount"] for o in root_objects), + } + + t_roots = time.time() + logger.info( + "Tool: handle_graph_analyseDatabase: Found %d root objects in %dms", + len(root_objects), + round((t_roots - t_fetch) * 1000), + ) + + # ═══════════════════════════════════════════════════════════ + # STEP 2 — Connected components (reuse Union-Find from step 0) + # ═══════════════════════════════════════════════════════════ + raw_comps = uf.components() + + # Assign sequential integer IDs sorted by descending size + sorted_roots = sorted(raw_comps.keys(), key=lambda r: -len(raw_comps[r])) + root_to_id = {r: i + 1 for i, r in enumerate(sorted_roots)} + + comp_node_details = [] + comp_id_map: dict[str, int] = {} + for root, members in raw_comps.items(): + cid = root_to_id[root] + for fq in members: + comp_id_map[fq] = cid + meta = node_meta.get(fq, {}) + comp_node_details.append( + { + "Node_FQ": fq, + "DatabaseName": meta.get("container", ""), + "ObjectName": meta.get("object", ""), + "Component_Id": cid, + "Object_Kind": meta.get("kind", "Unknown"), + } + ) + + comp_summaries = [] + for root in sorted_roots: + cid = root_to_id[root] + members = raw_comps[root] + comp_summaries.append( + { + "Component_Id": cid, + "Node_Count": len(members), + "Node_List": ", ".join(members), + } + ) + + comp_stats = [ + { + "Component_Count": len(raw_comps), + "Node_Count": len(comp_id_map), + "Edge_Count": edge_count, + "Largest_Component": max(len(m) for m in raw_comps.values()) if raw_comps else 0, + "Smallest_Component": min(len(m) for m in raw_comps.values()) if raw_comps else 0, + "Singleton_Count": sum(1 for m in raw_comps.values() if len(m) == 1), + "Summary_Message": ( + f"{len(raw_comps)} connected component(s) identified " + f"across {len(comp_id_map)} node(s) and {edge_count} edge(s)." + ), + } + ] + + t_comps = time.time() + logger.info( + "Tool: handle_graph_analyseDatabase: %d components in %dms", + len(raw_comps), + round((t_comps - t_roots) * 1000), + ) + + # ═══════════════════════════════════════════════════════════ + # STEP 3 — Cycle detection (reuse adj + UF from step 0) + # ═══════════════════════════════════════════════════════════ + all_cycles: list[list[str]] = [] + components_scanned = 0 + + for root in sorted_roots: + cycle_members = set(raw_comps[root]) + if len(cycle_members) < 2: + continue + components_scanned += 1 + cycles = _find_cycles_dfs(cycle_members, fwd_adj) + all_cycles.extend(cycles) + + # Deduplicate by canonical form (min rotation) + seen_canonical: set[tuple[str, ...]] = set() + unique_cycles: list[list[str]] = [] + for cycle in all_cycles: + inner = cycle[:-1] + if not inner: + continue + min_idx = inner.index(min(inner)) + canonical = tuple(inner[min_idx:] + inner[:min_idx]) + if canonical not in seen_canonical: + seen_canonical.add(canonical) + unique_cycles.append(cycle) + + # Build cycle details and summaries + cycle_details = [] + cycle_summaries = [] + cycle_node_set: set[str] = set() + + for cycle_id, cycle in enumerate(unique_cycles, 1): + cycle_len = len(cycle) - 1 + for pos, fq in enumerate(cycle[:-1], 1): + cycle_node_set.add(fq) + cycle_details.append( + { + "Cycle_Id": cycle_id, + "Cycle_Pos": pos, + "Node_FQ": fq, + "Cycle_Length": cycle_len, + "Component_Id": comp_id_map.get(fq, 0), + "Strategy": "DFS", + } + ) + cycle_summaries.append( + { + "Cycle_Id": cycle_id, + "Cycle_Length": cycle_len, + "Component_Id": comp_id_map.get(cycle[0], 0), + "Strategy": "DFS", + "Cycle_Path": " -> ".join(cycle), + } + ) + + comps_with_cycles = len({cd["Component_Id"] for cd in cycle_details}) + + cycle_stats = [ + { + "Cycle_Count": len(unique_cycles), + "Total_Nodes_In_Cycles": len(cycle_details), + "Unique_Nodes_In_Cycles": len(cycle_node_set), + "Components_With_Cycles": comps_with_cycles, + "Edge_Count": edge_count, + "Components_Scanned": components_scanned, + "Strategy_Used": "DFS", + "Summary_Message": ( + f"{len(unique_cycles)} cycle(s) detected." + if unique_cycles + else "No cycles detected — graph is a DAG." + ), + } + ] + + t_cycles = time.time() + logger.info( + "Tool: handle_graph_analyseDatabase: %d cycles in %dms", + len(unique_cycles), + round((t_cycles - t_comps) * 1000), + ) + + # ═══════════════════════════════════════════════════════════ + # STEP 4 — BFS waves from top N root objects + # ═══════════════════════════════════════════════════════════ + top_roots = root_objects[:top_n_roots] + top_root_fqs = [r["FullyQualifiedName"] for r in top_roots] + + if top_root_fqs: + bfs_result = _run_bfs( + root_fqs=top_root_fqs, + fwd_adj=fwd_adj, + rev_adj=rev_adj, + node_meta=node_meta, + max_depth_down=max_depth_down, + max_depth_up=max_depth_up, + ) + else: + bfs_result = { + "nodes": [], + "cycle_candidates": [], + "summary": { + "total_nodes": 0, + "root_nodes": 0, + "upstream_only": 0, + "downstream_only": 0, + "both_directions": 0, + "cycle_candidates": 0, + "max_upstream_depth": 0, + "max_downstream_depth": 0, + "nodes_per_nearest_root": {}, + "object_kind_counts": {}, + }, + } + + t_bfs = time.time() + logger.info( + "Tool: handle_graph_analyseDatabase: BFS %d nodes in %dms", + len(bfs_result["nodes"]), + round((t_bfs - t_cycles) * 1000), + ) + + # ═══════════════════════════════════════════════════════════ + # Assemble composite response + # ═══════════════════════════════════════════════════════════ + t_total = round((time.time() - t_start) * 1000) + + response_data = { + "root_objects": { + "objects": root_objects, + "summary": root_summary, + }, + "components": { + "node_details": comp_node_details, + "summaries": comp_summaries, + "stats": comp_stats, + }, + "cycles": { + "details": cycle_details, + "summaries": cycle_summaries, + "stats": cycle_stats, + }, + "bfs_waves": bfs_result, + "edge_stats": { + "total_edges": edge_count, + "fetch_time_ms": fetch_ms, + "total_time_ms": t_total, + }, + } + + metadata = { + "tool_name": tool_name or "graph_analyseDatabase", + "container_pattern": container_pattern, + "exclude_objects": exclude_objects, + "top_n_roots": top_n_roots, + "max_depth_down": max_depth_down, + "max_depth_up": max_depth_up, + "edge_repository": edge_repository, + "timing": { + "edge_fetch_ms": fetch_ms, + "root_objects_ms": round((t_roots - t_fetch) * 1000), + "components_ms": round((t_comps - t_roots) * 1000), + "cycles_ms": round((t_cycles - t_comps) * 1000), + "bfs_ms": round((t_bfs - t_cycles) * 1000), + "total_ms": t_total, + }, + "counts": { + "edges": edge_count, + "root_objects": len(root_objects), + "components": len(raw_comps), + "cycles": len(unique_cycles), + "bfs_nodes": len(bfs_result["nodes"]), + }, + "status": "success", + "message": ( + f"Composite analysis complete: {len(root_objects)} roots, " + f"{len(raw_comps)} components, {len(unique_cycles)} cycles, " + f"{len(bfs_result['nodes'])} BFS nodes. " + f"Total: {t_total}ms (1 SQL fetch: {fetch_ms}ms)." + ), + } + + logger.info( + "Tool: handle_graph_analyseDatabase: Complete in %dms — %d roots, %d components, %d cycles, %d BFS nodes", + t_total, + len(root_objects), + len(raw_comps), + len(unique_cycles), + len(bfs_result["nodes"]), + ) + + return create_response(response_data, metadata) + + except Exception as e: + logger.error("Tool: handle_graph_analyseDatabase: Error: %s", e, exc_info=True) + return create_response( + {"error": str(e)}, + { + "tool_name": tool_name or "graph_analyseDatabase", + "container_pattern": container_pattern, + "status": "error", + }, + ) + + +# ═══════════════════════════════════════════════════════════════════ +# Tool registration descriptor +# ═══════════════════════════════════════════════════════════════════ + +GRAPH_ANALYSE_DATABASE_TOOL = { + "name": "graph_analyseDatabase", + "handler": handle_graph_analyseDatabase, + "description": ( + "Composite graph analysis — runs root object discovery, connected " + "component analysis, cycle detection, and BFS deployment wave " + "planning in a SINGLE MCP call with one shared edge fetch. " + "Use this instead of calling graph_findRootObjects, " + "graph_connectedComponents, graph_detectCycles, and " + "graph_bfsLevels individually when you need two or more of " + "these analyses. Returns all four result sets in one response. " + "Dramatically faster than sequential calls due to shared edge " + "fetch (1 SQL round-trip instead of 4) and single MCP response. " + "Requires an edge repository conforming to the Graph Edge Contract. " + "If you don't have one yet, call graph_edgeContractDDL first to " + "generate the CREATE TABLE or CREATE VIEW DDL." + ), + "parameters": { + "container_pattern": { + "type": "string", + "description": ( + "CSV LIKE patterns for databases/schemas to analyse. " + "Supports wildcards: '%SALES%' or '%SALES%,%FINANCE%'." + ), + "required": True, + }, + "exclude_objects": { + "type": "string", + "description": ("CSV LIKE patterns to exclude. Example: 'SANDBOX%,%.temp_%'. Default: ''."), + "default": "", + }, + "top_n_roots": { + "type": "integer", + "description": ( + "Number of top root objects (by downstream impact) to include in BFS wave analysis. Default: 4." + ), + "default": 4, + }, + "max_depth_down": { + "type": "integer", + "description": ("Maximum downstream BFS hops from roots. Default: 10."), + "default": 10, + }, + "max_depth_up": { + "type": "integer", + "description": ("Maximum upstream BFS hops. 0 = skip upstream. Default: 0."), + "default": 0, + }, + "edge_repository": { + "type": "string", + "description": ( + "Edge repository table or view conforming to the Graph Edge Contract. " + "Call graph_edgeContractDDL to generate one if needed. " + "Required parameter — no default." + ), + "required": True, + }, + }, +} diff --git a/src/teradata_mcp_server/tools/graph/graph_bfs_levels.py b/src/teradata_mcp_server/tools/graph/graph_bfs_levels.py new file mode 100644 index 0000000..61f2b21 --- /dev/null +++ b/src/teradata_mcp_server/tools/graph/graph_bfs_levels.py @@ -0,0 +1,886 @@ +""" +graph_bfsLevels.py — Pure-Python BFS implementation for graph dependency analysis. + +This module provides handle_graph_bfsLevels, a pure-Python BFS +implementation that executes entirely in the MCP server process. + +Key design points: + - One SQL round-trip to Teradata (edge fetch), then all BFS runs in Python. + - Standard queue-based BFS (O(V+E)) rather than iterative SQL relaxation. + - No stored procedure dependency — no volatile tables, no Teradata DDL objects. + - All include_containers, exclude_objects, and depth-cap filtering applied + in Python before BFS starts. + - Output schema: node fields, direction values, nearest_root, + cycle_candidates, summary — fully compatible with the MCP tool + descriptor, tool registration, and all callers. + +Edge direction convention (critical — matches the corrected SP): + Edge Repository edge: Src "referenced by" Tgt + => Src is the DEPENDENCY (upstream of Tgt) + => Tgt is the DEPENDENT (downstream of Src) + + Upstream BFS (finds what a node DEPENDS ON): + Traverse edges in the Src→Tgt direction. + Starting from settled Tgt-side nodes, discover Src-side ancestors. + node_i = Src_Object_Name_FQ (upstream candidate being discovered) + node_j = Tgt_Object_Name_FQ (already-settled downstream neighbour) + + Downstream BFS (finds what DEPENDS ON a node): + Traverse edges in the Tgt→Src direction. + Starting from settled Src-side nodes, discover Tgt-side consumers. + node_i = Tgt_Object_Name_FQ (downstream candidate being discovered) + node_j = Src_Object_Name_FQ (already-settled upstream neighbour) + +Author: Paul Dancer — Teradata Global Field Tech +""" + +import fnmatch +import logging +from collections import defaultdict, deque + +from teradatasql import TeradataConnection + +from teradata_mcp_server.tools.graph._graph_utils import ( + bfs_safe_int, + create_bfs_summary, + extract_cycle_candidates, + parse_csv_patterns, +) +from teradata_mcp_server.tools.utils import create_response, rows_to_json + +logger = logging.getLogger("teradata_mcp_server") + + +# --------------------------------------------------------------------------- +# Public handler +# --------------------------------------------------------------------------- + + +def handle_graph_bfsLevels( + conn: TeradataConnection, + root_node_list: str, + max_depth_up: int = 10, + max_depth_down: int = 10, + exclude_objects: str = "", + include_containers: str = "", + edge_repository: str = "", + tool_name: str | None = None, + *args, + **kwargs, +): + """ + Compute BFS shortest-path hop distances from one or more root nodes. + + Pure-Python implementation — no stored procedure required. + + WHEN TO USE THIS TOOL vs graph_traceLineage: + ------------------------------------------------------- + Use graph_bfsLevels when asked to: + - Sequence objects for deployment or migration (ORDER BY downstream_level + gives correct topological deployment order for root objects) + - Group objects into migration waves (nearest_root identifies which of + the input root tables each object belongs to) + - Find which migration root table each object is closest to across a + multi-root migration scope + - Identify cycle members by depth (direction='BOTH' nodes with unequal + absolute upstream/downstream levels are cycle candidates) + - Count objects within N hops of a change (blast-radius sizing) + - Answer "how far is object X from the migration root tables?" + + Do NOT use graph_bfsLevels for general lineage tracing, impact path + analysis, or questions about which specific objects depend on which. + Use graph_traceLineage for those — it returns the full edge + set with relationship detail. graph_bfsLevels returns distances and + wave groupings, not dependency paths or edge detail. + + KEY DISTINCTION — root_node_list accepts EXACT FQ names only (no + wildcards). Use graph_findRootObjects first to identify the seed + objects, then pass their exact FQ names here. + + Arguments: + root_node_list - str: CSV of exact fully-qualified root node names. + No wildcards — exact names only. + + SINGLE ROOT: + 'DEV01_StGeo_STD_T.mortgage_account' + + MULTIPLE ROOTS (CSV): + 'DEV01_StGeo_STD_T.mortgage_account, + DEV01_StGeo_STD_T.mortgage_borrower, + DEV01_StGeo_STD_T.mortgage_property' + + CRITICAL: Exact FQ names, no wildcards. + Use graph_findRootObjects or + graph_traceLineage first to discover names. + + max_depth_up - int: Maximum upstream hops to traverse. + 0 = skip upstream analysis entirely. + Default: 10 + + Upstream means "what this object DEPENDS ON" — + its sources, prerequisites, and ancestors. + For root objects with in-degree zero, upstream_level + will be NULL for all non-root nodes (correct). + + max_depth_down - int: Maximum downstream hops to traverse. + 0 = skip downstream analysis entirely. + Default: 10 + + Downstream means "what DEPENDS ON this object" — + its consumers, dependents, and impact radius. + For root objects with in-degree zero, downstream_level + will show positive values for all consumers (correct). + + exclude_objects - str: CSV of FQ object name LIKE patterns to exclude. + Matched against both Src and Tgt sides of every edge. + Python fnmatch is used for pattern matching (% → *). + Example: 'DFJ%,C_D02%,%.temp_%' + Default: '' (no exclusions) + + include_containers - str: CSV of container name LIKE patterns to include. + Only edges where BOTH Src and Tgt containers match + at least one pattern are traversed. + Python fnmatch used for matching (% → *). + Empty = all containers included. + Example: 'DEV01_StGeo%,MF_STGEO%,TABLEAU%,POWERBI%' + Default: '' (all containers) + + edge_repository - str: Edge repository view/table conforming to the + Required parameter — no default. + + Returns: + ResponseType: formatted response with BFS node results + metadata. + Schema is identical to handle_graph_bfsLevels (SP-based tool). + + Response structure: + { + "nodes": [ + { + "node": "DEV01_StGeo_STD_T.mortgage_account", + "container_name": "DEV01_StGeo_STD_T", + "object_name": "mortgage_account", + "object_kind": "Table", + "upstream_level": None, // None (NULL) if unreachable or skipped + "downstream_level": 0, // 0 for root, positive for consumers + "nearest_root": "DEV01_StGeo_STD_T.mortgage_account", + "direction": "ROOT", // ROOT / U / D / BOTH + "is_root": "Y" + }, + ... + ], + "cycle_candidates": [...], // direction='BOTH' nodes with unequal + // absolute upstream/downstream levels + "summary": { + "total_nodes": 46, + "root_nodes": 3, + "upstream_only": 12, + "downstream_only": 28, + "both_directions": 3, + "cycle_candidates": 1, + "max_upstream_depth": 4, + "max_downstream_depth": 5, + "nodes_per_nearest_root": {"DB.Root1": 20, "DB.Root2": 26}, + "object_kind_counts": {"Table": 10, "View": 22, "Macro": 8, ...} + } + } + + direction values: + ROOT - One of the input root nodes + U - Reachable upstream only (negative upstream_level) + D - Reachable downstream only (positive downstream_level) + BOTH - Reachable in both directions — possible cycle member. + Unequal absolute levels indicate a back-edge (cycle). + Equal absolute levels indicate a shared dependency. + + Technical Implementation Notes: + - One SQL round-trip to fetch all edges matching the container/exclusion + filters. All BFS computation is then done in Python memory. + - Standard queue-based BFS (O(V+E)) — optimal for unweighted graphs. + This is more correct than the original Bellman-Ford style SQL + relaxation loop that the SP inherited from the notebook. + - Multi-source BFS: all root nodes are seeded simultaneously at level 0. + Each non-root node settles at the distance to its nearest root, with + ties broken deterministically by lexicographic root name order. + - Upstream BFS follows Src→Tgt edges to discover Src-side ancestors. + - Downstream BFS follows Tgt→Src edges to discover Tgt-side consumers. + - This direction convention matches the corrected SP (Option B fix): + upstream_level = NULL for root objects with in-degree zero (correct) + downstream_level = positive for all consumers (correct) + - Filter application order: + 1. SQL WHERE clause: fetch only edges matching include_containers + (both Src and Tgt containers must match at least one pattern) + 2. Python post-filter: exclude edges where either endpoint matches + an exclude_objects pattern (applied before building adjacency) + 3. BFS depth cap: enforced during queue processing + - Node metadata (container_name, object_name, object_kind) is derived + from the edge set and stored in a node registry during the fetch phase. + """ + logger.debug( + "Tool: handle_graph_bfsLevels: Args: root_node_list=%s, " + "max_depth_up=%s, max_depth_down=%s, exclude_objects=%s, " + "include_containers=%s, edge_repository=%s", + root_node_list, + max_depth_up, + max_depth_down, + exclude_objects, + include_containers, + edge_repository, + ) + + if not edge_repository: + return create_response( + {"error": "edge_repository is required. Call graph_edgeContractDDL to generate one."}, + { + "tool_name": tool_name or "graph_bfsLevels", + "status": "error", + }, + ) + + # Clamp depth parameters to safe range + max_depth_up = max(0, min(10, int(max_depth_up))) + max_depth_down = max(0, min(10, int(max_depth_down))) + + _tn = tool_name if tool_name else "graph_bfsLevels" + + try: + # ------------------------------------------------------------------ + # Step 1 — Parse root node list + # ------------------------------------------------------------------ + roots: list[str] = parse_csv_patterns(root_node_list) + + if not roots: + raise ValueError(f"root_node_list is empty or could not be parsed: '{root_node_list}'") + + logger.debug(f"Tool: handle_graph_bfsLevels: Parsed {len(roots)} root node(s): {roots}") + + # ------------------------------------------------------------------ + # Step 2 — Parse filter patterns for Python-side matching + # ------------------------------------------------------------------ + excl_patterns = parse_csv_patterns(exclude_objects) # may be empty + incl_patterns = parse_csv_patterns(include_containers) # may be empty + + # ------------------------------------------------------------------ + # Step 3 — Fetch edge set from Teradata (one round-trip) + # + # include_containers filter is applied in SQL (WHERE clause) for + # efficiency — avoids fetching edges that will be discarded. + # exclude_objects filter is applied in Python (more flexible LIKE + # patterns that are awkward to push into a single SQL predicate). + # + # Column selection: + # Src_Object_Name_FQ — fully-qualified source (dependency/upstream) + # Tgt_Object_Name_FQ — fully-qualified target (dependent/downstream) + # Src_Container_Name — database of source (for node registry) + # Src_Object_Name — short name of source (for node registry) + # Src_Kind — object type of source + # Tgt_Container_Name — database of target + # Tgt_Object_Name — short name of target + # Tgt_Kind — object type of target + # ------------------------------------------------------------------ + fetch_sql = _build_fetch_sql( + edge_repository=edge_repository, + incl_patterns=incl_patterns, + ) + + logger.debug(f"Tool: handle_graph_bfsLevels: Fetching edges: {fetch_sql}") + + with conn.cursor() as cur: + cur.execute(fetch_sql) + raw_rows = cur.fetchall() + col_names = [d[0].lower() for d in cur.description] + + logger.debug(f"Tool: handle_graph_bfsLevels: Fetched {len(raw_rows)} raw edge rows") + + # ------------------------------------------------------------------ + # Step 4 — Build in-memory graph structures + # + # node_registry: node_fq → {container_name, object_name, object_kind} + # fwd_adj: Src → {Tgt} (Src referenced by Tgt; Src is the dependency) + # rev_adj: Tgt → {Src} (reverse: Tgt depends on Src) + # + # fwd_adj is used by the UPSTREAM BFS to discover Src-side ancestors + # starting from settled Tgt-side neighbours. + # + # rev_adj is used by the DOWNSTREAM BFS to discover Tgt-side consumers + # starting from settled Src-side neighbours. + # + # Exclude-objects filtering is applied here: any edge where either + # endpoint FQ name matches a pattern in excl_patterns is dropped. + # ------------------------------------------------------------------ + node_registry: dict[str, dict] = {} + fwd_adj: dict[str, set[str]] = defaultdict(set) # Src → {Tgt} + rev_adj: dict[str, set[str]] = defaultdict(set) # Tgt → {Src} + + col_idx = {name: i for i, name in enumerate(col_names)} + + edges_total = 0 + edges_excluded = 0 + + for row in raw_rows: + src_fq = _val(row, col_idx, "src_object_name_fq") + tgt_fq = _val(row, col_idx, "tgt_object_name_fq") + src_db = _val(row, col_idx, "src_container_name") + src_nm = _val(row, col_idx, "src_object_name") + src_knd = _val(row, col_idx, "src_kind") + tgt_db = _val(row, col_idx, "tgt_container_name") + tgt_nm = _val(row, col_idx, "tgt_object_name") + tgt_knd = _val(row, col_idx, "tgt_kind") + + if not src_fq or not tgt_fq: + continue + + edges_total += 1 + + # Apply exclude_objects filter — both endpoints checked + if excl_patterns and (_matches_any(src_fq, excl_patterns) or _matches_any(tgt_fq, excl_patterns)): + edges_excluded += 1 + continue + + # Register both nodes in the registry + if src_fq not in node_registry: + node_registry[src_fq] = { + "container_name": src_db or "", + "object_name": src_nm or src_fq.split(".")[-1], + "object_kind": src_knd or "", + } + if tgt_fq not in node_registry: + node_registry[tgt_fq] = { + "container_name": tgt_db or "", + "object_name": tgt_nm or tgt_fq.split(".")[-1], + "object_kind": tgt_knd or "", + } + + # Build forward and reverse adjacency + fwd_adj[src_fq].add(tgt_fq) # Src → Tgt + rev_adj[tgt_fq].add(src_fq) # Tgt → Src + + logger.debug( + f"Tool: handle_graph_bfsLevels: " + f"Graph built — {len(node_registry)} unique nodes, " + f"{edges_total} raw edges, {edges_excluded} excluded. " + f"|fwd_adj|={len(fwd_adj)}, |rev_adj|={len(rev_adj)}" + ) + + # Ensure root nodes are registered even if they have no edges + # (isolated roots are valid — they appear only as ROOT in output) + for r in roots: + if r not in node_registry: + parts = r.split(".", 1) + node_registry[r] = { + "container_name": parts[0] if len(parts) > 1 else "", + "object_name": parts[1] if len(parts) > 1 else r, + "object_kind": "", + } + + # ------------------------------------------------------------------ + # Step 5 — Multi-source BFS: UPSTREAM pass + # + # "Upstream" = what a node DEPENDS ON (its sources, ancestors). + # + # Edge Repository: Src "referenced by" Tgt ⟹ Src is the dependency. + # + # Algorithm: + # Seed all root nodes at level 0. + # For each settled Tgt-side node (neighbour), look up its Src-side + # nodes via rev_adj (Tgt → {Src}). + # Each reachable Src node is upstream of the root. + # + # Why rev_adj? + # rev_adj[tgt] = {all Src nodes that Tgt depends on} + # Walking rev_adj from a settled node discovers its dependencies — + # which is exactly "upstream" in data lineage terms. + # + # For root objects with in-degree zero (no rev_adj entry), no Src + # nodes exist, so upstream_level remains None for all non-root nodes. + # This is correct behaviour. + # ------------------------------------------------------------------ + up_level: dict[str, int] = {} # node_fq → hop count (0..N) + up_root: dict[str, str] = {} # node_fq → nearest root + + if max_depth_up > 0: + up_level, up_root = _bfs_multisource( + roots=roots, + adj=rev_adj, # Tgt → {Src}: walk upstream + max_depth=max_depth_up, + label="upstream", + ) + logger.debug( + f"Tool: handle_graph_bfsLevels: Upstream BFS settled {len(up_level)} nodes (max_depth={max_depth_up})" + ) + else: + logger.debug("Tool: handle_graph_bfsLevels: Upstream BFS skipped (max_depth_up=0)") + + # ------------------------------------------------------------------ + # Step 6 — Multi-source BFS: DOWNSTREAM pass + # + # "Downstream" = what DEPENDS ON a node (its consumers, dependents). + # + # Edge Repository: Src "referenced by" Tgt ⟹ Tgt is the dependent. + # + # Algorithm: + # Seed all root nodes at level 0. + # For each settled Src-side node (neighbour), look up its Tgt-side + # nodes via fwd_adj (Src → {Tgt}). + # Each reachable Tgt node is downstream of the root. + # + # Why fwd_adj? + # fwd_adj[src] = {all Tgt nodes that reference Src} + # Walking fwd_adj from a settled node discovers its consumers — + # which is exactly "downstream" in data lineage terms. + # + # For root objects with in-degree zero, all their Tgt-side consumers + # are reachable via fwd_adj, so downstream_level correctly shows + # positive values for views, macros, reports, etc. + # ------------------------------------------------------------------ + dn_level: dict[str, int] = {} + dn_root: dict[str, str] = {} + + if max_depth_down > 0: + dn_level, dn_root = _bfs_multisource( + roots=roots, + adj=fwd_adj, # Src → {Tgt}: walk downstream + max_depth=max_depth_down, + label="downstream", + ) + logger.debug( + f"Tool: handle_graph_bfsLevels: " + f"Downstream BFS settled {len(dn_level)} nodes " + f"(max_depth={max_depth_down})" + ) + else: + logger.debug("Tool: handle_graph_bfsLevels: Downstream BFS skipped (max_depth_down=0)") + + # ------------------------------------------------------------------ + # Step 7 — Assemble result rows + # + # One row per reachable node (including roots themselves). + # Schema matches SP output exactly so callers need no changes. + # + # Rules: + # upstream_level : negative integer (-(hop_count)), None if unreachable + # downstream_level : positive integer (+hop_count), None if unreachable + # Root node : upstream_level=0, downstream_level=0 always + # direction : ROOT / U / D / BOTH + # nearest_root : upstream root takes precedence over downstream root + # is_root : 'Y' if node is in the root set, 'N' otherwise + # ------------------------------------------------------------------ + root_set = set(roots) + + # Union of all settled nodes (roots + BFS-reachable) + all_nodes: set[str] = root_set.copy() + all_nodes.update(up_level.keys()) + all_nodes.update(dn_level.keys()) + + result_nodes: list[dict] = [] + + for node_fq in sorted(all_nodes): + meta = node_registry.get(node_fq, {}) + is_root_node = node_fq in root_set + + upstream_level: int | None + downstream_level: int | None + nearest_root_val: str | None + direction: str | None + + if is_root_node: + upstream_level = 0 + downstream_level = 0 + nearest_root_val = node_fq + direction = "ROOT" + else: + raw_up = up_level.get(node_fq) + raw_dn = dn_level.get(node_fq) + + # upstream_level: negative (opposite sign to hop count) + upstream_level = (-(raw_up)) if raw_up is not None else None + # downstream_level: positive (same sign as hop count) + downstream_level = raw_dn if raw_dn is not None else None + + # nearest_root: upstream wins on tie (matches SP behaviour) + nearest_root_val = up_root.get(node_fq) or dn_root.get(node_fq) + + if raw_up is not None and raw_dn is not None: + direction = "BOTH" + elif raw_up is not None: + direction = "U" + elif raw_dn is not None: + direction = "D" + else: + direction = None # Should not occur — node is in all_nodes + + result_nodes.append( + { + "node": node_fq, + "container_name": meta.get("container_name", ""), + "object_name": meta.get("object_name", ""), + "object_kind": meta.get("object_kind", ""), + "upstream_level": upstream_level, + "downstream_level": downstream_level, + "nearest_root": nearest_root_val, + "direction": direction, + "is_root": "Y" if is_root_node else "N", + } + ) + + logger.debug(f"Tool: handle_graph_bfsLevels: Assembled {len(result_nodes)} result nodes") + + # ------------------------------------------------------------------ + # Step 8 — Build summary and extract cycle candidates + # (re-uses existing private helpers from the SP-based tool) + # ------------------------------------------------------------------ + cycle_cands = extract_cycle_candidates(result_nodes) + summary = create_bfs_summary(result_nodes, cycle_cands) + + # ------------------------------------------------------------------ + # Step 9 — Assemble response (identical schema to SP-based tool) + # ------------------------------------------------------------------ + response_data = { + "nodes": result_nodes, + "cycle_candidates": cycle_cands, + "summary": summary, + } + + metadata = { + "tool_name": _tn, + "root_node_list": root_node_list, + "max_depth_up": max_depth_up, + "max_depth_down": max_depth_down, + "exclude_objects": exclude_objects, + "include_containers": include_containers, + "edge_repository": edge_repository, + "implementation": "python_bfs", # distinguishes from SP-based tool + "graph_stats": { + "unique_nodes_in_graph": len(node_registry), + "raw_edges_fetched": edges_total, + "edges_excluded": edges_excluded, + "edges_traversed": edges_total - edges_excluded, + }, + "counts": summary, + "status": "success", + "rtn_code": 0, + "message": (f"Module=graph_bfsLevels;RootCount={len(roots)};TotalNodes={len(result_nodes)};Success;"), + } + + logger.debug(f"Tool: handle_graph_bfsLevels: metadata: {metadata}") + return create_response(response_data, metadata) + + except Exception as e: + logger.error(f"Tool: handle_graph_bfsLevels: Error: {e}", exc_info=True) + return create_response( + {"error": str(e)}, + { + "tool_name": _tn, + "root_node_list": root_node_list, + "status": "error", + }, + ) + + +# --------------------------------------------------------------------------- +# Private helpers +# --------------------------------------------------------------------------- +# parse_csv_patterns is imported from _graph_utils. + + +def _matches_any(fq_name: str, patterns: list[str]) -> bool: + """ + Return True if fq_name matches any pattern in patterns. + + Converts SQL LIKE wildcards (%) to fnmatch wildcards (*) before matching. + Case-insensitive to match Teradata NOT CASESPECIFIC behaviour. + + Arguments: + fq_name - Fully-qualified object name (e.g. 'MyDB.MyTable') + patterns - List of LIKE-style patterns (e.g. ['DFJ%', '%.temp_%']) + + Returns: + True if any pattern matches, False otherwise + """ + name_lower = fq_name.lower() + for pat in patterns: + # Convert SQL LIKE % to fnmatch * + fn_pat = pat.replace("%", "*").lower() + if fnmatch.fnmatch(name_lower, fn_pat): + return True + return False + + +def _matches_container_any(container: str, patterns: list[str]) -> bool: + """ + Return True if the container name matches any of the given patterns. + + Used to validate include_containers filter against container names. + Converts SQL LIKE % to fnmatch * for matching. + + Arguments: + container - Database/container name (e.g. 'DEV01_StGeo_STD_T') + patterns - List of LIKE-style container patterns + + Returns: + True if any pattern matches, False otherwise + """ + if not patterns: + return True # No whitelist = all containers included + name_lower = container.lower() + for pat in patterns: + fn_pat = pat.replace("%", "*").lower() + if fnmatch.fnmatch(name_lower, fn_pat): + return True + return False + + +def _build_fetch_sql( + edge_repository: str, + incl_patterns: list[str], +) -> str: + """ + Build the SQL query to fetch edges from the edge repository. + + include_containers is pushed into the WHERE clause for efficiency. + exclude_objects is applied in Python after fetching. + + Edge repository column usage: + Src_Object_Name_FQ — fully-qualified dependency (upstream) + Tgt_Object_Name_FQ — fully-qualified dependent (downstream) + + Arguments: + edge_repository - Fully-qualified view/table name + incl_patterns - Parsed list of container LIKE patterns (may be empty) + + Returns: + SQL string ready for cursor.execute() + """ + base_sql = f""" +LOCKING ROW FOR ACCESS +SELECT + TRIM(r.Src_Object_Name_FQ) AS Src_Object_Name_FQ + ,TRIM(r.Tgt_Object_Name_FQ) AS Tgt_Object_Name_FQ + ,TRIM(r.Src_Container_Name) AS Src_Container_Name + ,TRIM(r.Src_Object_Name) AS Src_Object_Name + ,TRIM(r.Src_Kind) AS Src_Kind + ,TRIM(r.Tgt_Container_Name) AS Tgt_Container_Name + ,TRIM(r.Tgt_Object_Name) AS Tgt_Object_Name + ,TRIM(r.Tgt_Kind) AS Tgt_Kind +FROM {edge_repository} r +WHERE r.Src_Object_Name_FQ IS NOT NULL +AND TRIM(r.Src_Object_Name_FQ) <> '' +AND r.Tgt_Object_Name_FQ IS NOT NULL +AND TRIM(r.Tgt_Object_Name_FQ) <> ''""" + + if incl_patterns: + # Build OR-expanded WHERE clause for container inclusion. + # Applies to BOTH Src and Tgt containers — an edge is included only + # if both endpoints are within the whitelisted container set. + src_clauses = " OR ".join(f"TRIM(r.Src_Container_Name) LIKE '{p}'" for p in incl_patterns) + tgt_clauses = " OR ".join(f"TRIM(r.Tgt_Container_Name) LIKE '{p}'" for p in incl_patterns) + base_sql += f"\nAND ({src_clauses})" + base_sql += f"\nAND ({tgt_clauses})" + + return base_sql + ";" + + +def _val(row, col_idx: dict, col_name: str) -> str | None: + """ + Safely extract a value from a result row by column name. + + Arguments: + row - Tuple of row values from cursor.fetchall() + col_idx - Dict mapping lowercase column name → position index + col_name - Column name to look up (lowercase) + + Returns: + Stripped string value, or None if missing/null + """ + idx = col_idx.get(col_name) + if idx is None: + return None + val = row[idx] + if val is None: + return None + return str(val).strip() + + +def _bfs_multisource( + roots: list[str], + adj: dict[str, set[str]], + max_depth: int, + label: str, +) -> tuple[dict[str, int], dict[str, str]]: + """ + Standard queue-based multi-source BFS from a set of root nodes. + + All roots are seeded simultaneously at level 0 (multi-source BFS). + Each reachable node settles at the hop count to its nearest root. + Ties are broken deterministically: the lexicographically smallest + root name wins (consistent with MIN(nearest_root) in the SP). + + Importantly, root nodes themselves are NOT added to the level/root + dicts returned — they are handled separately in the caller as + direction='ROOT'. This prevents roots from appearing twice in output. + + Arguments: + roots - List of exact root node FQ names + adj - Adjacency dict: node → {reachable neighbours} + For upstream BFS: rev_adj (Tgt → {Src}) + For downstream BFS: fwd_adj (Src → {Tgt}) + max_depth - Maximum hops to traverse from any root + label - 'upstream' or 'downstream' (used for logging only) + + Returns: + Tuple of: + level_map - Dict: node_fq → hop_count (1..max_depth) + Root nodes are NOT included (handled separately). + root_map - Dict: node_fq → nearest_root FQ name + """ + level_map: dict[str, int] = {} + root_map: dict[str, str] = {} + + # Seed: all root nodes at level 0. + # Visited set initialised with roots so they are never re-settled + # by BFS propagation from other roots. + visited: set[str] = set(roots) + + # Queue entries: (node_fq, nearest_root_fq, current_depth) + queue: deque[tuple[str, str, int]] = deque() + + for r in sorted(roots): # sorted → lexicographic tie-breaking + queue.append((r, r, 0)) + + while queue: + node, nearest_root, depth = queue.popleft() + + if depth >= max_depth: + # At depth cap — do not propagate further from this node + continue + + # Traverse neighbours from the adjacency dict + for neighbour in sorted(adj.get(node, [])): # sorted → determinism + if neighbour in visited: + continue + + visited.add(neighbour) + new_depth = depth + 1 + level_map[neighbour] = new_depth + root_map[neighbour] = nearest_root + queue.append((neighbour, nearest_root, new_depth)) + + logger.debug(f"_bfs_multisource [{label}]: settled {len(level_map)} non-root nodes") + return level_map, root_map + + +# bfs_safe_int — imported from _graph_utils + + +# create_bfs_summary — imported from _graph_utils + + +# extract_cycle_candidates — imported from _graph_utils + + +# --------------------------------------------------------------------------- +# Tool registration descriptor +# +# Register alongside the other GRAPH_*_TOOL descriptors in graph_tools.py. +# --------------------------------------------------------------------------- +GRAPH_BFS_LEVELS_TOOL = { + # Tool name matches the MCP protocol + # interface and all existing agent prompts. + "name": "graph_bfsLevels", + "handler": handle_graph_bfsLevels, + "description": ( + "Compute BFS shortest-path hop distances from one or more root nodes " + "in the dependency graph. Pure-Python implementation — no stored " + "procedure required. One SQL round-trip to fetch edges, then all BFS " + "computation runs in the MCP server process. " + "" + "Returns one row per reachable node with: upstream_level (None for root " + "objects with in-degree zero, negative for upstream ancestors), " + "downstream_level (0 for roots, positive for consumers), nearest_root " + "(which of the input root nodes this object is closest to), direction " + "(ROOT/U/D/BOTH), and is_root flag. Output schema is identical to the " + "SP-based graph_bfsLevels tool. " + "" + "USE THIS TOOL — not graph_traceLineage — when asked to: " + "sequence objects for deployment or migration (ORDER BY downstream_level " + "gives correct topological deployment order for objects downstream of " + "root tables); group objects into migration waves (nearest_root groups " + "each object under its closest root table); find which migration root " + "table each object belongs to across a multi-root migration scope; count " + "objects within N hops of a change for blast-radius sizing; identify " + "cycle members by depth (direction=BOTH nodes with unequal absolute " + "upstream/downstream levels are cycle candidates); or answer how far any " + "object is from the migration root tables. " + "" + "Do NOT use this tool for general lineage tracing, impact path analysis, " + "or questions about which specific objects depend on which — use " + "graph_traceLineage for those. graph_bfsLevels returns " + "distances and wave groupings, not dependency paths or edge detail. " + "" + "Requires an edge repository conforming to the Graph Edge Contract. " + "If you don't have one yet, call graph_edgeContractDDL first to " + "generate the CREATE TABLE or CREATE VIEW DDL. " + "" + "IMPORTANT: root_node_list accepts exact fully-qualified names only " + "(no wildcards). Use graph_findRootObjects first if needed." + ), + "parameters": { + "root_node_list": { + "type": "string", + "description": ( + "CSV of exact fully-qualified root node names. No wildcards. " + "Single: 'MyDB.MyTable'. " + "Multiple: 'MyDB.TableA,MyDB.TableB,MyDB.TableC'." + ), + "required": True, + }, + "max_depth_up": { + "type": "integer", + "description": ( + "Maximum upstream hops to traverse. Upstream = what the node " + "depends on (its sources and ancestors). " + "0 = skip upstream entirely. Default: 10." + ), + "default": 10, + }, + "max_depth_down": { + "type": "integer", + "description": ( + "Maximum downstream hops to traverse. Downstream = what depends " + "on the node (its consumers and impact radius). " + "0 = skip downstream entirely. Default: 10." + ), + "default": 10, + }, + "exclude_objects": { + "type": "string", + "description": ( + "CSV of FQ object name LIKE patterns to exclude from traversal. " + "Matched against both Src and Tgt sides of every edge. " + "SQL LIKE wildcards (%) supported. " + "Example: 'DFJ%,C_D02%,%.temp_%'. Default: '' (no exclusions)." + ), + "default": "", + }, + "include_containers": { + "type": "string", + "description": ( + "CSV of container name LIKE patterns to include. " + "Only edges where BOTH Src and Tgt containers match at least " + "one pattern are fetched and traversed. " + "SQL LIKE wildcards (%) supported. " + "Example: 'DEV01_StGeo%,MF_STGEO%,TABLEAU%,POWERBI%'. " + "Default: '' (all containers)." + ), + "default": "", + }, + "edge_repository": { + "type": "string", + "description": ( + "Edge repository table or view conforming to the Graph Edge Contract. " + "Call graph_edgeContractDDL to generate one if needed. " + "Required parameter — no default." + ), + "required": True, + }, + }, +} diff --git a/src/teradata_mcp_server/tools/graph/graph_connected_components.py b/src/teradata_mcp_server/tools/graph/graph_connected_components.py new file mode 100644 index 0000000..9e32184 --- /dev/null +++ b/src/teradata_mcp_server/tools/graph/graph_connected_components.py @@ -0,0 +1,479 @@ +""" +graph_connectedComponents.py — Connected components analysis tool. + +Provides handle_graph_connectedComponents and GRAPH_CONNECTED_COMPONENTS_TOOL. + +Pure-Python implementation — no stored procedure required. + +Algorithm overview: + 1. Fetch all edges within the container scope in a single SQL SELECT. + 2. Run Union-Find (path-compressed) to assign every node to a component. + 3. Compute per-component summaries and overall statistics in Python. + 4. Assemble the same three-structure response the SP returned: + node_details — one row per node with Component_Id + component_summaries — one row per component with node count and list + summary_stats — single aggregate row + +Edge direction convention (matches Edge Repository / graph_bfsLevels): + Src_Object_Name is REFERENCED BY Tgt_Object_Name. + For WCC purposes edge direction is ignored — two nodes are in the same + component if there is any path (directed or undirected) between them. + +Author: Paul Dancer — Teradata Global Field Tech +""" + +import logging +from collections import defaultdict +from typing import Any + +from teradatasql import TeradataConnection + +from teradata_mcp_server.tools.graph._graph_utils import ( + build_like_or, + parse_csv_patterns, +) +from teradata_mcp_server.tools.utils import create_response + +logger = logging.getLogger("teradata_mcp_server") + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- +# parse_csv_patterns and build_like_or are imported from _graph_utils. + + +def _build_excl_clauses(patterns: list[str]) -> str: + """ + Build a NOT (...) exclusion fragment for container/object patterns. + + A pattern containing a dot is treated as a fully-qualified DB.Object + pattern; a plain pattern is matched against the container name only. + + Arguments: + patterns - List of exclusion LIKE patterns + + Returns: + SQL fragment beginning with "AND NOT (...)" or empty string + """ + if not patterns: + return "" + + conditions = [] + for p in patterns: + if "." in p: + db_part, obj_part = p.split(".", 1) + conditions.append(f"(Src_Container_Name LIKE '{db_part}' AND Src_Object_Name LIKE '{obj_part}')") + else: + conditions.append(f"Src_Container_Name LIKE '{p}'") + + return "AND NOT (" + " OR ".join(conditions) + ")" + + +# --------------------------------------------------------------------------- +# Union-Find +# --------------------------------------------------------------------------- + + +class _UnionFind: + """ + Union-Find with path compression. + + Assigns every node to a canonical component representative. + union() merges two components; find() returns the representative. + """ + + def __init__(self): + self._parent: dict[str, str] = {} + + def find(self, x: str) -> str: + """Return canonical representative for x (with path compression).""" + self._parent.setdefault(x, x) + # -- Walk to root -- + root = x + while self._parent[root] != root: + root = self._parent[root] + # -- Path compression (flatten all nodes to root) -- + while self._parent[x] != root: + self._parent[x], x = root, self._parent[x] + return root + + def union(self, a, b) -> None: + """Merge the components containing a and b.""" + ra, rb = self.find(a), self.find(b) + if ra != rb: + self._parent[ra] = rb + + def all_nodes(self) -> set: + """Return the set of all nodes known to this Union-Find.""" + return set(self._parent.keys()) + + def component_map(self) -> dict[str, str]: + """Return {node: component_root} for all known nodes.""" + return {n: self.find(n) for n in self._parent} + + +# --------------------------------------------------------------------------- +# Response assembly helpers +# --------------------------------------------------------------------------- + + +def _build_node_details( + component_map: dict[str, str], + root_to_id: dict[str, int], + node_kind: dict[str, str], +) -> list[dict]: + """ + Build node_details — one row per node with its Component_Id. + + Arguments: + component_map - {node_fq: component_root} from Union-Find + root_to_id - {component_root: integer_id} mapping + node_kind - {node_fq: object_kind} from the edge fetch + + Returns: + List of node detail dicts + """ + rows = [] + for node_fq, comp_root in sorted(component_map.items()): + parts = node_fq.split(".", 1) + db_name = parts[0] if len(parts) > 1 else "" + obj_name = parts[1] if len(parts) > 1 else parts[0] + rows.append( + { + "Node_FQ": node_fq, + "DatabaseName": db_name, + "ObjectName": obj_name, + "Component_Id": root_to_id[comp_root], + "Object_Kind": node_kind.get(node_fq, "Unknown"), + } + ) + return rows + + +def _build_component_summaries( + component_map: dict[str, str], + root_to_id: dict[str, int], +) -> list[dict]: + """ + Build component_summaries — one row per component. + + Arguments: + component_map - {node_fq: component_root} + root_to_id - {component_root: integer_id} + + Returns: + List of component summary dicts ordered by Component_Id + """ + # Group nodes by component root + comp_nodes: dict[str, list[str]] = defaultdict(list) + for node_fq, comp_root in component_map.items(): + comp_nodes[comp_root].append(node_fq) + + rows: list[dict[str, Any]] = [] + for comp_root, nodes in comp_nodes.items(): + nodes_sorted = sorted(nodes) + rows.append( + { + "Component_Id": root_to_id[comp_root], + "Node_Count": len(nodes_sorted), + "Node_List": ", ".join(nodes_sorted), + } + ) + + rows.sort(key=lambda r: r["Component_Id"]) + return rows + + +def _build_summary_stats( + component_summaries: list[dict], + edge_count: int, +) -> list[dict]: + """ + Build summary_stats — single aggregate row. + + Arguments: + component_summaries - List of component summary dicts + edge_count - Total edges loaded from the repository + + Returns: + Single-element list + """ + node_count = sum(c["Node_Count"] for c in component_summaries) + comp_count = len(component_summaries) + + sizes = [c["Node_Count"] for c in component_summaries] + largest = max(sizes, default=0) + smallest = min(sizes, default=0) + + singleton_count = sum(1 for s in sizes if s == 1) + + return [ + { + "Component_Count": comp_count, + "Node_Count": node_count, + "Edge_Count": edge_count, + "Largest_Component": largest, + "Smallest_Component": smallest, + "Singleton_Count": singleton_count, + "Summary_Message": ( + f"{comp_count} connected component(s) identified across {node_count} node(s) and {edge_count} edge(s)." + ), + } + ] + + +# --------------------------------------------------------------------------- +# Public handler +# --------------------------------------------------------------------------- + + +def handle_graph_connectedComponents( + conn: TeradataConnection, + container_pattern: str, + exclude_objects: str = "", + edge_repository: str = "", + tool_name: str | None = None, + *args, + **kwargs, +): + """ + Identify all Weakly Connected Components (WCC) in the dependency graph. + + Pure-Python implementation — no stored procedure required. Issues a single + SQL SELECT to fetch the scoped edge set, then performs Union-Find WCC + partitioning entirely in the MCP server process. + + A connected component is a maximal set of nodes where every node can reach + every other node when edge direction is ignored. This partitions the graph + into isolated sub-graphs. + + Use this tool for: + - Understanding graph structure and partitioning + - Identifying isolated sub-graphs + - Scoping downstream impact analysis to a single component + - Pre-filtering before cycle detection (cycles exist only within a component) + - Identifying "islands" of related objects for migration or refactoring + - Estimating blast radius + + Arguments: + container_pattern - str: CSV LIKE patterns for container scope. + Supports wildcards (%) and CSV format. + Examples: '%WBC%', '%WBC%,%StGeo%', 'DEV01_%,DEV02_%' + + CRITICAL: STRING type, not array. + CORRECT: container_pattern="%WBC%,%StGeo%" + WRONG: container_pattern=["%WBC%", "%StGeo%"] + + exclude_objects - str: CSV LIKE patterns to exclude. + Matches against container name (or DB.Object if + the pattern contains a dot). + Default: '' (no exclusions) + + edge_repository - str: Edge repository view/table conforming to the + Graph Edge Contract (Src_Container_Name, + Src_Object_Name, Src_Kind, Tgt_Container_Name, + Tgt_Object_Name, Tgt_Kind columns). + For AI-Native Data Products use: + '{ProductName}_Semantic.lineage_graph' + Call graph_edgeContractDDL to generate a new one. + Required — no default. + + Returns: + ResponseType: formatted response with connected component results. + + Response structure: + { + "node_details": [...], // One row per node with Component_Id + "component_summaries": [...], // One row per component + "summary_stats": [...] // Single aggregate row + } + + node_details row fields: + Node_FQ, DatabaseName, ObjectName, Component_Id, Object_Kind + + component_summaries row fields: + Component_Id, Node_Count, Node_List + + summary_stats row fields: + Component_Count, Node_Count, Edge_Count, + Largest_Component, Smallest_Component, Singleton_Count, Summary_Message + """ + logger.debug( + "Tool: handle_graph_connectedComponents: Args: container_pattern=%s, exclude_objects=%s, edge_repository=%s", + container_pattern, + exclude_objects, + edge_repository, + ) + + # ----------------------------------------------------------------------- + # Parse and validate inputs + # ----------------------------------------------------------------------- + container_patterns = parse_csv_patterns(container_pattern) + if not container_patterns: + return create_response( + {"error": "container_pattern must not be empty"}, + { + "tool_name": tool_name or "graph_connectedComponents", + "container_pattern": container_pattern, + "status": "error", + }, + ) + + if not edge_repository: + return create_response( + { + "error": ( + "edge_repository is required. " + "For AI-Native Data Products use '{ProductName}_Semantic.lineage_graph'. " + "Call graph_edgeContractDDL to generate a new edge repository." + ) + }, + { + "tool_name": tool_name or "graph_connectedComponents", + "container_pattern": container_pattern, + "status": "error", + }, + ) + + excl_pattern_list = parse_csv_patterns(exclude_objects) + + try: + with conn.cursor() as cur: + # ------------------------------------------------------------------- + # Step 1 — Fetch all scoped edges in one SQL SELECT + # ------------------------------------------------------------------- + container_where = build_like_or(container_patterns, "Src_Container_Name") + excl_where = _build_excl_clauses(excl_pattern_list) + + edge_sql = f""" +LOCKING ROW FOR ACCESS +SELECT + TRIM(Src_Container_Name) || '.' || TRIM(Src_Object_Name) AS Src_FQ + ,TRIM(Tgt_Container_Name) || '.' || TRIM(Tgt_Object_Name) AS Tgt_FQ + ,COALESCE(TRIM(Src_Kind), 'Unknown') AS Src_Kind +FROM {edge_repository} +WHERE {container_where} + {excl_where} +""" + logger.debug("Tool: handle_graph_connectedComponents: Fetching edges:\n%s", edge_sql) + + cur.execute(edge_sql) + raw_edges = cur.fetchall() + + # ------------------------------------------------------------------- + # Step 2 — Build Union-Find and collect node kinds + # ------------------------------------------------------------------- + uf = _UnionFind() + node_kind: dict[str, str] = {} # {node_fq: object_kind} + + for src_fq, tgt_fq, src_kind in raw_edges: + uf.union(src_fq, tgt_fq) + # Record source kind; target kind not available without a second lookup + if src_fq not in node_kind: + node_kind[src_fq] = src_kind or "Unknown" + + edge_count = len(raw_edges) + logger.debug("Tool: handle_graph_connectedComponents: Loaded %d edges", edge_count) + + # ------------------------------------------------------------------- + # Step 3 — Assign integer component IDs + # ------------------------------------------------------------------- + comp_map = uf.component_map() + unique_roots = sorted(set(comp_map.values())) + root_to_id = {r: i + 1 for i, r in enumerate(unique_roots)} + + component_count = len(unique_roots) + logger.debug("Tool: handle_graph_connectedComponents: %d component(s) identified", component_count) + + # ------------------------------------------------------------------- + # Step 4 — Build response structures + # ------------------------------------------------------------------- + node_details = _build_node_details(comp_map, root_to_id, node_kind) + component_summaries = _build_component_summaries(comp_map, root_to_id) + summary_stats = _build_summary_stats(component_summaries, edge_count) + + response_data = { + "node_details": node_details, + "component_summaries": component_summaries, + "summary_stats": summary_stats, + } + + metadata = { + "tool_name": tool_name or "graph_connectedComponents", + "container_pattern": container_pattern, + "exclude_objects": exclude_objects, + "edge_repository": edge_repository, + "result_set_counts": { + "node_details": len(node_details), + "component_summaries": len(component_summaries), + "summary_stats": len(summary_stats), + }, + "status": "success", + "message": summary_stats[0]["Summary_Message"], + } + + logger.debug("Tool: handle_graph_connectedComponents: metadata: %s", metadata) + return create_response(response_data, metadata) + + except Exception as e: + logger.error("Tool: handle_graph_connectedComponents: Error: %s", e, exc_info=True) + return create_response( + {"error": str(e)}, + { + "tool_name": tool_name or "graph_connectedComponents", + "container_pattern": container_pattern, + "status": "error", + }, + ) + + +# --------------------------------------------------------------------------- +# Tool registration descriptor +# --------------------------------------------------------------------------- +GRAPH_CONNECTED_COMPONENTS_TOOL = { + "name": "graph_connectedComponents", + "handler": handle_graph_connectedComponents, + "description": ( + "Identify all Weakly Connected Components (WCC) in the dependency graph. " + "Pure-Python implementation — no stored procedure required. " + "A connected component is a maximal set of nodes reachable from one another " + "when edge direction is ignored. Fetches the scoped edge set in one SQL SELECT, " + "then performs Union-Find WCC partitioning in the MCP server process. " + "Returns node-to-component mapping, per-component summaries, and overall " + "statistics. Use to understand graph structure, identify isolated sub-graphs, " + "scope impact analysis, or pre-filter before cycle detection. " + "Requires an edge repository conforming to the Graph Edge Contract. " + "For AI-Native Data Products use '{ProductName}_Semantic.lineage_graph'. " + "Call graph_edgeContractDDL to generate a new edge repository." + ), + "parameters": { + "container_pattern": { + "type": "string", + "description": ( + "CSV LIKE patterns for containers (databases/schemas) to scan. " + "Supports wildcards: 'DFJ%' or '%WBC%,%StGeo%' for multiple." + ), + "required": True, + }, + "exclude_objects": { + "type": "string", + "description": ( + "CSV LIKE patterns to exclude from the scan. " + "Matches against container name (or DB.Object if pattern contains a dot). " + "Example: 'DFJ%,C_D02%'. Default: '' (no exclusions)." + ), + "default": "", + }, + "edge_repository": { + "type": "string", + "description": ( + "Edge repository table or view conforming to the Graph Edge Contract. " + "For AI-Native Data Products use '{ProductName}_Semantic.lineage_graph'. " + "Call graph_edgeContractDDL to generate one if needed. " + "Required — no default." + ), + "required": True, + }, + }, +} diff --git a/src/teradata_mcp_server/tools/graph/graph_detect_cycles.py b/src/teradata_mcp_server/tools/graph/graph_detect_cycles.py new file mode 100644 index 0000000..5afd73b --- /dev/null +++ b/src/teradata_mcp_server/tools/graph/graph_detect_cycles.py @@ -0,0 +1,555 @@ +""" +graph_detectCycles.py — Cycle detection tool. + +Provides handle_graph_detectCycles and GRAPH_DETECT_CYCLES_TOOL. + +Pure-Python implementation — no stored procedure required. + +Algorithm overview: + 1. Fetch all edges within the container scope in a single SQL SELECT. + 2. Perform Union-Find (WCC partitioning) to identify connected components. + 3. Run iterative DFS (grey/black colouring) independently within each + component. Iterative DFS avoids Python's recursion limit on deep graphs. + 4. Collect and deduplicate all directed cycles found. + 5. Assemble the same three-structure response the SP returned: + cycle_details — one row per node per cycle + cycle_summaries — one row per cycle with human-readable path + summary_stats — single aggregate row + +Edge direction convention (matches Edge Repository / graph_bfsLevels): + Src_Object_Name is REFERENCED BY Tgt_Object_Name. + => Src is the DEPENDENCY (upstream of Tgt). + => Tgt is the DEPENDENT (downstream of Src). + The directed edge for cycle detection runs Src → Tgt: + a view (Tgt) DEPENDS ON a table (Src), so the edge Src→Tgt represents + "Src must exist before Tgt". A cycle in this direction is a genuine + circular dependency. + +Author: Paul Dancer — Teradata Global Field Tech +""" + +import logging +from collections import defaultdict +from collections.abc import Iterator + +from teradatasql import TeradataConnection + +from teradata_mcp_server.tools.graph._graph_utils import ( + build_like_or, + parse_csv_patterns, +) +from teradata_mcp_server.tools.utils import create_response + +logger = logging.getLogger("teradata_mcp_server") + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- +# parse_csv_patterns and build_like_or are imported from _graph_utils. + + +def _build_excl_clauses(patterns: list[str]) -> str: + """ + Build a NOT (...) exclusion fragment for Src_Container_Name LIKE patterns. + + A pattern containing a dot is treated as a fully-qualified DB.Object pattern; + a plain pattern is matched against the container name only. + + Arguments: + patterns - List of exclusion LIKE patterns + + Returns: + SQL fragment beginning with "AND NOT (...)" or empty string + """ + if not patterns: + return "" + + conditions = [] + for p in patterns: + if "." in p: + db_part, obj_part = p.split(".", 1) + conditions.append(f"(Src_Container_Name LIKE '{db_part}' AND Src_Object_Name LIKE '{obj_part}')") + else: + conditions.append(f"Src_Container_Name LIKE '{p}'") + + return "AND NOT (" + " OR ".join(conditions) + ")" + + +# --------------------------------------------------------------------------- +# Union-Find for WCC partitioning +# --------------------------------------------------------------------------- + + +class _UnionFind: + """ + Simple Union-Find with path compression. + + Used to partition the edge set into Weakly Connected Components before + running per-component DFS. Partitioning dramatically reduces the work + per DFS call on graphs with many isolated sub-graphs. + """ + + def __init__(self): + self._parent: dict[str, str] = {} + + def find(self, x: str) -> str: + """Return canonical representative of x's component (with path compression).""" + self._parent.setdefault(x, x) + if self._parent[x] != x: + self._parent[x] = self.find(self._parent[x]) # path compression + return self._parent[x] + + def union(self, a, b) -> None: + """Merge the components containing a and b.""" + ra, rb = self.find(a), self.find(b) + if ra != rb: + self._parent[ra] = rb + + def component_map(self) -> dict[str, str]: + """Return {node: component_root} for all known nodes.""" + return {n: self.find(n) for n in self._parent} + + +# --------------------------------------------------------------------------- +# Iterative DFS cycle detection +# --------------------------------------------------------------------------- + + +def _detect_cycles_in_subgraph(nodes: set, adj: dict[str, list[str]]) -> list[list[str]]: + """ + Find all simple directed cycles reachable in an adjacency sub-graph. + + Uses an iterative DFS with grey/black node colouring. The iterative + approach is mandatory — Python's default recursion limit (1 000) is + easily exceeded on deep dependency chains. + + A node is GREY while it is on the current DFS stack (being explored). + A node is BLACK once all its descendants have been fully explored. + A back-edge into a GREY node signals a cycle. + + The cycle path is reconstructed from the DFS stack at the moment the + back-edge is detected. + + Arguments: + nodes - Set of node FQ names in this component + adj - Adjacency list {src: [tgt, ...]} for the full graph + (caller is responsible for scoping to this component) + + Returns: + List of cycles; each cycle is a list of FQ node names (start == end). + """ + white, grey, black = 0, 1, 2 + colour: dict[str, int] = {} + cycles: list[list[str]] = [] + + for start in nodes: + if colour.get(start) == black: + continue + + # Stack entries: (node, iterator-over-neighbours, path-so-far) + stack: list[tuple[str, Iterator[str], list[str]]] = [(start, iter(adj.get(start, [])), [start])] + colour[start] = grey + + while stack: + node, neighbours, path = stack[-1] + try: + nxt = next(neighbours) + + if colour.get(nxt) == grey: + # Back-edge found — reconstruct the cycle portion + cycle_start_idx = path.index(nxt) + cycle = path[cycle_start_idx:] + [nxt] + cycles.append(cycle) + + elif colour.get(nxt) != black: + colour[nxt] = grey + stack.append((nxt, iter(adj.get(nxt, [])), path + [nxt])) + + except StopIteration: + colour[node] = black + stack.pop() + + return cycles + + +# --------------------------------------------------------------------------- +# Response assembly helpers +# --------------------------------------------------------------------------- + + +def _build_cycle_details(cycles: list[list[str]], component_id_map: dict[str, int]) -> list[dict]: + """ + Build the cycle_details result set — one row per node per cycle. + + Arguments: + cycles - List of cycle paths (each a list of FQ names, start==end) + component_id_map - {node_fq: component_id} lookup + + Returns: + List of dicts matching the SP's cur_NodeDetails schema + """ + rows = [] + for cycle_id, cycle in enumerate(cycles, start=1): + # The last element is a repeat of the first — omit it for position count + members = cycle[:-1] + for pos, node_fq in enumerate(members, start=1): + rows.append( + { + "Cycle_Id": cycle_id, + "Cycle_Pos": pos, + "Node_FQ": node_fq, + "Cycle_Length": len(members), + "Component_Id": component_id_map.get(node_fq, -1), + "Strategy": "DFS", + } + ) + return rows + + +def _build_cycle_summaries(cycles: list[list[str]], component_id_map: dict[str, int]) -> list[dict]: + """ + Build the cycle_summaries result set — one row per cycle. + + Arguments: + cycles - List of cycle paths + component_id_map - {node_fq: component_id} lookup + + Returns: + List of dicts matching the SP's cur_CompSummaries schema + """ + rows = [] + for cycle_id, cycle in enumerate(cycles, start=1): + members = cycle[:-1] + path_str = " -> ".join(cycle) # start → ... → start + rows.append( + { + "Cycle_Id": cycle_id, + "Cycle_Length": len(members), + "Component_Id": component_id_map.get(members[0], -1), + "Strategy": "DFS", + "Cycle_Path": path_str, + } + ) + return rows + + +def _build_summary_stats(cycles: list[list[str]], edge_count: int, component_count: int) -> list[dict]: + """ + Build the summary_stats result set — single aggregate row. + + Arguments: + cycles - List of detected cycles + edge_count - Total edges loaded from the repository + component_count - Number of WCC components identified + + Returns: + Single-element list matching the SP's cur_SummaryStats schema + """ + total_nodes_in_cycles = sum(len(c) - 1 for c in cycles) # exclude repeated end + components_with_cycles = len({c[0] for c in cycles}) # rough proxy + + if len(cycles) == 0: + message = "No cycles detected — graph is a DAG." + elif len(cycles) == 1: + message = "1 cycle detected." + else: + message = f"{len(cycles)} cycles detected." + + return [ + { + "Cycle_Count": len(cycles), + "Total_Nodes_In_Cycles": total_nodes_in_cycles, + "Components_With_Cycles": components_with_cycles, + "Edge_Count": edge_count, + "Components_Scanned": component_count, + "Strategy_Used": "DFS", + "Summary_Message": message, + } + ] + + +# --------------------------------------------------------------------------- +# Public handler +# --------------------------------------------------------------------------- + + +def handle_graph_detectCycles( + conn: TeradataConnection, + container_pattern: str, + exclude_objects: str = "", + edge_repository: str = "", + tool_name: str | None = None, + *args, + **kwargs, +): + """ + Detect circular dependencies (cycles) in the dependency graph. + + Pure-Python implementation — no stored procedure required. Issues a single + SQL SELECT to fetch the scoped edge set, then performs WCC partitioning + followed by iterative DFS cycle detection entirely in the MCP server process. + + Use this tool for: + - Validating graph integrity (DAG property) + - Finding objects that form circular references + - Identifying stub-then-replace code patterns + - Debugging topological sort hangs + - Pre-deployment cycle checks + + Arguments: + container_pattern - str: CSV LIKE patterns for container scope. + Supports wildcards (%) and CSV format. + Examples: + 'DFJ%' — single database family + '%WBC%,%StGeo%' — multiple families + 'DEV01_%,DEV02_%' — multiple prefixes + + exclude_objects - str: CSV LIKE patterns to exclude from the scan. + Matches against container name (or DB.Object if + the pattern contains a dot). + Default: '' (no exclusions) + + edge_repository - str: Edge repository view/table conforming to the + Graph Edge Contract (Src_Container_Name, + Src_Object_Name, Src_Kind, Tgt_Container_Name, + Tgt_Object_Name, Tgt_Kind columns). + For AI-Native Data Products use: + '{ProductName}_Semantic.lineage_graph' + Call graph_edgeContractDDL to generate a new one. + Required — no default. + + Returns: + ResponseType: formatted response with cycle detection results. + + Response structure: + { + "cycle_details": [...], // One row per node per cycle + "cycle_summaries": [...], // One row per cycle with path string + "summary_stats": [...] // Single aggregate row + } + + cycle_details row fields: + Cycle_Id, Cycle_Pos, Node_FQ, Cycle_Length, Component_Id + + cycle_summaries row fields: + Cycle_Id, Cycle_Length, Component_Id, Cycle_Path + + summary_stats row fields: + Cycle_Count, Total_Nodes_In_Cycles, Components_With_Cycles, + Edge_Count, Components_Scanned, Summary_Message + """ + logger.debug( + "Tool: handle_graph_detectCycles: Args: container_pattern=%s, exclude_objects=%s, edge_repository=%s", + container_pattern, + exclude_objects, + edge_repository, + ) + + # ----------------------------------------------------------------------- + # Parse and validate inputs + # ----------------------------------------------------------------------- + container_patterns = parse_csv_patterns(container_pattern) + if not container_patterns: + return create_response( + {"error": "container_pattern must not be empty"}, + { + "tool_name": tool_name or "graph_detectCycles", + "container_pattern": container_pattern, + "status": "error", + }, + ) + + if not edge_repository: + return create_response( + { + "error": ( + "edge_repository is required. " + "For AI-Native Data Products use '{ProductName}_Semantic.lineage_graph'. " + "Call graph_edgeContractDDL to generate a new edge repository." + ) + }, + { + "tool_name": tool_name or "graph_detectCycles", + "container_pattern": container_pattern, + "status": "error", + }, + ) + + excl_pattern_list = parse_csv_patterns(exclude_objects) + + try: + with conn.cursor() as cur: + # ------------------------------------------------------------------- + # Step 1 — Fetch all scoped edges in one SQL SELECT + # ------------------------------------------------------------------- + container_where = build_like_or(container_patterns, "Src_Container_Name") + excl_where = _build_excl_clauses(excl_pattern_list) + + edge_sql = f""" +LOCKING ROW FOR ACCESS +SELECT + TRIM(Src_Container_Name) || '.' || TRIM(Src_Object_Name) AS Src_FQ + ,TRIM(Tgt_Container_Name) || '.' || TRIM(Tgt_Object_Name) AS Tgt_FQ +FROM {edge_repository} +WHERE {container_where} + {excl_where} +""" + logger.debug("Tool: handle_graph_detectCycles: Fetching edges:\n%s", edge_sql) + + cur.execute(edge_sql) + raw_edges = cur.fetchall() + + # ------------------------------------------------------------------- + # Step 2 — Build adjacency list and WCC components + # ------------------------------------------------------------------- + # adj[src] = [tgt, ...] — directed: src → tgt means tgt DEPENDS ON src + adj: dict[str, list[str]] = defaultdict(list) + uf = _UnionFind() + + for src_fq, tgt_fq in raw_edges: + adj[src_fq].append(tgt_fq) + uf.union(src_fq, tgt_fq) + + edge_count = len(raw_edges) + logger.debug("Tool: handle_graph_detectCycles: Loaded %d edges", edge_count) + + if edge_count == 0: + # No edges in scope — no cycles possible + return create_response( + { + "cycle_details": [], + "cycle_summaries": [], + "summary_stats": _build_summary_stats([], 0, 0), + }, + { + "tool_name": tool_name or "graph_detectCycles", + "container_pattern": container_pattern, + "exclude_objects": exclude_objects, + "edge_repository": edge_repository, + "result_set_counts": { + "cycle_details": 0, + "cycle_summaries": 0, + "summary_stats": 1, + }, + "status": "success", + "message": "No edges found in scope — no cycles possible.", + }, + ) + + # Assign integer component IDs from the Union-Find roots + comp_map = uf.component_map() + unique_roots = list(set(comp_map.values())) + root_to_id = {r: i + 1 for i, r in enumerate(unique_roots)} + component_id_map: dict[str, int] = {n: root_to_id[r] for n, r in comp_map.items()} + + # Group nodes by component + components: dict[str, set[str]] = defaultdict(set) + for node, comp_root in comp_map.items(): + components[comp_root].add(node) + + component_count = len(components) + logger.debug("Tool: handle_graph_detectCycles: %d components identified", component_count) + + # ------------------------------------------------------------------- + # Step 3 — Run iterative DFS within each component + # ------------------------------------------------------------------- + all_cycles: list[list[str]] = [] + + for _comp_root, comp_nodes in components.items(): + cycles_in_comp = _detect_cycles_in_subgraph(comp_nodes, adj) + all_cycles.extend(cycles_in_comp) + + logger.debug("Tool: handle_graph_detectCycles: %d cycle(s) detected", len(all_cycles)) + + # ------------------------------------------------------------------- + # Step 4 — Assemble response structures + # ------------------------------------------------------------------- + cycle_details = _build_cycle_details(all_cycles, component_id_map) + cycle_summaries = _build_cycle_summaries(all_cycles, component_id_map) + summary_stats = _build_summary_stats(all_cycles, edge_count, component_count) + + response_data = { + "cycle_details": cycle_details, + "cycle_summaries": cycle_summaries, + "summary_stats": summary_stats, + } + + metadata = { + "tool_name": tool_name or "graph_detectCycles", + "container_pattern": container_pattern, + "exclude_objects": exclude_objects, + "edge_repository": edge_repository, + "result_set_counts": { + "cycle_details": len(cycle_details), + "cycle_summaries": len(cycle_summaries), + "summary_stats": len(summary_stats), + }, + "status": "success", + "message": summary_stats[0]["Summary_Message"], + } + + logger.debug("Tool: handle_graph_detectCycles: metadata: %s", metadata) + return create_response(response_data, metadata) + + except Exception as e: + logger.error("Tool: handle_graph_detectCycles: Error: %s", e, exc_info=True) + return create_response( + {"error": str(e)}, + { + "tool_name": tool_name or "graph_detectCycles", + "container_pattern": container_pattern, + "status": "error", + }, + ) + + +# --------------------------------------------------------------------------- +# Tool registration descriptor +# --------------------------------------------------------------------------- +GRAPH_DETECT_CYCLES_TOOL = { + "name": "graph_detectCycles", + "handler": handle_graph_detectCycles, + "description": ( + "Detect circular references (cycles) in the dependency graph. " + "Pure-Python implementation — no stored procedure required. " + "Fetches the scoped edge set in one SQL SELECT, partitions into Weakly " + "Connected Components via Union-Find, then runs iterative DFS cycle " + "detection within each component. " + "Returns each cycle as an ordered list of nodes with a human-readable " + "path string. Use to validate graph integrity, find stub-then-replace " + "patterns, or identify objects that will cause topological sort to hang. " + "Requires an edge repository conforming to the Graph Edge Contract. " + "For AI-Native Data Products use '{ProductName}_Semantic.lineage_graph'. " + "Call graph_edgeContractDDL to generate a new edge repository." + ), + "parameters": { + "container_pattern": { + "type": "string", + "description": ( + "CSV LIKE patterns for containers (databases/schemas) to scan. " + "Supports wildcards: 'DFJ%' or '%WBC%,%StGeo%' for multiple." + ), + "required": True, + }, + "exclude_objects": { + "type": "string", + "description": ( + "CSV LIKE patterns to exclude from the scan. " + "Matches against container name (or DB.Object if pattern contains a dot). " + "Example: 'DFJ%,C_D02%'. Default: '' (no exclusions)." + ), + "default": "", + }, + "edge_repository": { + "type": "string", + "description": ( + "Edge repository table or view conforming to the Graph Edge Contract. " + "For AI-Native Data Products use '{ProductName}_Semantic.lineage_graph'. " + "Call graph_edgeContractDDL to generate one if needed. " + "Required — no default." + ), + "required": True, + }, + }, +} diff --git a/src/teradata_mcp_server/tools/graph/graph_edge_contract.py b/src/teradata_mcp_server/tools/graph/graph_edge_contract.py new file mode 100644 index 0000000..7af3128 --- /dev/null +++ b/src/teradata_mcp_server/tools/graph/graph_edge_contract.py @@ -0,0 +1,622 @@ +# ------------------------------------------------------------------------------- # +# File: graph_edge_contract.py # +# # +# Description: # +# Graph Edge Contract — schema abstraction for the graph analysis tools. # +# # +# Provides: # +# 1. GRAPH_EDGE_CONTRACT constant — canonical contract text, served as an # +# MCP Resource via app.py registration. # +# 2. handle_graph_edgeContractDDL() — MCP Tool that generates ready-to-run # +# Teradata DDL for a contract-conforming edge table or view. # +# # +# The graph analysis tools (findRootObjects, traceLineage, # +# connectedComponents, detectCycles, bfsLevels, analyseDatabase) all # +# require an edge repository — a table or view conforming to this contract. # +# Users supply its fully-qualified name via the edge_repository parameter. # +# # +# Column names are deliberately platform-agnostic: # +# Src_Container_Name / Tgt_Container_Name (not DatabaseName) # +# Src_Object_Name / Tgt_Object_Name (not ObjectName) # +# Src_Kind / Tgt_Kind (not Object_Kind) # +# # +# Optional enrichment columns (present in lineage_graph; ignored by tools # +# that don't use them — safe to omit from custom edge repositories): # +# Edge_Relationship — nature of the edge (e.g. ETL_INPUT, ETL_OUTPUT) # +# Transformation_Type — process type (e.g. ETL, FEATURE_ENG, AGGREGATION) # +# # +# "Container" generalises across platforms: a Teradata database, a script # +# directory, an Informatica workflow folder, a dbt project, etc. # +# # +# AI-Native Data Product shortcut: # +# {ProductName}_Semantic.lineage_graph (Observability Module v1.5) already # +# conforms to this contract and can be used directly as edge_repository. # +# # +# Contract Version: 1.1 # +# ------------------------------------------------------------------------------- # + +import logging +from typing import Any + +logger = logging.getLogger("teradata_mcp_server") + + +# ──────────────────────────────────────────────────────────────────────────────── # +# GRAPH EDGE CONTRACT — Canonical Text # +# # +# Registered as an MCP Resource in app.py (URI: graph://edge-contract). # +# AI agents retrieve this to understand the edge_repository schema required # +# by all graph_* tools. # +# ──────────────────────────────────────────────────────────────────────────────── # + +GRAPH_EDGE_CONTRACT = """ +Graph Edge Contract — Teradata MCP Server (Community Edition) +============================================================= + +Version: 1.1 +Status: Stable +Applies: All graph_* tools in the Teradata MCP Server + + +PURPOSE +------- +The graph analysis tools operate on a directed dependency graph stored as an +edge list. The edge repository is any Teradata table or view that conforms to +this contract. Users supply its fully-qualified name via the edge_repository +parameter on each graph tool. + + +REQUIRED COLUMNS +---------------- + Column Name Type Nullable Description + ────────────────── ────────────── ──────── ────────────────────────────────── + Src_Container_Name VARCHAR(128) No Container of the source (upstream) + object. Platform-agnostic: a + Teradata database, a script + directory, an ETL workflow folder, + a dbt project, etc. + + Src_Object_Name VARCHAR(128) No Name of the source object. + + Src_Kind VARCHAR(30) No Object type of the source. + Recommended: T=Table, V=View, + P=Procedure, M=Macro, J=JoinIndex, + H=HashIndex, G=Trigger, + A=AggregateUDF, F=UDF, S=Script, + E=ETL Mapping. + Custom values permitted. + + Tgt_Container_Name VARCHAR(128) No Container of the target + (downstream) object. Same + semantics as Src_Container_Name. + + Tgt_Object_Name VARCHAR(128) No Name of the target object. + + Tgt_Kind VARCHAR(30) No Object type of the target. + Same value domain as Src_Kind. + + +EDGE SEMANTICS +-------------- +Each row represents one directed dependency edge: + + Source (Src) ──is referenced by──▶ Target (Tgt) + +The TARGET object depends on the SOURCE object. + - SOURCE is upstream: a prerequisite, a referenced table or script. + - TARGET is downstream: a consumer, a dependent view or mapping. + +Example: + Src_Container_Name='PROD_STD_T' Src_Object_Name='CUSTOMER' Src_Kind='Table' + Tgt_Container_Name='PROD_STD_V' Tgt_Object_Name='CUST_ACTIVE' Tgt_Kind='View' + Edge_Relationship='DIRECT' Transformation_Type='ETL' + + Meaning: View PROD_STD_V.CUST_ACTIVE depends on table PROD_STD_T.CUSTOMER + via an ETL transformation. + + +OPTIONAL COLUMNS +---------------- +The following columns are recognised by the contract but not required by the +graph analysis tools. They are ignored by tools that do not use them, so +omitting them from a custom edge repository does not break conformance. + + Column Name Type Nullable Description + ───────────────────── ───────────── ──────── ────────────────────────────────── + Edge_Relationship VARCHAR(50) Yes Nature of the dependency edge. + Recommended values: + DIRECT — object-to-object + dependency + ETL_INPUT — source table to + ETL job + ETL_OUTPUT — ETL job to target + table + JOIN — join dependency + TRANSFORM — general + transformation + Custom values permitted. + Produced by lineage_graph view. + + Transformation_Type VARCHAR(50) Yes Process or transformation category. + Recommended values: + ETL FEATURE_ENG + AGGREGATION JOIN + EMBEDDING_GEN FILTER + PIVOT + Custom values permitted. + Sourced from data_lineage table. + +These columns are present in the {ProductName}_Semantic.lineage_graph view +(Observability Module v1.5) and can be used by graph visualisation tools for +edge labelling and filtering. The graph_* analysis tools (findRootObjects, +bfsLevels, traceLineage, detectCycles, connectedComponents, analyseDatabase) +do not read these columns — they operate on node identity only. + + +NODE IDENTITY +------------- +Nodes are identified by fully-qualified name: Container.Object + +The graph tools construct this internally as: + Src_Container_Name || '.' || Src_Object_Name (source node) + Tgt_Container_Name || '.' || Tgt_Object_Name (target node) + + +WHY "CONTAINER" NOT "DATABASE" +------------------------------ +The column names are deliberately platform-agnostic. "Container" generalises +across platforms and technologies: + + Platform Container means + ──────────────── ──────────────────────────────────────── + Teradata Database name + Oracle Schema name + SQL Server Database.Schema + Informatica Workflow or folder path + Shell scripts Directory path + dbt Project or schema + Tableau/Power BI Workbook or workspace + +This allows a single edge repository to hold cross-platform lineage — +e.g., a Teradata table consumed by an Informatica mapping that feeds a +Tableau dashboard — all in one graph. + + +ADDITIONAL COLUMNS +------------------ +The edge repository may contain additional columns beyond the required and +optional columns defined in this contract. They will be ignored by the graph +tools. + + +CONTAINER SCOPING +----------------- +All graph tools accept container_pattern or include_containers parameters +that filter edges using SQL LIKE against Src_Container_Name and Tgt_Container_Name. +The edge repository should contain edges across ALL relevant containers — +cross-container dependencies are the primary use case for graph analysis. + + +DUPLICATE EDGES +--------------- +The graph tools tolerate duplicate edges (same Src->Tgt pair appearing more +than once). Duplicates are deduplicated in memory during adjacency list +construction. For performance, it is recommended that the edge repository +contains no duplicates. + + +DDL GENERATION +-------------- +Use the graph_edgeContractDDL tool to generate a ready-to-run CREATE TABLE +or CREATE VIEW statement for a conforming edge repository. +""".strip() + + +# ──────────────────────────────────────────────────────────────────────────────── # +# DDL GENERATOR — Tool Handler # +# # +# Generates Teradata DDL for a contract-conforming edge table or view. # +# No database connection required — pure template generation. # +# ──────────────────────────────────────────────────────────────────────────────── # + + +def handle_graph_edgeContractDDL( + conn: Any, + target_database: str, + object_name: str = "EdgeRepository", + output_type: str = "TABLE", + **kwargs: Any, +) -> list[dict[str, Any]]: + """ + Generate DDL for a Graph Edge Contract-conforming table or view. + + This tool does NOT require a database connection — it generates DDL + text from templates. No SQL is executed. The conn parameter is + accepted for ModuleLoader calling convention compatibility but is + not used. + + Required columns in the generated schema (6): + Src_Container_Name, Src_Object_Name, Src_Kind, + Tgt_Container_Name, Tgt_Object_Name, Tgt_Kind + + Optional enrichment columns (2): + Edge_Relationship — nature of the edge (ETL_INPUT, ETL_OUTPUT, DIRECT…) + Transformation_Type — process category (ETL, FEATURE_ENG, AGGREGATION…) + These are ignored by graph analysis tools but useful for visualisation. + + AI-Native Data Product shortcut: + If you are working within an AI-Native Data Product, the view + {ProductName}_Semantic.lineage_graph (Observability Module v1.5) + already conforms to this contract. You do not need to generate DDL + — pass that view's fully-qualified name directly as edge_repository + on any graph_* tool. Example: + edge_repository='StGeoMortgage_Semantic.lineage_graph' + + Arguments: + conn: TeradataConnection (unused — accepted for + ModuleLoader compatibility). + target_database: Database in which to create the edge repository. + For AI-Native Data Products this is typically + {ProductName}_Semantic. + Example: 'StGeoMortgage_Semantic' + object_name: Name for the edge table/view. + Default: 'EdgeRepository' + output_type: 'TABLE' or 'VIEW'. + TABLE: generates CREATE TABLE DDL + separate sample DML. + Includes all 6 required + 2 optional columns. + VIEW: generates a CREATE VIEW template for mapping an + existing lineage source to all 8 contract columns. + Default: 'TABLE' + + Returns: + list[dict]: Response payload containing: + - ddl: DDL script (CREATE TABLE/VIEW + COMMENTs) + - sample_dml: Sample INSERT statements + validation query + (TABLE only; absent for VIEW) + - output_type: 'TABLE' or 'VIEW' + - contract_version: Contract version string + """ + logger.debug( + "Tool: handle_graph_edgeContractDDL: Args: target_database=%s, object_name=%s, output_type=%s", + target_database, + object_name, + output_type, + ) + + # ── Validate output_type ────────────────────────────────────────────────── + output_type = output_type.upper().strip() + if output_type not in ("TABLE", "VIEW"): + logger.warning("Tool: handle_graph_edgeContractDDL: Invalid output_type '%s'", output_type) + return [{"error": f"Invalid output_type '{output_type}'. Must be 'TABLE' or 'VIEW'."}] + + # ── Generate DDL (and sample DML for TABLE variant) ───────────────────── + if output_type == "TABLE": + ddl = _generate_table_ddl(target_database, object_name) + sample_dml = _generate_sample_dml(target_database, object_name) + else: + ddl = _generate_view_ddl(target_database, object_name) + sample_dml = None + + logger.info( + "Tool: handle_graph_edgeContractDDL: Generated %s DDL for %s.%s", output_type, target_database, object_name + ) + + result = { + "ddl": ddl, + "output_type": output_type, + "contract_version": "1.1", + } + if sample_dml is not None: + result["sample_dml"] = sample_dml + + return [result] + + +# ──────────────────────────────────────────────────────────────────────────────── # +# Internal DDL Templates # +# ──────────────────────────────────────────────────────────────────────────────── # + + +def _generate_table_ddl(db: str, name: str) -> str: + """ + Generate CREATE TABLE DDL with column comments (DDL only — no DML). + + Follows the Teradata Engineering Discipline: DDL files contain only + structural statements (CREATE, COMMENT, GRANT). Sample DML is + returned separately by _generate_sample_dml(). + + Args: + db: Target database name. + name: Target table name. + + Returns: + str: Teradata DDL script (CREATE TABLE + COMMENTs). + """ + return f"""-- ================================================================ +-- Graph Edge Contract — Edge Repository +-- Generated by: Teradata MCP Server (Community Edition) +-- Contract Version: 1.1 +-- ================================================================ + +CREATE SET TABLE {db}.{name} + ,NO FALLBACK + ,NO BEFORE JOURNAL + ,NO AFTER JOURNAL + ,CHECKSUM = DEFAULT + ,DEFAULT MERGEBLOCKRATIO +( + -- ── Required columns (6) ───────────────────────────────────── + Src_Container_Name VARCHAR(128) CHARACTER SET UNICODE NOT CASESPECIFIC NOT NULL + ,Src_Object_Name VARCHAR(128) CHARACTER SET UNICODE NOT CASESPECIFIC NOT NULL + ,Src_Kind VARCHAR(30) CHARACTER SET UNICODE NOT CASESPECIFIC NOT NULL + COMPRESS ('T','V','P','M','J','H','G','A','F','S','E','R', + 'Table','View','Procedure','Macro','Job','Script') + ,Tgt_Container_Name VARCHAR(128) CHARACTER SET UNICODE NOT CASESPECIFIC NOT NULL + ,Tgt_Object_Name VARCHAR(128) CHARACTER SET UNICODE NOT CASESPECIFIC NOT NULL + ,Tgt_Kind VARCHAR(30) CHARACTER SET UNICODE NOT CASESPECIFIC NOT NULL + COMPRESS ('T','V','P','M','J','H','G','A','F','S','E','R', + 'Table','View','Procedure','Macro','Job','Script') + -- ── Optional enrichment columns (2) ────────────────────────── + -- Ignored by graph analysis tools; used by visualisation clients. + ,Edge_Relationship VARCHAR(50) CHARACTER SET UNICODE NOT CASESPECIFIC + COMPRESS ('DIRECT','ETL_INPUT','ETL_OUTPUT', + 'JOIN','TRANSFORM','FILTER') + ,Transformation_Type VARCHAR(50) CHARACTER SET UNICODE NOT CASESPECIFIC + COMPRESS ('ETL','FEATURE_ENG','AGGREGATION','JOIN', + 'EMBEDDING_GEN','FILTER','PIVOT') +) +UNIQUE PRIMARY INDEX (Src_Container_Name, Src_Object_Name, Tgt_Container_Name, Tgt_Object_Name) +; + +-- ================================================================ +-- NOTE: Multi-Value Compression (MVC) on kind and optional columns +-- ================================================================ +-- Src_Kind / Tgt_Kind COMPRESS lists cover both single-letter codes +-- (legacy: T, V, P…) and full-word values (Table, View, Procedure…) +-- used by the lineage_graph view. Remove unused values for optimal +-- compression. Non-listed values store correctly but uncompressed. +-- +-- Edge_Relationship and Transformation_Type COMPRESS lists cover the +-- standard values from the Observability Module. Extend as needed for +-- custom edge types in your edge repository. +-- ================================================================ + +COMMENT ON TABLE {db}.{name} + AS 'Graph Edge Contract v1.1 - edge repository for Teradata MCP Server graph tools. Each row is a directed dependency: Target depends on Source. Required: 6 columns. Optional enrichment: Edge_Relationship, Transformation_Type.' +; + +COMMENT ON COLUMN {db}.{name}.Src_Container_Name + AS 'Source (upstream) container. Platform-agnostic: Teradata database, script directory, ETL workflow folder, etc.' +; + +COMMENT ON COLUMN {db}.{name}.Src_Object_Name + AS 'Source (upstream) object name.' +; + +COMMENT ON COLUMN {db}.{name}.Src_Kind + AS 'Source object type. Single-letter codes (T=Table, V=View, P=Procedure, M=Macro, J=JoinIndex, H=HashIndex, G=Trigger, S=Script, E=ETL Mapping) or full words (Table, View, Job). Custom values permitted.' +; + +COMMENT ON COLUMN {db}.{name}.Tgt_Container_Name + AS 'Target (downstream) container. Same semantics as Src_Container_Name.' +; + +COMMENT ON COLUMN {db}.{name}.Tgt_Object_Name + AS 'Target (downstream) object name.' +; + +COMMENT ON COLUMN {db}.{name}.Tgt_Kind + AS 'Target object type. Same value domain as Src_Kind.' +; + +COMMENT ON COLUMN {db}.{name}.Edge_Relationship + AS 'Optional. Nature of the dependency edge. Standard values: DIRECT (object dependency), ETL_INPUT (source to job), ETL_OUTPUT (job to target), JOIN, TRANSFORM, FILTER. Custom values permitted. Ignored by graph analysis tools.' +; + +COMMENT ON COLUMN {db}.{name}.Transformation_Type + AS 'Optional. Process or transformation category. Standard values: ETL, FEATURE_ENG, AGGREGATION, JOIN, EMBEDDING_GEN, FILTER, PIVOT. Sourced from data_lineage.transformation_type. Ignored by graph analysis tools.' +;""" + + +def _generate_sample_dml(db: str, name: str) -> str: + """ + Generate sample INSERT statements and a validation query for a + Graph Edge Contract table. + + Separated from the DDL to follow the Teradata Engineering Discipline: + DDL files (.tbl) must never contain INSERT/SELECT statements. + + Args: + db: Target database name. + name: Target table name. + + Returns: + str: Sample DML script (INSERTs + validation SELECT). + """ + return f"""-- ================================================================ +-- Sample data — two edges forming a simple dependency chain: +-- CUSTOMER (table) <- CUSTOMER_ACTIVE (view) <- CUSTOMER_REPORT (view) +-- Optional columns omitted — they are not required for conformance. +-- ================================================================ + +INSERT INTO {db}.{name} +( Src_Container_Name, Src_Object_Name, Src_Kind + ,Tgt_Container_Name, Tgt_Object_Name, Tgt_Kind) +VALUES +( 'MY_DB_STD_T', 'CUSTOMER', 'Table' + ,'MY_DB_STD_V', 'CUSTOMER_ACTIVE', 'View') +; + +INSERT INTO {db}.{name} +( Src_Container_Name, Src_Object_Name, Src_Kind + ,Tgt_Container_Name, Tgt_Object_Name, Tgt_Kind) +VALUES +( 'MY_DB_STD_V', 'CUSTOMER_ACTIVE', 'View' + ,'MY_DB_STD_V', 'CUSTOMER_REPORT', 'View') +; + +-- ================================================================ +-- Cross-platform example with optional enrichment columns populated. +-- An ETL job is surfaced as a first-class node (matching lineage_graph): +-- CUSTOMER (table) -> ETL_LOAD (job) -> CUSTOMER_FEATURES (table) +-- ================================================================ + +INSERT INTO {db}.{name} +( Src_Container_Name, Src_Object_Name, Src_Kind + ,Tgt_Container_Name, Tgt_Object_Name, Tgt_Kind + ,Edge_Relationship, Transformation_Type) +VALUES +( 'MY_DB_STD_T', 'CUSTOMER', 'Table' + ,'', 'ETL_LOAD', 'Job' + ,'ETL_INPUT', 'ETL') +; + +INSERT INTO {db}.{name} +( Src_Container_Name, Src_Object_Name, Src_Kind + ,Tgt_Container_Name, Tgt_Object_Name, Tgt_Kind + ,Edge_Relationship, Transformation_Type) +VALUES +( '', 'ETL_LOAD', 'Job' + ,'MY_PRED_STD_T', 'CUSTOMER_FEATURES', 'Table' + ,'ETL_OUTPUT', 'FEATURE_ENG') +; + +-- ================================================================ +-- Validation — confirm the edge repository meets the contract. +-- Only the six required columns must be NOT NULL. +-- Expected result: 0 violations. +-- ================================================================ + +SELECT 'NULL_CHECK' AS Validation + ,COUNT(*) AS Violations +FROM {db}.{name} +WHERE Src_Container_Name IS NULL + OR Src_Object_Name IS NULL + OR Src_Kind IS NULL + OR Tgt_Container_Name IS NULL + OR Tgt_Object_Name IS NULL + OR Tgt_Kind IS NULL +;""" + + +def _generate_view_ddl(db: str, name: str) -> str: + """ + Generate CREATE VIEW DDL template for user customisation. + + The view body contains placeholder references that the user must + replace with their actual lineage source table/view. + + Args: + db: Target database name. + name: Target view name. + + Returns: + str: Teradata SQL script with placeholder source references. + """ + return f"""-- ================================================================ +-- Graph Edge Contract — Edge Repository (VIEW) +-- Generated by: Teradata MCP Server (Community Edition) +-- Contract Version: 1.1 +-- +-- Customise the SELECT below to map your lineage source to the +-- six required columns. The two optional enrichment columns +-- (Edge_Relationship, Transformation_Type) are included as +-- placeholders — map them or return NULL if not available. +-- ================================================================ + +REPLACE VIEW {db}.{name} +( + Src_Container_Name + ,Src_Object_Name + ,Src_Kind + ,Tgt_Container_Name + ,Tgt_Object_Name + ,Tgt_Kind + -- Optional enrichment columns (NULL if not available in your source) + ,Edge_Relationship + ,Transformation_Type +) +AS +LOCKING ROW FOR ACCESS +SELECT + src.ContainerName AS Src_Container_Name + ,src.ObjectName AS Src_Object_Name + ,src.ObjectKind AS Src_Kind + ,tgt.ContainerName AS Tgt_Container_Name + ,tgt.ObjectName AS Tgt_Object_Name + ,tgt.ObjectKind AS Tgt_Kind + -- ============================================================ + -- Map these to your actual columns, or use NULL if not available. + -- Examples: + -- src.RelationshipType AS Edge_Relationship + -- src.ProcessCategory AS Transformation_Type + -- ============================================================ + ,CAST(NULL AS VARCHAR(50)) AS Edge_Relationship + ,CAST(NULL AS VARCHAR(50)) AS Transformation_Type +FROM + -- ============================================================ + -- Replace this with your actual lineage source. + -- Examples: + -- Your_DB.Your_Lineage_Table + -- A join across metadata tables + -- A UNION ALL of multiple lineage sources + -- {"{ProductName}"}_Observability.data_lineage (AI-Native Data Product) + -- ============================================================ + YOUR_DATABASE.YOUR_LINEAGE_TABLE AS src + -- Map your source columns to the contract column aliases above. +; + +COMMENT ON VIEW {db}.{name} + AS 'Graph Edge Contract v1.1 - edge repository view for Teradata MCP Server graph tools. 6 required columns + 2 optional enrichment columns (Edge_Relationship, Transformation_Type). Customise the source query to map your lineage data.' +;""" + + +# ──────────────────────────────────────────────────────────────────────────────── # +# Tool registration descriptor # +# ──────────────────────────────────────────────────────────────────────────────── # + +GRAPH_EDGE_CONTRACT_DDL_TOOL = { + "name": "graph_edgeContractDDL", + "handler": handle_graph_edgeContractDDL, + "description": ( + "Generate Teradata DDL for a Graph Edge Contract-conforming edge " + "repository table or view. Call this FIRST if you don't yet have an " + "edge repository — all other graph_* tools require one. " + "No database connection is used; DDL is returned as text ready to run. " + "TABLE output includes separate sample DML. " + "VIEW output generates a customisable template covering all 8 contract " + "columns: 6 required (Src_Container_Name, Src_Object_Name, Src_Kind, " + "Tgt_Container_Name, Tgt_Object_Name, Tgt_Kind) and 2 optional " + "enrichment columns (Edge_Relationship, Transformation_Type) for use " + "by graph visualisation tools. " + "AI-Native Data Product shortcut: if you have an Observability Module " + "(v1.5+), pass {ProductName}_Semantic.lineage_graph directly as " + "edge_repository — it already conforms to this contract. " + "Contract Version: 1.1." + ), + "parameters": { + "target_database": { + "type": "string", + "description": ( + "Database in which to create the edge repository. " + "For AI-Native Data Products this is typically " + "{ProductName}_Semantic. " + "Example: 'StGeoMortgage_Semantic'." + ), + "required": True, + }, + "object_name": { + "type": "string", + "description": ("Name for the edge table or view. Default: 'EdgeRepository'."), + "default": "EdgeRepository", + }, + "output_type": { + "type": "string", + "description": ( + "'TABLE' (default): CREATE TABLE DDL + separate sample DML. " + "'VIEW': CREATE VIEW template for mapping an existing lineage source." + ), + "default": "TABLE", + }, + }, +} diff --git a/src/teradata_mcp_server/tools/graph/graph_find_root_objects.py b/src/teradata_mcp_server/tools/graph/graph_find_root_objects.py new file mode 100644 index 0000000..196aa39 --- /dev/null +++ b/src/teradata_mcp_server/tools/graph/graph_find_root_objects.py @@ -0,0 +1,481 @@ +""" +graph_findRootObjects.py — Root object discovery tool. + +Provides handle_graph_findRootObjects and GRAPH_FIND_ROOT_OBJECTS_TOOL. +Queries the edge repository directly (no SP) to find objects with no upstream +dependencies — the ideal seed points for downstream impact analysis. + +Author: Paul Dancer — Teradata Global Field Tech +""" + +import logging +import time + +from teradatasql import TeradataConnection + +from teradata_mcp_server.tools.graph._graph_utils import parse_csv_patterns +from teradata_mcp_server.tools.utils import create_response, rows_to_json + +logger = logging.getLogger("teradata_mcp_server") + + +def handle_graph_findRootObjects( + conn: TeradataConnection, + container_pattern: str, + exclude_objects: str = "", + edge_repository: str = "", + object_types: str = "", + return_format: str = "detailed", + tool_name: str | None = None, + *args, + **kwargs, +): + """ + Find root objects (objects with no upstream dependencies) in specified containers. + + Root objects are ideal starting points for downstream impact analysis as they + represent the foundational data sources that nothing else depends upon. + + Use this for: + - Finding starting points for downstream impact analysis + - Identifying source tables and base objects in data pipelines + - Discovering independent objects that can be safely analysed in isolation + - Understanding data flow origins in a schema or database + - Planning migration or refactoring by identifying foundation objects + + Arguments: + container_pattern - str: Database/schema pattern(s) to search. SUPPORTS WILDCARDS (%) and CSV. + + IMPORTANT: This is a STRING parameter (type: str), not an array. + Pass multiple patterns as a single comma-separated string. + + SINGLE CONTAINER: + 'DEV01_StGeo_STD_T' - Specific database + + WILDCARDS (%): + '%WBC%' - All databases containing WBC + 'DEV01_%' - All databases starting with DEV01_ + '%_STD_T' - All databases ending with _STD_T + + MULTIPLE CONTAINERS (CSV format): + '%WBC%,%StGeo%' - All WBC and StGeo databases + 'DEV01_StGeo_STD_T,DEV02_WBC_STD_T' - Specific databases + 'DEV01_%,DEV02_%' - All DEV01 and DEV02 databases + + WHITESPACE HANDLING: + Whitespace is automatically trimmed, so these are equivalent: + ✅ '%WBC%,%StGeo%' (no spaces) + ✅ '%WBC%, %StGeo%' (spaces after commas - OK) + + HOW TO PASS IN CODE: + Python: container_pattern="%WBC%,%StGeo%" + JSON: {"container_pattern": "%WBC%,%StGeo%"} + + CRITICAL: This is a STRING type parameter. + ✅ CORRECT: Pass as string: container_pattern="%WBC%,%StGeo%" + ❌ WRONG: Pass as array: container_pattern=["%WBC%", "%StGeo%"] + + exclude_objects - str: Comma-separated list of patterns to exclude (SERVER-SIDE filter). + Matches against DatabaseName.ObjectName format. + + Common exclusion patterns: + 'PRD_%,PROD_%' - Exclude production databases + '%.temp_%,%.bak_%' - Exclude temporary and backup objects + 'DFJ%,C_D02%' - Exclude personal/sandbox schemas + + Performance: Reduces result set and improves query time + Default: '' (empty string = no exclusions) + + edge_repository - str: Edge repository table/view conforming to the + Required parameter — no default. + + object_types - str: Comma-separated list of object types to include (optional filter). + Examples: 'T' (tables), 'V' (views), 'P' (procedures), 'M' (macros) + Multiple: 'T,V' (tables and views only) + Empty = all object types included + Default: '' (all types) + + return_format - str: Output format: 'detailed' or 'summary' + 'detailed' (default): Full object list with metadata + 'summary': High-level statistics and counts only + Default: 'detailed' + + Returns: + ResponseType: formatted response with root objects + metadata + + Example queries that trigger this tool: + - "Which objects in WBC and StGeo databases have no dependencies?" + - "Find root objects in DEV01 databases" + - "What are the starting points for impact analysis in StGeo?" + - "Show me base tables with no upstream dependencies" + - "Which objects should I start analysing for downstream impact?" + + Example calls: + # Find root objects in WBC and StGeo databases + handle_graph_findRootObjects( + conn=connection, + container_pattern="%WBC%,%StGeo%" + ) + + # Find only root tables (no views/procedures) + handle_graph_findRootObjects( + conn=connection, + container_pattern="DEV01_%", + object_types="T" + ) + + # Find root objects excluding production and temporary objects + handle_graph_findRootObjects( + conn=connection, + container_pattern="%WBC%,%StGeo%", + exclude_objects="PRD_%,%.temp_%,%.bak_%" + ) + + # Quick summary of root objects + handle_graph_findRootObjects( + conn=connection, + container_pattern="DEV01_StGeo_STD_T", + return_format="summary" + ) + + Technical Implementation: + - Queries the edge repository to find all objects in specified containers + - Identifies objects that appear as sources but never as targets + - These are "root" objects - they have no upstream dependencies + - Results are filtered by exclude_objects and object_types parameters + - Returns list of root objects suitable for downstream impact analysis + """ + logger.debug( + "Tool: handle_graph_findRootObjects: Args: " + "container_pattern=%s, exclude_objects=%s, edge_repository=%s, " + "object_types=%s, return_format=%s", + container_pattern, + exclude_objects, + edge_repository, + object_types, + return_format, + ) + + if not edge_repository: + return create_response( + {"error": "edge_repository is required. Call graph_edgeContractDDL to generate one."}, + { + "tool_name": tool_name or "graph_findRootObjects", + "status": "error", + }, + ) + + try: + with conn.cursor() as cur: + # Build the SQL query to find root objects using NOT EXISTS + # Root objects are those that appear as sources but never as targets + # (i.e., they have no upstream dependencies) + + # Parse container patterns (CSV support) + container_patterns = parse_csv_patterns(container_pattern) + + # Build LIKE clauses for container patterns - used in main WHERE and NOT EXISTS + container_conditions = [] + for pattern in container_patterns: + container_conditions.append(f"Src_Container_Name LIKE '{pattern}'") + + container_where = " OR ".join(container_conditions) + + # Build exclusion conditions if provided + exclusion_where = "" + if exclude_objects: + exclude_patterns = parse_csv_patterns(exclude_objects) + exclusion_conditions = [] + for pattern in exclude_patterns: + # Check if pattern contains a dot (fully qualified) or just database pattern + if "." in pattern: + # Fully qualified pattern like 'DB.Object' + db_part, obj_part = pattern.split(".", 1) + exclusion_conditions.append( + f"(o1.Src_Container_Name LIKE '{db_part}' AND o1.Src_Object_Name LIKE '{obj_part}')" + ) + else: + # Database-only pattern like 'PRD_%' + exclusion_conditions.append(f"o1.Src_Container_Name LIKE '{pattern}'") + + if exclusion_conditions: + exclusion_where = " AND NOT (" + " OR ".join(exclusion_conditions) + ")" + + # Build object type filter if provided + type_where = "" + if object_types: + type_list = [f"'{t.strip()}'" for t in object_types.split(",") if t.strip()] + if type_list: + type_where = f" AND o1.Src_Kind IN ({','.join(type_list)})" + + import time + + start_time = time.time() + # Main query to find root objects using NOT EXISTS + # This is more efficient than NOT IN for large datasets + # The query finds objects that exist as sources but never as targets + sql = f""" +LOCKING ROW FOR ACCESS +SELECT DISTINCT + o1.Src_Container_Name AS DatabaseName, + o1.Src_Object_Name AS ObjectName, + TRIM(o1.Src_Container_Name) || '.' || TRIM(o1.Src_Object_Name) AS FullyQualifiedName, + o1.Src_Kind AS ObjectType, + COUNT(DISTINCT o1.Tgt_Container_Name || '.' || o1.Tgt_Object_Name) AS DownstreamDependentCount +FROM {edge_repository} o1 +WHERE ({container_where}) + {exclusion_where} + {type_where} + AND NOT EXISTS ( + SELECT 1 + FROM {edge_repository} o2 + WHERE o2.Tgt_Container_Name = o1.Src_Container_Name + AND o2.Tgt_Object_Name = o1.Src_Object_Name + AND ({container_where.replace("Src_Container_Name", "o2.Src_Container_Name")}) + ) +GROUP BY + o1.Src_Container_Name, + o1.Src_Object_Name, + o1.Src_Kind +ORDER BY + DownstreamDependentCount DESC, + o1.Src_Container_Name, + o1.Src_Object_Name + """ + + logger.debug("Tool: handle_graph_findRootObjects: Executing SQL:\n%s", sql) + + # Execute query + cur.execute(sql) + + query_time = time.time() - start_time + logger.debug("Tool: handle_graph_findRootObjects: Query execution took %.2fs", query_time) + + # Fetch all results and convert to list of dictionaries + # NOTE: rows_to_json takes (description, rows) - description FIRST! + root_objects = rows_to_json(cur.description, cur.fetchall()) + + logger.debug("Tool: handle_graph_findRootObjects: Found %d root objects", len(root_objects)) + if root_objects: + logger.debug("Tool: handle_graph_findRootObjects: First object: %s", root_objects[0]) + + # Safety check: ensure root_objects is a list of dicts, not a string + if not isinstance(root_objects, list): + logger.error( + "Tool: handle_graph_findRootObjects: root_objects is not a list — type: %s", type(root_objects) + ) + root_objects = [] + + # Format results based on return_format + if return_format == "summary": + formatted_data = _format_root_summary(root_objects, container_pattern) + else: # detailed + formatted_data = { + "root_objects": root_objects, + "summary": _create_root_summary_stats(root_objects, container_pattern), + } + + # Build metadata + metadata = { + "tool_name": tool_name if tool_name else "graph_findRootObjects", + "container_pattern": container_pattern, + "exclude_objects": exclude_objects, + "object_types": object_types, + "edge_repository": edge_repository, + "return_format": return_format, + "sql": sql, + "columns": [{"name": desc[0], "type": "str"} for desc in cur.description], + "row_count": len(root_objects), + "status": "success", + } + + logger.debug("Tool: handle_graph_findRootObjects: metadata: %s", metadata) + return create_response(formatted_data, metadata) + + except Exception as e: + logger.error("Tool: handle_graph_findRootObjects: Error: %s", e, exc_info=True) + return create_response( + {"error": str(e)}, + { + "tool_name": tool_name if tool_name else "graph_findRootObjects", + "container_pattern": container_pattern, + "status": "error", + }, + ) + + +def _create_root_summary_stats(root_objects: list, container_pattern: str) -> dict: + """ + Create summary statistics for root objects analysis. + + Arguments: + root_objects - List of root object dictionaries + container_pattern - Container pattern(s) searched + + Returns: + Dictionary with summary statistics + """ + # Count by object type + type_counts: dict[str, int] = {} + for obj in root_objects: + obj_type = obj.get("ObjectType", "Unknown") + type_counts[obj_type] = type_counts.get(obj_type, 0) + 1 + + # Count by database + db_counts: dict[str, int] = {} + for obj in root_objects: + db_name = obj.get("DatabaseName", "Unknown") + db_counts[db_name] = db_counts.get(db_name, 0) + 1 + + # Calculate total downstream dependencies + total_downstream = sum( + int(obj.get("DownstreamDependentCount", 0)) + if isinstance(obj.get("DownstreamDependentCount"), str) + else obj.get("DownstreamDependentCount", 0) + for obj in root_objects + ) + + # Find objects with most downstream dependencies + top_objects = sorted( + root_objects, + key=lambda x: ( + int(x.get("DownstreamDependentCount", 0)) + if isinstance(x.get("DownstreamDependentCount"), str) + else x.get("DownstreamDependentCount", 0) + ), + reverse=True, + )[:10] + + return { + "total_root_objects": len(root_objects), + "container_pattern": container_pattern, + "object_type_counts": type_counts, + "database_counts": db_counts, + "total_downstream_dependencies": total_downstream, + "average_downstream_per_root": round(total_downstream / len(root_objects), 2) if root_objects else 0, + "top_impact_objects": [ + { + "name": obj.get("FullyQualifiedName"), + "type": obj.get("ObjectType"), + "downstream_count": obj.get("DownstreamDependentCount"), + } + for obj in top_objects + ], + } + + +def _format_root_summary(root_objects: list, container_pattern: str) -> dict: + """ + Format a concise summary of root objects analysis. + + Arguments: + root_objects - List of root object dictionaries + container_pattern - Container pattern(s) searched + + Returns: + Dictionary with formatted summary + """ + stats = _create_root_summary_stats(root_objects, container_pattern) + + summary_text = f""" +ROOT OBJECTS ANALYSIS SUMMARY +{"=" * 60} + +Container Pattern(s): {container_pattern} + +OVERVIEW + Total Root Objects Found: {stats["total_root_objects"]} + Total Downstream Impact: {stats["total_downstream_dependencies"]} objects + Avg Downstream per Root: {stats["average_downstream_per_root"]} + +DEFINITION + Root objects are objects with NO upstream dependencies. + They represent foundational data sources and are ideal + starting points for downstream impact analysis. +""" + + if stats["object_type_counts"]: + summary_text += "\nBY OBJECT TYPE\n" + for obj_type, count in sorted(stats["object_type_counts"].items(), key=lambda x: x[1], reverse=True): + summary_text += f" {obj_type:20s} {count:3d}\n" + + if stats["database_counts"]: + summary_text += "\nBY DATABASE\n" + for db_name, count in sorted(stats["database_counts"].items(), key=lambda x: x[1], reverse=True)[:10]: + summary_text += f" {db_name:40s} {count:3d}\n" + + if len(stats["database_counts"]) > 10: + summary_text += f" ... and {len(stats['database_counts']) - 10} more databases\n" + + if stats["top_impact_objects"]: + summary_text += "\nTOP 10 ROOT OBJECTS BY DOWNSTREAM IMPACT\n" + for i, obj in enumerate(stats["top_impact_objects"], 1): + summary_text += f" {i:2d}. {obj['name']:50s} ({obj['type']}) → {obj['downstream_count']} dependents\n" + + summary_text += """ +RECOMMENDATION + Start your downstream impact analysis with the objects listed above, + particularly those with higher downstream dependent counts, as they + represent foundational objects with broader impact scope. +""" + + return { + "summary_text": summary_text, + "statistics": stats, + "root_object_names": [obj.get("FullyQualifiedName") for obj in root_objects], + } + + +# ------------------------------------------------------------------ +# Tool registration descriptor +# ------------------------------------------------------------------ +GRAPH_FIND_ROOT_OBJECTS_TOOL = { + "name": "graph_findRootObjects", + "handler": handle_graph_findRootObjects, + "description": ( + "Find root objects — objects with no upstream dependencies — in the " + "specified containers. Root objects are foundational data sources and " + "ideal starting points for downstream impact analysis or migration wave " + "planning. Results are ordered by downstream dependent count descending. " + "Use graph_bfsLevels after this tool to compute hop distances from the " + "identified root objects. " + "Requires an edge repository conforming to the Graph Edge Contract. " + "If you don't have one yet, call graph_edgeContractDDL first to " + "generate the CREATE TABLE or CREATE VIEW DDL." + ), + "parameters": { + "container_pattern": { + "type": "string", + "description": ( + "CSV LIKE patterns for databases/schemas to search. Supports wildcards: '%WBC%' or '%WBC%,%StGeo%'." + ), + "required": True, + }, + "exclude_objects": { + "type": "string", + "description": ("CSV of FQ object name LIKE patterns to exclude. Example: 'PRD_%,%.temp_%'. Default: ''."), + "default": "", + }, + "edge_repository": { + "type": "string", + "description": ( + "Edge repository table or view conforming to the Graph Edge Contract. " + "Call graph_edgeContractDDL to generate one if needed. " + "Required parameter — no default." + ), + "required": True, + }, + "object_types": { + "type": "string", + "description": ( + "CSV of object type codes to include. Example: 'Table' or 'Table,View'. Default: '' (all types)." + ), + "default": "", + }, + "return_format": { + "type": "string", + "description": "Output format: 'detailed' (default) or 'summary'.", + "default": "detailed", + }, + }, +} diff --git a/src/teradata_mcp_server/tools/graph/graph_tools.py b/src/teradata_mcp_server/tools/graph/graph_tools.py new file mode 100644 index 0000000..39c9fb8 --- /dev/null +++ b/src/teradata_mcp_server/tools/graph/graph_tools.py @@ -0,0 +1,180 @@ +""" +graph_tools.py — Registration aggregator for graph analysis tools. + +────────────────────────────────────────────────────────────────────── +WHY THIS FILE EXISTS AND WHY IT IS STRUCTURED THIS WAY +────────────────────────────────────────────────────────────────────── + +This file is intentionally a THIN HUB. It contains no handler logic, +no SQL, and no business rules. Its only job is to import handlers and +descriptors from the individual tool modules in the graph/ sub-package +and expose them as a single GRAPH_TOOLS list for MCP server registration. + +This structure was adopted for the following reasons: + +1. VERSION CONTROL + Each tool lives in its own file. A git diff for a bug fix or feature + change touches exactly one tool file — not a 2,000+ line monolith. + PR reviews are scoped. Blame history is meaningful. Bisecting a + regression is straightforward. + +2. INDEPENDENT DEPLOYMENT + A hotfix to graph_bfsLevels can be deployed by copying one file. + There is no risk of inadvertently shipping changes to other tools + alongside an unrelated fix. + +3. PARALLEL DEVELOPMENT + Multiple engineers can work on different tools simultaneously without + merge conflicts. Separate files eliminate the constant collision source + that a shared monolith creates. + +4. TESTABILITY + Each tool file can be unit-tested in isolation. A test for + graph_bfsLevels only needs to import that one module and mock the + connection — it does not pull in other tools, their imports, or their + dependencies. + +5. SEPARATION OF CONCERNS + Tool logic, shared utilities, and server registration are three + distinct concerns. They now live in three distinct places: + graph/.py — handler logic + descriptor + graph/_graph_utils.py — shared BFS helpers (internal, not a tool) + graph_tools.py — this file: registration only + +────────────────────────────────────────────────────────────────────── +PACKAGE STRUCTURE +────────────────────────────────────────────────────────────────────── + + teradata_mcp_server/tools/ + ├── graph_tools.py ← YOU ARE HERE (hub only) + ├── graph/ + │ ├── __init__.py + │ ├── _graph_utils.py ← shared helpers (bfs_safe_int, + │ │ create_bfs_summary, + │ │ extract_cycle_candidates) + │ ├── graph_traceLineage.py ← hybrid: Python CTEs, server-side traversal + │ ├── graph_findRootObjects.py ← SQL-only root object discovery + │ ├── graph_detectCycles.py ← Python: Union-Find + iterative DFS + │ ├── graph_connectedComponents.py ← Python: Union-Find WCC analysis + │ └── graph_bfsLevels.py ← Python BFS (no SP dependency) + └── utils.py ← shared MCP utilities (create_response etc.) + +────────────────────────────────────────────────────────────────────── +ADDING A NEW TOOL +────────────────────────────────────────────────────────────────────── + + 1. Create graph/graph_.py following the existing module + pattern (module docstring, imports, handler, descriptor constant). + 2. Import the handler and descriptor here (two lines below). + 3. Add the descriptor to GRAPH_TOOLS (one line below). + 4. Create tests/tools/graph/test_graph_.py. + +Nothing else changes — the MCP server consumes GRAPH_TOOLS unchanged. + +────────────────────────────────────────────────────────────────────── +SP-FREE ARCHITECTURE — ALL TOOLS +────────────────────────────────────────────────────────────────────── + +All graph tools in this package are free of stored procedure (SP) +dependencies. No Teradata DDL objects are required beyond read access +to the edge repository view/table. The implementation strategies are: + + graph_findRootObjects + Pure SQL SELECT — NOT EXISTS subquery identifies objects with no + upstream dependencies. No Python algorithm required. + + graph_bfsLevels + Pure Python — one bulk edge SELECT, then standard queue-based BFS + (O(V+E)) in the MCP server process. Replaced an SP-based + Bellman-Ford SQL relaxation loop. + + graph_detectCycles + Pure Python — one scoped edge SELECT, then Union-Find WCC + partitioning followed by iterative DFS (grey/black colouring). + Iterative DFS avoids Python's recursion limit on deep graphs. + + graph_connectedComponents + Pure Python — one scoped edge SELECT, then path-compressed + Union-Find assigns every node to a component in O(α·N) time. + + graph_traceLineage + Hybrid — Python constructs Teradata recursive CTEs and executes + them as plain SELECT statements. The recursive traversal runs + entirely in Teradata spool (server-side), returning only the + reachable subgraph across the network. Python owns orchestration, + deduplication, and response assembly. This approach avoids + transferring the full edge table when only a small subgraph is + needed — critical at scale (100 000+ edges). + +The only Teradata privilege required across all tools is SELECT on +the edge_repository view/table. + +────────────────────────────────────────────────────────────────────── +""" + +import logging + +from teradata_mcp_server.tools.graph.graph_analyse_database import ( + GRAPH_ANALYSE_DATABASE_TOOL, + handle_graph_analyseDatabase, +) +from teradata_mcp_server.tools.graph.graph_bfs_levels import ( + GRAPH_BFS_LEVELS_TOOL, + handle_graph_bfsLevels, +) +from teradata_mcp_server.tools.graph.graph_connected_components import ( + GRAPH_CONNECTED_COMPONENTS_TOOL, + handle_graph_connectedComponents, +) +from teradata_mcp_server.tools.graph.graph_detect_cycles import ( + GRAPH_DETECT_CYCLES_TOOL, + handle_graph_detectCycles, +) +from teradata_mcp_server.tools.graph.graph_edge_contract import ( + GRAPH_EDGE_CONTRACT_DDL_TOOL, + handle_graph_edgeContractDDL, +) + +# ── Individual tool imports ──────────────────────────────────────────────── +# +# Each import pair brings in: +# handle_* — the callable handler passed to the MCP framework +# *_TOOL — the descriptor dict (name, handler ref, description, parameters) +# +# Import order matches logical workflow: +# findRootObjects → bfsLevels → traceLineage → detectCycles → connectedComponents → analyseDatabase +from teradata_mcp_server.tools.graph.graph_find_root_objects import ( + GRAPH_FIND_ROOT_OBJECTS_TOOL, + handle_graph_findRootObjects, +) +from teradata_mcp_server.tools.graph.graph_trace_lineage import ( + GRAPH_TRACE_LINEAGE_TOOL, + handle_graph_traceLineage, +) + +logger = logging.getLogger("teradata_mcp_server") + +# ── Tool registry ────────────────────────────────────────────────────────── +# +# GRAPH_TOOLS is the single list consumed by the MCP server at startup. +# The server iterates this list and registers each tool's name, handler, +# and parameter schema with the MCP protocol layer. +# +# Order here controls the order tools appear in MCP tool listings. +# Workflow order (roots → BFS → dependencies → cycles → components) +# makes the listing intuitive for both humans and AI agents. +# +# To disable a tool temporarily: comment out its entry here. +# To add a new tool: append its descriptor (see ADDING A NEW TOOL above). + +GRAPH_TOOLS = [ + GRAPH_EDGE_CONTRACT_DDL_TOOL, # Step 0 — generate edge repository DDL + GRAPH_FIND_ROOT_OBJECTS_TOOL, # Step 1 — discover seed objects + GRAPH_BFS_LEVELS_TOOL, # Step 2 — wave planning + blast radius + GRAPH_TRACE_LINEAGE_TOOL, # Step 3 — full lineage + impact paths + GRAPH_DETECT_CYCLES_TOOL, # Step 4 — cycle validation + GRAPH_CONNECTED_COMPONENTS_TOOL, # Step 5 — graph partitioning + GRAPH_ANALYSE_DATABASE_TOOL, # Step 6 — composite single-fetch analysis +] + +logger.debug("graph_tools: registered %d tools: %s", len(GRAPH_TOOLS), [t["name"] for t in GRAPH_TOOLS]) diff --git a/src/teradata_mcp_server/tools/graph/graph_trace_lineage.py b/src/teradata_mcp_server/tools/graph/graph_trace_lineage.py new file mode 100644 index 0000000..fadddd5 --- /dev/null +++ b/src/teradata_mcp_server/tools/graph/graph_trace_lineage.py @@ -0,0 +1,807 @@ +""" +graph_traceLineage.py — Dependency lineage analysis tool. + +Provides handle_graph_traceLineage and GRAPH_TRACE_LINEAGE_TOOL. + +Hybrid implementation — no stored procedure required. + +Design: + Python constructs and executes parameterised Teradata recursive CTEs as plain + SELECT statements. The recursive traversal runs entirely in Teradata spool + (server-side), so only the reachable subgraph crosses the network — not the + full edge table. Python owns all orchestration, filtering, response assembly, + and format selection. + + This approach satisfies two competing constraints simultaneously: + 1. No stored procedure — no Teradata DDL, no REPLACE PROCEDURE privilege, + no server-side objects to deploy or version. + 2. No full-table transfer at scale — a graph with 100 000 edges is queried + with only the reachable subgraph returned per invocation. + +Recursive CTE direction convention (matches Edge Repository / graph_bfsLevels): + Edge Repository row: Src is REFERENCED BY Tgt. + => Src is the DEPENDENCY (upstream of Tgt). + => Tgt is the DEPENDENT (downstream of Src). + + Upstream CTE — "what does my seed depend on?": + Anchor on seed as Tgt; recurse by following Src side outward. + + Downstream CTE — "what depends on my seed?": + Anchor on seed as Src; recurse by following Tgt side outward. + +Author: Paul Dancer — Teradata Global Field Tech +""" + +import logging + +from teradatasql import TeradataConnection + +from teradata_mcp_server.tools.graph._graph_utils import parse_csv_patterns +from teradata_mcp_server.tools.utils import create_response, rows_to_json + +logger = logging.getLogger("teradata_mcp_server") + + +# --------------------------------------------------------------------------- +# Internal helpers — pattern parsing +# --------------------------------------------------------------------------- +# parse_csv_patterns is imported from _graph_utils. +# _build_or_like is kept local — it covers both Src and Tgt columns +# simultaneously, which is a different pattern from build_like_or. + + +def _build_or_like(patterns: list[str], src_col: str, tgt_col: str) -> str: + """ + Build an OR-joined pair of LIKE clauses covering both Src and Tgt columns. + + Used to scope the recursive CTE anchor and recursion steps so that only + edges touching the requested containers participate. + + Arguments: + patterns - List of LIKE pattern strings for container names + src_col - SQL column name for the source container + tgt_col - SQL column name for the target container + + Returns: + SQL fragment, e.g. + "({src_col} LIKE 'A%' OR {tgt_col} LIKE 'A%' + OR {src_col} LIKE 'B%' OR {tgt_col} LIKE 'B%')" + Returns empty string if patterns is empty (no filtering). + """ + if not patterns: + return "" + clauses = [] + for p in patterns: + clauses.append(f"{src_col} LIKE '{p}'") + clauses.append(f"{tgt_col} LIKE '{p}'") + return "AND (" + " OR ".join(clauses) + ")" + + +def _build_excl_fragment(patterns: list[str], db_col: str, obj_col: str) -> str: + """ + Build a NOT (...) exclusion fragment for object-level filtering. + + A pattern containing a dot is treated as a fully-qualified DB.Object + pattern; a plain pattern is matched against the container/DB column only. + + Arguments: + patterns - List of exclusion LIKE patterns + db_col - SQL column holding the database/schema name + obj_col - SQL column holding the object name + + Returns: + SQL fragment beginning with "AND NOT (...)" or empty string + """ + if not patterns: + return "" + + conditions = [] + for p in patterns: + if "." in p: + db_part, obj_part = p.split(".", 1) + conditions.append(f"({db_col} LIKE '{db_part}' AND {obj_col} LIKE '{obj_part}')") + else: + conditions.append(f"{db_col} LIKE '{p}'") + + return "AND NOT (" + " OR ".join(conditions) + ")" + + +# --------------------------------------------------------------------------- +# CTE builders +# --------------------------------------------------------------------------- + + +def _build_upstream_cte( + seed_pattern: str, + max_depth: int, + edge_table: str, + incl_fragment: str, + excl_fragment: str, +) -> str: + """ + Build a Teradata recursive CTE that traverses upstream from a seed pattern. + + "Upstream" means: what does my seed DEPEND ON? In Edge Repository terms, + when a row has Tgt matching the seed, Src is the upstream dependency. + The anchor selects rows where Tgt matches the seed; recursion follows + the Src side outward (each discovered Src becomes the next Tgt to search). + + Arguments: + seed_pattern - LIKE pattern for the seed object (DB.Object format) + max_depth - Maximum hop count to traverse + edge_table - Fully-qualified edge repository view/table name + incl_fragment - SQL fragment for container inclusion ("AND (...)") or '' + excl_fragment - SQL fragment for object exclusion ("AND NOT (...)") or '' + + Returns: + Complete WITH RECURSIVE ... SELECT statement as a string + """ + return f""" +WITH RECURSIVE UpstreamBFS + ( + Src_DB + ,Src_Obj + ,Src_Kind + ,Tgt_DB + ,Tgt_Obj + ,Tgt_Kind + ,Depth + ,Path_Str + ) AS +( + -- ---------------------------------------------------------------- + -- Anchor: edges where the target matches the seed pattern + -- ---------------------------------------------------------------- + SELECT + TRIM(e.Src_Container_Name) + ,TRIM(e.Src_Object_Name) + ,COALESCE(TRIM(e.Src_Kind), 'Unknown') + ,TRIM(e.Tgt_Container_Name) + ,TRIM(e.Tgt_Object_Name) + ,COALESCE(TRIM(e.Tgt_Kind), 'Unknown') + ,CAST(1 AS INTEGER) + ,CAST( + TRIM(e.Src_Container_Name) || '.' || TRIM(e.Src_Object_Name) + || ' <- ' + || TRIM(e.Tgt_Container_Name) || '.' || TRIM(e.Tgt_Object_Name) + AS VARCHAR(8000) + ) + FROM {edge_table} e + WHERE (TRIM(e.Tgt_Container_Name) || '.' || TRIM(e.Tgt_Object_Name)) + LIKE '{seed_pattern}' + {incl_fragment} + {excl_fragment} + + UNION ALL + + -- ---------------------------------------------------------------- + -- Recursion: follow the Src side of each already-discovered edge + -- ---------------------------------------------------------------- + SELECT + TRIM(e.Src_Container_Name) + ,TRIM(e.Src_Object_Name) + ,COALESCE(TRIM(e.Src_Kind), 'Unknown') + ,TRIM(e.Tgt_Container_Name) + ,TRIM(e.Tgt_Object_Name) + ,COALESCE(TRIM(e.Tgt_Kind), 'Unknown') + ,b.Depth + 1 + ,CAST( + TRIM(e.Src_Container_Name) || '.' || TRIM(e.Src_Object_Name) + || ' <- ' + || b.Path_Str + AS VARCHAR(8000) + ) + FROM {edge_table} e + INNER JOIN UpstreamBFS b + ON TRIM(e.Tgt_Container_Name) = b.Src_DB + AND TRIM(e.Tgt_Object_Name) = b.Src_Obj + WHERE b.Depth < {max_depth} + {incl_fragment} + {excl_fragment} +) +SELECT + Src_DB AS DependentObjectDBName + ,Src_Obj AS DependentObjectName + ,Src_DB || '.' || Src_Obj AS FQDependentObjectName + ,Tgt_DB AS ReferencedObjectDBName + ,Tgt_Obj AS ReferencedObjectName + ,Tgt_DB || '.' || Tgt_Obj AS FQReferencedObjectName + ,Src_Kind AS Src_Kind + ,Tgt_Kind AS Tgt_Kind + ,CAST(Depth * -1 AS INTEGER) AS Depth + ,Path_Str AS DependencyPath +FROM UpstreamBFS +ORDER BY Depth ASC, FQDependentObjectName +""" + + +def _build_downstream_cte( + seed_pattern: str, + max_depth: int, + edge_table: str, + incl_fragment: str, + excl_fragment: str, +) -> str: + """ + Build a Teradata recursive CTE that traverses downstream from a seed pattern. + + "Downstream" means: what DEPENDS ON my seed? In Edge Repository terms, + when a row has Src matching the seed, Tgt is the downstream dependent. + The anchor selects rows where Src matches the seed; recursion follows + the Tgt side outward (each discovered Tgt becomes the next Src to search). + + Arguments: + seed_pattern - LIKE pattern for the seed object (DB.Object format) + max_depth - Maximum hop count to traverse + edge_table - Fully-qualified edge repository view/table name + incl_fragment - SQL fragment for container inclusion ("AND (...)") or '' + excl_fragment - SQL fragment for object exclusion ("AND NOT (...)") or '' + + Returns: + Complete WITH RECURSIVE ... SELECT statement as a string + """ + return f""" +WITH RECURSIVE DownstreamBFS + ( + Src_DB + ,Src_Obj + ,Src_Kind + ,Tgt_DB + ,Tgt_Obj + ,Tgt_Kind + ,Depth + ,Path_Str + ) AS +( + -- ---------------------------------------------------------------- + -- Anchor: edges where the source matches the seed pattern + -- ---------------------------------------------------------------- + SELECT + TRIM(e.Src_Container_Name) + ,TRIM(e.Src_Object_Name) + ,COALESCE(TRIM(e.Src_Kind), 'Unknown') + ,TRIM(e.Tgt_Container_Name) + ,TRIM(e.Tgt_Object_Name) + ,COALESCE(TRIM(e.Tgt_Kind), 'Unknown') + ,CAST(1 AS INTEGER) + ,CAST( + TRIM(e.Src_Container_Name) || '.' || TRIM(e.Src_Object_Name) + || ' -> ' + || TRIM(e.Tgt_Container_Name) || '.' || TRIM(e.Tgt_Object_Name) + AS VARCHAR(8000) + ) + FROM {edge_table} e + WHERE (TRIM(e.Src_Container_Name) || '.' || TRIM(e.Src_Object_Name)) + LIKE '{seed_pattern}' + {incl_fragment} + {excl_fragment} + + UNION ALL + + -- ---------------------------------------------------------------- + -- Recursion: follow the Tgt side of each already-discovered edge + -- ---------------------------------------------------------------- + SELECT + TRIM(e.Src_Container_Name) + ,TRIM(e.Src_Object_Name) + ,COALESCE(TRIM(e.Src_Kind), 'Unknown') + ,TRIM(e.Tgt_Container_Name) + ,TRIM(e.Tgt_Object_Name) + ,COALESCE(TRIM(e.Tgt_Kind), 'Unknown') + ,b.Depth + 1 + ,CAST( + b.Path_Str + || ' -> ' + || TRIM(e.Tgt_Container_Name) || '.' || TRIM(e.Tgt_Object_Name) + AS VARCHAR(8000) + ) + FROM {edge_table} e + INNER JOIN DownstreamBFS b + ON TRIM(e.Src_Container_Name) = b.Tgt_DB + AND TRIM(e.Src_Object_Name) = b.Tgt_Obj + WHERE b.Depth < {max_depth} + {incl_fragment} + {excl_fragment} +) +SELECT + Tgt_DB AS DependentObjectDBName + ,Tgt_Obj AS DependentObjectName + ,Tgt_DB || '.' || Tgt_Obj AS FQDependentObjectName + ,Src_DB AS ReferencedObjectDBName + ,Src_Obj AS ReferencedObjectName + ,Src_DB || '.' || Src_Obj AS FQReferencedObjectName + ,Src_Kind AS Src_Kind + ,Tgt_Kind AS Tgt_Kind + ,CAST(Depth AS INTEGER) AS Depth + ,Path_Str AS DependencyPath +FROM DownstreamBFS +ORDER BY Depth ASC, FQDependentObjectName +""" + + +# --------------------------------------------------------------------------- +# Node / summary helpers — identical contract to the SP-based version +# --------------------------------------------------------------------------- + + +def _safe_int(value) -> int: + """ + Safely convert a value to int, returning 0 on failure. + + Arguments: + value - Any value (may be Teradata BYTEINT returned as string) + + Returns: + int + """ + try: + return int(value) if value is not None else 0 + except (ValueError, TypeError): + return 0 + + +def _derive_nodes_from_edges( + edges_up: list[dict], + edges_down: list[dict], +) -> list[dict]: + """ + Derive unique nodes from edge lists. + + Deduplicates by FQDependentObjectName, preferring the upstream record when + a node appears in both directions. + + Arguments: + edges_up - List of upstream edge dicts + edges_down - List of downstream edge dicts + + Returns: + List of unique node dicts + """ + nodes: dict[str, dict] = {} + + for edge in edges_up: + fq = edge.get("FQDependentObjectName") + if fq and fq not in nodes: + nodes[fq] = { + "FQDependentObjectName": fq, + "DependentObjectDBName": edge.get("DependentObjectDBName"), + "DependentObjectName": edge.get("DependentObjectName"), + "Direction": "Upstream", + "Depth": _safe_int(edge.get("Depth", 0)), + "ObjectType": edge.get("Src_Kind") or edge.get("Tgt_Kind"), + } + + for edge in edges_down: + fq = edge.get("FQDependentObjectName") + if fq and fq not in nodes: + nodes[fq] = { + "FQDependentObjectName": fq, + "DependentObjectDBName": edge.get("DependentObjectDBName"), + "DependentObjectName": edge.get("DependentObjectName"), + "Direction": "Downstream", + "Depth": _safe_int(edge.get("Depth", 0)), + "ObjectType": edge.get("Src_Kind") or edge.get("Tgt_Kind"), + } + + return list(nodes.values()) + + +def _create_summary_stats( + nodes: list[dict], + edges_up: list[dict], + edges_down: list[dict], +) -> dict: + """ + Create summary statistics from dependency data. + + Arguments: + nodes - List of node dicts + edges_up - List of upstream edge dicts + edges_down - List of downstream edge dicts + + Returns: + Dictionary of summary statistics + """ + upstream_nodes = [n for n in nodes if n.get("Direction") == "Upstream"] + downstream_nodes = [n for n in nodes if n.get("Direction") == "Downstream"] + + type_counts: dict[str, int] = {} + for node in nodes: + kind = node.get("ObjectType", "Unknown") or "Unknown" + type_counts[kind] = type_counts.get(kind, 0) + 1 + + return { + "total_nodes": len(nodes), + "upstream_nodes": len(upstream_nodes), + "downstream_nodes": len(downstream_nodes), + "total_edges": len(edges_up) + len(edges_down), + "upstream_edges": len(edges_up), + "downstream_edges": len(edges_down), + "max_depth_upstream": max((abs(_safe_int(n.get("Depth", 0))) for n in upstream_nodes), default=0), + "max_depth_downstream": max((_safe_int(n.get("Depth", 0)) for n in downstream_nodes), default=0), + "object_type_counts": type_counts, + } + + +def _format_summary( + nodes: list[dict], + edges_up: list[dict], + edges_down: list[dict], + object_name: str, +) -> dict: + """ + Format a concise summary of dependency analysis. + + Arguments: + nodes - List of node dicts + edges_up - List of upstream edge dicts + edges_down - List of downstream edge dicts + object_name - Object name pattern(s) analysed (may be CSV) + + Returns: + Dictionary with summary_text, statistics, upstream_objects, downstream_objects + """ + stats = _create_summary_stats(nodes, edges_up, edges_down) + upstream_nodes = [n for n in nodes if n.get("Direction") == "Upstream"] + downstream_nodes = [n for n in nodes if n.get("Direction") == "Downstream"] + + summary_text = f""" +DEPENDENCY ANALYSIS SUMMARY +{"=" * 60} + +Object Pattern(s): {object_name} + +OVERVIEW + Total Nodes: {stats["total_nodes"]} + Total Edges: {stats["total_edges"]} + +UPSTREAM (What These Objects Depend On) + Dependencies Found: {stats["upstream_nodes"]} + Edges: {stats["upstream_edges"]} + Max Depth Reached: {stats["max_depth_upstream"]} + +DOWNSTREAM (What Depends On These Objects) + Dependents Found: {stats["downstream_nodes"]} + Edges: {stats["downstream_edges"]} + Max Depth Reached: {stats["max_depth_downstream"]} +""" + + if stats["object_type_counts"]: + summary_text += "\nBY OBJECT TYPE\n" + for obj_type, count in sorted(stats["object_type_counts"].items(), key=lambda x: x[1], reverse=True): + summary_text += f" {obj_type:20s} {count:3d}\n" + + return { + "summary_text": summary_text, + "statistics": stats, + "upstream_objects": [n["FQDependentObjectName"] for n in upstream_nodes], + "downstream_objects": [n["FQDependentObjectName"] for n in downstream_nodes], + } + + +# --------------------------------------------------------------------------- +# Public handler +# --------------------------------------------------------------------------- + + +def handle_graph_traceLineage( + conn: TeradataConnection, + object_name: str, + max_depth_up: int = 3, + max_depth_down: int = 3, + exclude_objects: str = "", + include_containers: str = "", + edge_repository: str = "", + return_format: str = "detailed", + tool_name: str | None = None, + *args, + **kwargs, +): + """ + Analyse object dependencies in Teradata. Supports wildcards (%) and CSV patterns. + + Hybrid implementation — no stored procedure required. Python constructs + Teradata recursive CTEs that execute entirely server-side. Only the + reachable subgraph crosses the network — not the full edge table. + + Examples: 'DB.Table' (single), '%WBC%.%' (wildcard), 'DB.T1,DB.T2' (CSV) + + Finds upstream dependencies (what the object depends on) and downstream + dependents (what depends on the object). Returns nodes and edges + representing the dependency subgraph. + + When multiple patterns are provided via CSV, one upstream CTE and one + downstream CTE is executed per pattern. Results are merged and + deduplicated by Python before assembly. + + Use this for: + - Impact analysis: "What breaks if I change or drop this object?" + - Lineage tracing: "Where does this data come from?" + - Dependency discovery: "What does this object use?" + - Pre-deployment validation: checking impacts before making changes + + Arguments: + object_name - str: Object name pattern(s). + Supports wildcards (%) and CSV format. + STRING type — not an array. + + Single: 'DEV01_StGeo_STD_T.mortgage_account' + Wildcard: '%WBC%.%' + Multiple: '%WBC%.%,%StGeo%.%' + + max_depth_up - int: Maximum levels to traverse upstream (0-10). + 0 = no upstream analysis. Default: 3 + + max_depth_down - int: Maximum levels to traverse downstream (0-10). + 0 = no downstream analysis. Default: 3 + + exclude_objects - str: CSV LIKE patterns to exclude. + Matches against DB.Object format. + Example: 'PRD_%,%.temp_%' + Default: '' (no exclusions) + + include_containers - str: CSV of container LIKE patterns to include + (whitelist). Empty = all containers. + Default: '' (all containers) + + edge_repository - str: Edge repository view/table conforming to the + Required parameter — no default. + + return_format - str: 'detailed' (default), 'summary', or 'edges_only' + + Returns: + ResponseType: formatted response with dependency analysis results. + + detailed response structure: + { + "nodes": [...], // Unique nodes (deduplicated) + "upstream_edges": [...], // One row per upstream edge + "downstream_edges":[...], // One row per downstream edge + "summary": {...} // Aggregate statistics + } + + Edge row fields: + DependentObjectDBName, DependentObjectName, FQDependentObjectName, + ReferencedObjectDBName, ReferencedObjectName, FQReferencedObjectName, + Src_Kind, Tgt_Kind, Depth, DependencyPath + """ + logger.debug( + "Tool: handle_graph_traceLineage: Args: " + "object_name=%s, max_depth_up=%s, max_depth_down=%s, " + "exclude_objects=%s, include_containers=%s, " + "edge_repository=%s, return_format=%s", + object_name, + max_depth_up, + max_depth_down, + exclude_objects, + include_containers, + edge_repository, + return_format, + ) + + # ----------------------------------------------------------------------- + # Validate and clamp depth parameters + # ----------------------------------------------------------------------- + max_depth_up = max(0, min(10, int(max_depth_up))) + max_depth_down = max(0, min(10, int(max_depth_down))) + + # ----------------------------------------------------------------------- + # Parse pattern inputs + # ----------------------------------------------------------------------- + seed_patterns = parse_csv_patterns(object_name) + excl_patterns = parse_csv_patterns(exclude_objects) + incl_containers = parse_csv_patterns(include_containers) + + if not seed_patterns: + return create_response( + {"error": "object_name must not be empty"}, + { + "tool_name": tool_name or "graph_traceLineage", + "object_name": object_name, + "status": "error", + }, + ) + + if not edge_repository: + return create_response( + {"error": "edge_repository is required. Call graph_edgeContractDDL to generate one."}, + { + "tool_name": tool_name or "graph_traceLineage", + "object_name": object_name, + "status": "error", + }, + ) + + try: + # ----------------------------------------------------------------------- + # Build shared SQL fragments (same for every seed pattern) + # ----------------------------------------------------------------------- + incl_fragment = _build_or_like(incl_containers, "e.Src_Container_Name", "e.Tgt_Container_Name") + excl_fragment = _build_excl_fragment(excl_patterns, "e.Src_Container_Name", "e.Src_Object_Name") + + all_edges_up: list[dict] = [] + all_edges_down: list[dict] = [] + + with conn.cursor() as cur: + for pattern in seed_patterns: + # --------------------------------------------------------------- + # Upstream traversal (skip if max_depth_up == 0) + # --------------------------------------------------------------- + if max_depth_up > 0: + up_sql = _build_upstream_cte( + seed_pattern=pattern, + max_depth=max_depth_up, + edge_table=edge_repository, + incl_fragment=incl_fragment, + excl_fragment=excl_fragment, + ) + logger.debug("Tool: handle_graph_traceLineage: Upstream CTE for pattern '%s':\n%s", pattern, up_sql) + cur.execute(up_sql) + batch = rows_to_json(cur.description, cur.fetchall()) + all_edges_up.extend(batch) + logger.debug( + "Tool: handle_graph_traceLineage: Pattern '%s' upstream: %d edges", pattern, len(batch) + ) + + # --------------------------------------------------------------- + # Downstream traversal (skip if max_depth_down == 0) + # --------------------------------------------------------------- + if max_depth_down > 0: + down_sql = _build_downstream_cte( + seed_pattern=pattern, + max_depth=max_depth_down, + edge_table=edge_repository, + incl_fragment=incl_fragment, + excl_fragment=excl_fragment, + ) + logger.debug( + "Tool: handle_graph_traceLineage: Downstream CTE for pattern '%s':\n%s", pattern, down_sql + ) + cur.execute(down_sql) + batch = rows_to_json(cur.description, cur.fetchall()) + all_edges_down.extend(batch) + logger.debug( + "Tool: handle_graph_traceLineage: Pattern '%s' downstream: %d edges", pattern, len(batch) + ) + + # ----------------------------------------------------------------------- + # Deduplicate edges by (FQDependentObjectName, FQReferencedObjectName) + # ----------------------------------------------------------------------- + def _dedup(edges: list[dict]) -> list[dict]: + """Remove duplicate edges, keeping the first occurrence.""" + seen: set[tuple] = set() + out: list[dict] = [] + for e in edges: + key = ( + e.get("FQDependentObjectName"), + e.get("FQReferencedObjectName"), + ) + if key not in seen: + seen.add(key) + out.append(e) + return out + + edges_up = _dedup(all_edges_up) + edges_down = _dedup(all_edges_down) + + # ----------------------------------------------------------------------- + # Derive nodes and assemble response + # ----------------------------------------------------------------------- + nodes_data = _derive_nodes_from_edges(edges_up, edges_down) + + if return_format == "summary": + formatted_data = _format_summary(nodes_data, edges_up, edges_down, object_name) + elif return_format == "edges_only": + formatted_data = { + "upstream_edges": edges_up, + "downstream_edges": edges_down, + } + else: # detailed (default) + formatted_data = { + "nodes": nodes_data, + "upstream_edges": edges_up, + "downstream_edges": edges_down, + "summary": _create_summary_stats(nodes_data, edges_up, edges_down), + } + + metadata = { + "tool_name": tool_name or "graph_traceLineage", + "object_name": object_name, + "max_depth_up": max_depth_up, + "max_depth_down": max_depth_down, + "edge_repository": edge_repository, + "return_format": return_format, + "counts": { + "nodes": len(nodes_data), + "upstream_edges": len(edges_up), + "downstream_edges": len(edges_down), + }, + "status": "success", + "message": ( + f"Dependency analysis complete: " + f"{len(nodes_data)} node(s), " + f"{len(edges_up)} upstream edge(s), " + f"{len(edges_down)} downstream edge(s)." + ), + } + + logger.debug("Tool: handle_graph_traceLineage: metadata: %s", metadata) + return create_response(formatted_data, metadata) + + except Exception as e: + logger.error("Tool: handle_graph_traceLineage: Error: %s", e, exc_info=True) + return create_response( + {"error": str(e)}, + { + "tool_name": tool_name or "graph_traceLineage", + "object_name": object_name, + "status": "error", + }, + ) + + +# --------------------------------------------------------------------------- +# Tool registration descriptor +# --------------------------------------------------------------------------- +GRAPH_TRACE_LINEAGE_TOOL = { + "name": "graph_traceLineage", + "handler": handle_graph_traceLineage, + "description": ( + "Analyse object dependencies in Teradata — finds upstream dependencies " + "(what the object depends on) and downstream dependents (what depends " + "on the object). Hybrid implementation: Python constructs Teradata " + "recursive CTEs that execute entirely server-side, so only the reachable " + "subgraph crosses the network. No stored procedure required. " + "Supports wildcards (%) and CSV patterns for object_name. " + "Use for impact analysis, lineage tracing, and pre-deployment validation. " + "Do NOT use for migration wave sequencing — use graph_bfsLevels for that. " + "Requires an edge repository conforming to the Graph Edge Contract. " + "If you don't have one yet, call graph_edgeContractDDL first to " + "generate the CREATE TABLE or CREATE VIEW DDL." + ), + "parameters": { + "object_name": { + "type": "string", + "description": ( + "Object name pattern(s). Supports wildcards (%) and CSV. " + "Single: 'DB.Table'. Wildcard: '%WBC%.%'. " + "Multiple: '%WBC%.%,%StGeo%.%'." + ), + "required": True, + }, + "max_depth_up": { + "type": "integer", + "description": "Maximum upstream levels to traverse (0-10). Default: 3.", + "default": 3, + }, + "max_depth_down": { + "type": "integer", + "description": "Maximum downstream levels to traverse (0-10). Default: 3.", + "default": 3, + }, + "exclude_objects": { + "type": "string", + "description": ("CSV of FQ object name LIKE patterns to exclude. Example: 'PRD_%,%.temp_%'. Default: ''."), + "default": "", + }, + "include_containers": { + "type": "string", + "description": ("CSV of container LIKE patterns to include (whitelist). Default: '' (all containers)."), + "default": "", + }, + "edge_repository": { + "type": "string", + "description": ( + "Edge repository table or view conforming to the Graph Edge Contract. " + "Call graph_edgeContractDDL to generate one if needed. " + "Required parameter — no default." + ), + "required": True, + }, + "return_format": { + "type": "string", + "description": ("Output format: 'detailed' (default), 'summary', or 'edges_only'."), + "default": "detailed", + }, + }, +} diff --git a/src/teradata_mcp_server/tools/module_loader.py b/src/teradata_mcp_server/tools/module_loader.py index 9ecf5a4..9314480 100644 --- a/src/teradata_mcp_server/tools/module_loader.py +++ b/src/teradata_mcp_server/tools/module_loader.py @@ -24,6 +24,7 @@ class ModuleLoader: "chat": "teradata_mcp_server.tools.chat", "dba": "teradata_mcp_server.tools.dba", "fs": "teradata_mcp_server.tools.fs", + "graph": "teradata_mcp_server.tools.graph", "qlty": "teradata_mcp_server.tools.qlty", "rag": "teradata_mcp_server.tools.rag", "sql_opt": "teradata_mcp_server.tools.sql_opt", diff --git a/src/teradata_mcp_server/tools/utils/__init__.py b/src/teradata_mcp_server/tools/utils/__init__.py index 520a2de..e8edb29 100644 --- a/src/teradata_mcp_server/tools/utils/__init__.py +++ b/src/teradata_mcp_server/tools/utils/__init__.py @@ -21,8 +21,21 @@ # -------------------- Serialization & response helpers -------------------- # def serialize_teradata_types(obj: Any) -> Any: - """Convert Teradata-specific types to JSON serializable formats.""" - if isinstance(obj, date | datetime): + """Convert Teradata-specific types to JSON-serialisable formats. + + Handles None explicitly so that database NULL values are preserved + as Python None (→ JSON null) rather than the string ``"None"``. + + Args: + obj: The value to convert. + + Returns: + A JSON-native type (str, int, float, bool, None) or an + ISO-formatted date string. + """ + if obj is None: + return None + if isinstance(obj, (date, datetime)): return obj.isoformat() if isinstance(obj, Decimal): return float(obj) @@ -40,17 +53,83 @@ def rows_to_json(cursor_description: Any, rows: list[Any]) -> list[dict[str, Any return out -def create_response(data: Any, metadata: dict[str, Any] | None = None, error: dict[str, Any] | None = None) -> str: - """Create a standardized JSON response structure.""" +def _make_serialisable(obj: Any) -> Any: + """Recursively walk an object tree, converting every leaf to a + JSON-native Python type. + + This is the deep-conversion counterpart of + :func:`serialize_teradata_types`. It ensures that nested dicts + and lists produced by tool handlers contain only types that + ``json.dumps`` can serialise without a custom *default* hook, + and — critically — that ``None`` values survive as ``None`` + (JSON ``null``) instead of the string ``"None"``. + + Args: + obj: Any Python object (scalar, dict, list, tuple, etc.). + + Returns: + A recursively sanitised copy whose leaves are all + ``str | int | float | bool | None``. + """ + if obj is None: + return None + if isinstance(obj, (str, int, float, bool)): + return obj + if isinstance(obj, (date, datetime)): + return obj.isoformat() + if isinstance(obj, Decimal): + return float(obj) + if isinstance(obj, dict): + return {k: _make_serialisable(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [_make_serialisable(item) for item in obj] + # Fallback: cast to string (e.g. bytes, custom objects) + return str(obj) + + +def create_response( + data: Any, + metadata: dict[str, Any] | None = None, + error: dict[str, Any] | None = None, +) -> dict: + """Create a standardised MCP response structure. + + .. versionchanged:: 1.1.0 + Returns a **dict** instead of a JSON string. The MCP + framework requires ``structured_content`` to be a ``dict`` + (or ``None``); returning a JSON string caused the server to + wrap it in a ``[{"type": "text", ...}]`` list which the + framework rejected. + + All nested values are recursively sanitised via + :func:`_make_serialisable` so that ``None`` / NULL values + are preserved as ``None`` (JSON ``null``) and Teradata- + specific types (``Decimal``, ``datetime``, etc.) are + converted to JSON-native equivalents. + + Args: + data: Payload — typically a list of row-dicts. + metadata: Optional dict of tool metadata (tool_name, sql, etc.). + error: Optional error dict; if supplied the response + status is set to ``"error"``. + + Returns: + dict: A JSON-serialisable dict ready to be used as + MCP ``structured_content``. + """ if error: - resp = {"status": "error", "message": error} + resp: dict[str, Any] = {"status": "error", "message": error} if metadata: - resp["metadata"] = metadata - return json.dumps(resp, default=serialize_teradata_types) - resp = {"status": "success", "results": data} + resp["metadata"] = _make_serialisable(metadata) + return resp + + resp = { + "status": "success", + "results": _make_serialisable(data), + } if metadata: - resp["metadata"] = metadata - return json.dumps(resp, default=serialize_teradata_types) + resp["metadata"] = _make_serialisable(metadata) + return resp # ------------------------------ Auth helpers ------------------------------ #