Skip to content

Commit 20409fd

Browse files
authored
feat(ingestion/dbt): Add include_database_name parameter for dbt core (datahub-project#12411)
1 parent 6098e97 commit 20409fd

File tree

5 files changed

+268
-9
lines changed

5 files changed

+268
-9
lines changed

Diff for: metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py

+5
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,11 @@ class DBTCommonConfig(
357357
default=True,
358358
description="When enabled, includes the compiled code in the emitted metadata.",
359359
)
360+
include_database_name: bool = Field(
361+
default=True,
362+
description="Whether to add database name to the table urn. "
363+
"Set to False to skip it for engines like AWS Athena where it's not required.",
364+
)
360365

361366
@validator("target_platform")
362367
def validate_target_platform_value(cls, target_platform: str) -> str:

Diff for: metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ def extract_dbt_entities(
167167
use_identifiers: bool,
168168
tag_prefix: str,
169169
only_include_if_in_catalog: bool,
170+
include_database_name: bool,
170171
report: DBTSourceReport,
171172
) -> List[DBTNode]:
172173
sources_by_id = {x["unique_id"]: x for x in sources_results}
@@ -267,7 +268,7 @@ def extract_dbt_entities(
267268
dbt_name=key,
268269
dbt_adapter=manifest_adapter,
269270
dbt_package_name=manifest_node.get("package_name"),
270-
database=manifest_node["database"],
271+
database=manifest_node["database"] if include_database_name else None,
271272
schema=manifest_node["schema"],
272273
name=name,
273274
alias=manifest_node.get("alias"),
@@ -543,14 +544,15 @@ def loadManifestAndCatalog(
543544
all_catalog_entities = {**catalog_nodes, **catalog_sources}
544545

545546
nodes = extract_dbt_entities(
546-
all_manifest_entities,
547-
all_catalog_entities,
548-
sources_results,
549-
manifest_adapter,
550-
self.config.use_identifiers,
551-
self.config.tag_prefix,
552-
self.config.only_include_if_in_catalog,
553-
self.report,
547+
all_manifest_entities=all_manifest_entities,
548+
all_catalog_entities=all_catalog_entities,
549+
sources_results=sources_results,
550+
manifest_adapter=manifest_adapter,
551+
use_identifiers=self.config.use_identifiers,
552+
tag_prefix=self.config.tag_prefix,
553+
only_include_if_in_catalog=self.config.only_include_if_in_catalog,
554+
include_database_name=self.config.include_database_name,
555+
report=self.report,
554556
)
555557

556558
return (
+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{
2+
"metadata": {
3+
"dbt_schema_version": "https://schemas.getdbt.com/dbt/catalog/v1.json",
4+
"dbt_version": "1.9.0",
5+
"generated_at": "2025-01-22T10:56:18.229568Z",
6+
"invocation_id": "385456c0-b0ca-43b6-aa5a-422539d1f142",
7+
"env": {}
8+
},
9+
"nodes": {
10+
"model.tdd.simple": {
11+
"metadata": {
12+
"type": "iceberg_table",
13+
"schema": "sandbox",
14+
"name": "simple",
15+
"database": "awsdatacatalog",
16+
"comment": "This model calculates the count of records in the stg_simple table.",
17+
"owner": null
18+
},
19+
"columns": {
20+
"action_type_id": {
21+
"type": "int",
22+
"index": 0,
23+
"name": "action_type_id",
24+
"comment": null
25+
}
26+
},
27+
"stats": {
28+
"has_stats": {
29+
"id": "has_stats",
30+
"label": "Has Stats?",
31+
"value": false,
32+
"include": false,
33+
"description": "Indicates whether there are statistics for this table"
34+
}
35+
},
36+
"unique_id": "model.tdd.simple"
37+
}
38+
},
39+
"sources": {}
40+
}
+174
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
{
2+
"metadata": {
3+
"dbt_schema_version": "https://schemas.getdbt.com/dbt/manifest/v12.json",
4+
"dbt_version": "1.9.0",
5+
"generated_at": "2024-12-16T14:59:29.364802Z",
6+
"invocation_id": "eb043e5d-3a2a-4fe3-b44b-3bc9f55ba7f0",
7+
"env": {},
8+
"project_name": "tdd",
9+
"project_id": "66d26512fa77cc5ff934201903dd7482",
10+
"user_id": null,
11+
"send_anonymous_usage_stats": false,
12+
"adapter_type": "athena"
13+
},
14+
"nodes": {
15+
"model.tdd.simple": {
16+
"database": "awsdatacatalog",
17+
"schema": "sandbox",
18+
"name": "simple",
19+
"resource_type": "model",
20+
"package_name": "tdd",
21+
"path": "simple.sql",
22+
"original_file_path": "models/simple.sql",
23+
"unique_id": "model.tdd.simple",
24+
"fqn": [
25+
"tdd",
26+
"simple"
27+
],
28+
"alias": "simple",
29+
"checksum": {
30+
"name": "sha256",
31+
"checksum": "67b056428a09fa084c740bfeea17eafa2aa4a0e0ebd3ed100ef52e1bbc04718b"
32+
},
33+
"config": {
34+
"enabled": true,
35+
"alias": null,
36+
"schema": "sandbox",
37+
"database": null,
38+
"tags": [
39+
"dbt__tdd",
40+
"dbt_project_tdd"
41+
],
42+
"meta": {},
43+
"group": null,
44+
"materialized": "table",
45+
"incremental_strategy": null,
46+
"batch_size": null,
47+
"lookback": 1,
48+
"begin": null,
49+
"persist_docs": {
50+
"relation": true,
51+
"columns": true
52+
},
53+
"post-hook": [],
54+
"pre-hook": [],
55+
"quoting": {},
56+
"column_types": {},
57+
"full_refresh": null,
58+
"unique_key": null,
59+
"on_schema_change": "append_new_columns",
60+
"on_configuration_change": "apply",
61+
"grants": {},
62+
"packages": [],
63+
"docs": {
64+
"show": true,
65+
"node_color": null
66+
},
67+
"contract": {
68+
"enforced": true,
69+
"alias_types": true
70+
},
71+
"event_time": null,
72+
"concurrent_batches": null,
73+
"access": "protected",
74+
"lf_tags_config": {
75+
"enabled": true,
76+
"tags": {
77+
"confidentiality": "internal"
78+
}
79+
},
80+
"table_type": "iceberg"
81+
},
82+
"tags": [
83+
"dbt__tdd",
84+
"dbt_project_tdd"
85+
],
86+
"description": "This model calculates the count of records in the stg_simple table.",
87+
"columns": {
88+
"action_type_id": {
89+
"name": "action_type_id",
90+
"description": "The count of records in the stg_simple table",
91+
"meta": {},
92+
"data_type": "integer",
93+
"constraints": [],
94+
"quote": null,
95+
"tags": [],
96+
"granularity": null
97+
},
98+
"mixed": {
99+
"name": "mixed",
100+
"description": "",
101+
"meta": {},
102+
"data_type": "varchar",
103+
"constraints": [],
104+
"quote": null,
105+
"tags": [],
106+
"granularity": null
107+
}
108+
},
109+
"meta": {},
110+
"group": null,
111+
"docs": {
112+
"show": true,
113+
"node_color": null
114+
},
115+
"patch_path": "tdd://models/simple.yml",
116+
"build_path": null,
117+
"unrendered_config": {
118+
"tags": "dbt_project_tdd",
119+
"on_schema_change": "append_new_columns",
120+
"lf_tags_config": {
121+
"enabled": true,
122+
"tags": {
123+
"confidentiality": "internal"
124+
}
125+
},
126+
"persist_docs": {
127+
"relation": true,
128+
"columns": true
129+
},
130+
"table_type": "iceberg",
131+
"schema": "sandbox",
132+
"materialized": "table",
133+
"contract": {
134+
"enforced": true
135+
}
136+
},
137+
"created_at": 1734359717.2746701,
138+
"relation_name": "\"awsdatacatalog\".\"sandbox\".\"simple\"",
139+
"raw_code": "select 1 as action_type_id, 2 as mixed from test.test",
140+
"language": "sql",
141+
"refs": [
142+
{
143+
"name": "stg_simple",
144+
"package": null,
145+
"version": null
146+
}
147+
],
148+
"sources": [],
149+
"metrics": [],
150+
"depends_on": {
151+
"macros": [
152+
"macro.dbt_unit_testing.ref"
153+
],
154+
"nodes": [
155+
"model.tdd.stg_simple"
156+
]
157+
},
158+
"compiled_path": null,
159+
"contract": {
160+
"enforced": true,
161+
"alias_types": true,
162+
"checksum": "a2ea5df63211a33bf954882f96b1f89f1f6f625547d2e119ec10a7e78da3d3ea"
163+
},
164+
"access": "protected",
165+
"constraints": [],
166+
"version": null,
167+
"latest_version": null,
168+
"deprecation_date": null,
169+
"primary_key": [],
170+
"time_spine": null
171+
}
172+
},
173+
"sources": {}
174+
}

Diff for: metadata-ingestion/tests/unit/test_dbt_source.py renamed to metadata-ingestion/tests/unit/dbt/test_dbt_source.py

+38
Original file line numberDiff line numberDiff line change
@@ -493,3 +493,41 @@ def test_get_column_type_redshift():
493493
messages[0].message
494494
== "Got an unexpected column type. The column's parsed field type will not be populated."
495495
)
496+
497+
498+
def test_include_database_name_default():
499+
config_dict = {
500+
"manifest_path": "dummy_path",
501+
"catalog_path": "dummy_path",
502+
"target_platform": "dummy_platform",
503+
}
504+
config = DBTCoreConfig.parse_obj({**config_dict})
505+
assert config.include_database_name is True
506+
507+
508+
@pytest.mark.parametrize(
509+
("include_database_name", "expected"), [("false", False), ("true", True)]
510+
)
511+
def test_include_database_name(include_database_name: str, expected: bool) -> None:
512+
config_dict = {
513+
"manifest_path": "dummy_path",
514+
"catalog_path": "dummy_path",
515+
"target_platform": "dummy_platform",
516+
}
517+
config_dict.update({"include_database_name": include_database_name})
518+
config = DBTCoreConfig.parse_obj({**config_dict})
519+
assert config.include_database_name is expected
520+
521+
522+
def test_extract_dbt_entities():
523+
ctx = PipelineContext(run_id="test-run-id", pipeline_name="dbt-source")
524+
config = DBTCoreConfig(
525+
manifest_path="tests/unit/dbt/artifacts/manifest.json",
526+
catalog_path="tests/unit/dbt/artifacts/catalog.json",
527+
target_platform="dummy",
528+
)
529+
source = DBTCoreSource(config, ctx, "dbt")
530+
assert all(node.database is not None for node in source.loadManifestAndCatalog()[0])
531+
config.include_database_name = False
532+
source = DBTCoreSource(config, ctx, "dbt")
533+
assert all(node.database is None for node in source.loadManifestAndCatalog()[0])

0 commit comments

Comments
 (0)