Skip to content

Commit 5032b73

Browse files
committed
use dialect specific rules
1 parent 222f7db commit 5032b73

File tree

1 file changed

+82
-0
lines changed

1 file changed

+82
-0
lines changed
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
version: "1.0.0"
2+
3+
lakebridge:
4+
description: >
5+
Defines transformations that standardize snowflake data types before comparison to databricks.
6+
transformations:
7+
# ------------------------------------------------------------------------
8+
# TEMPORAL CATEGORY
9+
# ------------------------------------------------------------------------
10+
date_to_iso8601: # Use any name here
11+
description: "Normalize DATE to ISO 8601 (YYYY-MM-DD)."
12+
databricks_type: ["DATE"]
13+
databricks_transformation: "DATE_FORMAT({}, 'yyyy-MM-dd')"
14+
rules:
15+
date: # Use any name here
16+
source_types: ["DATE"]
17+
source_transformation: "TO_VARCHAR({}, 'YYYY-MM-DD')" # can also override databricks transformation if needed
18+
string:
19+
source_types: ["CHAR", "VARCHAR", "STRING", "TEXT"]
20+
source_transformation: "{}" # identity, no transformation needed for strings
21+
22+
timestamp_ntz_to_iso8601:
23+
description: "Normalize TIMESTAMP (no tz) to ISO 8601 with milliseconds."
24+
databricks_type: ["TIMESTAMP_NTZ", "TIMESTAMP WITHOUT TIME ZONE"]
25+
databricks_transformation: "DATE_FORMAT({}, 'yyyy-MM-dd HH:mm:ss.SSS')"
26+
rules:
27+
timestamp_ntz:
28+
source_types: [] # applies to all types if empty
29+
source_transformation: "TO_VARCHAR(TO_TIMESTAMP_NTZ({}))"
30+
31+
timestamp_to_iso8601:
32+
description: "Normalize TIMESTAMP with time zone to ISO 8601 with milliseconds."
33+
databricks_type: ["TIMESTAMP"]
34+
databricks_transformation: "DATE_FORMAT({}, 'yyyy-MM-dd HH:mm:ss.SSS ZZZZZ')"
35+
rules:
36+
timestamp_tz:
37+
source_types: [] # applies to all types if empty
38+
source_transformation: "TO_VARCHAR(TO_TIMESTAMP_TZ({}))"
39+
40+
# ------------------------------------------------------------------------
41+
# SEMI-STRUCTURED AND STRUCTURED CATEGORY
42+
# ------------------------------------------------------------------------
43+
arrays:
44+
description: "Normalize Arrays: filter NULLs then sort then join"
45+
databricks_type: ["ARRAY"]
46+
databricks_transformation: "COALESCE(CONCAT_WS(',', ARRAY_SORT(FILTER({}, x -> x IS NOT NULL))), 'null_recon')"
47+
rules:
48+
array:
49+
source_types: ["ARRAY"]
50+
source_transformation: "COALESCE(ARRAY_TO_STRING(ARRAY_SORT(ARRAY_COMPACT({})), ','), 'null_recon')"
51+
52+
structs:
53+
description: "Serialize or flatten nested structures to JSON."
54+
databricks_type: ["STRUCT", "MAP", "VARIANT", "OBJECT"]
55+
databricks_transformation: "TO_JSON({})"
56+
rules:
57+
struct:
58+
source_types: ["OBJECT", "MAP"]
59+
source_transformation: "TO_JSON({})"
60+
61+
# ------------------------------------------------------------------------
62+
# NUMERIC CATEGORY
63+
# ------------------------------------------------------------------------
64+
65+
66+
# ------------------------------------------------------------------------
67+
# OTHER CATEGORY
68+
# ------------------------------------------------------------------------
69+
booleans:
70+
description: "Normalize boolean and boolean-like values to TRUE/FALSE."
71+
databricks_type: ["BOOLEAN"]
72+
databricks_transformation: "{}"
73+
rules:
74+
boolean:
75+
source_types: [ "BOOLEAN" ]
76+
source_transformation: "{}"
77+
integer_as_boolean:
78+
source_types: [ "INT", "INTEGER", "SMALLINT", "TINYINT", "BYTEINT", "NUMBER" ]
79+
source_transformation: "CASE WHEN {} = 1 THEN TRUE WHEN {} = 0 THEN FALSE ELSE FALSE END"
80+
string_as_boolean:
81+
source_types: [ "CHAR", "VARCHAR", "TEXT", "STRING" ]
82+
source_transformation: "CASE WHEN UPPER({}) IN ('Y','YES','TRUE','1') THEN TRUE ELSE FALSE END"

0 commit comments

Comments
 (0)