Skip to content

Commit fc55b98

Browse files
feat(snowflake)!: Transpilation of MINHASH functions from Snowflake to DuckDB (#6859)
* transpilation for MINHASH, MINHASH_COMBINE, and APPROXIMATE_SIMILARITY * made approximate behavor more like Snowflake * Added support for star
1 parent bf90b5d commit fc55b98

2 files changed

Lines changed: 102 additions & 0 deletions

File tree

sqlglot/dialects/duckdb.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2077,6 +2077,53 @@ class Generator(generator.Generator):
20772077
""",
20782078
)
20792079

2080+
# Template for MINHASH transpilation
2081+
# Computes k minimum hash values across aggregated data using DuckDB list functions
2082+
# Returns JSON matching Snowflake format: {"state": [...], "type": "minhash", "version": 1}
2083+
MINHASH_TEMPLATE: exp.Expression = exp.maybe_parse(
2084+
"""
2085+
SELECT JSON_OBJECT('state', LIST(min_h ORDER BY seed), 'type', 'minhash', 'version', 1)
2086+
FROM (
2087+
SELECT seed, LIST_MIN(LIST_TRANSFORM(vals, __v -> HASH(CAST(__v AS VARCHAR) || CAST(seed AS VARCHAR)))) AS min_h
2088+
FROM (SELECT LIST(:expr) AS vals), RANGE(0, :k) AS t(seed)
2089+
)
2090+
""",
2091+
)
2092+
2093+
# Template for MINHASH_COMBINE transpilation
2094+
# Combines multiple minhash signatures by taking element-wise minimum
2095+
MINHASH_COMBINE_TEMPLATE: exp.Expression = exp.maybe_parse(
2096+
"""
2097+
SELECT JSON_OBJECT('state', LIST(min_h ORDER BY idx), 'type', 'minhash', 'version', 1)
2098+
FROM (
2099+
SELECT
2100+
pos AS idx,
2101+
MIN(val) AS min_h
2102+
FROM
2103+
UNNEST(LIST(:expr)) AS _(sig),
2104+
UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS t(val, pos)
2105+
GROUP BY pos
2106+
)
2107+
""",
2108+
)
2109+
2110+
# Template for APPROXIMATE_SIMILARITY transpilation
2111+
# Computes multi-way Jaccard similarity: fraction of positions where ALL signatures agree
2112+
APPROXIMATE_SIMILARITY_TEMPLATE: exp.Expression = exp.maybe_parse(
2113+
"""
2114+
SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*)
2115+
FROM (
2116+
SELECT pos, COUNT(DISTINCT h) AS num_distinct
2117+
FROM (
2118+
SELECT h, pos
2119+
FROM UNNEST(LIST(:expr)) AS _(sig),
2120+
UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos)
2121+
)
2122+
GROUP BY pos
2123+
)
2124+
""",
2125+
)
2126+
20802127
def timeslice_sql(self: DuckDB.Generator, expression: exp.TimeSlice) -> str:
20812128
"""
20822129
Transform Snowflake's TIME_SLICE to DuckDB's time_bucket.
@@ -2829,6 +2876,32 @@ def levenshtein_sql(self, expression: exp.Levenshtein) -> str:
28292876
levenshtein = exp.Levenshtein(this=this, expression=expr)
28302877
return self.sql(exp.Least(this=levenshtein, expressions=[max_dist]))
28312878

2879+
def minhash_sql(self, expression: exp.Minhash) -> str:
2880+
k = expression.this
2881+
exprs = expression.expressions
2882+
2883+
if len(exprs) != 1 or isinstance(exprs[0], exp.Star):
2884+
self.unsupported(
2885+
"MINHASH with multiple expressions or * requires manual query restructuring"
2886+
)
2887+
return self.func("MINHASH", k, *exprs)
2888+
2889+
expr = exprs[0]
2890+
result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k)
2891+
return f"({self.sql(result)})"
2892+
2893+
def minhashcombine_sql(self, expression: exp.MinhashCombine) -> str:
2894+
expr = expression.this
2895+
result = exp.replace_placeholders(self.MINHASH_COMBINE_TEMPLATE.copy(), expr=expr)
2896+
return f"({self.sql(result)})"
2897+
2898+
def approximatesimilarity_sql(self, expression: exp.ApproximateSimilarity) -> str:
2899+
expr = expression.this
2900+
result = exp.replace_placeholders(
2901+
self.APPROXIMATE_SIMILARITY_TEMPLATE.copy(), expr=expr
2902+
)
2903+
return f"({self.sql(result)})"
2904+
28322905
def lower_sql(self, expression: exp.Lower) -> str:
28332906
result_sql = self.func("LOWER", _cast_to_varchar(expression.this))
28342907
return _gen_with_cast_to_blob(self, expression, result_sql)

tests/dialects/test_snowflake.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1845,6 +1845,35 @@ def test_snowflake(self):
18451845
"snowflake": "EDITDISTANCE(col1, col2, 3)",
18461846
},
18471847
)
1848+
1849+
self.validate_identity("MINHASH(100, col1)")
1850+
self.validate_identity("MINHASH(100, col1, col2)")
1851+
self.validate_all(
1852+
"MINHASH(4, col1)",
1853+
write={
1854+
"duckdb": "(SELECT JSON_OBJECT('state', LIST(min_h ORDER BY seed NULLS FIRST), 'type', 'minhash', 'version', 1) FROM (SELECT seed, LIST_MIN(LIST_TRANSFORM(vals, __v -> HASH(CAST(__v AS TEXT) || CAST(seed AS TEXT)))) AS min_h FROM (SELECT LIST(col1) AS vals), RANGE(0, 4) AS t(seed)))",
1855+
"snowflake": "MINHASH(4, col1)",
1856+
},
1857+
)
1858+
1859+
self.validate_identity("MINHASH_COMBINE(sig_col)")
1860+
self.validate_all(
1861+
"MINHASH_COMBINE(sig_col)",
1862+
write={
1863+
"duckdb": "(SELECT JSON_OBJECT('state', LIST(min_h ORDER BY idx NULLS FIRST), 'type', 'minhash', 'version', 1) FROM (SELECT pos AS idx, MIN(val) AS min_h FROM UNNEST(LIST(sig_col)) AS _(sig) JOIN UNNEST(CAST(sig -> '$.state' AS UBIGINT[])) WITH ORDINALITY AS t(val, pos) ON TRUE GROUP BY pos))",
1864+
"snowflake": "MINHASH_COMBINE(sig_col)",
1865+
},
1866+
)
1867+
1868+
self.validate_identity("APPROXIMATE_SIMILARITY(sig_col)")
1869+
self.validate_all(
1870+
"APPROXIMATE_SIMILARITY(sig_col)",
1871+
write={
1872+
"duckdb": "(SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) FROM (SELECT pos, COUNT(DISTINCT h) AS num_distinct FROM (SELECT h, pos FROM UNNEST(LIST(sig_col)) AS _(sig) JOIN UNNEST(CAST(sig -> '$.state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos) ON TRUE) GROUP BY pos))",
1873+
"snowflake": "APPROXIMATE_SIMILARITY(sig_col)",
1874+
},
1875+
)
1876+
18481877
self.validate_identity("SELECT BITNOT(a)")
18491878
self.validate_identity("SELECT BIT_NOT(a)", "SELECT BITNOT(a)")
18501879
self.validate_all(

0 commit comments

Comments
 (0)