Skip to content

Commit b1b37c7

Browse files
made approximate behavor more like Snowflake
1 parent b1facec commit b1b37c7

2 files changed

Lines changed: 10 additions & 9 deletions

File tree

sqlglot/dialects/duckdb.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2108,17 +2108,18 @@ class Generator(generator.Generator):
21082108
)
21092109

21102110
# Template for APPROXIMATE_SIMILARITY transpilation
2111-
# Computes Jaccard similarity across minhash signatures by comparing pairwise
2111+
# Computes multi-way Jaccard similarity: fraction of positions where ALL signatures agree
21122112
APPROXIMATE_SIMILARITY_TEMPLATE: exp.Expression = exp.maybe_parse(
21132113
"""
2114-
SELECT CAST(SUM(match) AS DOUBLE) / COUNT(*)
2114+
SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*)
21152115
FROM (
2116-
SELECT CASE WHEN s1.h = s2.h THEN 1 ELSE 0 END AS match
2117-
FROM UNNEST(LIST(:expr)) WITH ORDINALITY AS sigs1(sig1, n1),
2118-
UNNEST(LIST(:expr)) WITH ORDINALITY AS sigs2(sig2, n2),
2119-
UNNEST(CAST(sig1 -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s1(h, i),
2120-
UNNEST(CAST(sig2 -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s2(h, j)
2121-
WHERE n1 < n2 AND s1.i = s2.j
2116+
SELECT pos, COUNT(DISTINCT h) AS num_distinct
2117+
FROM (
2118+
SELECT h, pos
2119+
FROM UNNEST(LIST(:expr)) AS _(sig),
2120+
UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos)
2121+
)
2122+
GROUP BY pos
21222123
)
21232124
""",
21242125
)

tests/dialects/test_snowflake.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1869,7 +1869,7 @@ def test_snowflake(self):
18691869
self.validate_all(
18701870
"APPROXIMATE_SIMILARITY(sig_col)",
18711871
write={
1872-
"duckdb": "(SELECT CAST(SUM(match) AS DOUBLE) / COUNT(*) FROM (SELECT CASE WHEN s1.h = s2.h THEN 1 ELSE 0 END AS match FROM UNNEST(LIST(sig_col)) WITH ORDINALITY AS sigs1(sig1, n1) JOIN UNNEST(LIST(sig_col)) WITH ORDINALITY AS sigs2(sig2, n2) ON TRUE JOIN UNNEST(CAST(sig1 -> '$.state' AS UBIGINT[])) WITH ORDINALITY AS s1(h, i) ON TRUE JOIN UNNEST(CAST(sig2 -> '$.state' AS UBIGINT[])) WITH ORDINALITY AS s2(h, j) ON TRUE WHERE n1 < n2 AND s1.i = s2.j))",
1872+
"duckdb": "(SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) FROM (SELECT pos, COUNT(DISTINCT h) AS num_distinct FROM (SELECT h, pos FROM UNNEST(LIST(sig_col)) AS _(sig) JOIN UNNEST(CAST(sig -> '$.state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos) ON TRUE) GROUP BY pos))",
18731873
"snowflake": "APPROXIMATE_SIMILARITY(sig_col)",
18741874
},
18751875
)

0 commit comments

Comments
 (0)