Skip to content

Commit 8440083

Browse files
Fixed the PG->DuckDB translator to handle TPCH queries
1 parent 6da930a commit 8440083

File tree

2 files changed

+261
-16
lines changed

2 files changed

+261
-16
lines changed

cpp/deeplake_pg/pg_to_duckdb_translator.cpp

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ namespace pg {
77

88
namespace {
99

10+
// Keyword lengths for WHERE clause parsing
11+
constexpr size_t BETWEEN_KEYWORD_LENGTH = 7;
12+
constexpr size_t AND_KEYWORD_LENGTH = 3;
13+
constexpr size_t OR_KEYWORD_LENGTH = 2;
14+
1015
// Helper to check if a position is within single quotes
1116
bool is_within_quotes(const std::string& str, size_t pos) {
1217
size_t quote_count = 0;
@@ -665,6 +670,9 @@ std::string pg_to_duckdb_translator::wrap_where_predicates(const std::string& qu
665670
paren_depth = 0;
666671
in_quotes = false;
667672

673+
// Track if we're inside a BETWEEN clause
674+
bool in_between = false;
675+
668676
// Parse the WHERE clause
669677
while (pos < where_clause.length()) {
670678
char c = where_clause[pos];
@@ -677,28 +685,59 @@ std::string pg_to_duckdb_translator::wrap_where_predicates(const std::string& qu
677685
} else if (c == ')') {
678686
paren_depth--;
679687
} else if (paren_depth == 0) {
688+
// Check for BETWEEN keyword to track if AND is part of BETWEEN
689+
if (pos + BETWEEN_KEYWORD_LENGTH <= where_clause.length() && !in_between) {
690+
std::string next_keyword = where_clause.substr(pos, BETWEEN_KEYWORD_LENGTH);
691+
std::string next_keyword_upper = next_keyword;
692+
for (char& ch : next_keyword_upper) ch = std::toupper(ch);
693+
694+
// Check if character before is a word boundary (space or start of string)
695+
bool before_is_boundary = (pos == 0 || std::isspace(where_clause[pos - 1]));
696+
697+
if (next_keyword_upper == "BETWEEN" &&
698+
before_is_boundary &&
699+
(pos + BETWEEN_KEYWORD_LENGTH >= where_clause.length() ||
700+
std::isspace(where_clause[pos + BETWEEN_KEYWORD_LENGTH]))) {
701+
in_between = true;
702+
}
703+
}
704+
680705
// Check for AND or OR at top level
681-
if (pos + 3 <= where_clause.length()) {
682-
std::string next_3 = where_clause.substr(pos, 3);
683-
std::string next_3_upper = next_3;
684-
for (char& ch : next_3_upper) ch = std::toupper(ch);
685-
686-
bool is_and = (next_3_upper == "AND" &&
687-
(pos + 3 >= where_clause.length() || std::isspace(where_clause[pos + 3])));
688-
bool is_or = (pos + 2 < where_clause.length() &&
689-
next_3_upper.substr(0, 2) == "OR" &&
690-
(pos + 2 >= where_clause.length() || std::isspace(where_clause[pos + 2])));
691-
692-
if (is_and) {
706+
if (pos + AND_KEYWORD_LENGTH <= where_clause.length()) {
707+
std::string next_keyword = where_clause.substr(pos, AND_KEYWORD_LENGTH);
708+
std::string next_keyword_upper = next_keyword;
709+
for (char& ch : next_keyword_upper) ch = std::toupper(ch);
710+
711+
// Check if character before is a word boundary (space or start of string)
712+
bool before_is_boundary = (pos == 0 || std::isspace(where_clause[pos - 1]));
713+
714+
bool is_and = (next_keyword_upper == "AND" &&
715+
before_is_boundary &&
716+
(pos + AND_KEYWORD_LENGTH >= where_clause.length() ||
717+
std::isspace(where_clause[pos + AND_KEYWORD_LENGTH])));
718+
bool is_or = (pos + OR_KEYWORD_LENGTH < where_clause.length() &&
719+
next_keyword_upper.substr(0, OR_KEYWORD_LENGTH) == "OR" &&
720+
before_is_boundary &&
721+
(pos + OR_KEYWORD_LENGTH >= where_clause.length() ||
722+
std::isspace(where_clause[pos + OR_KEYWORD_LENGTH])));
723+
724+
// AND within BETWEEN is not a separator
725+
if (is_and && in_between) {
726+
in_between = false; // BETWEEN...AND completed
727+
pos += AND_KEYWORD_LENGTH; // Skip past the 'AND' keyword
728+
continue;
729+
}
730+
731+
if (is_and && !in_between) {
693732
predicates.push_back(where_clause.substr(start, pos - start));
694733
operators.push_back("AND");
695-
pos += 3;
734+
pos += AND_KEYWORD_LENGTH;
696735
start = pos;
697736
continue;
698737
} else if (is_or) {
699738
predicates.push_back(where_clause.substr(start, pos - start));
700739
operators.push_back("OR");
701-
pos += 2;
740+
pos += OR_KEYWORD_LENGTH;
702741
start = pos;
703742
continue;
704743
}

postgres/tests/py_tests/test_pg_duck_translation.py

Lines changed: 208 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,22 @@
55
PostgreSQL query syntax to DuckDB-compatible syntax for queries targeting
66
deeplake tables.
77
8-
Translation rules tested:
8+
Translation rules tested (15 test functions):
99
1. JSON operators: -> and ->> to DuckDB's ->> with $.path syntax
1010
2. Type casts: ::TYPE to CAST(expr AS TYPE)
1111
3. Timestamp conversions: PostgreSQL epoch arithmetic to TO_TIMESTAMP()
1212
4. Date extraction: EXTRACT() to DuckDB's date part functions
1313
5. Date differences: EXTRACT(EPOCH FROM ...) to date_diff()
1414
6. IN clauses: IN ('a', 'b') to in ['a', 'b']
1515
7. COUNT(*) to count()
16-
8. WHERE predicate wrapping: Add parentheses around each predicate
16+
8. WHERE predicate wrapping: Add parentheses around each predicate (with word boundaries)
1717
9. to_date to strptime
1818
10. DIV() to // operator
1919
11. regexp_substr to regexp_extract
20+
12. BETWEEN clauses: Preserve BETWEEN...AND syntax without splitting
21+
13. Column names with keywords: Handle column names containing 'and', 'or' as substrings
22+
14. Complex combined translations: Multiple rules applied together
23+
15. Regression tests: Position increment bug in BETWEEN handling
2024
"""
2125
import pytest
2226
import asyncpg
@@ -559,3 +563,205 @@ async def test_date_arithmetic_with_casts(db_conn: asyncpg.Connection):
559563

560564
finally:
561565
await db_conn.execute("DROP TABLE IF EXISTS string_data CASCADE")
566+
567+
568+
@pytest.mark.asyncio
569+
async def test_between_clause(db_conn: asyncpg.Connection):
570+
"""
571+
Test BETWEEN clause translation.
572+
573+
BETWEEN uses AND as part of its syntax, not as a logical operator.
574+
The translator must not split "col BETWEEN val1 AND val2" into separate predicates.
575+
576+
PostgreSQL: WHERE col BETWEEN 1 AND 10
577+
DuckDB: WHERE (col BETWEEN 1 AND 10) -- NOT: WHERE (col BETWEEN 1) AND (10)
578+
"""
579+
assertions = Assertions(db_conn)
580+
581+
try:
582+
# Create table similar to TPCH lineitem
583+
await db_conn.execute("""
584+
CREATE TABLE lineitem (
585+
l_discount decimal,
586+
l_quantity decimal,
587+
l_shipdate date,
588+
l_extendedprice decimal
589+
) USING deeplake
590+
""")
591+
592+
await db_conn.execute("""
593+
INSERT INTO lineitem VALUES
594+
(0.05, 10, '1994-01-15', 1000.00),
595+
(0.06, 20, '1994-06-15', 2000.00),
596+
(0.07, 30, '1995-01-15', 3000.00),
597+
(0.10, 40, '1995-06-15', 4000.00)
598+
""")
599+
600+
# Test 1: BETWEEN with arithmetic (from TPCH Query 6)
601+
# l_discount between 0.06 - 0.01 AND 0.06 + 0.01
602+
# Matches rows where l_discount is between 0.05 and 0.07 inclusive
603+
# Test data: 0.05✓, 0.06✓, 0.07✓, 0.10✗ → 3 rows expected
604+
await assertions.assert_query_row_count(
605+
3,
606+
"SELECT * FROM lineitem WHERE l_discount BETWEEN 0.06 - 0.01 AND 0.06 + 0.01"
607+
)
608+
609+
# Test 2: BETWEEN with dates (from TPCH Query 7, 8)
610+
await assertions.assert_query_row_count(
611+
2,
612+
"SELECT * FROM lineitem WHERE l_shipdate BETWEEN '1994-01-01' AND '1994-12-31'"
613+
)
614+
615+
# Test 3: Multiple BETWEEN clauses with AND (complex case)
616+
await assertions.assert_query_row_count(
617+
1,
618+
"""SELECT * FROM lineitem
619+
WHERE l_discount BETWEEN 0.05 AND 0.06
620+
AND l_quantity BETWEEN 15 AND 25"""
621+
)
622+
623+
print("✓ Test passed: BETWEEN clauses translate correctly")
624+
625+
finally:
626+
await db_conn.execute("DROP TABLE IF EXISTS lineitem CASCADE")
627+
628+
629+
@pytest.mark.asyncio
630+
async def test_column_names_with_keywords(db_conn: asyncpg.Connection):
631+
"""
632+
Test column names containing SQL keywords like AND, OR.
633+
634+
Column names like 'p_brand', 'standard', 'portland' contain 'and' or 'or'
635+
as substrings. The translator must use word boundaries to avoid splitting
636+
these column names.
637+
638+
PostgreSQL: WHERE p_brand <> 'Brand#45'
639+
DuckDB: WHERE (p_brand <> 'Brand#45') -- NOT: WHERE (p_br) AND (<> 'Brand#45')
640+
"""
641+
assertions = Assertions(db_conn)
642+
643+
try:
644+
# Create table similar to TPCH part table
645+
await db_conn.execute("""
646+
CREATE TABLE part (
647+
p_partkey int,
648+
p_brand text,
649+
p_type text,
650+
p_size int
651+
) USING deeplake
652+
""")
653+
654+
await db_conn.execute("""
655+
INSERT INTO part VALUES
656+
(1, 'Brand#12', 'STANDARD POLISHED', 10),
657+
(2, 'Brand#45', 'MEDIUM POLISHED', 20),
658+
(3, 'Brand#23', 'LARGE BRUSHED', 30),
659+
(4, 'Brand#45', 'SMALL BURNISHED', 15)
660+
""")
661+
662+
# Test 1: Column name with 'and' substring (from TPCH Query 16)
663+
# p_brand contains 'and', must not split at "br-AND"
664+
# Only Row 1 (Brand#12, size 10) matches both conditions:
665+
# - Row 1: Brand#12, size 10 → brand ≠ Brand#45 ✓, size < 25 ✓
666+
# - Row 2: Brand#45, size 20 → brand ≠ Brand#45 ✗
667+
# - Row 3: Brand#23, size 30 → brand ≠ Brand#45 ✓, size < 25 ✗
668+
# - Row 4: Brand#45, size 15 → brand ≠ Brand#45 ✗
669+
await assertions.assert_query_row_count(
670+
1,
671+
"SELECT * FROM part WHERE p_brand <> 'Brand#45' AND p_size < 25"
672+
)
673+
674+
# Test 2: Column name with 'and' and equals (from TPCH Query 17)
675+
# p_brand = 'Brand#23'
676+
await assertions.assert_query_row_count(
677+
1,
678+
"SELECT * FROM part WHERE p_brand = 'Brand#23' AND p_size = 30"
679+
)
680+
681+
# Test 3: String value with 'and' in LIKE pattern
682+
# STANDARD contains 'and'
683+
await assertions.assert_query_row_count(
684+
1,
685+
"SELECT * FROM part WHERE p_type LIKE 'STANDARD%' AND p_size = 10"
686+
)
687+
688+
# Test 4: Multiple conditions with column names containing keywords
689+
await assertions.assert_query_row_count(
690+
3,
691+
"""SELECT * FROM part
692+
WHERE p_brand IN ('Brand#12', 'Brand#23', 'Brand#45')
693+
AND p_type NOT LIKE 'MEDIUM%'"""
694+
)
695+
696+
print("✓ Test passed: Column names with SQL keywords handle correctly")
697+
698+
finally:
699+
await db_conn.execute("DROP TABLE IF EXISTS part CASCADE")
700+
701+
702+
@pytest.mark.asyncio
703+
async def test_between_position_increment_bug(db_conn: asyncpg.Connection):
704+
"""
705+
Regression test for the position increment bug in BETWEEN clause handling.
706+
707+
The bug: When detecting AND within BETWEEN, the code was incrementing pos by 1
708+
instead of 3, causing it to re-check the same position on the next iteration.
709+
710+
This test specifically uses a query where the bug would cause incorrect parsing.
711+
Without the fix (pos += 3), the translator would incorrectly parse the BETWEEN
712+
clause, potentially leading to malformed output or infinite loops.
713+
"""
714+
assertions = Assertions(db_conn)
715+
716+
try:
717+
# Create test table
718+
await db_conn.execute("""
719+
CREATE TABLE test_data (
720+
id int,
721+
value int,
722+
category text
723+
) USING deeplake
724+
""")
725+
726+
await db_conn.execute("""
727+
INSERT INTO test_data VALUES
728+
(1, 5, 'A'),
729+
(2, 15, 'B'),
730+
(3, 25, 'C'),
731+
(4, 35, 'D')
732+
""")
733+
734+
# Test 1: Simple BETWEEN that would trigger the bug
735+
# The AND in "BETWEEN 10 AND 30" should not be treated as a logical separator
736+
await assertions.assert_query_row_count(
737+
2,
738+
"SELECT * FROM test_data WHERE value BETWEEN 10 AND 30"
739+
)
740+
741+
# Test 2: BETWEEN followed by a logical AND
742+
# This tests that after handling BETWEEN...AND, we correctly parse the next AND
743+
await assertions.assert_query_row_count(
744+
1,
745+
"SELECT * FROM test_data WHERE value BETWEEN 10 AND 30 AND category = 'B'"
746+
)
747+
748+
# Test 3: Multiple consecutive BETWEENs with logical ANDs
749+
# This would definitely expose the position increment bug
750+
await assertions.assert_query_row_count(
751+
0,
752+
"""SELECT * FROM test_data
753+
WHERE value BETWEEN 10 AND 20
754+
AND id BETWEEN 5 AND 10"""
755+
)
756+
757+
# Test 4: BETWEEN with expressions (from TPCH Query 6 pattern)
758+
# This is the actual pattern that failed in TPCH tests
759+
await assertions.assert_query_row_count(
760+
2,
761+
"SELECT * FROM test_data WHERE value BETWEEN 5 + 5 AND 15 + 15"
762+
)
763+
764+
print("✓ Test passed: BETWEEN position increment bug regression test")
765+
766+
finally:
767+
await db_conn.execute("DROP TABLE IF EXISTS test_data CASCADE")

0 commit comments

Comments
 (0)