|
5 | 5 | PostgreSQL query syntax to DuckDB-compatible syntax for queries targeting |
6 | 6 | deeplake tables. |
7 | 7 |
|
8 | | -Translation rules tested: |
| 8 | +Translation rules tested (15 test functions): |
9 | 9 | 1. JSON operators: -> and ->> to DuckDB's ->> with $.path syntax |
10 | 10 | 2. Type casts: ::TYPE to CAST(expr AS TYPE) |
11 | 11 | 3. Timestamp conversions: PostgreSQL epoch arithmetic to TO_TIMESTAMP() |
12 | 12 | 4. Date extraction: EXTRACT() to DuckDB's date part functions |
13 | 13 | 5. Date differences: EXTRACT(EPOCH FROM ...) to date_diff() |
14 | 14 | 6. IN clauses: IN ('a', 'b') to in ['a', 'b'] |
15 | 15 | 7. COUNT(*) to count() |
16 | | -8. WHERE predicate wrapping: Add parentheses around each predicate |
| 16 | +8. WHERE predicate wrapping: Add parentheses around each predicate (with word boundaries) |
17 | 17 | 9. to_date to strptime |
18 | 18 | 10. DIV() to // operator |
19 | 19 | 11. regexp_substr to regexp_extract |
| 20 | +12. BETWEEN clauses: Preserve BETWEEN...AND syntax without splitting |
| 21 | +13. Column names with keywords: Handle column names containing 'and', 'or' as substrings |
| 22 | +14. Complex combined translations: Multiple rules applied together |
| 23 | +15. Regression tests: Position increment bug in BETWEEN handling |
20 | 24 | """ |
21 | 25 | import pytest |
22 | 26 | import asyncpg |
@@ -559,3 +563,205 @@ async def test_date_arithmetic_with_casts(db_conn: asyncpg.Connection): |
559 | 563 |
|
560 | 564 | finally: |
561 | 565 | await db_conn.execute("DROP TABLE IF EXISTS string_data CASCADE") |
| 566 | + |
| 567 | + |
| 568 | +@pytest.mark.asyncio |
| 569 | +async def test_between_clause(db_conn: asyncpg.Connection): |
| 570 | + """ |
| 571 | + Test BETWEEN clause translation. |
| 572 | +
|
| 573 | + BETWEEN uses AND as part of its syntax, not as a logical operator. |
| 574 | + The translator must not split "col BETWEEN val1 AND val2" into separate predicates. |
| 575 | +
|
| 576 | + PostgreSQL: WHERE col BETWEEN 1 AND 10 |
| 577 | + DuckDB: WHERE (col BETWEEN 1 AND 10) -- NOT: WHERE (col BETWEEN 1) AND (10) |
| 578 | + """ |
| 579 | + assertions = Assertions(db_conn) |
| 580 | + |
| 581 | + try: |
| 582 | + # Create table similar to TPCH lineitem |
| 583 | + await db_conn.execute(""" |
| 584 | + CREATE TABLE lineitem ( |
| 585 | + l_discount decimal, |
| 586 | + l_quantity decimal, |
| 587 | + l_shipdate date, |
| 588 | + l_extendedprice decimal |
| 589 | + ) USING deeplake |
| 590 | + """) |
| 591 | + |
| 592 | + await db_conn.execute(""" |
| 593 | + INSERT INTO lineitem VALUES |
| 594 | + (0.05, 10, '1994-01-15', 1000.00), |
| 595 | + (0.06, 20, '1994-06-15', 2000.00), |
| 596 | + (0.07, 30, '1995-01-15', 3000.00), |
| 597 | + (0.10, 40, '1995-06-15', 4000.00) |
| 598 | + """) |
| 599 | + |
| 600 | + # Test 1: BETWEEN with arithmetic (from TPCH Query 6) |
| 601 | + # l_discount between 0.06 - 0.01 AND 0.06 + 0.01 |
| 602 | + # Matches rows where l_discount is between 0.05 and 0.07 inclusive |
| 603 | + # Test data: 0.05✓, 0.06✓, 0.07✓, 0.10✗ → 3 rows expected |
| 604 | + await assertions.assert_query_row_count( |
| 605 | + 3, |
| 606 | + "SELECT * FROM lineitem WHERE l_discount BETWEEN 0.06 - 0.01 AND 0.06 + 0.01" |
| 607 | + ) |
| 608 | + |
| 609 | + # Test 2: BETWEEN with dates (from TPCH Query 7, 8) |
| 610 | + await assertions.assert_query_row_count( |
| 611 | + 2, |
| 612 | + "SELECT * FROM lineitem WHERE l_shipdate BETWEEN '1994-01-01' AND '1994-12-31'" |
| 613 | + ) |
| 614 | + |
| 615 | + # Test 3: Multiple BETWEEN clauses with AND (complex case) |
| 616 | + await assertions.assert_query_row_count( |
| 617 | + 1, |
| 618 | + """SELECT * FROM lineitem |
| 619 | + WHERE l_discount BETWEEN 0.05 AND 0.06 |
| 620 | + AND l_quantity BETWEEN 15 AND 25""" |
| 621 | + ) |
| 622 | + |
| 623 | + print("✓ Test passed: BETWEEN clauses translate correctly") |
| 624 | + |
| 625 | + finally: |
| 626 | + await db_conn.execute("DROP TABLE IF EXISTS lineitem CASCADE") |
| 627 | + |
| 628 | + |
| 629 | +@pytest.mark.asyncio |
| 630 | +async def test_column_names_with_keywords(db_conn: asyncpg.Connection): |
| 631 | + """ |
| 632 | + Test column names containing SQL keywords like AND, OR. |
| 633 | +
|
| 634 | + Column names like 'p_brand', 'standard', 'portland' contain 'and' or 'or' |
| 635 | + as substrings. The translator must use word boundaries to avoid splitting |
| 636 | + these column names. |
| 637 | +
|
| 638 | + PostgreSQL: WHERE p_brand <> 'Brand#45' |
| 639 | + DuckDB: WHERE (p_brand <> 'Brand#45') -- NOT: WHERE (p_br) AND (<> 'Brand#45') |
| 640 | + """ |
| 641 | + assertions = Assertions(db_conn) |
| 642 | + |
| 643 | + try: |
| 644 | + # Create table similar to TPCH part table |
| 645 | + await db_conn.execute(""" |
| 646 | + CREATE TABLE part ( |
| 647 | + p_partkey int, |
| 648 | + p_brand text, |
| 649 | + p_type text, |
| 650 | + p_size int |
| 651 | + ) USING deeplake |
| 652 | + """) |
| 653 | + |
| 654 | + await db_conn.execute(""" |
| 655 | + INSERT INTO part VALUES |
| 656 | + (1, 'Brand#12', 'STANDARD POLISHED', 10), |
| 657 | + (2, 'Brand#45', 'MEDIUM POLISHED', 20), |
| 658 | + (3, 'Brand#23', 'LARGE BRUSHED', 30), |
| 659 | + (4, 'Brand#45', 'SMALL BURNISHED', 15) |
| 660 | + """) |
| 661 | + |
| 662 | + # Test 1: Column name with 'and' substring (from TPCH Query 16) |
| 663 | + # p_brand contains 'and', must not split at "br-AND" |
| 664 | + # Only Row 1 (Brand#12, size 10) matches both conditions: |
| 665 | + # - Row 1: Brand#12, size 10 → brand ≠ Brand#45 ✓, size < 25 ✓ |
| 666 | + # - Row 2: Brand#45, size 20 → brand ≠ Brand#45 ✗ |
| 667 | + # - Row 3: Brand#23, size 30 → brand ≠ Brand#45 ✓, size < 25 ✗ |
| 668 | + # - Row 4: Brand#45, size 15 → brand ≠ Brand#45 ✗ |
| 669 | + await assertions.assert_query_row_count( |
| 670 | + 1, |
| 671 | + "SELECT * FROM part WHERE p_brand <> 'Brand#45' AND p_size < 25" |
| 672 | + ) |
| 673 | + |
| 674 | + # Test 2: Column name with 'and' and equals (from TPCH Query 17) |
| 675 | + # p_brand = 'Brand#23' |
| 676 | + await assertions.assert_query_row_count( |
| 677 | + 1, |
| 678 | + "SELECT * FROM part WHERE p_brand = 'Brand#23' AND p_size = 30" |
| 679 | + ) |
| 680 | + |
| 681 | + # Test 3: String value with 'and' in LIKE pattern |
| 682 | + # STANDARD contains 'and' |
| 683 | + await assertions.assert_query_row_count( |
| 684 | + 1, |
| 685 | + "SELECT * FROM part WHERE p_type LIKE 'STANDARD%' AND p_size = 10" |
| 686 | + ) |
| 687 | + |
| 688 | + # Test 4: Multiple conditions with column names containing keywords |
| 689 | + await assertions.assert_query_row_count( |
| 690 | + 3, |
| 691 | + """SELECT * FROM part |
| 692 | + WHERE p_brand IN ('Brand#12', 'Brand#23', 'Brand#45') |
| 693 | + AND p_type NOT LIKE 'MEDIUM%'""" |
| 694 | + ) |
| 695 | + |
| 696 | + print("✓ Test passed: Column names with SQL keywords handle correctly") |
| 697 | + |
| 698 | + finally: |
| 699 | + await db_conn.execute("DROP TABLE IF EXISTS part CASCADE") |
| 700 | + |
| 701 | + |
| 702 | +@pytest.mark.asyncio |
| 703 | +async def test_between_position_increment_bug(db_conn: asyncpg.Connection): |
| 704 | + """ |
| 705 | + Regression test for the position increment bug in BETWEEN clause handling. |
| 706 | +
|
| 707 | + The bug: When detecting AND within BETWEEN, the code was incrementing pos by 1 |
| 708 | + instead of 3, causing it to re-check the same position on the next iteration. |
| 709 | +
|
| 710 | + This test specifically uses a query where the bug would cause incorrect parsing. |
| 711 | + Without the fix (pos += 3), the translator would incorrectly parse the BETWEEN |
| 712 | + clause, potentially leading to malformed output or infinite loops. |
| 713 | + """ |
| 714 | + assertions = Assertions(db_conn) |
| 715 | + |
| 716 | + try: |
| 717 | + # Create test table |
| 718 | + await db_conn.execute(""" |
| 719 | + CREATE TABLE test_data ( |
| 720 | + id int, |
| 721 | + value int, |
| 722 | + category text |
| 723 | + ) USING deeplake |
| 724 | + """) |
| 725 | + |
| 726 | + await db_conn.execute(""" |
| 727 | + INSERT INTO test_data VALUES |
| 728 | + (1, 5, 'A'), |
| 729 | + (2, 15, 'B'), |
| 730 | + (3, 25, 'C'), |
| 731 | + (4, 35, 'D') |
| 732 | + """) |
| 733 | + |
| 734 | + # Test 1: Simple BETWEEN that would trigger the bug |
| 735 | + # The AND in "BETWEEN 10 AND 30" should not be treated as a logical separator |
| 736 | + await assertions.assert_query_row_count( |
| 737 | + 2, |
| 738 | + "SELECT * FROM test_data WHERE value BETWEEN 10 AND 30" |
| 739 | + ) |
| 740 | + |
| 741 | + # Test 2: BETWEEN followed by a logical AND |
| 742 | + # This tests that after handling BETWEEN...AND, we correctly parse the next AND |
| 743 | + await assertions.assert_query_row_count( |
| 744 | + 1, |
| 745 | + "SELECT * FROM test_data WHERE value BETWEEN 10 AND 30 AND category = 'B'" |
| 746 | + ) |
| 747 | + |
| 748 | + # Test 3: Multiple consecutive BETWEENs with logical ANDs |
| 749 | + # This would definitely expose the position increment bug |
| 750 | + await assertions.assert_query_row_count( |
| 751 | + 0, |
| 752 | + """SELECT * FROM test_data |
| 753 | + WHERE value BETWEEN 10 AND 20 |
| 754 | + AND id BETWEEN 5 AND 10""" |
| 755 | + ) |
| 756 | + |
| 757 | + # Test 4: BETWEEN with expressions (from TPCH Query 6 pattern) |
| 758 | + # This is the actual pattern that failed in TPCH tests |
| 759 | + await assertions.assert_query_row_count( |
| 760 | + 2, |
| 761 | + "SELECT * FROM test_data WHERE value BETWEEN 5 + 5 AND 15 + 15" |
| 762 | + ) |
| 763 | + |
| 764 | + print("✓ Test passed: BETWEEN position increment bug regression test") |
| 765 | + |
| 766 | + finally: |
| 767 | + await db_conn.execute("DROP TABLE IF EXISTS test_data CASCADE") |
0 commit comments