@@ -21,9 +21,11 @@ def df_with_all_column_types(num_rows=100):
21
21
return pd .DataFrame (data = data , index = index )
22
22
23
23
24
- def compare_against_pyarrow (pyarrow_expr_str , expected_adb_expr , lib , function_map = None , expect_equal = True ):
24
+ def compare_against_pyarrow (pyarrow_expr_str , expected_adb_qb , lib , function_map = None , expect_equal = True ):
25
25
adb_expr = ExpressionNode .from_pyarrow_expression_str (pyarrow_expr_str , function_map )
26
- assert str (adb_expr ) == str (expected_adb_expr )
26
+ q = QueryBuilder ()
27
+ q = q [adb_expr ]
28
+ assert q == expected_adb_qb
27
29
pa_expr = eval (pyarrow_expr_str )
28
30
29
31
# Setup
@@ -33,8 +35,6 @@ def compare_against_pyarrow(pyarrow_expr_str, expected_adb_expr, lib, function_m
33
35
pa_table = pa .Table .from_pandas (df )
34
36
35
37
# Apply filter to adb
36
- q = QueryBuilder ()
37
- q = q [adb_expr ]
38
38
adb_result = lib .read (sym , query_builder = q ).data
39
39
40
40
# Apply filter to pyarrow
@@ -48,76 +48,86 @@ def compare_against_pyarrow(pyarrow_expr_str, expected_adb_expr, lib, function_m
48
48
49
49
def test_basic_filters (lmdb_version_store_v1 ):
50
50
lib = lmdb_version_store_v1
51
+ q = QueryBuilder ()
51
52
52
53
# Filter by boolean column
53
54
expr = f"pc.field('bool_col')"
54
- expected_expr = ExpressionNode . column_ref ( 'bool_col' )
55
- compare_against_pyarrow (expr , expected_expr , lib )
55
+ expected_q = q [ q [ 'bool_col' ]]
56
+ compare_against_pyarrow (expr , expected_q , lib )
56
57
57
58
# Filter by comparison
58
59
for op in ["<" , "<=" , "==" , ">=" , ">" ]:
59
60
expr = f"pc.field('int_col') { op } 50"
60
- expected_expr = eval (f"ExpressionNode.column_ref( 'int_col') { op } 50" )
61
- compare_against_pyarrow (expr , expected_expr , lib )
61
+ expected_q = q [ eval (f"q[ 'int_col'] { op } 50" )]
62
+ compare_against_pyarrow (expr , expected_q , lib )
62
63
63
64
# Filter with unary operators
64
65
expr = "~pc.field('bool_col')"
65
- expected_expr = ~ ExpressionNode . column_ref ( 'bool_col' )
66
- compare_against_pyarrow (expr , expected_expr , lib )
66
+ expected_q = q [ ~ q [ 'bool_col' ]]
67
+ compare_against_pyarrow (expr , expected_q , lib )
67
68
68
69
# Filter with binary operators
69
70
for op in ["+" , "-" , "*" , "/" ]:
70
71
expr = f"pc.field('float_col') { op } 5.0 < 50.0"
71
- expected_expr = eval (f"ExpressionNode.column_ref( 'float_col') { op } 5.0 < 50.0" )
72
- compare_against_pyarrow (expr , expected_expr , lib )
72
+ expected_q = q [ eval (f"q[ 'float_col'] { op } 5.0 < 50.0" )]
73
+ compare_against_pyarrow (expr , expected_q , lib )
73
74
74
75
for op in ["&" , "|" ]:
75
76
expr = f"pc.field('bool_col') { op } (pc.field('int_col') < 50)"
76
- expected_expr = eval (f"ExpressionNode.column_ref( 'bool_col') { op } (ExpressionNode.column_ref( 'int_col') < 50)" )
77
- compare_against_pyarrow (expr , expected_expr , lib )
77
+ expected_q = q [ eval (f"q[ 'bool_col'] { op } (q[ 'int_col'] < 50)" )]
78
+ compare_against_pyarrow (expr , expected_q , lib )
78
79
79
80
# Filter with expression method calls
80
81
expr = "pc.field('str_col').isin(['str_0', 'str_10', 'str_20'])"
81
- expected_expr = ExpressionNode .column_ref ('str_col' ).isin (['str_0' , 'str_10' , 'str_20' ])
82
- compare_against_pyarrow (expr , expected_expr , lib )
82
+ expected_q = q [q ['str_col' ].isin (['str_0' , 'str_10' , 'str_20' ])]
83
+ compare_against_pyarrow (expr , expected_q , lib )
84
+
85
+ expr = "pc.field('str_col').isin(('str_0', 'str_10', 'str_20'))"
86
+ expected_q = q [q ['str_col' ].isin (('str_0' , 'str_10' , 'str_20' ))]
87
+ compare_against_pyarrow (expr , expected_q , lib )
88
+
89
+ expr = "pc.field('str_col').isin({'str_0', 'str_10', 'str_20'})"
90
+ expected_q = q [q ['str_col' ].isin ({'str_0' , 'str_10' , 'str_20' })]
91
+ compare_against_pyarrow (expr , expected_q , lib )
83
92
84
93
expr = "pc.field('float_col').is_nan()"
85
- expected_expr = ExpressionNode . column_ref ( 'float_col' ) .isnull ()
94
+ expected_q = q [ q [ 'float_col' ] .isnull ()]
86
95
# We expect a different result between adb and pyarrow because of the different nan/null handling
87
- compare_against_pyarrow (expr , expected_expr , lib , expect_equal = False )
96
+ compare_against_pyarrow (expr , expected_q , lib , expect_equal = False )
88
97
89
98
expr = "pc.field('float_col').is_null()"
90
- expected_expr = ExpressionNode . column_ref ( 'float_col' ) .isnull ()
91
- compare_against_pyarrow (expr , expected_expr , lib )
99
+ expected_q = q [ q [ 'float_col' ] .isnull ()]
100
+ compare_against_pyarrow (expr , expected_q , lib )
92
101
93
102
expr = "pc.field('float_col').is_valid()"
94
- expected_expr = ExpressionNode . column_ref ( 'float_col' ) .notnull ()
95
- compare_against_pyarrow (expr , expected_expr , lib )
103
+ expected_q = q [ q [ 'float_col' ] .notnull ()]
104
+ compare_against_pyarrow (expr , expected_q , lib )
96
105
97
106
def test_complex_filters (lmdb_version_store_v1 ):
98
107
lib = lmdb_version_store_v1
108
+ q = QueryBuilder ()
99
109
100
110
# Nested complex filters
101
111
expr = "((pc.field('float_col') * 2) > 20.0) & (pc.field('int_col') <= pc.scalar(60)) | pc.field('bool_col')"
102
- expected_expr = ( ExpressionNode . column_ref ( 'float_col' ) * 2 > 20.0 ) & (ExpressionNode . column_ref ( 'int_col' ) <= 60 ) | ExpressionNode . column_ref ( 'bool_col' )
103
- compare_against_pyarrow (expr , expected_expr , lib )
112
+ expected_q = q [( q [ 'float_col' ] * 2 > 20.0 ) & (q [ 'int_col' ] <= 60 ) | q [ 'bool_col' ]]
113
+ compare_against_pyarrow (expr , expected_q , lib )
104
114
105
115
expr = "((pc.field('float_col') / 2) > 20.0) & (pc.field('float_col') <= pc.scalar(60)) & pc.field('str_col').isin(['str_30', 'str_41', 'str_42', 'str_53', 'str_99'])"
106
- expected_expr = ( ExpressionNode . column_ref ( 'float_col' ) / 2 > 20.0 ) & (ExpressionNode . column_ref ( 'float_col' ) <= 60 ) & ExpressionNode . column_ref ( 'str_col' ) .isin (['str_30' , 'str_41' , 'str_42' , 'str_53' , 'str_99' ])
107
- compare_against_pyarrow (expr , expected_expr , lib )
116
+ expected_q = q [( q [ 'float_col' ] / 2 > 20.0 ) & (q [ 'float_col' ] <= 60 ) & q [ 'str_col' ] .isin (['str_30' , 'str_41' , 'str_42' , 'str_53' , 'str_99' ])]
117
+ compare_against_pyarrow (expr , expected_q , lib )
108
118
109
119
# Filters with function calls
110
120
function_map = {
111
121
"datetime.datetime" : datetime .datetime ,
112
122
"abs" : abs ,
113
123
}
114
124
expr = "pc.field('datetime_col') < datetime.datetime(2025, 1, 20)"
115
- expected_expr = ExpressionNode . column_ref ( 'datetime_col' ) < datetime .datetime (2025 , 1 , 20 )
116
- compare_against_pyarrow (expr , expected_expr , lib , function_map )
125
+ expected_q = q [ q [ 'datetime_col' ] < datetime .datetime (2025 , 1 , 20 )]
126
+ compare_against_pyarrow (expr , expected_q , lib , function_map )
117
127
118
128
expr = "(pc.field('datetime_col') < datetime.datetime(2025, 1, abs(-20))) & (pc.field('int_col') >= abs(-5))"
119
- expected_expr = ( ExpressionNode . column_ref ( 'datetime_col' ) < datetime .datetime (2025 , 1 , abs (- 20 ))) & (ExpressionNode . column_ref ( 'int_col' ) >= abs (- 5 ))
120
- compare_against_pyarrow (expr , expected_expr , lib , function_map )
129
+ expected_q = q [( q [ 'datetime_col' ] < datetime .datetime (2025 , 1 , abs (- 20 ))) & (q [ 'int_col' ] >= abs (- 5 ))]
130
+ compare_against_pyarrow (expr , expected_q , lib , function_map )
121
131
122
132
def test_broken_filters ():
123
133
# ill-formated filter
0 commit comments