Skip to content

Commit f2500d1

Browse files
authored
Strip Parquet file_group information from expected plans (#16)
* ignore Parquet paths in tests * remove file group info
1 parent 2c8b8b8 commit f2500d1

24 files changed

+168
-443
lines changed

README.md

+16-5
Original file line numberDiff line numberDiff line change
@@ -120,18 +120,29 @@ python -m pip install -r requirements-in.txt
120120

121121
Whenever rust code changes (your changes or via `git pull`):
122122

123-
````bash
123+
```bash
124124
# make sure you activate the venv using "source venv/bin/activate" first
125-
maturin develop python -m pytest ```
126-
125+
maturin develop python -m pytest
126+
```
127127

128128
## Testing
129129

130130
Running local Rust tests require generating the tpch-data. This can be done
131-
by running the following command:
131+
by running the following commands:
132132

133133
```bash
134-
./scripts/generate_tpch_data.sh
134+
export TPCH_TEST_PARTITIONS=1
135+
export TPCH_SCALING_FACTOR=1
136+
./scripts/gen-test-data.sh
137+
```
138+
139+
This will generate data into a top-level `data` directory.
140+
141+
Tests can be run with:
142+
143+
```shell
144+
export TPCH_DATA_PATH=`pwd`/data
145+
cargo test
135146
```
136147

137148
Tests compare plans with expected plans, which unfortunately contain the

scripts/replace-expected-plans-paths.sh

-44
This file was deleted.

src/planner.rs

+10-9
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,9 @@ mod test {
389389
}
390390

391391
async fn do_test(n: u8) -> TestResult<()> {
392-
let data_path = env::var("TPCH_DATA_PATH")?;
392+
let tpch_path_env_var = "TPCH_DATA_PATH";
393+
let data_path = env::var(tpch_path_env_var).expect(&format!("Environment variable {} not found", tpch_path_env_var));
394+
393395
let file = format!("testdata/queries/q{n}.sql");
394396
let sql = fs::read_to_string(&file)?;
395397
let config = SessionConfig::new().with_target_partitions(1);
@@ -432,19 +434,18 @@ mod test {
432434
displayable(query_stage.plan.as_ref()).indent(false)
433435
));
434436
}
437+
438+
// Remove Parquet file group information since it will vary between CI/CD and local
439+
let re = Regex::new(r"file_groups=\{.*}")?;
440+
let cleaned_output = re.replace_all(output.as_str(), "file_groups={ ... }");
441+
435442
let expected_file = format!("testdata/expected-plans/q{n}.txt");
436443
if !Path::new(&expected_file).exists() {
437-
fs::write(&expected_file, &output)?;
444+
fs::write(&expected_file, &*cleaned_output)?;
438445
}
439446
let expected_plan = fs::read_to_string(&expected_file)?;
440447

441-
let re = Regex::new(r":[^]]*]")?;
442-
443-
// Remove the byte offsets from the plans, seems non repeatable
444-
// between CI/CD and local
445-
let cleaned_expected_plan = re.replace_all(&expected_plan, "]");
446-
let cleaned_output = re.replace_all(&output, "]");
447-
assert_eq!(cleaned_expected_plan, cleaned_output);
448+
assert_eq!(expected_plan, cleaned_output);
448449
Ok(())
449450
}
450451
}

testdata/expected-plans/q1.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ SortExec: expr=[l_returnflag@0 ASC NULLS LAST,l_linestatus@1 ASC NULLS LAST], pr
1717
ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __common_expr_2, l_quantity@0 as l_quantity, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus]
1818
CoalesceBatchesExec: target_batch_size=8192
1919
FilterExec: l_shipdate@6 <= 1998-09-24
20-
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/lineitem.parquet:0..208317955]]}, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], predicate=l_shipdate@10 <= 1998-09-24, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@0 <= 1998-09-24 END, required_guarantees=[]
20+
ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], predicate=l_shipdate@10 <= 1998-09-24, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@0 <= 1998-09-24 END, required_guarantees=[]
2121

2222
RaySQL Plan
2323
===========
@@ -29,5 +29,5 @@ SortExec: expr=[l_returnflag@0 ASC NULLS LAST,l_linestatus@1 ASC NULLS LAST], pr
2929
ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __common_expr_2, l_quantity@0 as l_quantity, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus]
3030
CoalesceBatchesExec: target_batch_size=8192
3131
FilterExec: l_shipdate@6 <= 1998-09-24
32-
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/lineitem.parquet:0..208317955]]}, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], predicate=l_shipdate@10 <= 1998-09-24, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@0 <= 1998-09-24 END, required_guarantees=[]
32+
ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], predicate=l_shipdate@10 <= 1998-09-24, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@0 <= 1998-09-24 END, required_guarantees=[]
3333

testdata/expected-plans/q10.txt

+8-8
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,20 @@ SortExec: TopK(fetch=20), expr=[revenue@2 DESC], preserve_partitioning=[false]
2929
ProjectionExec: expr=[c_custkey@1 as c_custkey, c_name@2 as c_name, c_address@3 as c_address, c_phone@4 as c_phone, c_acctbal@5 as c_acctbal, c_comment@6 as c_comment, l_extendedprice@7 as l_extendedprice, l_discount@8 as l_discount, n_name@0 as n_name]
3030
CoalesceBatchesExec: target_batch_size=8192
3131
HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(n_nationkey@0, c_nationkey@3)], projection=[n_name@1, c_custkey@2, c_name@3, c_address@4, c_phone@6, c_acctbal@7, c_comment@8, l_extendedprice@9, l_discount@10]
32-
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/nation.parquet]]}, projection=[n_nationkey, n_name]
32+
ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name]
3333
CoalesceBatchesExec: target_batch_size=8192
3434
HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10]
3535
CoalesceBatchesExec: target_batch_size=8192
3636
HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7]
37-
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/customer.parquet:0..13211178]]}, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment]
37+
ParquetExec: file_groups={ ... }, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment]
3838
ProjectionExec: expr=[o_orderkey@0 as o_orderkey, o_custkey@1 as o_custkey]
3939
CoalesceBatchesExec: target_batch_size=8192
4040
FilterExec: o_orderdate@2 >= 1993-07-01 AND o_orderdate@2 < 1993-10-01
41-
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/orders.parquet:0..54530383]]}, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[]
41+
ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[]
4242
ProjectionExec: expr=[l_orderkey@0 as l_orderkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount]
4343
CoalesceBatchesExec: target_batch_size=8192
4444
FilterExec: l_returnflag@3 = R
45-
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/lineitem.parquet:0..208317955]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)]
45+
ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)]
4646

4747
RaySQL Plan
4848
===========
@@ -54,18 +54,18 @@ SortExec: TopK(fetch=20), expr=[revenue@2 DESC], preserve_partitioning=[false]
5454
ProjectionExec: expr=[c_custkey@1 as c_custkey, c_name@2 as c_name, c_address@3 as c_address, c_phone@4 as c_phone, c_acctbal@5 as c_acctbal, c_comment@6 as c_comment, l_extendedprice@7 as l_extendedprice, l_discount@8 as l_discount, n_name@0 as n_name]
5555
CoalesceBatchesExec: target_batch_size=8192
5656
HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(n_nationkey@0, c_nationkey@3)], projection=[n_name@1, c_custkey@2, c_name@3, c_address@4, c_phone@6, c_acctbal@7, c_comment@8, l_extendedprice@9, l_discount@10]
57-
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/nation.parquet]]}, projection=[n_nationkey, n_name]
57+
ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name]
5858
CoalesceBatchesExec: target_batch_size=8192
5959
HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10]
6060
CoalesceBatchesExec: target_batch_size=8192
6161
HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7]
62-
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/customer.parquet:0..13211178]]}, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment]
62+
ParquetExec: file_groups={ ... }, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment]
6363
ProjectionExec: expr=[o_orderkey@0 as o_orderkey, o_custkey@1 as o_custkey]
6464
CoalesceBatchesExec: target_batch_size=8192
6565
FilterExec: o_orderdate@2 >= 1993-07-01 AND o_orderdate@2 < 1993-10-01
66-
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/orders.parquet:0..54530383]]}, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[]
66+
ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[]
6767
ProjectionExec: expr=[l_orderkey@0 as l_orderkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount]
6868
CoalesceBatchesExec: target_batch_size=8192
6969
FilterExec: l_returnflag@3 = R
70-
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/lineitem.parquet:0..208317955]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)]
70+
ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)]
7171

0 commit comments

Comments
 (0)