Skip to content

Commit 3338af1

Browse files
committed
chore: setup dat test scaffolding
This commit was modified from delta-io#3137 to enable an independent merge to bring some of these structural changes needed for delta-kernel-rs integration in piecemeal Signed-off-by: Robert Pack <[email protected]> Signed-off-by: R. Tyler Croy <[email protected]>
1 parent 89ddf07 commit 3338af1

File tree

8 files changed

+249
-2
lines changed

8 files changed

+249
-2
lines changed

.github/actions/load_dat/action.yaml

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
name: Delta Acceptance Tests
2+
description: Load Delta Lake acceptance test data
3+
4+
inputs:
5+
version:
6+
description: "The Python version to set up"
7+
required: false
8+
default: "0.0.3"
9+
10+
target-directory:
11+
description: target directory for acceptance test data
12+
required: false
13+
default: ${{ github.workspace }}/dat
14+
15+
runs:
16+
using: composite
17+
18+
steps:
19+
- name: load DAT
20+
shell: bash
21+
run: |
22+
rm -rf {{ inputs.target-directory }}
23+
curl -OL https://github.com/delta-incubator/dat/releases/download/v${{ inputs.version }}/deltalake-dat-v${{ inputs.version }}.tar.gz
24+
mkdir -p {{ inputs.target-directory }}
25+
tar --no-same-permissions -xzf deltalake-dat-v${{ inputs.version }}.tar.gz --directory {{ inputs.target-directory }}
26+
rm deltalake-dat-v${{ inputs.version }}.tar.gz

.github/actions/setup-env/action.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@ description: "Set up Python, virtual environment, and Rust toolchain"
44
inputs:
55
python-version:
66
description: "The Python version to set up"
7-
required: true
7+
required: false
88
default: "3.10"
99

1010
rust-toolchain:
1111
description: "The Rust toolchain to set up"
12-
required: true
12+
required: false
1313
default: "stable"
1414

1515
runs:

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ __blobstorage__
2323
.githubchangeloggenerator.cache/
2424
.githubchangeloggenerator*
2525
data
26+
.zed/
2627

2728
# Add all Cargo.lock files except for those in binary crates
2829
Cargo.lock
@@ -35,3 +36,4 @@ site
3536
__pycache__
3637
.zed
3738
.zed/
39+
dat/

crates/test/Cargo.toml

+10
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,16 @@ edition = "2021"
55
publish = false
66

77
[dependencies]
8+
arrow-array = { workspace = true, features = ["chrono-tz"] }
9+
arrow-cast = { workspace = true }
10+
arrow-ord = { workspace = true }
11+
arrow-schema = { workspace = true, features = ["serde"] }
12+
arrow-select = { workspace = true }
13+
parquet = { workspace = true, features = ["async", "object_store"] }
14+
815
bytes = { workspace = true }
916
chrono = { workspace = true, default-features = false, features = ["clock"] }
17+
delta_kernel = { workspace = true }
1018
deltalake-core = { version = "0.26.0", path = "../core" }
1119
dotenvy = "0"
1220
fs_extra = "1.3.0"
@@ -16,7 +24,9 @@ rand = "0.8"
1624
serde = { workspace = true, features = ["derive"] }
1725
serde_json = { workspace = true }
1826
tempfile = "3"
27+
thiserror = { workspace = true }
1928
tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
29+
url = { workspace = true }
2030

2131
[features]
2232
default = []

crates/test/src/acceptance/data.rs

+130
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
use std::{path::Path, sync::Arc};
2+
3+
use arrow_array::{Array, RecordBatch};
4+
use arrow_ord::sort::{lexsort_to_indices, SortColumn};
5+
use arrow_schema::{DataType, Schema};
6+
use arrow_select::{concat::concat_batches, take::take};
7+
use delta_kernel::DeltaResult;
8+
use futures::{stream::TryStreamExt, StreamExt};
9+
use object_store::{local::LocalFileSystem, ObjectStore};
10+
use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder};
11+
12+
use super::TestCaseInfo;
13+
use crate::TestResult;
14+
15+
pub async fn read_golden(path: &Path, _version: Option<&str>) -> DeltaResult<RecordBatch> {
16+
let expected_root = path.join("expected").join("latest").join("table_content");
17+
let store = Arc::new(LocalFileSystem::new_with_prefix(&expected_root)?);
18+
let files: Vec<_> = store.list(None).try_collect().await?;
19+
let mut batches = vec![];
20+
let mut schema = None;
21+
for meta in files.into_iter() {
22+
if let Some(ext) = meta.location.extension() {
23+
if ext == "parquet" {
24+
let reader = ParquetObjectReader::new(store.clone(), meta.location);
25+
let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
26+
if schema.is_none() {
27+
schema = Some(builder.schema().clone());
28+
}
29+
let mut stream = builder.build()?;
30+
while let Some(batch) = stream.next().await {
31+
batches.push(batch?);
32+
}
33+
}
34+
}
35+
}
36+
let all_data = concat_batches(&schema.unwrap(), &batches)?;
37+
Ok(all_data)
38+
}
39+
40+
pub fn sort_record_batch(batch: RecordBatch) -> DeltaResult<RecordBatch> {
41+
// Sort by all columns
42+
let mut sort_columns = vec![];
43+
for col in batch.columns() {
44+
match col.data_type() {
45+
DataType::Struct(_) | DataType::List(_) | DataType::Map(_, _) => {
46+
// can't sort structs, lists, or maps
47+
}
48+
_ => sort_columns.push(SortColumn {
49+
values: col.clone(),
50+
options: None,
51+
}),
52+
}
53+
}
54+
let indices = lexsort_to_indices(&sort_columns, None)?;
55+
let columns = batch
56+
.columns()
57+
.iter()
58+
.map(|c| take(c, &indices, None).unwrap())
59+
.collect();
60+
Ok(RecordBatch::try_new(batch.schema(), columns)?)
61+
}
62+
63+
// Ensure that two schema have the same field names, and dict_id/ordering.
64+
// We ignore:
65+
// - data type: This is checked already in `assert_columns_match`
66+
// - nullability: parquet marks many things as nullable that we don't in our schema
67+
// - metadata: because that diverges from the real data to the golden tabled data
68+
fn assert_schema_fields_match(schema: &Schema, golden: &Schema) {
69+
for (schema_field, golden_field) in schema.fields.iter().zip(golden.fields.iter()) {
70+
assert!(
71+
schema_field.name() == golden_field.name(),
72+
"Field names don't match"
73+
);
74+
assert!(
75+
schema_field.dict_id() == golden_field.dict_id(),
76+
"Field dict_id doesn't match"
77+
);
78+
assert!(
79+
schema_field.dict_is_ordered() == golden_field.dict_is_ordered(),
80+
"Field dict_is_ordered doesn't match"
81+
);
82+
}
83+
}
84+
85+
// some things are equivalent, but don't show up as equivalent for `==`, so we normalize here
86+
fn normalize_col(col: Arc<dyn Array>) -> Arc<dyn Array> {
87+
if let DataType::Timestamp(unit, Some(zone)) = col.data_type() {
88+
if **zone == *"+00:00" {
89+
arrow_cast::cast::cast(&col, &DataType::Timestamp(*unit, Some("UTC".into())))
90+
.expect("Could not cast to UTC")
91+
} else {
92+
col
93+
}
94+
} else {
95+
col
96+
}
97+
}
98+
99+
fn assert_columns_match(actual: &[Arc<dyn Array>], expected: &[Arc<dyn Array>]) {
100+
for (actual, expected) in actual.iter().zip(expected) {
101+
let actual = normalize_col(actual.clone());
102+
let expected = normalize_col(expected.clone());
103+
// note that array equality includes data_type equality
104+
// See: https://arrow.apache.org/rust/arrow_data/equal/fn.equal.html
105+
assert_eq!(
106+
&actual, &expected,
107+
"Column data didn't match. Got {actual:?}, expected {expected:?}"
108+
);
109+
}
110+
}
111+
112+
pub async fn assert_scan_data(
113+
all_data: Vec<RecordBatch>,
114+
test_case: &TestCaseInfo,
115+
) -> TestResult<()> {
116+
let all_data = concat_batches(&all_data[0].schema(), all_data.iter()).unwrap();
117+
let all_data = sort_record_batch(all_data)?;
118+
119+
let golden = read_golden(test_case.root_dir(), None).await?;
120+
let golden = sort_record_batch(golden)?;
121+
122+
assert_columns_match(all_data.columns(), golden.columns());
123+
assert_schema_fields_match(all_data.schema().as_ref(), golden.schema().as_ref());
124+
assert!(
125+
all_data.num_rows() == golden.num_rows(),
126+
"Didn't have same number of rows"
127+
);
128+
129+
Ok(())
130+
}

crates/test/src/acceptance/meta.rs

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
use std::collections::HashMap;
2+
use std::fs::File;
3+
use std::path::{Path, PathBuf};
4+
5+
use delta_kernel::{Error, Version};
6+
use serde::{Deserialize, Serialize};
7+
use url::Url;
8+
9+
#[derive(Debug, thiserror::Error)]
10+
pub enum AssertionError {
11+
#[error("Invalid test case data")]
12+
InvalidTestCase,
13+
14+
#[error("Kernel error: {0}")]
15+
KernelError(#[from] Error),
16+
}
17+
18+
pub type TestResult<T, E = AssertionError> = std::result::Result<T, E>;
19+
20+
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)]
21+
struct TestCaseInfoJson {
22+
name: String,
23+
description: String,
24+
}
25+
26+
#[derive(PartialEq, Eq, Debug)]
27+
pub struct TestCaseInfo {
28+
name: String,
29+
description: String,
30+
root_dir: PathBuf,
31+
}
32+
33+
impl TestCaseInfo {
34+
/// Root path for this test cases Delta table.
35+
pub fn table_root(&self) -> TestResult<Url> {
36+
let table_root = self.root_dir.join("delta");
37+
Url::from_directory_path(table_root).map_err(|_| AssertionError::InvalidTestCase)
38+
}
39+
40+
pub fn root_dir(&self) -> &PathBuf {
41+
&self.root_dir
42+
}
43+
44+
pub fn table_summary(&self) -> TestResult<TableVersionMetaData> {
45+
let info_path = self
46+
.root_dir()
47+
.join("expected/latest/table_version_metadata.json");
48+
let file = File::open(info_path).map_err(|_| AssertionError::InvalidTestCase)?;
49+
let info: TableVersionMetaData =
50+
serde_json::from_reader(file).map_err(|_| AssertionError::InvalidTestCase)?;
51+
Ok(info)
52+
}
53+
}
54+
55+
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)]
56+
pub struct TableVersionMetaData {
57+
pub version: Version,
58+
pub properties: HashMap<String, String>,
59+
pub min_reader_version: i32,
60+
pub min_writer_version: i32,
61+
}
62+
63+
pub fn read_dat_case(case_root: impl AsRef<Path>) -> TestResult<TestCaseInfo> {
64+
let info_path = case_root.as_ref().join("test_case_info.json");
65+
let file = File::open(info_path).map_err(|_| AssertionError::InvalidTestCase)?;
66+
let info: TestCaseInfoJson =
67+
serde_json::from_reader(file).map_err(|_| AssertionError::InvalidTestCase)?;
68+
Ok(TestCaseInfo {
69+
root_dir: case_root.as_ref().into(),
70+
name: info.name,
71+
description: info.description,
72+
})
73+
}

crates/test/src/acceptance/mod.rs

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pub mod data;
2+
pub mod meta;
3+
4+
pub use data::*;
5+
pub use meta::*;

crates/test/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use deltalake_core::DeltaTableBuilder;
1414
use deltalake_core::{ObjectStore, Path};
1515
use tempfile::TempDir;
1616

17+
pub mod acceptance;
1718
pub mod clock;
1819
pub mod concurrent;
1920
#[cfg(feature = "datafusion")]

0 commit comments

Comments
 (0)