chore: setup dat test scaffolding

rtyler · rtyler · commit 3338af17a1c6 · 2025-05-04T15:06:12.000Z
This commit was modified from delta-io#3137 to enable an independent merge to bring some of these structural changes needed for delta-kernel-rs integration in piecemeal Signed-off-by: Robert Pack <robstar.pack@gmail.com> Signed-off-by: R. Tyler Croy <rtyler@brokenco.de>
diff --git a/.github/actions/load_dat/action.yaml b/.github/actions/load_dat/action.yaml
@@ -0,0 +1,26 @@
+name: Delta Acceptance Tests
+description: Load Delta Lake acceptance test data
+
+inputs:
+  version:
+    description: "The Python version to set up"
+    required: false
+    default: "0.0.3"
+
+  target-directory:
+    description: target directory for acceptance test data
+    required: false
+    default: ${{ github.workspace }}/dat
+
+runs:
+  using: composite
+
+  steps:
+    - name: load DAT
+      shell: bash
+      run: |
+        rm -rf {{ inputs.target-directory }}
+        curl -OL https://github.com/delta-incubator/dat/releases/download/v${{ inputs.version }}/deltalake-dat-v${{ inputs.version }}.tar.gz
+        mkdir -p {{ inputs.target-directory }}
+        tar --no-same-permissions -xzf deltalake-dat-v${{ inputs.version }}.tar.gz --directory {{ inputs.target-directory }}
+        rm deltalake-dat-v${{ inputs.version }}.tar.gz
diff --git a/.github/actions/setup-env/action.yml b/.github/actions/setup-env/action.yml
@@ -4,12 +4,12 @@ description: "Set up Python, virtual environment, and Rust toolchain"
 inputs:
   python-version:
     description: "The Python version to set up"
-    required: true
+    required: false
     default: "3.10"
 
   rust-toolchain:
     description: "The Rust toolchain to set up"
-    required: true
+    required: false
     default: "stable"
 
 runs:
diff --git a/.gitignore b/.gitignore
@@ -23,6 +23,7 @@ __blobstorage__
 .githubchangeloggenerator.cache/
 .githubchangeloggenerator*
 data
+.zed/
 
 # Add all Cargo.lock files except for those in binary crates
 Cargo.lock
@@ -35,3 +36,4 @@ site
 __pycache__
 .zed
 .zed/
+dat/
diff --git a/crates/test/Cargo.toml b/crates/test/Cargo.toml
@@ -5,8 +5,16 @@ edition = "2021"
 publish = false
 
 [dependencies]
+arrow-array = { workspace = true, features = ["chrono-tz"] }
+arrow-cast = { workspace = true }
+arrow-ord = { workspace = true }
+arrow-schema = { workspace = true, features = ["serde"] }
+arrow-select = { workspace = true }
+parquet = { workspace = true, features = ["async", "object_store"] }
+
 bytes = { workspace = true }
 chrono = { workspace = true, default-features = false, features = ["clock"] }
+delta_kernel = { workspace = true }
 deltalake-core = { version = "0.26.0", path = "../core" }
 dotenvy = "0"
 fs_extra = "1.3.0"
@@ -16,7 +24,9 @@ rand = "0.8"
 serde = { workspace = true, features = ["derive"] }
 serde_json = { workspace = true }
 tempfile = "3"
+thiserror = { workspace = true }
 tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
+url = { workspace = true }
 
 [features]
 default = []
diff --git a/crates/test/src/acceptance/data.rs b/crates/test/src/acceptance/data.rs
@@ -0,0 +1,130 @@
+use std::{path::Path, sync::Arc};
+
+use arrow_array::{Array, RecordBatch};
+use arrow_ord::sort::{lexsort_to_indices, SortColumn};
+use arrow_schema::{DataType, Schema};
+use arrow_select::{concat::concat_batches, take::take};
+use delta_kernel::DeltaResult;
+use futures::{stream::TryStreamExt, StreamExt};
+use object_store::{local::LocalFileSystem, ObjectStore};
+use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder};
+
+use super::TestCaseInfo;
+use crate::TestResult;
+
+pub async fn read_golden(path: &Path, _version: Option<&str>) -> DeltaResult<RecordBatch> {
+    let expected_root = path.join("expected").join("latest").join("table_content");
+    let store = Arc::new(LocalFileSystem::new_with_prefix(&expected_root)?);
+    let files: Vec<_> = store.list(None).try_collect().await?;
+    let mut batches = vec![];
+    let mut schema = None;
+    for meta in files.into_iter() {
+        if let Some(ext) = meta.location.extension() {
+            if ext == "parquet" {
+                let reader = ParquetObjectReader::new(store.clone(), meta.location);
+                let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
+                if schema.is_none() {
+                    schema = Some(builder.schema().clone());
+                }
+                let mut stream = builder.build()?;
+                while let Some(batch) = stream.next().await {
+                    batches.push(batch?);
+                }
+            }
+        }
+    }
+    let all_data = concat_batches(&schema.unwrap(), &batches)?;
+    Ok(all_data)
+}
+
+pub fn sort_record_batch(batch: RecordBatch) -> DeltaResult<RecordBatch> {
+    // Sort by all columns
+    let mut sort_columns = vec![];
+    for col in batch.columns() {
+        match col.data_type() {
+            DataType::Struct(_) | DataType::List(_) | DataType::Map(_, _) => {
+                // can't sort structs, lists, or maps
+            }
+            _ => sort_columns.push(SortColumn {
+                values: col.clone(),
+                options: None,
+            }),
+        }
+    }
+    let indices = lexsort_to_indices(&sort_columns, None)?;
+    let columns = batch
+        .columns()
+        .iter()
+        .map(|c| take(c, &indices, None).unwrap())
+        .collect();
+    Ok(RecordBatch::try_new(batch.schema(), columns)?)
+}
+
+// Ensure that two schema have the same field names, and dict_id/ordering.
+// We ignore:
+//  - data type: This is checked already in `assert_columns_match`
+//  - nullability: parquet marks many things as nullable that we don't in our schema
+//  - metadata: because that diverges from the real data to the golden tabled data
+fn assert_schema_fields_match(schema: &Schema, golden: &Schema) {
+    for (schema_field, golden_field) in schema.fields.iter().zip(golden.fields.iter()) {
+        assert!(
+            schema_field.name() == golden_field.name(),
+            "Field names don't match"
+        );
+        assert!(
+            schema_field.dict_id() == golden_field.dict_id(),
+            "Field dict_id doesn't match"
+        );
+        assert!(
+            schema_field.dict_is_ordered() == golden_field.dict_is_ordered(),
+            "Field dict_is_ordered doesn't match"
+        );
+    }
+}
+
+// some things are equivalent, but don't show up as equivalent for `==`, so we normalize here
+fn normalize_col(col: Arc<dyn Array>) -> Arc<dyn Array> {
+    if let DataType::Timestamp(unit, Some(zone)) = col.data_type() {
+        if **zone == *"+00:00" {
+            arrow_cast::cast::cast(&col, &DataType::Timestamp(*unit, Some("UTC".into())))
+                .expect("Could not cast to UTC")
+        } else {
+            col
+        }
+    } else {
+        col
+    }
+}
+
+fn assert_columns_match(actual: &[Arc<dyn Array>], expected: &[Arc<dyn Array>]) {
+    for (actual, expected) in actual.iter().zip(expected) {
+        let actual = normalize_col(actual.clone());
+        let expected = normalize_col(expected.clone());
+        // note that array equality includes data_type equality
+        // See: https://arrow.apache.org/rust/arrow_data/equal/fn.equal.html
+        assert_eq!(
+            &actual, &expected,
+            "Column data didn't match. Got {actual:?}, expected {expected:?}"
+        );
+    }
+}
+
+pub async fn assert_scan_data(
+    all_data: Vec<RecordBatch>,
+    test_case: &TestCaseInfo,
+) -> TestResult<()> {
+    let all_data = concat_batches(&all_data[0].schema(), all_data.iter()).unwrap();
+    let all_data = sort_record_batch(all_data)?;
+
+    let golden = read_golden(test_case.root_dir(), None).await?;
+    let golden = sort_record_batch(golden)?;
+
+    assert_columns_match(all_data.columns(), golden.columns());
+    assert_schema_fields_match(all_data.schema().as_ref(), golden.schema().as_ref());
+    assert!(
+        all_data.num_rows() == golden.num_rows(),
+        "Didn't have same number of rows"
+    );
+
+    Ok(())
+}
diff --git a/crates/test/src/acceptance/meta.rs b/crates/test/src/acceptance/meta.rs
@@ -0,0 +1,73 @@
+use std::collections::HashMap;
+use std::fs::File;
+use std::path::{Path, PathBuf};
+
+use delta_kernel::{Error, Version};
+use serde::{Deserialize, Serialize};
+use url::Url;
+
+#[derive(Debug, thiserror::Error)]
+pub enum AssertionError {
+    #[error("Invalid test case data")]
+    InvalidTestCase,
+
+    #[error("Kernel error: {0}")]
+    KernelError(#[from] Error),
+}
+
+pub type TestResult<T, E = AssertionError> = std::result::Result<T, E>;
+
+#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)]
+struct TestCaseInfoJson {
+    name: String,
+    description: String,
+}
+
+#[derive(PartialEq, Eq, Debug)]
+pub struct TestCaseInfo {
+    name: String,
+    description: String,
+    root_dir: PathBuf,
+}
+
+impl TestCaseInfo {
+    /// Root path for this test cases Delta table.
+    pub fn table_root(&self) -> TestResult<Url> {
+        let table_root = self.root_dir.join("delta");
+        Url::from_directory_path(table_root).map_err(|_| AssertionError::InvalidTestCase)
+    }
+
+    pub fn root_dir(&self) -> &PathBuf {
+        &self.root_dir
+    }
+
+    pub fn table_summary(&self) -> TestResult<TableVersionMetaData> {
+        let info_path = self
+            .root_dir()
+            .join("expected/latest/table_version_metadata.json");
+        let file = File::open(info_path).map_err(|_| AssertionError::InvalidTestCase)?;
+        let info: TableVersionMetaData =
+            serde_json::from_reader(file).map_err(|_| AssertionError::InvalidTestCase)?;
+        Ok(info)
+    }
+}
+
+#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub struct TableVersionMetaData {
+    pub version: Version,
+    pub properties: HashMap<String, String>,
+    pub min_reader_version: i32,
+    pub min_writer_version: i32,
+}
+
+pub fn read_dat_case(case_root: impl AsRef<Path>) -> TestResult<TestCaseInfo> {
+    let info_path = case_root.as_ref().join("test_case_info.json");
+    let file = File::open(info_path).map_err(|_| AssertionError::InvalidTestCase)?;
+    let info: TestCaseInfoJson =
+        serde_json::from_reader(file).map_err(|_| AssertionError::InvalidTestCase)?;
+    Ok(TestCaseInfo {
+        root_dir: case_root.as_ref().into(),
+        name: info.name,
+        description: info.description,
+    })
+}
diff --git a/crates/test/src/acceptance/mod.rs b/crates/test/src/acceptance/mod.rs
@@ -0,0 +1,5 @@
+pub mod data;
+pub mod meta;
+
+pub use data::*;
+pub use meta::*;
diff --git a/crates/test/src/lib.rs b/crates/test/src/lib.rs
@@ -14,6 +14,7 @@ use deltalake_core::DeltaTableBuilder;
 use deltalake_core::{ObjectStore, Path};
 use tempfile::TempDir;
 
+pub mod acceptance;
 pub mod clock;
 pub mod concurrent;
 #[cfg(feature = "datafusion")]