diff --git a/.github/workflows/tpcdsgen-conformance.yml b/.github/workflows/tpcdsgen-conformance.yml
index 88c94d4..e4bcbb9 100644
--- a/.github/workflows/tpcdsgen-conformance.yml
+++ b/.github/workflows/tpcdsgen-conformance.yml
@@ -3,17 +3,23 @@ name: TPC-DS Conformance
on:
push:
branches: [ main, master ]
+ paths:
+ - 'tpcdsgen/**'
+ - '.github/**'
pull_request:
branches: [ main, master ]
+ paths:
+ - 'tpcdsgen/**'
+ - '.github/**'
env:
CARGO_TERM_COLOR: always
RUST_BACKTRACE: 1
jobs:
- # Conformance testing against Java implementation
+ # Conformance testing against the Java / Trino reference implementation.
conformance-tests:
- name: Conformance Tests
+ name: Conformance Tests (Java)
runs-on: ubuntu-latest
steps:
@@ -65,7 +71,60 @@ jobs:
if: failure() # Upload fixtures if tests fail for debugging
uses: actions/upload-artifact@v7
with:
- name: test-fixtures
+ name: test-fixtures-java
+ path: tpcdsgen/tests/fixtures/
+ retention-days: 7
+
+ # Conformance testing against the C dsdgen reference implementation.
+ #
+ # Reference data is pre-generated and lives in
+ # https://github.com/alamb/tpcds-data (branch sf1).
+ # `generate-fixtures.sh --compat c` clones it with --depth 1 and extracts
+ # into tpcdsgen/tests/fixtures/scale-1-c/. Rust is then run in
+ # --compat c mode and the .dat output is compared byte-for-byte (MD5/diff).
+ conformance-tests-c:
+ name: Conformance Tests (C dsdgen)
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v6
+
+ - name: Install Rust toolchain
+ uses: dtolnay/rust-toolchain@stable
+
+ - name: Cache Rust dependencies
+ uses: actions/cache@v5
+ with:
+ path: |
+ ~/.cargo/bin/
+ ~/.cargo/registry/index/
+ ~/.cargo/registry/cache/
+ ~/.cargo/git/db/
+ target/
+ key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
+ restore-keys: |
+ ${{ runner.os }}-cargo-
+
+ - name: Download C dsdgen reference data
+ run: |
+ cd tpcdsgen
+ ./scripts/generate-fixtures.sh --compat c --scale 1
+
+ - name: Build Rust table generators
+ run: |
+ cargo build --release -p tpcdsgen
+
+ - name: Run conformance tests (Rust --compat c vs C dsdgen)
+ run: |
+ cd tpcdsgen
+ ./scripts/test-all-tables.sh --compat c
+
+ - name: Upload test fixtures as artifacts
+ if: failure()
+ uses: actions/upload-artifact@v7
+ with:
+ name: test-fixtures-c
path: tpcdsgen/tests/fixtures/
retention-days: 7
diff --git a/tpcdsgen/.gitignore b/tpcdsgen/.gitignore
index 233832c..adcd66c 100644
--- a/tpcdsgen/.gitignore
+++ b/tpcdsgen/.gitignore
@@ -8,9 +8,6 @@
# Test fixtures (generated).
#/tests/fixtures/
-# Python cache.
-scripts/__pycache__/
-
# Stuff I need to remember
NEXT_STEPS.md
ISSUES.md
diff --git a/tpcdsgen/.python-version b/tpcdsgen/.python-version
deleted file mode 100644
index 6324d40..0000000
--- a/tpcdsgen/.python-version
+++ /dev/null
@@ -1 +0,0 @@
-3.14
diff --git a/tpcdsgen/README.md b/tpcdsgen/README.md
index 623dff0..8eb1174 100644
--- a/tpcdsgen/README.md
+++ b/tpcdsgen/README.md
@@ -29,61 +29,47 @@ Fixtures are pre-generated TPC-DS data files used for conformance testing.
```
tests/fixtures/
-├── java/ # Java reference implementation output
-│ ├── scale-1/ # 25 tables, ~1.2GB
-│ └── scale-10/ # 25 tables, ~11GB
-└── rust/ # Rust implementation output
- ├── scale-1/ # 25 tables, ~1.2GB
- └── scale-10/ # 25 tables, ~11GB
+├── scale-1-java/ # Java reference fixtures (`--compat trino`)
+├── scale-1-c/ # C dsdgen reference fixtures (`--compat c`)
+└── scale-10-java/ # higher scale factors as needed
```
-### Generating Java Fixtures
-
-Requires the Java TPC-DS implementation to be built:
+### Conformance Testing
-```bash
-# Build Java implementation (if not already built)
-cd ../tpcds && mvn clean package -DskipTests && cd -
-
-# Generate Java fixtures for scale 1
-java -jar ../tpcds/target/tpcds-1.5-SNAPSHOT-jar-with-dependencies.jar \
- --scale 1 \
- --directory tests/fixtures/java/scale-1 \
- --overwrite
-
-# Generate Java fixtures for scale 10
-java -jar ../tpcds/target/tpcds-1.5-SNAPSHOT-jar-with-dependencies.jar \
- --scale 10 \
- --directory tests/fixtures/java/scale-10 \
- --overwrite
-```
+`tpcdsgen` ships with two conformance suites, both implemented as shell
+scripts that do byte-for-byte (MD5) comparison of `.dat` output. See
+[scripts/README.md](scripts/README.md) for full details.
-### Generating Rust Fixtures
+**vs. Java / Trino reference (default, `--compat trino`):**
```bash
-# Build Rust implementation
-cargo build --release
+# One-time: clone & build the Java TPC-DS implementation.
+./scripts/bootstrap-java.sh
-# Generate Rust fixtures for scale 1
-./target/release/tpcdsgen --scale 1 --directory tests/fixtures/rust/scale-1
+# Generate Java reference fixtures into tests/fixtures/scale-N-java/.
+./scripts/generate-fixtures.sh
-# Generate Rust fixtures for scale 10
-./target/release/tpcdsgen --scale 10 --directory tests/fixtures/rust/scale-10
+# Compare Rust output byte-for-byte against the Java fixtures.
+./scripts/test-all-tables.sh --scale 1
```
-### Conformance Testing
-
-To verify Rust output matches Java byte-for-byte:
+**vs. C dsdgen reference (`--compat c`):**
```bash
-# Run conformance tests at scale 1
-./scripts/test-all-tables.sh --scale 1
+# One-time: download pre-generated C dsdgen data from
+# https://github.com/alamb/tpcds-data into tests/fixtures/scale-N-c/.
+./scripts/generate-fixtures.sh --compat c --scale 1
-# Run conformance tests at scale 10
-./scripts/test-all-tables.sh --scale 10
+# Compare Rust --compat c output byte-for-byte against the C fixtures.
+./scripts/test-all-tables.sh --compat c --scale 1
```
-See [HASHES.md](HASHES.md) for the canonical MD5 hashes.
+Both suites also support comparing a single table:
+
+```bash
+./scripts/compare-table.sh reason # vs. Java
+./scripts/compare-table.sh reason --compat c # vs. C dsdgen
+```
### Verifying Fixtures with MD5SUMS
@@ -91,13 +77,13 @@ Each fixture directory contains an `MD5SUMS` file for verification.
**On Linux:**
```bash
-cd tests/fixtures/java/scale-1
+cd tests/fixtures/scale-1-java
md5sum -c MD5SUMS
```
**On macOS:**
```bash
-cd tests/fixtures/java/scale-1
+cd tests/fixtures/scale-1-java
while read hash file; do
[[ $(md5 -q "$file") == "$hash" ]] && echo "$file: OK" || echo "$file: FAILED"
done < MD5SUMS
diff --git a/tpcdsgen/main.py b/tpcdsgen/main.py
deleted file mode 100644
index b59da6e..0000000
--- a/tpcdsgen/main.py
+++ /dev/null
@@ -1,6 +0,0 @@
-def main():
- print("Hello from tpcdsgen!")
-
-
-if __name__ == "__main__":
- main()
diff --git a/tpcdsgen/pyproject.toml b/tpcdsgen/pyproject.toml
deleted file mode 100644
index 2fa8884..0000000
--- a/tpcdsgen/pyproject.toml
+++ /dev/null
@@ -1,10 +0,0 @@
-[project]
-name = "tpcdsgen"
-version = "0.1.0"
-description = "Add your description here"
-readme = "README.md"
-requires-python = ">=3.14"
-dependencies = [
- "datafusion>=53.0.0",
- "pyarrow>=24.0.0",
-]
diff --git a/tpcdsgen/scripts/README.md b/tpcdsgen/scripts/README.md
index 3c71f6a..bc3629a 100644
--- a/tpcdsgen/scripts/README.md
+++ b/tpcdsgen/scripts/README.md
@@ -1,285 +1,143 @@
# TPC-DS Test Scripts
-This directory contains scripts for testing the Rust TPC-DS implementation against the Java reference implementation.
+This directory contains scripts for testing the Rust TPC-DS implementation
+against two reference implementations:
-## Overview
+1. **Java / Trino** (default, `--compat trino`) — the Java port of `dsdgen`
+ used by Trino. The Rust port was originally derived from this and is
+ expected to be byte-for-byte identical.
+2. **C `dsdgen`** (`--compat c`) — the original TPC-supplied reference
+ implementation. The `--compat c` mode corrects bugs in the Java port to
+ match the C reference (see [BUGS.md](../BUGS.md) and the parent
+ [README](../README.md)).
-The testing infrastructure validates that the Rust port generates **byte-for-byte identical** output to the Java
-implementation (which itself maintains bug-for-bug compatibility with the original C dsdgen).
-
-## Prerequisites
-
-You need the Java TPC-DS implementation for conformance testing. Use the bootstrap script to set it up:
-
-```bash
-./scripts/bootstrap-java.sh
-```
-
-This will clone and build the Java implementation automatically. See the bootstrap section below for details.
+Both conformance suites validate **byte-for-byte identical** output via
+MD5/`diff` comparison.
## Directory Structure
```
tpcdsgen/
├── tests/
-│ └── fixtures/ # Generated reference data (gitignored)
-│ └── scale-1/ # Scale factor 1 reference data
+│ └── fixtures/ # Reference data (gitignored)
+│ ├── scale-1-java/ # Java reference (`--compat trino`)
+│ │ ├── call_center.dat
+│ │ ├── warehouse.dat
+│ │ └── ... (all 25 tables)
+│ └── scale-1-c/ # C dsdgen reference (`--compat c`)
│ ├── call_center.dat
│ ├── warehouse.dat
│ └── ... (all 25 tables)
└── scripts/
- ├── bootstrap-java.sh # Setup Java TPC-DS implementation
- ├── generate-fixtures.sh # Generate Java reference data
- ├── compare-table.sh # Compare one table
- ├── test-all-tables.sh # Test all ported tables
- ├── clean-fixtures.sh # Clean up fixtures
- └── README.md # This file
+ ├── bootstrap-java.sh # Clone + build the Java TPC-DS impl
+ ├── generate-fixtures.sh # Generate/download reference fixtures
+ │ # (Java via --compat trino; C via --compat c)
+ ├── compare-table.sh # Compare one table
+ ├── test-all-tables.sh # Compare all ported tables
+ ├── clean-fixtures.sh # Clean fixtures
+ └── README.md # This file
```
-## Quick Start
+## Quick Start — Java conformance (`--compat trino`)
```bash
# 1. Bootstrap Java implementation (first time only)
./scripts/bootstrap-java.sh
-# 2. Generate reference fixtures
+# 2. Generate Java reference fixtures into tests/fixtures/scale-N-java/.
./scripts/generate-fixtures.sh
-# 3. Test all ported tables
+# 3. Test all ported tables against the Java reference.
./scripts/test-all-tables.sh
```
-## Scripts
-
-### 0. `bootstrap-java.sh` - Setup Java TPC-DS Implementation
-
-**⚠️ Run this first!** Sets up the Java TPC-DS implementation needed for conformance testing.
-
-**Usage:**
-```bash
-# First time setup (clone and build)
-./scripts/bootstrap-java.sh
-
-# Force rebuild
-./scripts/bootstrap-java.sh --rebuild
-
-# Verify existing installation
-./scripts/bootstrap-java.sh --verify
-
-# Show help
-./scripts/bootstrap-java.sh --help
-```
-
-**What it does:**
-1. Checks if Java and Maven are installed
-2. Clones the Java TPC-DS repository from GitHub (if needed)
-3. Builds the Java implementation with Maven
-4. Runs a smoke test to verify it works
-
-**Requirements:**
-- Java 11+ (e.g., `brew install openjdk@11`)
-- Maven (e.g., `brew install maven`)
-- Git
-
-**Environment Variables:**
-- `TPCDS_JAVA_REPO` - Override the Java repo URL (default: https://github.com/trinodb/tpcds.git)
-
-**Output:**
-- Clones to `../tpcds/` (parallel to this repo)
-- Creates `../tpcds/target/tpcds-*-jar-with-dependencies.jar`
-
-**Time:** ~2-3 minutes (first run)
-
-### 1. `generate-fixtures.sh` - Generate Reference Data
-
-Generates TPC-DS tables using the Java implementation. This creates the "golden reference" data that Rust output is compared against.
-
-**Usage:**
-```bash
-# Generate all 25 tables (recommended first run)
-./scripts/generate-fixtures.sh
-
-# Generate specific tables
-./scripts/generate-fixtures.sh call_center warehouse
-
-# Quiet mode (minimal output)
-./scripts/generate-fixtures.sh --quiet
-
-# Show help
-./scripts/generate-fixtures.sh --help
-```
-
-**What it does:**
-1. Checks if Java implementation is built (builds if needed)
-2. Creates `tests/fixtures/scale-1/` directory
-3. Generates each table using Java TPC-DS generator
-4. Reports progress and statistics
-
-**Output:**
-- Generates `.dat` files in `tests/fixtures/scale-1/`
-- Each file contains pipe-delimited rows with trailing pipe: `value1|value2|value3|`
-- Files are gitignored (regenerate as needed)
-
-**Time:** ~2-5 minutes for all 25 tables at scale 1
-
----
-
-### 2. `compare-table.sh` - Compare Single Table
-
-Compares Rust-generated output for a single table against the Java reference fixture.
-
-**Usage:**
-```bash
-# Compare a table
-./scripts/compare-table.sh call_center
-
-# Quiet mode
-./scripts/compare-table.sh customer_demographics --quiet
-
-# Show help
-./scripts/compare-table.sh --help
-```
-
-**What it does:**
-1. Checks that Java fixture exists
-2. Generates table using Rust implementation
-3. Performs byte-for-byte comparison with `diff`
-4. Reports results
-
-**Exit codes:**
-- `0` - Tables match exactly ✓
-- `1` - Tables differ or error occurred ✗
-
-**Output example:**
-```
-[INFO] =========================================
-[INFO] Table Comparison: call_center
-[INFO] =========================================
-[INFO] Java fixture: tests/fixtures/scale-1/call_center.dat
-[INFO] Generating call_center with Rust...
-[INFO] Using binary: target/release/tpcdsgen --table call_center
-[INFO] Comparing outputs...
-[INFO] Java fixture: 6 rows, 4.0K
-[INFO] Rust output: 6 rows, 4.0K
-[SUCCESS] ✓ call_center: MD5 match (6 rows, cc9aabc63eb8603bd7330b6735ed0961)
-[INFO] =========================================
-```
-
----
+## Quick Start — C dsdgen conformance (`--compat c`)
-### 3. `test-all-tables.sh` - Test All Ported Tables
+The C reference data is pre-generated and published in
+[alamb/tpcds-data](https://github.com/alamb/tpcds-data), one branch per
+scale factor (`sf1`, `sf2`, ...). `generate-fixtures.sh --compat c` clones
+the requested branch with `--depth 1` and extracts it into
+`tests/fixtures/scale-N-c/`.
-Runs comparison tests for all tables that have been ported to Rust. This is the main test suite.
-
-**Usage:**
```bash
-# Test all ported tables (verbose)
-./scripts/test-all-tables.sh
-
-# Quiet mode (show only summary)
-./scripts/test-all-tables.sh --quiet
-
-# Show help
-./scripts/test-all-tables.sh --help
-```
-
-**What it does:**
-1. Tests all 24 TPC-DS tables (dbgen_version excluded - has timestamps)
-2. Builds the unified Rust generator (`tpcdsgen`)
-3. Compares each table against Java fixture using `compare-table.sh`
-4. Prints comprehensive summary
+# 1. Download the C dsdgen reference data (default scale 1).
+./scripts/generate-fixtures.sh --compat c # sf1
+./scripts/generate-fixtures.sh --compat c --scale 2 # sf2
-**Exit codes:**
-- `0` - All tables match ✓
-- `1` - One or more tables differ ✗
+# 2. Test all ported tables against the C reference.
+./scripts/test-all-tables.sh --compat c
-**Output example:**
-```
-[INFO] =========================================
-[INFO] TPC-DS Table Test Suite
-[INFO] =========================================
-[INFO] Testing 24 tables:
-[INFO] - call_center
-[INFO] - catalog_page
-[INFO] - catalog_returns
-[INFO] ... (all 24 tables)
-[INFO] =========================================
-[INFO] Building Rust TPC-DS generator...
-[SUCCESS] Generator built successfully
-[INFO] =========================================
-
-[INFO] Testing: call_center
-...
-[SUCCESS] ✓ call_center: MD5 match (6 rows, cc9aabc63eb8603bd7330b6735ed0961)
-...
-
-[INFO] =========================================
-[INFO] Test Summary
-[INFO] =========================================
-[INFO] Total tables tested: 24
-[SUCCESS] Passed: 24
-
-[INFO] Total time: 45s
-[INFO] =========================================
+# Or compare a single table.
+./scripts/compare-table.sh reason --compat c
```
----
+### Tables excluded from automated checks
-### 4. `clean-fixtures.sh` - Clean Up Fixtures
+The following tables are excluded from automated MD5 comparison; the
+exclusion lists live in `test-all-tables.sh`.
-Removes all generated fixtures to free up disk space or force regeneration.
+- **Always:** `dbgen_version.dat` — contains a generation timestamp.
+- **`--compat c` only:** `customer.dat` — the reference data in
+ `alamb/tpcds-data` was generated through a pipeline that double-UTF-8
+ encodes the non-ASCII country names (`CÔTE D'IVOIRE`, `RÉUNION`). The
+ Rust `--compat c` output uses raw Latin-1, which is what unmodified C
+ `dsdgen` produces. Once the reference data is regenerated without the
+ `iconv ISO-8859-14 -> UTF-8` step in `alamb/tpcds-data`'s `Dockerfile`,
+ this exclusion can be removed.
-**Usage:**
-```bash
-# Clean with confirmation prompt
-./scripts/clean-fixtures.sh
+## Scripts
-# Clean without confirmation
-./scripts/clean-fixtures.sh --yes
+Each script is self-documenting — open it and read the header comment for
+full usage, flags, environment variables, output, and exit codes. The
+table below is just a roadmap.
-# Show help
-./scripts/clean-fixtures.sh --help
-```
+| Script | Purpose |
+|---------------------------|---------------------------------------------------------------------------------------------------------------------------------|
+| `bootstrap-java.sh` | Clone and build the Java / Trino reference implementation into `../tpcds/`. Run once before Java conformance. |
+| `generate-fixtures.sh` | Populate `tests/fixtures/scale-N-{java,c}/` with reference data. `--compat trino` (default) runs the Java impl; `--compat c` downloads pre-generated C `dsdgen` data from [alamb/tpcds-data](https://github.com/alamb/tpcds-data). |
+| `compare-table.sh` | Compare one table's Rust output against the selected reference (`--compat trino` or `--compat c`) via MD5 + diff. |
+| `test-all-tables.sh` | Run the full conformance suite for one compat mode (the main CI entry point). Honors per-mode skip lists at the top of the script. |
+| `clean-fixtures.sh` | Remove all generated fixtures under `tests/fixtures/`. |
-**What it does:**
-1. Counts fixture files and reports total size
-2. Asks for confirmation (unless `--yes` provided)
-3. Deletes entire `tests/fixtures/` directory
+Run any script with `--help` to print its usage block.
---
## Typical Workflow
-### Initial Setup
+### Java conformance
```bash
-# 1. Generate all reference fixtures (one-time, or when Java changes)
+# 1. Generate Java reference fixtures (one-time, or when Java changes).
./scripts/generate-fixtures.sh
-# This creates tests/fixtures/scale-1/*.dat files
+# 2. Run the comparison.
+./scripts/compare-table.sh
# one table
+./scripts/test-all-tables.sh # all tables
```
-### During Development
+### C dsdgen conformance
```bash
-# 2. After implementing a new table, compare it
-./scripts/compare-table.sh new_table_name
+# 1. Download the C reference data (one-time, or to refresh).
+./scripts/generate-fixtures.sh --compat c
-# 3. Or test all ported tables at once
-./scripts/test-all-tables.sh
+# 2. Run the comparison in C-compat mode.
+./scripts/compare-table.sh --compat c
+./scripts/test-all-tables.sh --compat c
```
### Cleanup
```bash
-# 4. Remove fixtures if needed (can regenerate anytime)
-./scripts/clean-fixtures.sh --yes
+./scripts/clean-fixtures.sh --yes # remove all fixtures
```
---
## Requirements
-- **Java:** Maven-built TPC-DS JAR at `../tpcds/target/tpcds-*-jar-with-dependencies.jar`
-- **Rust:** Cargo-built `tpcdsgen` binary at `target/debug/tpcdsgen` or `target/release/tpcdsgen`
-- **Disk space:** ~500MB-1GB for scale 1 fixtures
+- **Java:** Maven-built TPC-DS JAR at `../tpcds/target/tpcds-*-jar-with-dependencies.jar` (`bootstrap-java.sh` handles this).
+- **C dsdgen reference:** `git`, `tar`, `bzip2` for `generate-fixtures.sh --compat c`. No C compiler required — data is pre-generated.
+- **Rust:** Cargo-built `tpcdsgen` binary at `target/debug/tpcdsgen` or `target/release/tpcdsgen`.
+- **Disk space:** ~1 GB for SF1 Java fixtures; ~2.4 GB for SF1 C fixtures.
---
@@ -296,16 +154,21 @@ mvn clean package
cargo build --release
```
-**Problem:** `Fixture not found`
+**Problem:** `Fixture not found` (Java path)
```bash
./scripts/generate-fixtures.sh X
```
+**Problem:** `Fixture not found` (C path)
+```bash
+./scripts/generate-fixtures.sh --compat c --scale N
+```
+
**Problem:** Tables don't match
-1. Check if both implementations use same seed (should be deterministic)
-2. Verify Rust port logic against Java source
-3. Use `diff` output to find first difference
-4. Debug specific row/column that differs
+1. Check that the right compat mode is selected (`--compat trino` vs `--compat c`).
+2. Verify both sides use the same seed (the Rust generator is deterministic).
+3. Use the `diff` output to find the first difference.
+4. Debug the specific row/column that differs.
---
@@ -314,12 +177,14 @@ cargo build --release
These scripts are designed to be CI-friendly:
```yaml
-# Example GitHub Actions workflow
-- name: Generate fixtures
- run: ./scripts/generate-fixtures.sh --quiet
-
-- name: Test all tables
- run: ./scripts/test-all-tables.sh --quiet
+# Java conformance
+- run: ./scripts/bootstrap-java.sh
+- run: ./scripts/generate-fixtures.sh --quiet
+- run: ./scripts/test-all-tables.sh --quiet
+
+# C dsdgen conformance
+- run: ./scripts/generate-fixtures.sh --compat c
+- run: ./scripts/test-all-tables.sh --compat c --quiet
```
Exit codes make it easy to fail CI on mismatches.
diff --git a/tpcdsgen/scripts/bootstrap-java.sh b/tpcdsgen/scripts/bootstrap-java.sh
index 301926a..4f29183 100755
--- a/tpcdsgen/scripts/bootstrap-java.sh
+++ b/tpcdsgen/scripts/bootstrap-java.sh
@@ -1,19 +1,50 @@
#!/usr/bin/env bash
#
-# Bootstrap the Java TPC-DS implementation for conformance testing
+# bootstrap-java.sh — Set up the Java / Trino TPC-DS reference
+# implementation used by `--compat trino` conformance testing.
#
-# This script:
-# 1. Clones the Java TPC-DS repository (if needed)
-# 2. Builds the Java implementation
-# 3. Verifies the build succeeded
-#
-# Usage:
-# ./scripts/bootstrap-java.sh # Clone and build
-# ./scripts/bootstrap-java.sh --rebuild # Force rebuild even if exists
-# ./scripts/bootstrap-java.sh --verify # Just verify, don't clone/build
+# Please see print_usage() below for details.
set -euo pipefail
+print_usage() {
+ cat << 'EOF'
+bootstrap-java.sh — Set up the Java / Trino TPC-DS reference implementation.
+
+What it does:
+ 1. Checks that Java 11+ and Maven are installed.
+ 2. Clones the Java TPC-DS repository into ../tpcds/ (if not present).
+ 3. Builds the Java implementation with `mvn clean package -DskipTests`.
+ 4. Runs a small smoke test to confirm the JAR works.
+
+Usage:
+ bootstrap-java.sh [OPTIONS]
+
+Options:
+ --rebuild Force rebuild even if the JAR already exists.
+ --verify Only verify the existing installation; do not clone/build.
+ --help Show this help message.
+
+Environment variables:
+ TPCDS_JAVA_REPO Git URL for Java TPC-DS repo.
+ Default: https://github.com/trinodb/tpcds.git
+
+Requirements: Java 11+, Maven, git.
+
+Output:
+ Clones to ../tpcds/ (parallel to this repo) and produces
+ ../tpcds/target/tpcds-*-jar-with-dependencies.jar.
+
+Examples:
+ bootstrap-java.sh # Clone and build if needed.
+ bootstrap-java.sh --rebuild # Force clean rebuild.
+ bootstrap-java.sh --verify # Just check existing install.
+
+See scripts/README.md for the full conformance-testing workflow.
+EOF
+ exit 0
+}
+
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
@@ -49,32 +80,6 @@ log_error() {
echo -e "${RED}[ERROR]${NC} $*" >&2
}
-# Print usage
-usage() {
- cat << EOF
-Bootstrap the Java TPC-DS implementation for conformance testing
-
-Usage:
- $(basename "$0") [OPTIONS]
-
-Options:
- --rebuild Force rebuild even if JAR exists
- --verify Only verify installation, don't clone/build
- --help Show this help message
-
-Environment Variables:
- TPCDS_JAVA_REPO Git URL for Java TPC-DS repo
- Default: https://github.com/trinodb/tpcds.git
-
-Examples:
- $(basename "$0") # Clone and build if needed
- $(basename "$0") --rebuild # Force clean rebuild
- $(basename "$0") --verify # Just check if everything works
-
-EOF
- exit 0
-}
-
# Check if Java/Maven are installed
check_prerequisites() {
log_info "Checking prerequisites..."
@@ -275,7 +280,7 @@ main() {
shift
;;
--help)
- usage
+ print_usage
;;
*)
log_error "Unknown option: $1"
diff --git a/tpcdsgen/scripts/clean-fixtures.sh b/tpcdsgen/scripts/clean-fixtures.sh
index fa4df72..51e7f3c 100755
--- a/tpcdsgen/scripts/clean-fixtures.sh
+++ b/tpcdsgen/scripts/clean-fixtures.sh
@@ -1,12 +1,41 @@
#!/usr/bin/env bash
#
-# Clean up generated test fixtures
+# clean-fixtures.sh — Remove all generated reference fixtures.
#
-# Usage:
-# ./scripts/clean-fixtures.sh [--yes]
+# Please see print_usage() below for details.
set -euo pipefail
+print_usage() {
+ cat << 'EOF'
+clean-fixtures.sh — Remove all generated reference fixtures.
+
+Deletes the entire tests/fixtures/ tree (Java fixtures in scale-N-java/
+and C dsdgen fixtures in scale-N-c/). Fixtures are git-ignored generated
+artifacts and can be re-created with
+./scripts/generate-fixtures.sh (with or without --compat c).
+
+What it does:
+ 1. Counts existing .dat fixture files and reports total size.
+ 2. Asks for confirmation (unless --yes is passed).
+ 3. Removes tests/fixtures/ entirely.
+
+Usage:
+ clean-fixtures.sh [OPTIONS]
+
+Options:
+ --yes Skip confirmation prompt.
+ --help Show this help message.
+
+Examples:
+ clean-fixtures.sh # Clean with confirmation.
+ clean-fixtures.sh --yes # Clean without confirmation.
+
+See scripts/README.md for the full conformance-testing workflow.
+EOF
+ exit 0
+}
+
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
@@ -34,25 +63,6 @@ log_warn() {
echo -e "${YELLOW}[WARN]${NC} $*"
}
-# Print usage
-usage() {
- cat << EOF
-Clean up generated test fixtures
-
-Usage:
- $(basename "$0") [--yes]
-
-Options:
- --yes Skip confirmation prompt
-
-Examples:
- $(basename "$0") # Clean with confirmation
- $(basename "$0") --yes # Clean without confirmation
-
-EOF
- exit 0
-}
-
# Main function
main() {
# Parse arguments
@@ -63,11 +73,11 @@ main() {
shift
;;
--help)
- usage
+ print_usage
;;
*)
log_warn "Unknown option: $1"
- usage
+ print_usage
;;
esac
done
diff --git a/tpcdsgen/scripts/compare-c.py b/tpcdsgen/scripts/compare-c.py
deleted file mode 100644
index 54a223a..0000000
--- a/tpcdsgen/scripts/compare-c.py
+++ /dev/null
@@ -1,227 +0,0 @@
-#!/usr/bin/env python3
-"""
-Compare Rust .dat output against C dsdgen Parquet reference.
-
-Typical workflow:
- # 1. Obtain C dsdgen Parquet reference data, e.g. from datafusion-benchmarks:
- # git clone https://github.com/apache/datafusion-benchmarks
- # # the sf1 Parquet files are at datafusion-benchmarks/tpcds/data/sf1/
-
- # 2. Generate Rust output in C-compat mode:
- # cargo run -p tpcdsgen -- --compat c --directory /tmp/tpcds-c
-
- # 3. Compare:
- # uv run scripts/compare-c.py --dat-dir /tmp/tpcds-c --parquet-dir path/to/sf1 [TABLE...]
-
-Usage:
- uv run scripts/compare-c.py --dat-dir DIR --parquet-dir DIR [TABLE...] [--verbose]
-"""
-
-import sys
-import argparse
-from decimal import Decimal
-from pathlib import Path
-
-import pyarrow as pa
-import pyarrow.parquet as pq
-
-SCRIPT_DIR = Path(__file__).parent.resolve()
-sys.path.insert(0, str(SCRIPT_DIR))
-from tpcds_schemas import all_schemas
-
-MAX_DIFF_ROWS = 10 # Max differing rows to show per table
-
-
-# ---------------------------------------------------------------------------
-# Value normalization
-# ---------------------------------------------------------------------------
-
-def normalize_dat_value(raw: str, field: pa.Field):
- """Parse a raw .dat string into a typed, comparable Python value."""
- if raw == "":
- return None
- t = field.type
- if pa.types.is_integer(t):
- return int(raw)
- if pa.types.is_decimal(t):
- return Decimal(raw).normalize()
- if pa.types.is_date(t):
- return raw # Keep as YYYY-MM-DD string
- return raw # string: keep as-is (C dsdgen doesn't pad)
-
-
-def fix_mojibake(s: str) -> str:
- """Fix strings where UTF-8 bytes were stored as individual Latin-1 characters.
-
- The C dsdgen Parquet reference was generated by tpcdsgen.py without
- specifying an encoding, so DataFusion read the raw UTF-8 bytes and stored
- each byte as a Latin-1 character. This re-encodes as Latin-1 then decodes
- as UTF-8, recovering the original Unicode string.
- """
- try:
- return s.encode("latin-1").decode("utf-8")
- except (UnicodeDecodeError, UnicodeEncodeError):
- return s
-
-
-def normalize_parquet_value(scalar, field: pa.Field):
- """Normalize a pyarrow scalar into the same comparable Python value."""
- py = scalar.as_py()
- if py is None:
- return None
- t = field.type
- if pa.types.is_integer(t):
- return int(py)
- if pa.types.is_decimal(t):
- return Decimal(str(py)).normalize()
- if pa.types.is_date(t):
- return str(py) # date → "YYYY-MM-DD"
- if pa.types.is_string(t) or pa.types.is_large_string(t):
- return fix_mojibake(py)
- return py
-
-
-# ---------------------------------------------------------------------------
-# Loaders
-# ---------------------------------------------------------------------------
-
-def load_dat(table: str, fields: list[pa.Field], dat_dir: Path) -> list[tuple]:
- path = dat_dir / f"{table}.dat"
- if not path.exists():
- raise FileNotFoundError(
- f".dat file not found: {path}\n"
- f"Generate with: cargo run -p tpcdsgen -- --compat c --directory {dat_dir}"
- )
-
- rows = []
- with open(path, "r", encoding="latin-1") as f:
- for lineno, line in enumerate(f, 1):
- line = line.rstrip("\n")
- parts = line.split("|")
- # Drop trailing empty field produced by the trailing |
- if parts and parts[-1] == "":
- parts = parts[:-1]
- if len(parts) != len(fields):
- raise ValueError(
- f"{table}.dat line {lineno}: expected {len(fields)} fields, "
- f"got {len(parts)}: {line[:120]!r}"
- )
- rows.append(tuple(normalize_dat_value(p, fields[i]) for i, p in enumerate(parts)))
- return rows
-
-
-def load_parquet(table: str, fields: list[pa.Field], parquet_dir: Path) -> list[tuple]:
- path = parquet_dir / f"{table}.parquet"
- if not path.exists():
- raise FileNotFoundError(f"Parquet reference not found: {path}")
-
- col_names = [f.name for f in fields]
- tbl = pq.read_table(path, columns=col_names)
-
- rows = []
- for batch in tbl.to_batches():
- for row_idx in range(batch.num_rows):
- rows.append(tuple(
- normalize_parquet_value(batch.column(i)[row_idx], fields[i])
- for i in range(len(fields))
- ))
- return rows
-
-
-# ---------------------------------------------------------------------------
-# Comparison
-# ---------------------------------------------------------------------------
-
-def compare_table(table: str, verbose: bool, dat_dir: Path, parquet_dir: Path) -> bool:
- fields = all_schemas[table]
- col_names = [f.name for f in fields]
-
- try:
- dat_rows = load_dat(table, fields, dat_dir)
- pq_rows = load_parquet(table, fields, parquet_dir)
- except Exception as e:
- print(f" ERROR: {e}")
- return False
-
- if len(dat_rows) != len(pq_rows):
- print(f" ROW COUNT MISMATCH rust={len(dat_rows)} c={len(pq_rows)}")
- return False
-
- # Sort both by all columns for a canonical order.
- # This is O(n log n) but handles any ordering differences gracefully.
- dat_rows.sort(key=lambda r: tuple((v is None, v) if not isinstance(v, Decimal) else (False, float(v)) for v in r))
- pq_rows.sort(key=lambda r: tuple((v is None, v) if not isinstance(v, Decimal) else (False, float(v)) for v in r))
-
- diff_rows = 0
- first_diff_at = None
- for i, (dr, pr) in enumerate(zip(dat_rows, pq_rows)):
- if dr != pr:
- if diff_rows == 0:
- first_diff_at = i + 1
- diff_rows += 1
- if verbose and diff_rows <= MAX_DIFF_ROWS:
- print(f" row {i+1}:")
- for j, (dv, pv) in enumerate(zip(dr, pr)):
- if dv != pv:
- print(f" {col_names[j]}: rust={dv!r} c={pv!r}")
-
- if diff_rows == 0:
- print(f" \033[32m✓ MATCH\033[0m {len(dat_rows)} rows")
- return True
- else:
- pct = 100.0 * diff_rows / len(dat_rows)
- print(f" \033[31m✗ {diff_rows}/{len(dat_rows)} rows differ ({pct:.2f}%) first diff at row {first_diff_at}\033[0m")
- if not verbose:
- print(f" (re-run with --verbose to see column diffs)")
- return False
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def main():
- parser = argparse.ArgumentParser(description="Compare Rust .dat output vs C dsdgen Parquet reference")
- parser.add_argument("tables", nargs="*", help="Tables to compare (default: all)")
- parser.add_argument("--dat-dir", required=True,
- help="Directory containing .dat files generated by: tpcdsgen --compat c --directory DIR")
- parser.add_argument("--parquet-dir", required=True,
- help="Directory containing C dsdgen Parquet files (e.g. datafusion-benchmarks/tpcds/data/sf1)")
- parser.add_argument("--verbose", "-v", action="store_true", help="Show differing column values")
- args = parser.parse_args()
-
- dat_dir = Path(args.dat_dir)
- parquet_dir = Path(args.parquet_dir)
-
- available = sorted(all_schemas.keys())
- tables = args.tables if args.tables else available
-
- unknown = [t for t in tables if t not in all_schemas]
- if unknown:
- print(f"Unknown tables: {', '.join(unknown)}")
- print(f"Available: {', '.join(available)}")
- sys.exit(1)
-
- print(f"Comparing {len(tables)} table(s): Rust .dat vs C dsdgen Parquet\n")
- print(f" dat-dir : {dat_dir}")
- print(f" parquet-dir: {parquet_dir}\n")
-
- results: dict[str, bool] = {}
- for table in tables:
- print(f"{table}:")
- results[table] = compare_table(table, args.verbose, dat_dir, parquet_dir)
-
- passed = [t for t, ok in results.items() if ok]
- failed = [t for t, ok in results.items() if not ok]
-
- print(f"\n{'='*50}")
- print(f"Passed : {len(passed)}/{len(results)}")
- if failed:
- print(f"Failed : {', '.join(failed)}")
- print("="*50)
-
- sys.exit(0 if not failed else 1)
-
-
-if __name__ == "__main__":
- main()
diff --git a/tpcdsgen/scripts/compare-table.sh b/tpcdsgen/scripts/compare-table.sh
index a0a2630..4e02c34 100755
--- a/tpcdsgen/scripts/compare-table.sh
+++ b/tpcdsgen/scripts/compare-table.sh
@@ -1,16 +1,61 @@
#!/usr/bin/env bash
#
-# Compare Rust-generated table output with Java reference fixture
+# compare-table.sh — Compare a single table's Rust output to a reference
+# fixture byte-for-byte (MD5 + diff).
#
-# Usage:
-# ./scripts/compare-table.sh TABLE_NAME [--quiet]
-#
-# Exit codes:
-# 0 - Tables match exactly
-# 1 - Tables differ or error occurred
+# Please see print_usage() below for details.
set -euo pipefail
+print_usage() {
+ cat << 'EOF'
+compare-table.sh — Compare a single table's Rust output to a reference
+fixture byte-for-byte (MD5 + diff).
+
+Two reference implementations are supported, selected by --compat:
+ --compat trino (default) Java / Trino fixtures in
+ tests/fixtures/scale-N-java/
+ (generate with
+ ./scripts/generate-fixtures.sh)
+ --compat c C dsdgen fixtures in
+ tests/fixtures/scale-N-c/
+ (download with
+ ./scripts/generate-fixtures.sh --compat c)
+
+Usage:
+ compare-table.sh TABLE_NAME [OPTIONS]
+
+Arguments:
+ TABLE_NAME Name of the table to compare (e.g. call_center).
+
+Options:
+ --scale N Scale factor (default: 1).
+ --compat trino|c Reference implementation (default: trino).
+ --quiet Quiet mode (minimal output).
+ --help Show this help message.
+
+Examples:
+ compare-table.sh call_center # vs. Java, scale 1
+ compare-table.sh reason --compat c # vs. C dsdgen, scale 1
+ compare-table.sh inventory --scale 10 # vs. Java, scale 10
+ compare-table.sh customer_demographics --quiet
+
+Output example:
+ [INFO] Table Comparison: call_center
+ [INFO] Java fixture: tests/fixtures/scale-1-java/call_center.dat
+ [INFO] Java fixture: 6 rows, 4.0K
+ [INFO] Rust output: 6 rows, 4.0K
+ [SUCCESS] ✓ call_center: MD5 match (6 rows, cc9aab...)
+
+Exit codes:
+ 0 - Tables match exactly.
+ 1 - Tables differ or an error occurred.
+
+See scripts/README.md for the full conformance-testing workflow.
+EOF
+ exit 0
+}
+
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
@@ -24,6 +69,7 @@ PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
# Configuration (can be overridden by --scale)
SCALE_FACTOR=${TPCDS_SCALE:-1}
+COMPAT=${TPCDS_COMPAT:-trino}
QUIET=0
# Logging functions
@@ -45,8 +91,8 @@ log_diff() {
echo -e "${YELLOW}[DIFF]${NC} $*"
}
-# Returns tables are generated by their parent sales generators
-# Map returns table -> parent table (the one to generate)
+# Returns tables are generated by their parent sales generators.
+# Map returns table -> parent table (the one to generate).
get_generator_for_table() {
local table=$1
case $table in
@@ -57,34 +103,6 @@ get_generator_for_table() {
esac
}
-# Print usage
-usage() {
- cat << EOF
-Compare Rust-generated table output with Java reference fixture
-
-Usage:
- $(basename "$0") TABLE_NAME [OPTIONS]
-
-Arguments:
- TABLE_NAME Name of the table to compare (e.g., call_center)
-
-Options:
- --scale N Scale factor (default: 1)
- --quiet Quiet mode (minimal output)
-
-Examples:
- $(basename "$0") call_center
- $(basename "$0") customer_demographics --quiet
- $(basename "$0") inventory --scale 10
-
-Exit codes:
- 0 - Tables match exactly
- 1 - Tables differ or error occurred
-
-EOF
- exit 0
-}
-
# Find the unified tpcdsgen binary
find_rust_binary() {
local target_dir
@@ -139,7 +157,7 @@ generate_rust_table() {
fi
log_info "Generating $table with Rust..."
- log_info "Using binary: $binary --table $generator --scale $SCALE_FACTOR"
+ log_info "Using binary: $binary --compat $COMPAT --table $generator --scale $SCALE_FACTOR"
if [[ "$generator" != "$table" ]]; then
log_info "Note: $table is generated alongside $generator"
fi
@@ -148,8 +166,8 @@ generate_rust_table() {
local temp_dir
temp_dir=$(mktemp -d)
- # Run Rust generator with --table, --scale, and --directory flags
- if ! "$binary" --table "$generator" --scale "$SCALE_FACTOR" --directory "$temp_dir" >/dev/null 2>&1; then
+ # Run Rust generator with --compat, --table, --scale, and --directory flags
+ if ! "$binary" --compat "$COMPAT" --table "$generator" --scale "$SCALE_FACTOR" --directory "$temp_dir" >/dev/null 2>&1; then
log_error "Failed to generate $table with Rust"
rm -rf "$temp_dir"
return 1
@@ -182,54 +200,51 @@ compute_md5() {
# Compare two files
compare_files() {
- local java_file=$1
+ local ref_file=$1
local rust_file=$2
local table=$3
+ local ref_label=$4
log_info "Comparing outputs..."
# Get file sizes
- local java_size
- local rust_size
- local java_rows
- local rust_rows
+ local ref_size rust_size ref_rows rust_rows
- java_size=$(du -h "$java_file" | cut -f1)
+ ref_size=$(du -h "$ref_file" | cut -f1)
rust_size=$(du -h "$rust_file" | cut -f1)
- java_rows=$(wc -l < "$java_file" | tr -d ' ')
+ ref_rows=$(wc -l < "$ref_file" | tr -d ' ')
rust_rows=$(wc -l < "$rust_file" | tr -d ' ')
- log_info "Java fixture: $java_rows rows, $java_size"
- log_info "Rust output: $rust_rows rows, $rust_size"
+ log_info "$ref_label fixture: $ref_rows rows, $ref_size"
+ log_info "Rust output: $rust_rows rows, $rust_size"
# Quick check: row count must match
- if [[ "$java_rows" != "$rust_rows" ]]; then
+ if [[ "$ref_rows" != "$rust_rows" ]]; then
log_error "Row count mismatch!"
- log_error " Java: $java_rows rows"
+ log_error " $ref_label: $ref_rows rows"
log_error " Rust: $rust_rows rows"
return 1
fi
# Compute MD5 hashes
log_info "Computing MD5 hashes..."
- local java_md5
- local rust_md5
- java_md5=$(compute_md5 "$java_file")
+ local ref_md5 rust_md5
+ ref_md5=$(compute_md5 "$ref_file")
rust_md5=$(compute_md5 "$rust_file")
- log_info "Java MD5: $java_md5"
+ log_info "$ref_label MD5: $ref_md5"
log_info "Rust MD5: $rust_md5"
# Compare MD5 hashes
- if [[ "$java_md5" == "$rust_md5" ]]; then
- log_success "✓ $table: MD5 match ($java_rows rows, $java_md5)"
+ if [[ "$ref_md5" == "$rust_md5" ]]; then
+ log_success "✓ $table: MD5 match ($ref_rows rows, $ref_md5)"
return 0
else
log_error "✗ $table: MD5 mismatch!"
- log_error " Java: $java_md5"
- log_error " Rust: $rust_md5"
+ log_error " $ref_label: $ref_md5"
+ log_error " Rust: $rust_md5"
log_diff "Showing first differences:"
- diff -u "$java_file" "$rust_file" | head -30 || true
+ diff -u "$ref_file" "$rust_file" | head -30 || true
return 1
fi
}
@@ -245,32 +260,50 @@ main() {
SCALE_FACTOR="$2"
shift 2
;;
+ --compat)
+ COMPAT="$2"
+ shift 2
+ ;;
--quiet)
QUIET=1
shift
;;
--help)
- usage
+ print_usage
;;
*)
if [[ -z "$table" ]]; then
table=$1
else
log_error "Too many arguments"
- usage
+ print_usage
fi
shift
;;
esac
done
- # Set fixture directory based on scale factor
- FIXTURE_DIR="$PROJECT_ROOT/tests/fixtures/scale-$SCALE_FACTOR"
+ # Resolve compat mode -> fixture directory and reference label
+ local ref_label
+ case $COMPAT in
+ trino)
+ FIXTURE_DIR="$PROJECT_ROOT/tests/fixtures/scale-${SCALE_FACTOR}-java"
+ ref_label="Java"
+ ;;
+ c)
+ FIXTURE_DIR="$PROJECT_ROOT/tests/fixtures/scale-${SCALE_FACTOR}-c"
+ ref_label="C dsdgen"
+ ;;
+ *)
+ log_error "Unknown --compat value: $COMPAT (expected: trino, c)"
+ exit 1
+ ;;
+ esac
# Validate table argument
if [[ -z "$table" ]]; then
log_error "Table name required"
- usage
+ print_usage
fi
log_info "========================================="
@@ -281,11 +314,15 @@ main() {
local fixture_file="$FIXTURE_DIR/${table}.dat"
if [[ ! -f "$fixture_file" ]]; then
log_error "Fixture not found: $fixture_file"
- log_error "Generate fixtures first: ./scripts/generate-fixtures.sh $table"
+ if [[ "$COMPAT" == "c" ]]; then
+ log_error "Download C reference data first: ./scripts/generate-fixtures.sh --compat c --scale $SCALE_FACTOR"
+ else
+ log_error "Generate fixtures first: ./scripts/generate-fixtures.sh $table"
+ fi
exit 1
fi
- log_info "Java fixture: $fixture_file"
+ log_info "$ref_label fixture: $fixture_file"
# Generate Rust output
local rust_output
@@ -300,7 +337,7 @@ main() {
# Compare files
local result=0
- if ! compare_files "$fixture_file" "$rust_output" "$table"; then
+ if ! compare_files "$fixture_file" "$rust_output" "$table" "$ref_label"; then
result=1
fi
diff --git a/tpcdsgen/scripts/generate-fixtures.sh b/tpcdsgen/scripts/generate-fixtures.sh
index 749126b..8810ce1 100755
--- a/tpcdsgen/scripts/generate-fixtures.sh
+++ b/tpcdsgen/scripts/generate-fixtures.sh
@@ -1,32 +1,106 @@
#!/usr/bin/env bash
#
-# Generate TPC-DS reference fixtures using the Java implementation
+# generate-fixtures.sh — Generate reference TPC-DS fixtures used by the
+# conformance suite (compare-table.sh / test-all-tables.sh).
#
-# Usage:
-# ./scripts/generate-fixtures.sh # Generate all tables
-# ./scripts/generate-fixtures.sh --quiet # Generate all tables (quiet mode)
-# ./scripts/generate-fixtures.sh table1 ... # Generate specific tables
-# ./scripts/generate-fixtures.sh --help # Show help
+# Please see print_usage() below for details.
set -euo pipefail
-# Colors for output
+print_usage() {
+ cat << 'EOF'
+generate-fixtures.sh — Generate reference TPC-DS fixtures.
+
+Two reference implementations are supported, selected by --compat:
+
+ --compat trino (default)
+ Runs the Java / Trino TPC-DS implementation (set up by
+ ./scripts/bootstrap-java.sh) and writes the resulting *.dat files
+ into tests/fixtures/scale-N-java/. These are the "golden reference"
+ the Rust port targets byte-for-byte.
+
+ --compat c
+ Downloads pre-generated C `dsdgen` reference data from
+ https://github.com/alamb/tpcds-data (branch sfN; one branch per
+ scale factor). The branch is cloned with --depth 1, re-assembled
+ from split bzip2 tarballs, and extracted into
+ tests/fixtures/scale-N-c/. No local C toolchain needed.
+
+Usage:
+ generate-fixtures.sh [OPTIONS] [TABLES...]
+
+Options:
+ --compat trino|c Reference implementation (default: trino).
+ --scale N Scale factor (default: 1).
+ --quiet Quiet mode (minimal output).
+ --rebuild --compat c only: re-download and re-extract even
+ if fixtures already exist.
+ --verify --compat c only: only check that fixtures look
+ sane; do not download.
+ --help Show this help message.
+
+Arguments:
+ TABLES --compat trino only: space-separated list of table
+ names to generate. If omitted, generates all 25.
+ Not meaningful for --compat c (the published
+ archive includes all 25 tables together).
+
+Environment variables:
+ TPCDS_C_DATA_REPO Override the C reference data repo URL.
+ Default: https://github.com/alamb/tpcds-data.git
+ TPCDS_SCALE Default scale factor (overridden by --scale).
+ TPCDS_COMPAT Default compat mode (overridden by --compat).
+
+Requirements:
+ --compat trino: Java 11+, Maven (a built tpcds-*.jar; see bootstrap-java.sh)
+ --compat c : git, tar, bzip2
+
+Output:
+ tests/fixtures/scale-N-java/.dat — pipe-delimited, trailing |.
+ tests/fixtures/scale-N-c/.dat — same format, C dsdgen origin.
+ Files are gitignored; regenerate as needed.
+
+Examples:
+ # Java reference, all 25 tables at scale 1 (default).
+ ./scripts/generate-fixtures.sh
+
+ # Java reference, scale 10, two specific tables.
+ ./scripts/generate-fixtures.sh --scale 10 call_center warehouse
+
+ # C dsdgen reference, scale 1.
+ ./scripts/generate-fixtures.sh --compat c
+
+ # C dsdgen reference, scale 2, force re-download.
+ ./scripts/generate-fixtures.sh --compat c --scale 2 --rebuild
+
+See scripts/README.md for the full conformance-testing workflow.
+EOF
+ exit 0
+}
+
+# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
-NC='\033[0m' # No Color
+NC='\033[0m'
# Script directory and project root
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
JAVA_DIR="$PROJECT_ROOT/../tpcds"
-# Configuration (can be overridden by --scale)
+# Configuration (overridable by flags / env vars)
SCALE_FACTOR=${TPCDS_SCALE:-1}
+COMPAT=${TPCDS_COMPAT:-trino}
QUIET=0
+FORCE_REBUILD=0
+VERIFY_ONLY=0
+
+# alamb/tpcds-data repository (for --compat c).
+TPCDS_DATA_REPO="${TPCDS_C_DATA_REPO:-https://github.com/alamb/tpcds-data.git}"
-# All TPC-DS tables (25 tables)
+# All TPC-DS tables (25 tables).
ALL_TABLES=(
"call_center"
"catalog_page"
@@ -55,7 +129,7 @@ ALL_TABLES=(
"web_site"
)
-# Logging functions
+# Logging
log_info() {
if [[ $QUIET -eq 0 ]]; then
echo -e "${BLUE}[INFO]${NC} $*"
@@ -76,104 +150,62 @@ log_error() {
echo -e "${RED}[ERROR]${NC} $*" >&2
}
-# Print usage
-usage() {
- cat << EOF
-Generate TPC-DS reference fixtures using the Java implementation
-
-Usage:
- $(basename "$0") [OPTIONS] [TABLES...]
-
-Options:
- --scale N Scale factor (default: 1)
- --quiet Quiet mode (minimal output)
- --help Show this help message
-
-Arguments:
- TABLES Space-separated list of table names to generate
- If omitted, generates all 25 tables
-
-Examples:
- $(basename "$0") # Generate all tables at scale 1
- $(basename "$0") --scale 10 # Generate all tables at scale 10
- $(basename "$0") --quiet # Generate all tables (quiet)
- $(basename "$0") call_center warehouse # Generate specific tables
+# -----------------------------------------------------------------------------
+# --compat trino (Java) helpers
+# -----------------------------------------------------------------------------
-EOF
- exit 0
-}
-
-# Find Java JAR file
find_java_jar() {
- local jar_pattern="$JAVA_DIR/target/tpcds-*-jar-with-dependencies.jar"
local jar_file
-
jar_file=$(find "$JAVA_DIR/target" -name "tpcds-*-jar-with-dependencies.jar" 2>/dev/null | head -1)
-
if [[ -z "$jar_file" ]]; then
return 1
fi
-
echo "$jar_file"
- return 0
}
-# Build Java implementation if needed
ensure_java_build() {
log_info "Checking Java implementation..."
-
if ! find_java_jar >/dev/null 2>&1; then
log_warn "Java JAR not found. Building Java implementation..."
-
cd "$JAVA_DIR"
if ! mvn -q clean package -DskipTests; then
log_error "Failed to build Java implementation"
exit 1
fi
cd - >/dev/null
-
log_success "Java implementation built successfully"
else
log_info "Java JAR found: $(find_java_jar)"
fi
}
-# Generate a single table
-generate_table() {
+generate_java_table() {
local table=$1
+ local fixture_dir=$2
local jar_file
jar_file=$(find_java_jar)
log_info "Generating $table..."
- # Create a temporary directory for generation
local temp_dir
temp_dir=$(mktemp -d)
- # Generate table in temp directory
- #
- # Run Java generator, filter out DEBUG lines but capture errors
- local output
- if output=$(java -jar "$jar_file" \
+ if java -jar "$jar_file" \
--table "$table" \
--scale "$SCALE_FACTOR" \
--overwrite \
--directory "$temp_dir" \
- 2>&1); then
+ >/dev/null 2>&1; then
- # Move generated file to fixture directory
local output_file="$temp_dir/${table}.dat"
-
if [[ -f "$output_file" ]]; then
- mv "$output_file" "$FIXTURE_DIR/"
-
- # Get file info
- local file_size
- local row_count
- file_size=$(du -h "$FIXTURE_DIR/${table}.dat" | cut -f1)
- row_count=$(wc -l < "$FIXTURE_DIR/${table}.dat" | tr -d ' ')
-
+ mv "$output_file" "$fixture_dir/"
+ local file_size row_count
+ file_size=$(du -h "$fixture_dir/${table}.dat" | cut -f1)
+ row_count=$(wc -l < "$fixture_dir/${table}.dat" | tr -d ' ')
log_success "$table generated: $row_count rows, $file_size"
+ rm -rf "$temp_dir"
+ return 0
else
log_error "Expected output file not found: $output_file"
rm -rf "$temp_dir"
@@ -184,69 +216,31 @@ generate_table() {
rm -rf "$temp_dir"
return 1
fi
-
- # Clean up temp directory
- rm -rf "$temp_dir"
- return 0
}
-# Main function
-main() {
- local tables_to_generate=()
- local start_time
- local end_time
- local success_count=0
- local fail_count=0
-
- # Parse arguments
- while [[ $# -gt 0 ]]; do
- case $1 in
- --scale)
- SCALE_FACTOR="$2"
- shift 2
- ;;
- --quiet)
- QUIET=1
- shift
- ;;
- --help)
- usage
- ;;
- *)
- tables_to_generate+=("$1")
- shift
- ;;
- esac
- done
-
- # Set fixture directory based on scale factor
- FIXTURE_DIR="$PROJECT_ROOT/tests/fixtures/scale-$SCALE_FACTOR"
-
- # If no tables specified, generate all
- if [[ ${#tables_to_generate[@]} -eq 0 ]]; then
- tables_to_generate=("${ALL_TABLES[@]}")
- fi
+generate_trino_fixtures() {
+ local fixture_dir=$1
+ shift
+ local tables_to_generate=("$@")
log_info "========================================="
- log_info "TPC-DS Fixture Generator"
+ log_info "Java TPC-DS Fixture Generator"
log_info "========================================="
- log_info "Scale Factor: $SCALE_FACTOR"
+ log_info "Scale Factor: $SCALE_FACTOR"
log_info "Tables to generate: ${#tables_to_generate[@]}"
- log_info "Fixture directory: $FIXTURE_DIR"
+ log_info "Fixture directory: $fixture_dir"
log_info "========================================="
- # Ensure Java build exists
ensure_java_build
- # Create fixture directory
- mkdir -p "$FIXTURE_DIR"
- log_info "Created fixture directory: $FIXTURE_DIR"
+ mkdir -p "$fixture_dir"
+ log_info "Created fixture directory: $fixture_dir"
- # Generate tables
+ local success_count=0 fail_count=0 start_time end_time
start_time=$(date +%s)
for table in "${tables_to_generate[@]}"; do
- if generate_table "$table"; then
+ if generate_java_table "$table" "$fixture_dir"; then
success_count=$((success_count + 1))
else
fail_count=$((fail_count + 1))
@@ -256,25 +250,244 @@ main() {
end_time=$(date +%s)
local duration=$((end_time - start_time))
- # Print summary
echo ""
log_info "========================================="
log_info "Generation Complete"
log_info "========================================="
log_success "Successfully generated: $success_count tables"
-
if [[ $fail_count -gt 0 ]]; then
log_error "Failed to generate: $fail_count tables"
fi
-
log_info "Total time: ${duration}s"
- log_info "Fixtures saved to: $FIXTURE_DIR"
+ log_info "Fixtures saved to: $fixture_dir"
log_info "========================================="
- # Exit with error if any tables failed
if [[ $fail_count -gt 0 ]]; then
exit 1
fi
}
+# -----------------------------------------------------------------------------
+# --compat c (C dsdgen) helpers
+# -----------------------------------------------------------------------------
+
+check_c_prerequisites() {
+ local missing=()
+ command -v git >/dev/null 2>&1 || missing+=(git)
+ command -v bzip2 >/dev/null 2>&1 || missing+=(bzip2)
+ command -v tar >/dev/null 2>&1 || missing+=(tar)
+ if [[ ${#missing[@]} -gt 0 ]]; then
+ log_error "Missing required tool(s) for --compat c: ${missing[*]}"
+ return 1
+ fi
+ return 0
+}
+
+# Sanity check the extracted fixtures.
+# At minimum, a handful of expected tables must be present and non-empty.
+verify_c_fixtures() {
+ local fixture_dir=$1
+ local required=(store_sales.dat catalog_sales.dat web_sales.dat reason.dat call_center.dat)
+
+ if [[ ! -d "$fixture_dir" ]]; then
+ log_error "Fixture directory does not exist: $fixture_dir"
+ return 1
+ fi
+
+ for f in "${required[@]}"; do
+ if [[ ! -s "$fixture_dir/$f" ]]; then
+ log_error "Missing or empty fixture: $fixture_dir/$f"
+ return 1
+ fi
+ done
+
+ local count
+ count=$(find "$fixture_dir" -maxdepth 1 -name "*.dat" -type f | wc -l | tr -d ' ')
+ log_success "Found $count .dat fixtures in $fixture_dir"
+ return 0
+}
+
+download_and_extract_c() {
+ local branch=$1
+ local fixture_dir=$2
+ local clone_dir
+ clone_dir=$(mktemp -d -t tpcds-data-XXXXXX)
+
+ # Cleanup helper. Called both on the success and failure paths below
+ # rather than via `trap RETURN`, which under `set -u` causes the trap
+ # to fire from later functions (e.g. `main`) where `$clone_dir` is no
+ # longer in scope.
+ _cleanup_clone_dir() {
+ if [[ -n "${clone_dir:-}" && -d "$clone_dir" ]]; then
+ rm -rf "$clone_dir"
+ fi
+ }
+
+ log_info "Cloning $TPCDS_DATA_REPO branch '$branch' (depth 1) ..."
+ if ! git clone --depth 1 --single-branch --branch "$branch" \
+ "$TPCDS_DATA_REPO" "$clone_dir/tpcds-data"; then
+ log_error "Failed to clone $TPCDS_DATA_REPO branch '$branch'"
+ log_error "Confirm the branch exists (sf1, sf2, ...)"
+ _cleanup_clone_dir
+ return 1
+ fi
+
+ if ! ls "$clone_dir/tpcds-data"/data.tar.bz2.* >/dev/null 2>&1; then
+ log_error "No data.tar.bz2.* parts found in cloned branch '$branch'"
+ _cleanup_clone_dir
+ return 1
+ fi
+
+ log_info "Extracting reference data into $fixture_dir ..."
+ mkdir -p "$fixture_dir"
+
+ # The archive expands as data/.dat. Extract into a temp dir,
+ # then flatten one level so the result is fixture_dir/.dat.
+ local extract_dir="$clone_dir/extract"
+ mkdir -p "$extract_dir"
+ if ! cat "$clone_dir/tpcds-data"/data.tar.bz2.* | bzip2 -d | tar -x -C "$extract_dir"; then
+ log_error "Failed to extract data.tar.bz2.* parts"
+ _cleanup_clone_dir
+ return 1
+ fi
+
+ if [[ ! -d "$extract_dir/data" ]]; then
+ log_error "Unexpected archive layout: $extract_dir/data not found"
+ _cleanup_clone_dir
+ return 1
+ fi
+
+ mv "$extract_dir/data"/*.dat "$fixture_dir/"
+ _cleanup_clone_dir
+ return 0
+}
+
+generate_c_fixtures() {
+ local fixture_dir=$1
+ local branch="sf${SCALE_FACTOR}"
+
+ log_info "========================================="
+ log_info "C dsdgen Reference Data Bootstrap"
+ log_info "========================================="
+ log_info "Repository: $TPCDS_DATA_REPO"
+ log_info "Branch: $branch"
+ log_info "Fixture directory: $fixture_dir"
+ log_info "========================================="
+
+ if ! check_c_prerequisites; then
+ exit 1
+ fi
+
+ if [[ $VERIFY_ONLY -eq 1 ]]; then
+ if verify_c_fixtures "$fixture_dir"; then
+ exit 0
+ else
+ exit 1
+ fi
+ fi
+
+ # Skip download if fixtures already look complete.
+ if [[ $FORCE_REBUILD -eq 0 ]] && verify_c_fixtures "$fixture_dir" >/dev/null 2>&1; then
+ log_success "C reference fixtures already present at $fixture_dir"
+ log_info "Use --rebuild to force re-download"
+ exit 0
+ fi
+
+ if [[ $FORCE_REBUILD -eq 1 && -d "$fixture_dir" ]]; then
+ log_info "Removing existing fixture directory: $fixture_dir"
+ rm -rf "$fixture_dir"
+ fi
+
+ local start_time end_time
+ start_time=$(date +%s)
+ if ! download_and_extract_c "$branch" "$fixture_dir"; then
+ exit 1
+ fi
+ end_time=$(date +%s)
+
+ if ! verify_c_fixtures "$fixture_dir"; then
+ log_error "Bootstrap completed but verification failed"
+ exit 1
+ fi
+
+ echo ""
+ log_info "========================================="
+ log_success "C dsdgen reference data ready"
+ log_info "Time: $((end_time - start_time))s"
+ log_info "========================================="
+}
+
+# -----------------------------------------------------------------------------
+# main
+# -----------------------------------------------------------------------------
+
+main() {
+ local tables_to_generate=()
+
+ while [[ $# -gt 0 ]]; do
+ case $1 in
+ --compat)
+ COMPAT="$2"
+ shift 2
+ ;;
+ --scale)
+ SCALE_FACTOR="$2"
+ shift 2
+ ;;
+ --quiet)
+ QUIET=1
+ shift
+ ;;
+ --rebuild)
+ FORCE_REBUILD=1
+ shift
+ ;;
+ --verify)
+ VERIFY_ONLY=1
+ shift
+ ;;
+ --help)
+ print_usage
+ ;;
+ *)
+ tables_to_generate+=("$1")
+ shift
+ ;;
+ esac
+ done
+
+ case $COMPAT in
+ trino|c) ;;
+ *)
+ log_error "Unknown --compat value: $COMPAT (expected: trino, c)"
+ exit 1
+ ;;
+ esac
+
+ if [[ "$COMPAT" == "c" && ${#tables_to_generate[@]} -gt 0 ]]; then
+ log_error "Per-table selection is not supported with --compat c"
+ log_error "The published archive bundles all 25 tables together."
+ exit 1
+ fi
+
+ if [[ "$COMPAT" == "trino" && ( $FORCE_REBUILD -eq 1 || $VERIFY_ONLY -eq 1 ) ]]; then
+ log_error "--rebuild and --verify are only valid with --compat c"
+ exit 1
+ fi
+
+ case $COMPAT in
+ trino)
+ local fixture_dir="$PROJECT_ROOT/tests/fixtures/scale-${SCALE_FACTOR}-java"
+ if [[ ${#tables_to_generate[@]} -eq 0 ]]; then
+ tables_to_generate=("${ALL_TABLES[@]}")
+ fi
+ generate_trino_fixtures "$fixture_dir" "${tables_to_generate[@]}"
+ ;;
+ c)
+ local fixture_dir="$PROJECT_ROOT/tests/fixtures/scale-${SCALE_FACTOR}-c"
+ generate_c_fixtures "$fixture_dir"
+ ;;
+ esac
+}
+
main "$@"
diff --git a/tpcdsgen/scripts/test-all-tables.sh b/tpcdsgen/scripts/test-all-tables.sh
index 9495148..091d503 100755
--- a/tpcdsgen/scripts/test-all-tables.sh
+++ b/tpcdsgen/scripts/test-all-tables.sh
@@ -1,16 +1,61 @@
#!/usr/bin/env bash
#
-# Test all ported Rust tables against Java reference fixtures
+# test-all-tables.sh — Run the full conformance suite for one compat
+# mode, byte-for-byte (MD5 + diff) comparing Rust output against
+# reference fixtures. Main entry point used by CI.
#
-# Usage:
-# ./scripts/test-all-tables.sh [--quiet]
-#
-# Exit codes:
-# 0 - All tables match
-# 1 - One or more tables differ
+# Please see print_usage() below for details.
set -euo pipefail
+print_usage() {
+ cat << 'EOF'
+test-all-tables.sh — Run the full conformance suite for one compat mode.
+
+Iterates all 24 TPC-DS tables (dbgen_version is always excluded because
+it contains a generation timestamp), builds the Rust generator in release
+mode, delegates each per-table comparison to ./scripts/compare-table.sh,
+and prints a pass/fail summary. Exits non-zero if any table differs.
+
+Two reference implementations are supported, selected by --compat:
+ --compat trino (default) Java / Trino fixtures in
+ tests/fixtures/scale-N-java/
+ (generate with
+ ./scripts/generate-fixtures.sh)
+ --compat c C dsdgen fixtures in
+ tests/fixtures/scale-N-c/
+ (download with
+ ./scripts/generate-fixtures.sh --compat c)
+
+Per-compat skip lists live near the top of the script. As of this
+writing, --compat c additionally skips `customer` until
+alamb/tpcds-data is regenerated without the iconv ISO-8859-14 -> UTF-8
+step that double-encodes non-ASCII country names.
+
+Usage:
+ test-all-tables.sh [OPTIONS]
+
+Options:
+ --scale N Scale factor (default: 1).
+ --compat trino|c Reference implementation (default: trino).
+ --quiet Quiet mode (show only summary).
+ --help Show this help message.
+
+Examples:
+ test-all-tables.sh # All tables at scale 1 vs Java.
+ test-all-tables.sh --scale 10 # All tables at scale 10 vs Java.
+ test-all-tables.sh --compat c # All tables at scale 1 vs C dsdgen.
+ test-all-tables.sh --quiet # Summary-only output.
+
+Exit codes:
+ 0 - All tested tables match.
+ 1 - One or more tables differ.
+
+See scripts/README.md for the full conformance-testing workflow.
+EOF
+ exit 0
+}
+
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
@@ -24,6 +69,7 @@ PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
# Configuration (can be overridden by --scale)
SCALE_FACTOR=${TPCDS_SCALE:-1}
+COMPAT=${TPCDS_COMPAT:-trino}
QUIET=0
# Logging functions
@@ -45,31 +91,6 @@ log_warn() {
echo -e "${YELLOW}[WARN]${NC} $*"
}
-# Print usage
-usage() {
- cat << EOF
-Test all ported Rust tables against Java reference fixtures
-
-Usage:
- $(basename "$0") [--scale N] [--quiet]
-
-Options:
- --scale N Scale factor (default: 1)
- --quiet Quiet mode (show only summary)
-
-Examples:
- $(basename "$0") # Test all tables at scale 1
- $(basename "$0") --scale 10 # Test all tables at scale 10
- $(basename "$0") --quiet # Test all tables (quiet)
-
-Exit codes:
- 0 - All tables match exactly
- 1 - One or more tables differ
-
-EOF
- exit 0
-}
-
# All TPC-DS tables to test (24 tables - excludes dbgen_version which has timestamps)
# Note: dbgen_version is excluded because it contains timestamps that will never match
ALL_TABLES=(
@@ -99,9 +120,38 @@ ALL_TABLES=(
"web_site"
)
-# Get list of tables to test
+# Tables to skip per compat mode (in addition to dbgen_version, which is
+# always skipped because it contains a generation timestamp).
+#
+# --compat c: customer.dat is skipped because the reference data in
+# https://github.com/alamb/tpcds-data was generated through a pipeline that
+# accidentally double-UTF-8-encodes the non-ASCII country names (`CÔTE
+# D'IVOIRE`, `RÉUNION`). The Rust --compat c output uses raw Latin-1, which
+# is what unmodified C dsdgen produces. Once the reference data is
+# regenerated without the iconv ISO-8859-14 -> UTF-8 step, this entry can
+# be removed.
+# TODO(alamb): re-include customer once alamb/tpcds-data has been regenerated.
+C_COMPAT_SKIP_TABLES=("customer")
+
+# Get list of tables to test, applying per-compat skip lists.
get_tables_to_test() {
- echo "${ALL_TABLES[@]}"
+ local skip_list=()
+ if [[ "$COMPAT" == "c" ]]; then
+ skip_list=("${C_COMPAT_SKIP_TABLES[@]}")
+ fi
+
+ local result=()
+ for t in "${ALL_TABLES[@]}"; do
+ local skip=0
+ for s in "${skip_list[@]:-}"; do
+ if [[ "$t" == "$s" ]]; then
+ skip=1
+ break
+ fi
+ done
+ [[ $skip -eq 0 ]] && result+=("$t")
+ done
+ echo "${result[@]}"
}
# Build the unified Rust table generator
@@ -123,9 +173,9 @@ test_table() {
local compare_script="$SCRIPT_DIR/compare-table.sh"
if [[ $QUIET -eq 1 ]]; then
- "$compare_script" "$table" --scale "$SCALE_FACTOR" --quiet
+ "$compare_script" "$table" --scale "$SCALE_FACTOR" --compat "$COMPAT" --quiet
else
- "$compare_script" "$table" --scale "$SCALE_FACTOR"
+ "$compare_script" "$table" --scale "$SCALE_FACTOR" --compat "$COMPAT"
fi
}
@@ -143,23 +193,36 @@ main() {
SCALE_FACTOR="$2"
shift 2
;;
+ --compat)
+ COMPAT="$2"
+ shift 2
+ ;;
--quiet)
QUIET=1
shift
;;
--help)
- usage
+ print_usage
;;
*)
log_error "Unknown option: $1"
- usage
+ print_usage
;;
esac
done
+ case $COMPAT in
+ trino|c) ;;
+ *)
+ log_error "Unknown --compat value: $COMPAT (expected: trino, c)"
+ exit 1
+ ;;
+ esac
+
log_info "========================================="
log_info "TPC-DS Table Test Suite"
log_info "Scale Factor: $SCALE_FACTOR"
+ log_info "Compat Mode: $COMPAT"
log_info "========================================="
# Get tables to test
diff --git a/tpcdsgen/scripts/tpcds_schemas.py b/tpcdsgen/scripts/tpcds_schemas.py
deleted file mode 100644
index dec1aa2..0000000
--- a/tpcdsgen/scripts/tpcds_schemas.py
+++ /dev/null
@@ -1,506 +0,0 @@
-"""TPC-DS table schemas as PyArrow fields.
-
-Extracted from apache/datafusion-benchmarks (Apache-2.0).
-Used by compare-c.py to parse .dat files and read Parquet reference data.
-"""
-
-import pyarrow
-
-all_schemas = {}
-
-all_schemas['customer_address'] = [
- pyarrow.field("ca_address_sk", pyarrow.int32()),
- pyarrow.field("ca_address_id", pyarrow.string()),
- pyarrow.field("ca_street_number", pyarrow.string()),
- pyarrow.field("ca_street_name", pyarrow.string()),
- pyarrow.field("ca_street_type", pyarrow.string()),
- pyarrow.field("ca_suite_number", pyarrow.string()),
- pyarrow.field("ca_city", pyarrow.string()),
- pyarrow.field("ca_county", pyarrow.string()),
- pyarrow.field("ca_state", pyarrow.string()),
- pyarrow.field("ca_zip", pyarrow.string()),
- pyarrow.field("ca_country", pyarrow.string()),
- pyarrow.field("ca_gmt_offset", pyarrow.decimal128(5, 2)),
- pyarrow.field("ca_location_type", pyarrow.string())
-]
-
-all_schemas['customer_demographics'] = [
- pyarrow.field("cd_demo_sk", pyarrow.int32()),
- pyarrow.field("cd_gender", pyarrow.string()),
- pyarrow.field("cd_marital_status", pyarrow.string()),
- pyarrow.field("cd_education_status", pyarrow.string()),
- pyarrow.field("cd_purchase_estimate", pyarrow.int32()),
- pyarrow.field("cd_credit_rating", pyarrow.string()),
- pyarrow.field("cd_dep_count", pyarrow.int32()),
- pyarrow.field("cd_dep_employed_count", pyarrow.int32()),
- pyarrow.field("cd_dep_college_count", pyarrow.int32())
-]
-
-all_schemas['date_dim'] = [
- pyarrow.field("d_date_sk", pyarrow.int32()),
- pyarrow.field("d_date_id", pyarrow.string()),
- pyarrow.field("d_date", pyarrow.date32()),
- pyarrow.field("d_month_seq", pyarrow.int32()),
- pyarrow.field("d_week_seq", pyarrow.int32()),
- pyarrow.field("d_quarter_seq", pyarrow.int32()),
- pyarrow.field("d_year", pyarrow.int32()),
- pyarrow.field("d_dow", pyarrow.int32()),
- pyarrow.field("d_moy", pyarrow.int32()),
- pyarrow.field("d_dom", pyarrow.int32()),
- pyarrow.field("d_qoy", pyarrow.int32()),
- pyarrow.field("d_fy_year", pyarrow.int32()),
- pyarrow.field("d_fy_quarter_seq", pyarrow.int32()),
- pyarrow.field("d_fy_week_seq", pyarrow.int32()),
- pyarrow.field("d_day_name", pyarrow.string()),
- pyarrow.field("d_quarter_name", pyarrow.string()),
- pyarrow.field("d_holiday", pyarrow.string()),
- pyarrow.field("d_weekend", pyarrow.string()),
- pyarrow.field("d_following_holiday", pyarrow.string()),
- pyarrow.field("d_first_dom", pyarrow.int32()),
- pyarrow.field("d_last_dom", pyarrow.int32()),
- pyarrow.field("d_same_day_ly", pyarrow.int32()),
- pyarrow.field("d_same_day_lq", pyarrow.int32()),
- pyarrow.field("d_current_day", pyarrow.string()),
- pyarrow.field("d_current_week", pyarrow.string()),
- pyarrow.field("d_current_month", pyarrow.string()),
- pyarrow.field("d_current_quarter", pyarrow.string()),
- pyarrow.field("d_current_year", pyarrow.string()),
-]
-
-all_schemas["warehouse"] = [
- pyarrow.field("w_warehouse_sk", pyarrow.int32()),
- pyarrow.field("w_warehouse_id", pyarrow.string()),
- pyarrow.field("w_warehouse_name", pyarrow.string()),
- pyarrow.field("w_warehouse_sq_ft", pyarrow.int32()),
- pyarrow.field("w_street_number", pyarrow.string()),
- pyarrow.field("w_street_name", pyarrow.string()),
- pyarrow.field("w_street_type", pyarrow.string()),
- pyarrow.field("w_suite_number", pyarrow.string()),
- pyarrow.field("w_city", pyarrow.string()),
- pyarrow.field("w_county", pyarrow.string()),
- pyarrow.field("w_state", pyarrow.string()),
- pyarrow.field("w_zip", pyarrow.string()),
- pyarrow.field("w_country", pyarrow.string()),
- pyarrow.field("w_gmt_offset", pyarrow.decimal128(5, 2)),
-]
-
-all_schemas["ship_mode"] = [
- pyarrow.field("sm_ship_mode_sk", pyarrow.int32()),
- pyarrow.field("sm_ship_mode_id", pyarrow.string()),
- pyarrow.field("sm_type", pyarrow.string()),
- pyarrow.field("sm_code", pyarrow.string()),
- pyarrow.field("sm_carrier", pyarrow.string()),
- pyarrow.field("sm_contract", pyarrow.string()),
-]
-
-all_schemas["time_dim"] = [
- pyarrow.field("t_time_sk", pyarrow.int32()),
- pyarrow.field("t_time_id", pyarrow.string()),
- pyarrow.field("t_time", pyarrow.int32()),
- pyarrow.field("t_hour", pyarrow.int32()),
- pyarrow.field("t_minute", pyarrow.int32()),
- pyarrow.field("t_second", pyarrow.int32()),
- pyarrow.field("t_am_pm", pyarrow.string()),
- pyarrow.field("t_shift", pyarrow.string()),
- pyarrow.field("t_sub_shift", pyarrow.string()),
- pyarrow.field("t_meal_time", pyarrow.string()),
-]
-
-all_schemas["reason"] = [
- pyarrow.field("r_reason_sk", pyarrow.int32()),
- pyarrow.field("r_reason_id", pyarrow.string()),
- pyarrow.field("r_reason_desc", pyarrow.string()),
-]
-
-all_schemas["income_band"] = [
- pyarrow.field("ib_income_band_sk", pyarrow.int32()),
- pyarrow.field("ib_lower_bound", pyarrow.int32()),
- pyarrow.field("ib_upper_bound", pyarrow.int32()),
-]
-
-all_schemas["item"] = [
- pyarrow.field("i_item_sk", pyarrow.int32()),
- pyarrow.field("i_item_id", pyarrow.string()),
- pyarrow.field("i_rec_start_date", pyarrow.date32()),
- pyarrow.field("i_rec_end_date", pyarrow.date32()),
- pyarrow.field("i_item_desc", pyarrow.string()),
- pyarrow.field("i_current_price", pyarrow.decimal128(7, 2)),
- pyarrow.field("i_wholesale_cost", pyarrow.decimal128(7, 2)),
- pyarrow.field("i_brand_id", pyarrow.int32()),
- pyarrow.field("i_brand", pyarrow.string()),
- pyarrow.field("i_class_id", pyarrow.int32()),
- pyarrow.field("i_class", pyarrow.string()),
- pyarrow.field("i_category_id", pyarrow.int32()),
- pyarrow.field("i_category", pyarrow.string()),
- pyarrow.field("i_manufact_id", pyarrow.int32()),
- pyarrow.field("i_manufact", pyarrow.string()),
- pyarrow.field("i_size", pyarrow.string()),
- pyarrow.field("i_formulation", pyarrow.string()),
- pyarrow.field("i_color", pyarrow.string()),
- pyarrow.field("i_units", pyarrow.string()),
- pyarrow.field("i_container", pyarrow.string()),
- pyarrow.field("i_manager_id", pyarrow.int32()),
- pyarrow.field("i_product_name", pyarrow.string()),
-]
-
-all_schemas["store"] = [
- pyarrow.field("s_store_sk", pyarrow.int32()),
- pyarrow.field("s_store_id", pyarrow.string()),
- pyarrow.field("s_rec_start_date", pyarrow.date32()),
- pyarrow.field("s_rec_end_date", pyarrow.date32()),
- pyarrow.field("s_closed_date_sk", pyarrow.int32()),
- pyarrow.field("s_store_name", pyarrow.string()),
- pyarrow.field("s_number_employees", pyarrow.int32()),
- pyarrow.field("s_floor_space", pyarrow.int32()),
- pyarrow.field("s_hours", pyarrow.string()),
- pyarrow.field("s_manager", pyarrow.string()),
- pyarrow.field("s_market_id", pyarrow.int32()),
- pyarrow.field("s_geography_class", pyarrow.string()),
- pyarrow.field("s_market_desc", pyarrow.string()),
- pyarrow.field("s_market_manager", pyarrow.string()),
- pyarrow.field("s_division_id", pyarrow.int32()),
- pyarrow.field("s_division_name", pyarrow.string()),
- pyarrow.field("s_company_id", pyarrow.int32()),
- pyarrow.field("s_company_name", pyarrow.string()),
- pyarrow.field("s_street_number", pyarrow.string()),
- pyarrow.field("s_street_name", pyarrow.string()),
- pyarrow.field("s_street_type", pyarrow.string()),
- pyarrow.field("s_suite_number", pyarrow.string()),
- pyarrow.field("s_city", pyarrow.string()),
- pyarrow.field("s_county", pyarrow.string()),
- pyarrow.field("s_state", pyarrow.string()),
- pyarrow.field("s_zip", pyarrow.string()),
- pyarrow.field("s_country", pyarrow.string()),
- pyarrow.field("s_gmt_offset", pyarrow.decimal128(5, 2)),
- pyarrow.field("s_tax_precentage", pyarrow.decimal128(5, 2)),
-]
-
-all_schemas["call_center"] = [
- pyarrow.field("cc_call_center_sk", pyarrow.int32()),
- pyarrow.field("cc_call_center_id", pyarrow.string()),
- pyarrow.field("cc_rec_start_date", pyarrow.date32()),
- pyarrow.field("cc_rec_end_date", pyarrow.date32()),
- pyarrow.field("cc_closed_date_sk", pyarrow.int32()),
- pyarrow.field("cc_open_date_sk", pyarrow.int32()),
- pyarrow.field("cc_name", pyarrow.string()),
- pyarrow.field("cc_class", pyarrow.string()),
- pyarrow.field("cc_employees", pyarrow.int32()),
- pyarrow.field("cc_sq_ft", pyarrow.int32()),
- pyarrow.field("cc_hours", pyarrow.string()),
- pyarrow.field("cc_manager", pyarrow.string()),
- pyarrow.field("cc_mkt_id", pyarrow.int32()),
- pyarrow.field("cc_mkt_class", pyarrow.string()),
- pyarrow.field("cc_mkt_desc", pyarrow.string()),
- pyarrow.field("cc_market_manager", pyarrow.string()),
- pyarrow.field("cc_division", pyarrow.int32()),
- pyarrow.field("cc_division_name", pyarrow.string()),
- pyarrow.field("cc_company", pyarrow.int32()),
- pyarrow.field("cc_company_name", pyarrow.string()),
- pyarrow.field("cc_street_number", pyarrow.string()),
- pyarrow.field("cc_street_name", pyarrow.string()),
- pyarrow.field("cc_street_type", pyarrow.string()),
- pyarrow.field("cc_suite_number", pyarrow.string()),
- pyarrow.field("cc_city", pyarrow.string()),
- pyarrow.field("cc_county", pyarrow.string()),
- pyarrow.field("cc_state", pyarrow.string()),
- pyarrow.field("cc_zip", pyarrow.string()),
- pyarrow.field("cc_country", pyarrow.string()),
- pyarrow.field("cc_gmt_offset", pyarrow.decimal128(5, 2)),
- pyarrow.field("cc_tax_percentage", pyarrow.decimal128(5, 2)),
-]
-
-all_schemas["customer"] = [
- pyarrow.field("c_customer_sk", pyarrow.int32()),
- pyarrow.field("c_customer_id", pyarrow.string()),
- pyarrow.field("c_current_cdemo_sk", pyarrow.int32()),
- pyarrow.field("c_current_hdemo_sk", pyarrow.int32()),
- pyarrow.field("c_current_addr_sk", pyarrow.int32()),
- pyarrow.field("c_first_shipto_date_sk", pyarrow.int32()),
- pyarrow.field("c_first_sales_date_sk", pyarrow.int32()),
- pyarrow.field("c_salutation", pyarrow.string()),
- pyarrow.field("c_first_name", pyarrow.string()),
- pyarrow.field("c_last_name", pyarrow.string()),
- pyarrow.field("c_preferred_cust_flag", pyarrow.string()),
- pyarrow.field("c_birth_day", pyarrow.int32()),
- pyarrow.field("c_birth_month", pyarrow.int32()),
- pyarrow.field("c_birth_year", pyarrow.int32()),
- pyarrow.field("c_birth_country", pyarrow.string()),
- pyarrow.field("c_login", pyarrow.string()),
- pyarrow.field("c_email_address", pyarrow.string()),
- pyarrow.field("c_last_review_date_sk", pyarrow.string()),
-]
-
-all_schemas["web_site"] = [
- pyarrow.field("web_site_sk", pyarrow.int32()),
- pyarrow.field("web_site_id", pyarrow.string()),
- pyarrow.field("web_rec_start_date", pyarrow.date32()),
- pyarrow.field("web_rec_end_date", pyarrow.date32()),
- pyarrow.field("web_name", pyarrow.string()),
- pyarrow.field("web_open_date_sk", pyarrow.int32()),
- pyarrow.field("web_close_date_sk", pyarrow.int32()),
- pyarrow.field("web_class", pyarrow.string()),
- pyarrow.field("web_manager", pyarrow.string()),
- pyarrow.field("web_mkt_id", pyarrow.int32()),
- pyarrow.field("web_mkt_class", pyarrow.string()),
- pyarrow.field("web_mkt_desc", pyarrow.string()),
- pyarrow.field("web_market_manager", pyarrow.string()),
- pyarrow.field("web_company_id", pyarrow.int32()),
- pyarrow.field("web_company_name", pyarrow.string()),
- pyarrow.field("web_street_number", pyarrow.string()),
- pyarrow.field("web_street_name", pyarrow.string()),
- pyarrow.field("web_street_type", pyarrow.string()),
- pyarrow.field("web_suite_number", pyarrow.string()),
- pyarrow.field("web_city", pyarrow.string()),
- pyarrow.field("web_county", pyarrow.string()),
- pyarrow.field("web_state", pyarrow.string()),
- pyarrow.field("web_zip", pyarrow.string()),
- pyarrow.field("web_country", pyarrow.string()),
- pyarrow.field("web_gmt_offset", pyarrow.decimal128(5, 2)),
- pyarrow.field("web_tax_percentage", pyarrow.decimal128(5, 2)),
-]
-
-all_schemas["store_returns"] = [
- pyarrow.field("sr_returned_date_sk", pyarrow.int32()),
- pyarrow.field("sr_return_time_sk", pyarrow.int32()),
- pyarrow.field("sr_item_sk", pyarrow.int32()),
- pyarrow.field("sr_customer_sk", pyarrow.int32()),
- pyarrow.field("sr_cdemo_sk", pyarrow.int32()),
- pyarrow.field("sr_hdemo_sk", pyarrow.int32()),
- pyarrow.field("sr_addr_sk", pyarrow.int32()),
- pyarrow.field("sr_store_sk", pyarrow.int32()),
- pyarrow.field("sr_reason_sk", pyarrow.int32()),
- pyarrow.field("sr_ticket_number", pyarrow.int32()),
- pyarrow.field("sr_return_quantity", pyarrow.int32()),
- pyarrow.field("sr_return_amt", pyarrow.decimal128(7, 2)),
- pyarrow.field("sr_return_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("sr_return_amt_inc_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("sr_fee", pyarrow.decimal128(7, 2)),
- pyarrow.field("sr_return_ship_cost", pyarrow.decimal128(7, 2)),
- pyarrow.field("sr_refunded_cash", pyarrow.decimal128(7, 2)),
- pyarrow.field("sr_reversed_charge", pyarrow.decimal128(7, 2)),
- pyarrow.field("sr_store_credit", pyarrow.decimal128(7, 2)),
- pyarrow.field("sr_net_loss", pyarrow.decimal128(7, 2)),
-]
-
-all_schemas["household_demographics"] = [
- pyarrow.field("hd_demo_sk", pyarrow.int32()),
- pyarrow.field("hd_income_band_sk", pyarrow.int32()),
- pyarrow.field("hd_buy_potential", pyarrow.string()),
- pyarrow.field("hd_dep_count", pyarrow.int32()),
- pyarrow.field("hd_vehicle_count", pyarrow.int32()),
-]
-
-all_schemas["web_page"] = [
- pyarrow.field("wp_web_page_sk", pyarrow.int32()),
- pyarrow.field("wp_web_page_id", pyarrow.string()),
- pyarrow.field("wp_rec_start_date", pyarrow.date32()),
- pyarrow.field("wp_rec_end_date", pyarrow.date32()),
- pyarrow.field("wp_creation_date_sk", pyarrow.int32()),
- pyarrow.field("wp_access_date_sk", pyarrow.int32()),
- pyarrow.field("wp_autogen_flag", pyarrow.string()),
- pyarrow.field("wp_customer_sk", pyarrow.int32()),
- pyarrow.field("wp_url", pyarrow.string()),
- pyarrow.field("wp_type", pyarrow.string()),
- pyarrow.field("wp_char_count", pyarrow.int32()),
- pyarrow.field("wp_link_count", pyarrow.int32()),
- pyarrow.field("wp_image_count", pyarrow.int32()),
- pyarrow.field("wp_max_ad_count", pyarrow.int32()),
-]
-
-all_schemas["promotion"] = [
- pyarrow.field("p_promo_sk", pyarrow.int32()),
- pyarrow.field("p_promo_id", pyarrow.string()),
- pyarrow.field("p_start_date_sk", pyarrow.int32()),
- pyarrow.field("p_end_date_sk", pyarrow.int32()),
- pyarrow.field("p_item_sk", pyarrow.int32()),
- pyarrow.field("p_cost", pyarrow.decimal128(15, 2)),
- pyarrow.field("p_response_target", pyarrow.int32()),
- pyarrow.field("p_promo_name", pyarrow.string()),
- pyarrow.field("p_channel_dmail", pyarrow.string()),
- pyarrow.field("p_channel_email", pyarrow.string()),
- pyarrow.field("p_channel_catalog", pyarrow.string()),
- pyarrow.field("p_channel_tv", pyarrow.string()),
- pyarrow.field("p_channel_radio", pyarrow.string()),
- pyarrow.field("p_channel_press", pyarrow.string()),
- pyarrow.field("p_channel_event", pyarrow.string()),
- pyarrow.field("p_channel_demo", pyarrow.string()),
- pyarrow.field("p_channel_details", pyarrow.string()),
- pyarrow.field("p_purpose", pyarrow.string()),
- pyarrow.field("p_discount_active", pyarrow.string()),
-]
-
-all_schemas["catalog_page"] = [
- pyarrow.field("cp_catalog_page_sk", pyarrow.int32()),
- pyarrow.field("cp_catalog_page_id", pyarrow.string()),
- pyarrow.field("cp_start_date_sk", pyarrow.int32()),
- pyarrow.field("cp_end_date_sk", pyarrow.int32()),
- pyarrow.field("cp_department", pyarrow.string()),
- pyarrow.field("cp_catalog_number", pyarrow.int32()),
- pyarrow.field("cp_catalog_page_number", pyarrow.int32()),
- pyarrow.field("cp_description", pyarrow.string()),
- pyarrow.field("cp_type", pyarrow.string()),
-]
-
-all_schemas["inventory"] = [
- pyarrow.field("inv_date_sk", pyarrow.int32()),
- pyarrow.field("inv_item_sk", pyarrow.int32()),
- pyarrow.field("inv_warehouse_sk", pyarrow.int32()),
- pyarrow.field("inv_quantity_on_hand", pyarrow.int32()),
-]
-
-all_schemas["catalog_returns"] = [
- pyarrow.field("cr_returned_date_sk", pyarrow.int32()),
- pyarrow.field("cr_returned_time_sk", pyarrow.int32()),
- pyarrow.field("cr_item_sk", pyarrow.int32()),
- pyarrow.field("cr_refunded_customer_sk", pyarrow.int32()),
- pyarrow.field("cr_refunded_cdemo_sk", pyarrow.int32()),
- pyarrow.field("cr_refunded_hdemo_sk", pyarrow.int32()),
- pyarrow.field("cr_refunded_addr_sk", pyarrow.int32()),
- pyarrow.field("cr_returning_customer_sk", pyarrow.int32()),
- pyarrow.field("cr_returning_cdemo_sk", pyarrow.int32()),
- pyarrow.field("cr_returning_hdemo_sk", pyarrow.int32()),
- pyarrow.field("cr_returning_addr_sk", pyarrow.int32()),
- pyarrow.field("cr_call_center_sk", pyarrow.int32()),
- pyarrow.field("cr_catalog_page_sk", pyarrow.int32()),
- pyarrow.field("cr_ship_mode_sk", pyarrow.int32()),
- pyarrow.field("cr_warehouse_sk", pyarrow.int32()),
- pyarrow.field("cr_reason_sk", pyarrow.int32()),
- pyarrow.field("cr_order_number", pyarrow.int32()),
- pyarrow.field("cr_return_quantity", pyarrow.int32()),
- pyarrow.field("cr_return_amount", pyarrow.decimal128(7, 2)),
- pyarrow.field("cr_return_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("cr_return_amt_inc_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("cr_fee", pyarrow.decimal128(7, 2)),
- pyarrow.field("cr_return_ship_cost", pyarrow.decimal128(7, 2)),
- pyarrow.field("cr_refunded_cash", pyarrow.decimal128(7, 2)),
- pyarrow.field("cr_reversed_charge", pyarrow.decimal128(7, 2)),
- pyarrow.field("cr_store_credit", pyarrow.decimal128(7, 2)),
- pyarrow.field("cr_net_loss", pyarrow.decimal128(7, 2)),
-]
-
-all_schemas["web_returns"] = [
- pyarrow.field("wr_returned_date_sk", pyarrow.int32()),
- pyarrow.field("wr_returned_time_sk", pyarrow.int32()),
- pyarrow.field("wr_item_sk", pyarrow.int32()),
- pyarrow.field("wr_refunded_customer_sk", pyarrow.int32()),
- pyarrow.field("wr_refunded_cdemo_sk", pyarrow.int32()),
- pyarrow.field("wr_refunded_hdemo_sk", pyarrow.int32()),
- pyarrow.field("wr_refunded_addr_sk", pyarrow.int32()),
- pyarrow.field("wr_returning_customer_sk", pyarrow.int32()),
- pyarrow.field("wr_returning_cdemo_sk", pyarrow.int32()),
- pyarrow.field("wr_returning_hdemo_sk", pyarrow.int32()),
- pyarrow.field("wr_returning_addr_sk", pyarrow.int32()),
- pyarrow.field("wr_web_page_sk", pyarrow.int32()),
- pyarrow.field("wr_reason_sk", pyarrow.int32()),
- pyarrow.field("wr_order_number", pyarrow.int32()),
- pyarrow.field("wr_return_quantity", pyarrow.int32()),
- pyarrow.field("wr_return_amt", pyarrow.decimal128(7, 2)),
- pyarrow.field("wr_return_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("wr_return_amt_inc_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("wr_fee", pyarrow.decimal128(7, 2)),
- pyarrow.field("wr_return_ship_cost", pyarrow.decimal128(7, 2)),
- pyarrow.field("wr_refunded_cash", pyarrow.decimal128(7, 2)),
- pyarrow.field("wr_reversed_charge", pyarrow.decimal128(7, 2)),
- pyarrow.field("wr_account_credit", pyarrow.decimal128(7, 2)),
- pyarrow.field("wr_net_loss", pyarrow.decimal128(7, 2)),
-]
-
-all_schemas["web_sales"] = [
- pyarrow.field("ws_sold_date_sk", pyarrow.int32()),
- pyarrow.field("ws_sold_time_sk", pyarrow.int32()),
- pyarrow.field("ws_ship_date_sk", pyarrow.int32()),
- pyarrow.field("ws_item_sk", pyarrow.int32()),
- pyarrow.field("ws_bill_customer_sk", pyarrow.int32()),
- pyarrow.field("ws_bill_cdemo_sk", pyarrow.int32()),
- pyarrow.field("ws_bill_hdemo_sk", pyarrow.int32()),
- pyarrow.field("ws_bill_addr_sk", pyarrow.int32()),
- pyarrow.field("ws_ship_customer_sk", pyarrow.int32()),
- pyarrow.field("ws_ship_cdemo_sk", pyarrow.int32()),
- pyarrow.field("ws_ship_hdemo_sk", pyarrow.int32()),
- pyarrow.field("ws_ship_addr_sk", pyarrow.int32()),
- pyarrow.field("ws_web_page_sk", pyarrow.int32()),
- pyarrow.field("ws_web_site_sk", pyarrow.int32()),
- pyarrow.field("ws_ship_mode_sk", pyarrow.int32()),
- pyarrow.field("ws_warehouse_sk", pyarrow.int32()),
- pyarrow.field("ws_promo_sk", pyarrow.int32()),
- pyarrow.field("ws_order_number", pyarrow.int32()),
- pyarrow.field("ws_quantity", pyarrow.int32()),
- pyarrow.field("ws_wholesale_cost", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_list_price", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_sales_price", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_ext_discount_amt", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_ext_sales_price", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_ext_wholesale_cost", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_ext_list_price", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_ext_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_coupon_amt", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_ext_ship_cost", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_net_paid", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_net_paid_inc_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_net_paid_inc_ship", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_net_paid_inc_ship_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("ws_net_profit", pyarrow.decimal128(7, 2)),
-]
-
-all_schemas["catalog_sales"] = [
- pyarrow.field("cs_sold_date_sk", pyarrow.int32()),
- pyarrow.field("cs_sold_time_sk", pyarrow.int32()),
- pyarrow.field("cs_ship_date_sk", pyarrow.int32()),
- pyarrow.field("cs_bill_customer_sk", pyarrow.int32()),
- pyarrow.field("cs_bill_cdemo_sk", pyarrow.int32()),
- pyarrow.field("cs_bill_hdemo_sk", pyarrow.int32()),
- pyarrow.field("cs_bill_addr_sk", pyarrow.int32()),
- pyarrow.field("cs_ship_customer_sk", pyarrow.int32()),
- pyarrow.field("cs_ship_cdemo_sk", pyarrow.int32()),
- pyarrow.field("cs_ship_hdemo_sk", pyarrow.int32()),
- pyarrow.field("cs_ship_addr_sk", pyarrow.int32()),
- pyarrow.field("cs_call_center_sk", pyarrow.int32()),
- pyarrow.field("cs_catalog_page_sk", pyarrow.int32()),
- pyarrow.field("cs_ship_mode_sk", pyarrow.int32()),
- pyarrow.field("cs_warehouse_sk", pyarrow.int32()),
- pyarrow.field("cs_item_sk", pyarrow.int32()),
- pyarrow.field("cs_promo_sk", pyarrow.int32()),
- pyarrow.field("cs_order_number", pyarrow.int32()),
- pyarrow.field("cs_quantity", pyarrow.int32()),
- pyarrow.field("cs_wholesale_cost", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_list_price", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_sales_price", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_ext_discount_amt", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_ext_sales_price", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_ext_wholesale_cost", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_ext_list_price", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_ext_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_coupon_amt", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_ext_ship_cost", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_net_paid", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_net_paid_inc_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_net_paid_inc_ship", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_net_paid_inc_ship_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("cs_net_profit", pyarrow.decimal128(7, 2)),
-]
-
-all_schemas["store_sales"] = [
- pyarrow.field("ss_sold_date_sk", pyarrow.int32()),
- pyarrow.field("ss_sold_time_sk", pyarrow.int32()),
- pyarrow.field("ss_item_sk", pyarrow.int32()),
- pyarrow.field("ss_customer_sk", pyarrow.int32()),
- pyarrow.field("ss_cdemo_sk", pyarrow.int32()),
- pyarrow.field("ss_hdemo_sk", pyarrow.int32()),
- pyarrow.field("ss_addr_sk", pyarrow.int32()),
- pyarrow.field("ss_store_sk", pyarrow.int32()),
- pyarrow.field("ss_promo_sk", pyarrow.int32()),
- pyarrow.field("ss_ticket_number", pyarrow.int32()),
- pyarrow.field("ss_quantity", pyarrow.int32()),
- pyarrow.field("ss_wholesale_cost", pyarrow.decimal128(7, 2)),
- pyarrow.field("ss_list_price", pyarrow.decimal128(7, 2)),
- pyarrow.field("ss_sales_price", pyarrow.decimal128(7, 2)),
- pyarrow.field("ss_ext_discount_amt", pyarrow.decimal128(7, 2)),
- pyarrow.field("ss_ext_sales_price", pyarrow.decimal128(7, 2)),
- pyarrow.field("ss_ext_wholesale_cost", pyarrow.decimal128(7, 2)),
- pyarrow.field("ss_ext_list_price", pyarrow.decimal128(7, 2)),
- pyarrow.field("ss_ext_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("ss_coupon_amt", pyarrow.decimal128(7, 2)),
- pyarrow.field("ss_net_paid", pyarrow.decimal128(7, 2)),
- pyarrow.field("ss_net_paid_inc_tax", pyarrow.decimal128(7, 2)),
- pyarrow.field("ss_net_profit", pyarrow.decimal128(7, 2)),
-]
diff --git a/tpcdsgen/uv.lock b/tpcdsgen/uv.lock
deleted file mode 100644
index 0f3a8e7..0000000
--- a/tpcdsgen/uv.lock
+++ /dev/null
@@ -1,56 +0,0 @@
-version = 1
-revision = 3
-requires-python = ">=3.14"
-
-[[package]]
-name = "datafusion"
-version = "53.0.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
- { name = "pyarrow" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/58/2b/0f96f12b70839c93930c4e17d767fc32b6c77d548c78784128049e944701/datafusion-53.0.0.tar.gz", hash = "sha256:ba9a5ec06b5453fbd8710d6aeeb515a8bcac4b6c140e254409bb53a5f322ef22", size = 224267, upload-time = "2026-04-13T00:45:02.686Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/af/4c/60e052813d81f1ffe3123ead013dbdd2cf961daa576cb9056cbb80228e6b/datafusion-53.0.0-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:a0bd1a98d736571321416dc4ed361a9d1225da1ec9f6c5fad818d75f547697a7", size = 35774913, upload-time = "2026-04-13T00:44:46.235Z" },
- { url = "https://files.pythonhosted.org/packages/6e/59/beabe5301df3338d8206446cd624079e43bdad46e20377a6336017fb6ccf/datafusion-53.0.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:ce186a8d2405afd67e11e2fb75715019f16b00d070b8d0da89d8aa61cc74c8b5", size = 32667118, upload-time = "2026-04-13T00:44:50.269Z" },
- { url = "https://files.pythonhosted.org/packages/ae/94/636ab61ade98395daea6e733e225e9c7beef111c7c5b575ac851513e203c/datafusion-53.0.0-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:288a00a7ef03e2807a4667683f7560efd80d60ed1d41696ac15ca9ded14c8251", size = 35585824, upload-time = "2026-04-13T00:44:53.683Z" },
- { url = "https://files.pythonhosted.org/packages/34/80/b9f4889209af02f8d14bccb0e6f0519c329b072bc4d2595025a1303f144c/datafusion-53.0.0-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:8fef0004f0161fcfc556c025a7201f9cc3169aa3adb97a86419ebb34182d9efb", size = 38083690, upload-time = "2026-04-13T00:44:57.188Z" },
- { url = "https://files.pythonhosted.org/packages/4b/1a/ea4831fc6aeefedbcf186c9f6a273d507b1787c03cbb905bded7e1149a6a/datafusion-53.0.0-cp310-abi3-win_amd64.whl", hash = "sha256:4c8410f5f659b926677be6c7d443bbc05d825c078c970b7d8cf977ebcf948314", size = 38120687, upload-time = "2026-04-13T00:45:00.633Z" },
-]
-
-[[package]]
-name = "pyarrow"
-version = "24.0.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261, upload-time = "2026-04-21T10:51:25.837Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/ad/80/d022a34ff05d2cbedd8ccf841fc1f532ecfa9eb5ed1711b56d0e0ea71fc9/pyarrow-24.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:1cc9057f0319e26333b357e17f3c2c022f1a83739b48a88b25bfd5fa2dc18838", size = 35007997, upload-time = "2026-04-21T10:49:48.796Z" },
- { url = "https://files.pythonhosted.org/packages/1a/ff/f01485fda6f4e5d441afb8dd5e7681e4db18826c1e271852f5d3957d6a80/pyarrow-24.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e6f1278ee4785b6db21229374a1c9e54ec7c549de5d1efc9630b6207de7e170b", size = 36678720, upload-time = "2026-04-21T10:49:55.858Z" },
- { url = "https://files.pythonhosted.org/packages/9e/c2/2d2d5fea814237923f71b36495211f20b43a1576f9a4d6da7e751a64ec6f/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:adbbedc55506cbdabb830890444fb856bfb0060c46c6f8026c6c2f2cf86ae795", size = 45741852, upload-time = "2026-04-21T10:50:04.624Z" },
- { url = "https://files.pythonhosted.org/packages/8e/3a/28ba9c1c1ebdbb5f1b94dfebb46f207e52e6a554b7fe4132540fde29a3a0/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ae8a1145af31d903fa9bb166824d7abe9b4681a000b0159c9fb99c11bc11ad26", size = 48889852, upload-time = "2026-04-21T10:50:12.293Z" },
- { url = "https://files.pythonhosted.org/packages/df/51/4a389acfd31dca009f8fb82d7f510bb4130f2b3a8e18cf00194d0687d8ac/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d7027eba1df3b2069e2e8d80f644fa0918b68c46432af3d088ddd390d063ecde", size = 49445207, upload-time = "2026-04-21T10:50:20.677Z" },
- { url = "https://files.pythonhosted.org/packages/19/4b/0bab2b23d2ae901b1b9a03c0efd4b2d070256f8ce3fc43f6e58c167b2081/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e56a1ffe9bf7b727432b89104cc0849c21582949dd7bdcb34f17b2001a351a76", size = 51954117, upload-time = "2026-04-21T10:50:29.14Z" },
- { url = "https://files.pythonhosted.org/packages/29/88/f4e9145da0417b3d2c12035a8492b35ff4a3dbc653e614fcfb51d9dedb38/pyarrow-24.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:38be1808cdd068605b787e6ca9119b27eb275a0234e50212c3492331680c3b1e", size = 28001155, upload-time = "2026-04-21T10:51:22.337Z" },
- { url = "https://files.pythonhosted.org/packages/79/4f/46a49a63f43526da895b1a45bbb51d5baf8e4d77159f8528fc3e5490007f/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:418e48ce50a45a6a6c73c454677203a9c75c966cb1e92ca3370959185f197a05", size = 35250387, upload-time = "2026-04-21T10:50:35.552Z" },
- { url = "https://files.pythonhosted.org/packages/a0/da/d5e0cd5ef00796922404806d5f00325cdadc3441ce2c13fe7115f2df9a64/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:2f16197705a230a78270cdd4ea8a1d57e86b2fdcbc34a1f6aebc72e65c986f9a", size = 36797102, upload-time = "2026-04-21T10:50:42.417Z" },
- { url = "https://files.pythonhosted.org/packages/34/c7/5904145b0a593a05236c882933d439b5720f0a145381179063722fbfc123/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:fb24ac194bfc5e86839d7dcd52092ee31e5fe6733fe11f5e3b06ef0812b20072", size = 45745118, upload-time = "2026-04-21T10:50:49.324Z" },
- { url = "https://files.pythonhosted.org/packages/13/d3/cca42fe166d1c6e4d5b80e530b7949104d10e17508a90ae202dac205ce2a/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:9700ebd9a51f5895ce75ff4ac4b3c47a7d4b42bc618be8e713e5d56bacf5f931", size = 48844765, upload-time = "2026-04-21T10:50:55.579Z" },
- { url = "https://files.pythonhosted.org/packages/b0/49/942c3b79878ba928324d1e17c274ed84581db8c0a749b24bcf4cbdf15bd3/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d8ddd2768da81d3ee08cfea9b597f4abb4e8e1dc8ae7e204b608d23a0d3ab699", size = 49471890, upload-time = "2026-04-21T10:51:02.439Z" },
- { url = "https://files.pythonhosted.org/packages/76/97/ff71431000a75d84135a1ace5ca4ba11726a231a8007bbb320a4c54075d5/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:61a3d7eaa97a14768b542f3d284dc6400dd2470d9f080708b13cd46b6ae18136", size = 51932250, upload-time = "2026-04-21T10:51:10.576Z" },
- { url = "https://files.pythonhosted.org/packages/51/be/6f79d55816d5c22557cf27533543d5d70dfe692adfbee4b99f2760674f38/pyarrow-24.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:c91d00057f23b8d353039520dc3a6c09d8608164c692e9f59a175a42b2ae0c19", size = 28131282, upload-time = "2026-04-21T10:51:16.815Z" },
-]
-
-[[package]]
-name = "tpcdsgen"
-version = "0.1.0"
-source = { virtual = "." }
-dependencies = [
- { name = "datafusion" },
- { name = "pyarrow" },
-]
-
-[package.metadata]
-requires-dist = [
- { name = "datafusion", specifier = ">=53.0.0" },
- { name = "pyarrow", specifier = ">=24.0.0" },
-]