diff --git a/.github/workflows/tpcdsgen-conformance.yml b/.github/workflows/tpcdsgen-conformance.yml index 88c94d4..e4bcbb9 100644 --- a/.github/workflows/tpcdsgen-conformance.yml +++ b/.github/workflows/tpcdsgen-conformance.yml @@ -3,17 +3,23 @@ name: TPC-DS Conformance on: push: branches: [ main, master ] + paths: + - 'tpcdsgen/**' + - '.github/**' pull_request: branches: [ main, master ] + paths: + - 'tpcdsgen/**' + - '.github/**' env: CARGO_TERM_COLOR: always RUST_BACKTRACE: 1 jobs: - # Conformance testing against Java implementation + # Conformance testing against the Java / Trino reference implementation. conformance-tests: - name: Conformance Tests + name: Conformance Tests (Java) runs-on: ubuntu-latest steps: @@ -65,7 +71,60 @@ jobs: if: failure() # Upload fixtures if tests fail for debugging uses: actions/upload-artifact@v7 with: - name: test-fixtures + name: test-fixtures-java + path: tpcdsgen/tests/fixtures/ + retention-days: 7 + + # Conformance testing against the C dsdgen reference implementation. + # + # Reference data is pre-generated and lives in + # https://github.com/alamb/tpcds-data (branch sf1). + # `generate-fixtures.sh --compat c` clones it with --depth 1 and extracts + # into tpcdsgen/tests/fixtures/scale-1-c/. Rust is then run in + # --compat c mode and the .dat output is compared byte-for-byte (MD5/diff). + conformance-tests-c: + name: Conformance Tests (C dsdgen) + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache Rust dependencies + uses: actions/cache@v5 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo- + + - name: Download C dsdgen reference data + run: | + cd tpcdsgen + ./scripts/generate-fixtures.sh --compat c --scale 1 + + - name: Build Rust table generators + run: | + cargo build --release -p tpcdsgen + + - name: Run conformance tests (Rust --compat c vs C dsdgen) + run: | + cd tpcdsgen + ./scripts/test-all-tables.sh --compat c + + - name: Upload test fixtures as artifacts + if: failure() + uses: actions/upload-artifact@v7 + with: + name: test-fixtures-c path: tpcdsgen/tests/fixtures/ retention-days: 7 diff --git a/tpcdsgen/.gitignore b/tpcdsgen/.gitignore index 233832c..adcd66c 100644 --- a/tpcdsgen/.gitignore +++ b/tpcdsgen/.gitignore @@ -8,9 +8,6 @@ # Test fixtures (generated). #/tests/fixtures/ -# Python cache. -scripts/__pycache__/ - # Stuff I need to remember NEXT_STEPS.md ISSUES.md diff --git a/tpcdsgen/.python-version b/tpcdsgen/.python-version deleted file mode 100644 index 6324d40..0000000 --- a/tpcdsgen/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.14 diff --git a/tpcdsgen/README.md b/tpcdsgen/README.md index 623dff0..8eb1174 100644 --- a/tpcdsgen/README.md +++ b/tpcdsgen/README.md @@ -29,61 +29,47 @@ Fixtures are pre-generated TPC-DS data files used for conformance testing. ``` tests/fixtures/ -├── java/ # Java reference implementation output -│ ├── scale-1/ # 25 tables, ~1.2GB -│ └── scale-10/ # 25 tables, ~11GB -└── rust/ # Rust implementation output - ├── scale-1/ # 25 tables, ~1.2GB - └── scale-10/ # 25 tables, ~11GB +├── scale-1-java/ # Java reference fixtures (`--compat trino`) +├── scale-1-c/ # C dsdgen reference fixtures (`--compat c`) +└── scale-10-java/ # higher scale factors as needed ``` -### Generating Java Fixtures - -Requires the Java TPC-DS implementation to be built: +### Conformance Testing -```bash -# Build Java implementation (if not already built) -cd ../tpcds && mvn clean package -DskipTests && cd - - -# Generate Java fixtures for scale 1 -java -jar ../tpcds/target/tpcds-1.5-SNAPSHOT-jar-with-dependencies.jar \ - --scale 1 \ - --directory tests/fixtures/java/scale-1 \ - --overwrite - -# Generate Java fixtures for scale 10 -java -jar ../tpcds/target/tpcds-1.5-SNAPSHOT-jar-with-dependencies.jar \ - --scale 10 \ - --directory tests/fixtures/java/scale-10 \ - --overwrite -``` +`tpcdsgen` ships with two conformance suites, both implemented as shell +scripts that do byte-for-byte (MD5) comparison of `.dat` output. See +[scripts/README.md](scripts/README.md) for full details. -### Generating Rust Fixtures +**vs. Java / Trino reference (default, `--compat trino`):** ```bash -# Build Rust implementation -cargo build --release +# One-time: clone & build the Java TPC-DS implementation. +./scripts/bootstrap-java.sh -# Generate Rust fixtures for scale 1 -./target/release/tpcdsgen --scale 1 --directory tests/fixtures/rust/scale-1 +# Generate Java reference fixtures into tests/fixtures/scale-N-java/. +./scripts/generate-fixtures.sh -# Generate Rust fixtures for scale 10 -./target/release/tpcdsgen --scale 10 --directory tests/fixtures/rust/scale-10 +# Compare Rust output byte-for-byte against the Java fixtures. +./scripts/test-all-tables.sh --scale 1 ``` -### Conformance Testing - -To verify Rust output matches Java byte-for-byte: +**vs. C dsdgen reference (`--compat c`):** ```bash -# Run conformance tests at scale 1 -./scripts/test-all-tables.sh --scale 1 +# One-time: download pre-generated C dsdgen data from +# https://github.com/alamb/tpcds-data into tests/fixtures/scale-N-c/. +./scripts/generate-fixtures.sh --compat c --scale 1 -# Run conformance tests at scale 10 -./scripts/test-all-tables.sh --scale 10 +# Compare Rust --compat c output byte-for-byte against the C fixtures. +./scripts/test-all-tables.sh --compat c --scale 1 ``` -See [HASHES.md](HASHES.md) for the canonical MD5 hashes. +Both suites also support comparing a single table: + +```bash +./scripts/compare-table.sh reason # vs. Java +./scripts/compare-table.sh reason --compat c # vs. C dsdgen +``` ### Verifying Fixtures with MD5SUMS @@ -91,13 +77,13 @@ Each fixture directory contains an `MD5SUMS` file for verification. **On Linux:** ```bash -cd tests/fixtures/java/scale-1 +cd tests/fixtures/scale-1-java md5sum -c MD5SUMS ``` **On macOS:** ```bash -cd tests/fixtures/java/scale-1 +cd tests/fixtures/scale-1-java while read hash file; do [[ $(md5 -q "$file") == "$hash" ]] && echo "$file: OK" || echo "$file: FAILED" done < MD5SUMS diff --git a/tpcdsgen/main.py b/tpcdsgen/main.py deleted file mode 100644 index b59da6e..0000000 --- a/tpcdsgen/main.py +++ /dev/null @@ -1,6 +0,0 @@ -def main(): - print("Hello from tpcdsgen!") - - -if __name__ == "__main__": - main() diff --git a/tpcdsgen/pyproject.toml b/tpcdsgen/pyproject.toml deleted file mode 100644 index 2fa8884..0000000 --- a/tpcdsgen/pyproject.toml +++ /dev/null @@ -1,10 +0,0 @@ -[project] -name = "tpcdsgen" -version = "0.1.0" -description = "Add your description here" -readme = "README.md" -requires-python = ">=3.14" -dependencies = [ - "datafusion>=53.0.0", - "pyarrow>=24.0.0", -] diff --git a/tpcdsgen/scripts/README.md b/tpcdsgen/scripts/README.md index 3c71f6a..bc3629a 100644 --- a/tpcdsgen/scripts/README.md +++ b/tpcdsgen/scripts/README.md @@ -1,285 +1,143 @@ # TPC-DS Test Scripts -This directory contains scripts for testing the Rust TPC-DS implementation against the Java reference implementation. +This directory contains scripts for testing the Rust TPC-DS implementation +against two reference implementations: -## Overview +1. **Java / Trino** (default, `--compat trino`) — the Java port of `dsdgen` + used by Trino. The Rust port was originally derived from this and is + expected to be byte-for-byte identical. +2. **C `dsdgen`** (`--compat c`) — the original TPC-supplied reference + implementation. The `--compat c` mode corrects bugs in the Java port to + match the C reference (see [BUGS.md](../BUGS.md) and the parent + [README](../README.md)). -The testing infrastructure validates that the Rust port generates **byte-for-byte identical** output to the Java -implementation (which itself maintains bug-for-bug compatibility with the original C dsdgen). - -## Prerequisites - -You need the Java TPC-DS implementation for conformance testing. Use the bootstrap script to set it up: - -```bash -./scripts/bootstrap-java.sh -``` - -This will clone and build the Java implementation automatically. See the bootstrap section below for details. +Both conformance suites validate **byte-for-byte identical** output via +MD5/`diff` comparison. ## Directory Structure ``` tpcdsgen/ ├── tests/ -│ └── fixtures/ # Generated reference data (gitignored) -│ └── scale-1/ # Scale factor 1 reference data +│ └── fixtures/ # Reference data (gitignored) +│ ├── scale-1-java/ # Java reference (`--compat trino`) +│ │ ├── call_center.dat +│ │ ├── warehouse.dat +│ │ └── ... (all 25 tables) +│ └── scale-1-c/ # C dsdgen reference (`--compat c`) │ ├── call_center.dat │ ├── warehouse.dat │ └── ... (all 25 tables) └── scripts/ - ├── bootstrap-java.sh # Setup Java TPC-DS implementation - ├── generate-fixtures.sh # Generate Java reference data - ├── compare-table.sh # Compare one table - ├── test-all-tables.sh # Test all ported tables - ├── clean-fixtures.sh # Clean up fixtures - └── README.md # This file + ├── bootstrap-java.sh # Clone + build the Java TPC-DS impl + ├── generate-fixtures.sh # Generate/download reference fixtures + │ # (Java via --compat trino; C via --compat c) + ├── compare-table.sh # Compare one table + ├── test-all-tables.sh # Compare all ported tables + ├── clean-fixtures.sh # Clean fixtures + └── README.md # This file ``` -## Quick Start +## Quick Start — Java conformance (`--compat trino`) ```bash # 1. Bootstrap Java implementation (first time only) ./scripts/bootstrap-java.sh -# 2. Generate reference fixtures +# 2. Generate Java reference fixtures into tests/fixtures/scale-N-java/. ./scripts/generate-fixtures.sh -# 3. Test all ported tables +# 3. Test all ported tables against the Java reference. ./scripts/test-all-tables.sh ``` -## Scripts - -### 0. `bootstrap-java.sh` - Setup Java TPC-DS Implementation - -**⚠️ Run this first!** Sets up the Java TPC-DS implementation needed for conformance testing. - -**Usage:** -```bash -# First time setup (clone and build) -./scripts/bootstrap-java.sh - -# Force rebuild -./scripts/bootstrap-java.sh --rebuild - -# Verify existing installation -./scripts/bootstrap-java.sh --verify - -# Show help -./scripts/bootstrap-java.sh --help -``` - -**What it does:** -1. Checks if Java and Maven are installed -2. Clones the Java TPC-DS repository from GitHub (if needed) -3. Builds the Java implementation with Maven -4. Runs a smoke test to verify it works - -**Requirements:** -- Java 11+ (e.g., `brew install openjdk@11`) -- Maven (e.g., `brew install maven`) -- Git - -**Environment Variables:** -- `TPCDS_JAVA_REPO` - Override the Java repo URL (default: https://github.com/trinodb/tpcds.git) - -**Output:** -- Clones to `../tpcds/` (parallel to this repo) -- Creates `../tpcds/target/tpcds-*-jar-with-dependencies.jar` - -**Time:** ~2-3 minutes (first run) - -### 1. `generate-fixtures.sh` - Generate Reference Data - -Generates TPC-DS tables using the Java implementation. This creates the "golden reference" data that Rust output is compared against. - -**Usage:** -```bash -# Generate all 25 tables (recommended first run) -./scripts/generate-fixtures.sh - -# Generate specific tables -./scripts/generate-fixtures.sh call_center warehouse - -# Quiet mode (minimal output) -./scripts/generate-fixtures.sh --quiet - -# Show help -./scripts/generate-fixtures.sh --help -``` - -**What it does:** -1. Checks if Java implementation is built (builds if needed) -2. Creates `tests/fixtures/scale-1/` directory -3. Generates each table using Java TPC-DS generator -4. Reports progress and statistics - -**Output:** -- Generates `.dat` files in `tests/fixtures/scale-1/` -- Each file contains pipe-delimited rows with trailing pipe: `value1|value2|value3|` -- Files are gitignored (regenerate as needed) - -**Time:** ~2-5 minutes for all 25 tables at scale 1 - ---- - -### 2. `compare-table.sh` - Compare Single Table - -Compares Rust-generated output for a single table against the Java reference fixture. - -**Usage:** -```bash -# Compare a table -./scripts/compare-table.sh call_center - -# Quiet mode -./scripts/compare-table.sh customer_demographics --quiet - -# Show help -./scripts/compare-table.sh --help -``` - -**What it does:** -1. Checks that Java fixture exists -2. Generates table using Rust implementation -3. Performs byte-for-byte comparison with `diff` -4. Reports results - -**Exit codes:** -- `0` - Tables match exactly ✓ -- `1` - Tables differ or error occurred ✗ - -**Output example:** -``` -[INFO] ========================================= -[INFO] Table Comparison: call_center -[INFO] ========================================= -[INFO] Java fixture: tests/fixtures/scale-1/call_center.dat -[INFO] Generating call_center with Rust... -[INFO] Using binary: target/release/tpcdsgen --table call_center -[INFO] Comparing outputs... -[INFO] Java fixture: 6 rows, 4.0K -[INFO] Rust output: 6 rows, 4.0K -[SUCCESS] ✓ call_center: MD5 match (6 rows, cc9aabc63eb8603bd7330b6735ed0961) -[INFO] ========================================= -``` - ---- +## Quick Start — C dsdgen conformance (`--compat c`) -### 3. `test-all-tables.sh` - Test All Ported Tables +The C reference data is pre-generated and published in +[alamb/tpcds-data](https://github.com/alamb/tpcds-data), one branch per +scale factor (`sf1`, `sf2`, ...). `generate-fixtures.sh --compat c` clones +the requested branch with `--depth 1` and extracts it into +`tests/fixtures/scale-N-c/`. -Runs comparison tests for all tables that have been ported to Rust. This is the main test suite. - -**Usage:** ```bash -# Test all ported tables (verbose) -./scripts/test-all-tables.sh - -# Quiet mode (show only summary) -./scripts/test-all-tables.sh --quiet - -# Show help -./scripts/test-all-tables.sh --help -``` - -**What it does:** -1. Tests all 24 TPC-DS tables (dbgen_version excluded - has timestamps) -2. Builds the unified Rust generator (`tpcdsgen`) -3. Compares each table against Java fixture using `compare-table.sh` -4. Prints comprehensive summary +# 1. Download the C dsdgen reference data (default scale 1). +./scripts/generate-fixtures.sh --compat c # sf1 +./scripts/generate-fixtures.sh --compat c --scale 2 # sf2 -**Exit codes:** -- `0` - All tables match ✓ -- `1` - One or more tables differ ✗ +# 2. Test all ported tables against the C reference. +./scripts/test-all-tables.sh --compat c -**Output example:** -``` -[INFO] ========================================= -[INFO] TPC-DS Table Test Suite -[INFO] ========================================= -[INFO] Testing 24 tables: -[INFO] - call_center -[INFO] - catalog_page -[INFO] - catalog_returns -[INFO] ... (all 24 tables) -[INFO] ========================================= -[INFO] Building Rust TPC-DS generator... -[SUCCESS] Generator built successfully -[INFO] ========================================= - -[INFO] Testing: call_center -... -[SUCCESS] ✓ call_center: MD5 match (6 rows, cc9aabc63eb8603bd7330b6735ed0961) -... - -[INFO] ========================================= -[INFO] Test Summary -[INFO] ========================================= -[INFO] Total tables tested: 24 -[SUCCESS] Passed: 24 - -[INFO] Total time: 45s -[INFO] ========================================= +# Or compare a single table. +./scripts/compare-table.sh reason --compat c ``` ---- +### Tables excluded from automated checks -### 4. `clean-fixtures.sh` - Clean Up Fixtures +The following tables are excluded from automated MD5 comparison; the +exclusion lists live in `test-all-tables.sh`. -Removes all generated fixtures to free up disk space or force regeneration. +- **Always:** `dbgen_version.dat` — contains a generation timestamp. +- **`--compat c` only:** `customer.dat` — the reference data in + `alamb/tpcds-data` was generated through a pipeline that double-UTF-8 + encodes the non-ASCII country names (`CÔTE D'IVOIRE`, `RÉUNION`). The + Rust `--compat c` output uses raw Latin-1, which is what unmodified C + `dsdgen` produces. Once the reference data is regenerated without the + `iconv ISO-8859-14 -> UTF-8` step in `alamb/tpcds-data`'s `Dockerfile`, + this exclusion can be removed. -**Usage:** -```bash -# Clean with confirmation prompt -./scripts/clean-fixtures.sh +## Scripts -# Clean without confirmation -./scripts/clean-fixtures.sh --yes +Each script is self-documenting — open it and read the header comment for +full usage, flags, environment variables, output, and exit codes. The +table below is just a roadmap. -# Show help -./scripts/clean-fixtures.sh --help -``` +| Script | Purpose | +|---------------------------|---------------------------------------------------------------------------------------------------------------------------------| +| `bootstrap-java.sh` | Clone and build the Java / Trino reference implementation into `../tpcds/`. Run once before Java conformance. | +| `generate-fixtures.sh` | Populate `tests/fixtures/scale-N-{java,c}/` with reference data. `--compat trino` (default) runs the Java impl; `--compat c` downloads pre-generated C `dsdgen` data from [alamb/tpcds-data](https://github.com/alamb/tpcds-data). | +| `compare-table.sh` | Compare one table's Rust output against the selected reference (`--compat trino` or `--compat c`) via MD5 + diff. | +| `test-all-tables.sh` | Run the full conformance suite for one compat mode (the main CI entry point). Honors per-mode skip lists at the top of the script. | +| `clean-fixtures.sh` | Remove all generated fixtures under `tests/fixtures/`. | -**What it does:** -1. Counts fixture files and reports total size -2. Asks for confirmation (unless `--yes` provided) -3. Deletes entire `tests/fixtures/` directory +Run any script with `--help` to print its usage block. --- ## Typical Workflow -### Initial Setup +### Java conformance ```bash -# 1. Generate all reference fixtures (one-time, or when Java changes) +# 1. Generate Java reference fixtures (one-time, or when Java changes). ./scripts/generate-fixtures.sh -# This creates tests/fixtures/scale-1/*.dat files +# 2. Run the comparison. +./scripts/compare-table.sh # one table +./scripts/test-all-tables.sh # all tables ``` -### During Development +### C dsdgen conformance ```bash -# 2. After implementing a new table, compare it -./scripts/compare-table.sh new_table_name +# 1. Download the C reference data (one-time, or to refresh). +./scripts/generate-fixtures.sh --compat c -# 3. Or test all ported tables at once -./scripts/test-all-tables.sh +# 2. Run the comparison in C-compat mode. +./scripts/compare-table.sh
--compat c +./scripts/test-all-tables.sh --compat c ``` ### Cleanup ```bash -# 4. Remove fixtures if needed (can regenerate anytime) -./scripts/clean-fixtures.sh --yes +./scripts/clean-fixtures.sh --yes # remove all fixtures ``` --- ## Requirements -- **Java:** Maven-built TPC-DS JAR at `../tpcds/target/tpcds-*-jar-with-dependencies.jar` -- **Rust:** Cargo-built `tpcdsgen` binary at `target/debug/tpcdsgen` or `target/release/tpcdsgen` -- **Disk space:** ~500MB-1GB for scale 1 fixtures +- **Java:** Maven-built TPC-DS JAR at `../tpcds/target/tpcds-*-jar-with-dependencies.jar` (`bootstrap-java.sh` handles this). +- **C dsdgen reference:** `git`, `tar`, `bzip2` for `generate-fixtures.sh --compat c`. No C compiler required — data is pre-generated. +- **Rust:** Cargo-built `tpcdsgen` binary at `target/debug/tpcdsgen` or `target/release/tpcdsgen`. +- **Disk space:** ~1 GB for SF1 Java fixtures; ~2.4 GB for SF1 C fixtures. --- @@ -296,16 +154,21 @@ mvn clean package cargo build --release ``` -**Problem:** `Fixture not found` +**Problem:** `Fixture not found` (Java path) ```bash ./scripts/generate-fixtures.sh X ``` +**Problem:** `Fixture not found` (C path) +```bash +./scripts/generate-fixtures.sh --compat c --scale N +``` + **Problem:** Tables don't match -1. Check if both implementations use same seed (should be deterministic) -2. Verify Rust port logic against Java source -3. Use `diff` output to find first difference -4. Debug specific row/column that differs +1. Check that the right compat mode is selected (`--compat trino` vs `--compat c`). +2. Verify both sides use the same seed (the Rust generator is deterministic). +3. Use the `diff` output to find the first difference. +4. Debug the specific row/column that differs. --- @@ -314,12 +177,14 @@ cargo build --release These scripts are designed to be CI-friendly: ```yaml -# Example GitHub Actions workflow -- name: Generate fixtures - run: ./scripts/generate-fixtures.sh --quiet - -- name: Test all tables - run: ./scripts/test-all-tables.sh --quiet +# Java conformance +- run: ./scripts/bootstrap-java.sh +- run: ./scripts/generate-fixtures.sh --quiet +- run: ./scripts/test-all-tables.sh --quiet + +# C dsdgen conformance +- run: ./scripts/generate-fixtures.sh --compat c +- run: ./scripts/test-all-tables.sh --compat c --quiet ``` Exit codes make it easy to fail CI on mismatches. diff --git a/tpcdsgen/scripts/bootstrap-java.sh b/tpcdsgen/scripts/bootstrap-java.sh index 301926a..4f29183 100755 --- a/tpcdsgen/scripts/bootstrap-java.sh +++ b/tpcdsgen/scripts/bootstrap-java.sh @@ -1,19 +1,50 @@ #!/usr/bin/env bash # -# Bootstrap the Java TPC-DS implementation for conformance testing +# bootstrap-java.sh — Set up the Java / Trino TPC-DS reference +# implementation used by `--compat trino` conformance testing. # -# This script: -# 1. Clones the Java TPC-DS repository (if needed) -# 2. Builds the Java implementation -# 3. Verifies the build succeeded -# -# Usage: -# ./scripts/bootstrap-java.sh # Clone and build -# ./scripts/bootstrap-java.sh --rebuild # Force rebuild even if exists -# ./scripts/bootstrap-java.sh --verify # Just verify, don't clone/build +# Please see print_usage() below for details. set -euo pipefail +print_usage() { + cat << 'EOF' +bootstrap-java.sh — Set up the Java / Trino TPC-DS reference implementation. + +What it does: + 1. Checks that Java 11+ and Maven are installed. + 2. Clones the Java TPC-DS repository into ../tpcds/ (if not present). + 3. Builds the Java implementation with `mvn clean package -DskipTests`. + 4. Runs a small smoke test to confirm the JAR works. + +Usage: + bootstrap-java.sh [OPTIONS] + +Options: + --rebuild Force rebuild even if the JAR already exists. + --verify Only verify the existing installation; do not clone/build. + --help Show this help message. + +Environment variables: + TPCDS_JAVA_REPO Git URL for Java TPC-DS repo. + Default: https://github.com/trinodb/tpcds.git + +Requirements: Java 11+, Maven, git. + +Output: + Clones to ../tpcds/ (parallel to this repo) and produces + ../tpcds/target/tpcds-*-jar-with-dependencies.jar. + +Examples: + bootstrap-java.sh # Clone and build if needed. + bootstrap-java.sh --rebuild # Force clean rebuild. + bootstrap-java.sh --verify # Just check existing install. + +See scripts/README.md for the full conformance-testing workflow. +EOF + exit 0 +} + # Colors RED='\033[0;31m' GREEN='\033[0;32m' @@ -49,32 +80,6 @@ log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2 } -# Print usage -usage() { - cat << EOF -Bootstrap the Java TPC-DS implementation for conformance testing - -Usage: - $(basename "$0") [OPTIONS] - -Options: - --rebuild Force rebuild even if JAR exists - --verify Only verify installation, don't clone/build - --help Show this help message - -Environment Variables: - TPCDS_JAVA_REPO Git URL for Java TPC-DS repo - Default: https://github.com/trinodb/tpcds.git - -Examples: - $(basename "$0") # Clone and build if needed - $(basename "$0") --rebuild # Force clean rebuild - $(basename "$0") --verify # Just check if everything works - -EOF - exit 0 -} - # Check if Java/Maven are installed check_prerequisites() { log_info "Checking prerequisites..." @@ -275,7 +280,7 @@ main() { shift ;; --help) - usage + print_usage ;; *) log_error "Unknown option: $1" diff --git a/tpcdsgen/scripts/clean-fixtures.sh b/tpcdsgen/scripts/clean-fixtures.sh index fa4df72..51e7f3c 100755 --- a/tpcdsgen/scripts/clean-fixtures.sh +++ b/tpcdsgen/scripts/clean-fixtures.sh @@ -1,12 +1,41 @@ #!/usr/bin/env bash # -# Clean up generated test fixtures +# clean-fixtures.sh — Remove all generated reference fixtures. # -# Usage: -# ./scripts/clean-fixtures.sh [--yes] +# Please see print_usage() below for details. set -euo pipefail +print_usage() { + cat << 'EOF' +clean-fixtures.sh — Remove all generated reference fixtures. + +Deletes the entire tests/fixtures/ tree (Java fixtures in scale-N-java/ +and C dsdgen fixtures in scale-N-c/). Fixtures are git-ignored generated +artifacts and can be re-created with +./scripts/generate-fixtures.sh (with or without --compat c). + +What it does: + 1. Counts existing .dat fixture files and reports total size. + 2. Asks for confirmation (unless --yes is passed). + 3. Removes tests/fixtures/ entirely. + +Usage: + clean-fixtures.sh [OPTIONS] + +Options: + --yes Skip confirmation prompt. + --help Show this help message. + +Examples: + clean-fixtures.sh # Clean with confirmation. + clean-fixtures.sh --yes # Clean without confirmation. + +See scripts/README.md for the full conformance-testing workflow. +EOF + exit 0 +} + # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' @@ -34,25 +63,6 @@ log_warn() { echo -e "${YELLOW}[WARN]${NC} $*" } -# Print usage -usage() { - cat << EOF -Clean up generated test fixtures - -Usage: - $(basename "$0") [--yes] - -Options: - --yes Skip confirmation prompt - -Examples: - $(basename "$0") # Clean with confirmation - $(basename "$0") --yes # Clean without confirmation - -EOF - exit 0 -} - # Main function main() { # Parse arguments @@ -63,11 +73,11 @@ main() { shift ;; --help) - usage + print_usage ;; *) log_warn "Unknown option: $1" - usage + print_usage ;; esac done diff --git a/tpcdsgen/scripts/compare-c.py b/tpcdsgen/scripts/compare-c.py deleted file mode 100644 index 54a223a..0000000 --- a/tpcdsgen/scripts/compare-c.py +++ /dev/null @@ -1,227 +0,0 @@ -#!/usr/bin/env python3 -""" -Compare Rust .dat output against C dsdgen Parquet reference. - -Typical workflow: - # 1. Obtain C dsdgen Parquet reference data, e.g. from datafusion-benchmarks: - # git clone https://github.com/apache/datafusion-benchmarks - # # the sf1 Parquet files are at datafusion-benchmarks/tpcds/data/sf1/ - - # 2. Generate Rust output in C-compat mode: - # cargo run -p tpcdsgen -- --compat c --directory /tmp/tpcds-c - - # 3. Compare: - # uv run scripts/compare-c.py --dat-dir /tmp/tpcds-c --parquet-dir path/to/sf1 [TABLE...] - -Usage: - uv run scripts/compare-c.py --dat-dir DIR --parquet-dir DIR [TABLE...] [--verbose] -""" - -import sys -import argparse -from decimal import Decimal -from pathlib import Path - -import pyarrow as pa -import pyarrow.parquet as pq - -SCRIPT_DIR = Path(__file__).parent.resolve() -sys.path.insert(0, str(SCRIPT_DIR)) -from tpcds_schemas import all_schemas - -MAX_DIFF_ROWS = 10 # Max differing rows to show per table - - -# --------------------------------------------------------------------------- -# Value normalization -# --------------------------------------------------------------------------- - -def normalize_dat_value(raw: str, field: pa.Field): - """Parse a raw .dat string into a typed, comparable Python value.""" - if raw == "": - return None - t = field.type - if pa.types.is_integer(t): - return int(raw) - if pa.types.is_decimal(t): - return Decimal(raw).normalize() - if pa.types.is_date(t): - return raw # Keep as YYYY-MM-DD string - return raw # string: keep as-is (C dsdgen doesn't pad) - - -def fix_mojibake(s: str) -> str: - """Fix strings where UTF-8 bytes were stored as individual Latin-1 characters. - - The C dsdgen Parquet reference was generated by tpcdsgen.py without - specifying an encoding, so DataFusion read the raw UTF-8 bytes and stored - each byte as a Latin-1 character. This re-encodes as Latin-1 then decodes - as UTF-8, recovering the original Unicode string. - """ - try: - return s.encode("latin-1").decode("utf-8") - except (UnicodeDecodeError, UnicodeEncodeError): - return s - - -def normalize_parquet_value(scalar, field: pa.Field): - """Normalize a pyarrow scalar into the same comparable Python value.""" - py = scalar.as_py() - if py is None: - return None - t = field.type - if pa.types.is_integer(t): - return int(py) - if pa.types.is_decimal(t): - return Decimal(str(py)).normalize() - if pa.types.is_date(t): - return str(py) # date → "YYYY-MM-DD" - if pa.types.is_string(t) or pa.types.is_large_string(t): - return fix_mojibake(py) - return py - - -# --------------------------------------------------------------------------- -# Loaders -# --------------------------------------------------------------------------- - -def load_dat(table: str, fields: list[pa.Field], dat_dir: Path) -> list[tuple]: - path = dat_dir / f"{table}.dat" - if not path.exists(): - raise FileNotFoundError( - f".dat file not found: {path}\n" - f"Generate with: cargo run -p tpcdsgen -- --compat c --directory {dat_dir}" - ) - - rows = [] - with open(path, "r", encoding="latin-1") as f: - for lineno, line in enumerate(f, 1): - line = line.rstrip("\n") - parts = line.split("|") - # Drop trailing empty field produced by the trailing | - if parts and parts[-1] == "": - parts = parts[:-1] - if len(parts) != len(fields): - raise ValueError( - f"{table}.dat line {lineno}: expected {len(fields)} fields, " - f"got {len(parts)}: {line[:120]!r}" - ) - rows.append(tuple(normalize_dat_value(p, fields[i]) for i, p in enumerate(parts))) - return rows - - -def load_parquet(table: str, fields: list[pa.Field], parquet_dir: Path) -> list[tuple]: - path = parquet_dir / f"{table}.parquet" - if not path.exists(): - raise FileNotFoundError(f"Parquet reference not found: {path}") - - col_names = [f.name for f in fields] - tbl = pq.read_table(path, columns=col_names) - - rows = [] - for batch in tbl.to_batches(): - for row_idx in range(batch.num_rows): - rows.append(tuple( - normalize_parquet_value(batch.column(i)[row_idx], fields[i]) - for i in range(len(fields)) - )) - return rows - - -# --------------------------------------------------------------------------- -# Comparison -# --------------------------------------------------------------------------- - -def compare_table(table: str, verbose: bool, dat_dir: Path, parquet_dir: Path) -> bool: - fields = all_schemas[table] - col_names = [f.name for f in fields] - - try: - dat_rows = load_dat(table, fields, dat_dir) - pq_rows = load_parquet(table, fields, parquet_dir) - except Exception as e: - print(f" ERROR: {e}") - return False - - if len(dat_rows) != len(pq_rows): - print(f" ROW COUNT MISMATCH rust={len(dat_rows)} c={len(pq_rows)}") - return False - - # Sort both by all columns for a canonical order. - # This is O(n log n) but handles any ordering differences gracefully. - dat_rows.sort(key=lambda r: tuple((v is None, v) if not isinstance(v, Decimal) else (False, float(v)) for v in r)) - pq_rows.sort(key=lambda r: tuple((v is None, v) if not isinstance(v, Decimal) else (False, float(v)) for v in r)) - - diff_rows = 0 - first_diff_at = None - for i, (dr, pr) in enumerate(zip(dat_rows, pq_rows)): - if dr != pr: - if diff_rows == 0: - first_diff_at = i + 1 - diff_rows += 1 - if verbose and diff_rows <= MAX_DIFF_ROWS: - print(f" row {i+1}:") - for j, (dv, pv) in enumerate(zip(dr, pr)): - if dv != pv: - print(f" {col_names[j]}: rust={dv!r} c={pv!r}") - - if diff_rows == 0: - print(f" \033[32m✓ MATCH\033[0m {len(dat_rows)} rows") - return True - else: - pct = 100.0 * diff_rows / len(dat_rows) - print(f" \033[31m✗ {diff_rows}/{len(dat_rows)} rows differ ({pct:.2f}%) first diff at row {first_diff_at}\033[0m") - if not verbose: - print(f" (re-run with --verbose to see column diffs)") - return False - - -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- - -def main(): - parser = argparse.ArgumentParser(description="Compare Rust .dat output vs C dsdgen Parquet reference") - parser.add_argument("tables", nargs="*", help="Tables to compare (default: all)") - parser.add_argument("--dat-dir", required=True, - help="Directory containing .dat files generated by: tpcdsgen --compat c --directory DIR") - parser.add_argument("--parquet-dir", required=True, - help="Directory containing C dsdgen Parquet files (e.g. datafusion-benchmarks/tpcds/data/sf1)") - parser.add_argument("--verbose", "-v", action="store_true", help="Show differing column values") - args = parser.parse_args() - - dat_dir = Path(args.dat_dir) - parquet_dir = Path(args.parquet_dir) - - available = sorted(all_schemas.keys()) - tables = args.tables if args.tables else available - - unknown = [t for t in tables if t not in all_schemas] - if unknown: - print(f"Unknown tables: {', '.join(unknown)}") - print(f"Available: {', '.join(available)}") - sys.exit(1) - - print(f"Comparing {len(tables)} table(s): Rust .dat vs C dsdgen Parquet\n") - print(f" dat-dir : {dat_dir}") - print(f" parquet-dir: {parquet_dir}\n") - - results: dict[str, bool] = {} - for table in tables: - print(f"{table}:") - results[table] = compare_table(table, args.verbose, dat_dir, parquet_dir) - - passed = [t for t, ok in results.items() if ok] - failed = [t for t, ok in results.items() if not ok] - - print(f"\n{'='*50}") - print(f"Passed : {len(passed)}/{len(results)}") - if failed: - print(f"Failed : {', '.join(failed)}") - print("="*50) - - sys.exit(0 if not failed else 1) - - -if __name__ == "__main__": - main() diff --git a/tpcdsgen/scripts/compare-table.sh b/tpcdsgen/scripts/compare-table.sh index a0a2630..4e02c34 100755 --- a/tpcdsgen/scripts/compare-table.sh +++ b/tpcdsgen/scripts/compare-table.sh @@ -1,16 +1,61 @@ #!/usr/bin/env bash # -# Compare Rust-generated table output with Java reference fixture +# compare-table.sh — Compare a single table's Rust output to a reference +# fixture byte-for-byte (MD5 + diff). # -# Usage: -# ./scripts/compare-table.sh TABLE_NAME [--quiet] -# -# Exit codes: -# 0 - Tables match exactly -# 1 - Tables differ or error occurred +# Please see print_usage() below for details. set -euo pipefail +print_usage() { + cat << 'EOF' +compare-table.sh — Compare a single table's Rust output to a reference +fixture byte-for-byte (MD5 + diff). + +Two reference implementations are supported, selected by --compat: + --compat trino (default) Java / Trino fixtures in + tests/fixtures/scale-N-java/ + (generate with + ./scripts/generate-fixtures.sh) + --compat c C dsdgen fixtures in + tests/fixtures/scale-N-c/ + (download with + ./scripts/generate-fixtures.sh --compat c) + +Usage: + compare-table.sh TABLE_NAME [OPTIONS] + +Arguments: + TABLE_NAME Name of the table to compare (e.g. call_center). + +Options: + --scale N Scale factor (default: 1). + --compat trino|c Reference implementation (default: trino). + --quiet Quiet mode (minimal output). + --help Show this help message. + +Examples: + compare-table.sh call_center # vs. Java, scale 1 + compare-table.sh reason --compat c # vs. C dsdgen, scale 1 + compare-table.sh inventory --scale 10 # vs. Java, scale 10 + compare-table.sh customer_demographics --quiet + +Output example: + [INFO] Table Comparison: call_center + [INFO] Java fixture: tests/fixtures/scale-1-java/call_center.dat + [INFO] Java fixture: 6 rows, 4.0K + [INFO] Rust output: 6 rows, 4.0K + [SUCCESS] ✓ call_center: MD5 match (6 rows, cc9aab...) + +Exit codes: + 0 - Tables match exactly. + 1 - Tables differ or an error occurred. + +See scripts/README.md for the full conformance-testing workflow. +EOF + exit 0 +} + # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' @@ -24,6 +69,7 @@ PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" # Configuration (can be overridden by --scale) SCALE_FACTOR=${TPCDS_SCALE:-1} +COMPAT=${TPCDS_COMPAT:-trino} QUIET=0 # Logging functions @@ -45,8 +91,8 @@ log_diff() { echo -e "${YELLOW}[DIFF]${NC} $*" } -# Returns tables are generated by their parent sales generators -# Map returns table -> parent table (the one to generate) +# Returns tables are generated by their parent sales generators. +# Map returns table -> parent table (the one to generate). get_generator_for_table() { local table=$1 case $table in @@ -57,34 +103,6 @@ get_generator_for_table() { esac } -# Print usage -usage() { - cat << EOF -Compare Rust-generated table output with Java reference fixture - -Usage: - $(basename "$0") TABLE_NAME [OPTIONS] - -Arguments: - TABLE_NAME Name of the table to compare (e.g., call_center) - -Options: - --scale N Scale factor (default: 1) - --quiet Quiet mode (minimal output) - -Examples: - $(basename "$0") call_center - $(basename "$0") customer_demographics --quiet - $(basename "$0") inventory --scale 10 - -Exit codes: - 0 - Tables match exactly - 1 - Tables differ or error occurred - -EOF - exit 0 -} - # Find the unified tpcdsgen binary find_rust_binary() { local target_dir @@ -139,7 +157,7 @@ generate_rust_table() { fi log_info "Generating $table with Rust..." - log_info "Using binary: $binary --table $generator --scale $SCALE_FACTOR" + log_info "Using binary: $binary --compat $COMPAT --table $generator --scale $SCALE_FACTOR" if [[ "$generator" != "$table" ]]; then log_info "Note: $table is generated alongside $generator" fi @@ -148,8 +166,8 @@ generate_rust_table() { local temp_dir temp_dir=$(mktemp -d) - # Run Rust generator with --table, --scale, and --directory flags - if ! "$binary" --table "$generator" --scale "$SCALE_FACTOR" --directory "$temp_dir" >/dev/null 2>&1; then + # Run Rust generator with --compat, --table, --scale, and --directory flags + if ! "$binary" --compat "$COMPAT" --table "$generator" --scale "$SCALE_FACTOR" --directory "$temp_dir" >/dev/null 2>&1; then log_error "Failed to generate $table with Rust" rm -rf "$temp_dir" return 1 @@ -182,54 +200,51 @@ compute_md5() { # Compare two files compare_files() { - local java_file=$1 + local ref_file=$1 local rust_file=$2 local table=$3 + local ref_label=$4 log_info "Comparing outputs..." # Get file sizes - local java_size - local rust_size - local java_rows - local rust_rows + local ref_size rust_size ref_rows rust_rows - java_size=$(du -h "$java_file" | cut -f1) + ref_size=$(du -h "$ref_file" | cut -f1) rust_size=$(du -h "$rust_file" | cut -f1) - java_rows=$(wc -l < "$java_file" | tr -d ' ') + ref_rows=$(wc -l < "$ref_file" | tr -d ' ') rust_rows=$(wc -l < "$rust_file" | tr -d ' ') - log_info "Java fixture: $java_rows rows, $java_size" - log_info "Rust output: $rust_rows rows, $rust_size" + log_info "$ref_label fixture: $ref_rows rows, $ref_size" + log_info "Rust output: $rust_rows rows, $rust_size" # Quick check: row count must match - if [[ "$java_rows" != "$rust_rows" ]]; then + if [[ "$ref_rows" != "$rust_rows" ]]; then log_error "Row count mismatch!" - log_error " Java: $java_rows rows" + log_error " $ref_label: $ref_rows rows" log_error " Rust: $rust_rows rows" return 1 fi # Compute MD5 hashes log_info "Computing MD5 hashes..." - local java_md5 - local rust_md5 - java_md5=$(compute_md5 "$java_file") + local ref_md5 rust_md5 + ref_md5=$(compute_md5 "$ref_file") rust_md5=$(compute_md5 "$rust_file") - log_info "Java MD5: $java_md5" + log_info "$ref_label MD5: $ref_md5" log_info "Rust MD5: $rust_md5" # Compare MD5 hashes - if [[ "$java_md5" == "$rust_md5" ]]; then - log_success "✓ $table: MD5 match ($java_rows rows, $java_md5)" + if [[ "$ref_md5" == "$rust_md5" ]]; then + log_success "✓ $table: MD5 match ($ref_rows rows, $ref_md5)" return 0 else log_error "✗ $table: MD5 mismatch!" - log_error " Java: $java_md5" - log_error " Rust: $rust_md5" + log_error " $ref_label: $ref_md5" + log_error " Rust: $rust_md5" log_diff "Showing first differences:" - diff -u "$java_file" "$rust_file" | head -30 || true + diff -u "$ref_file" "$rust_file" | head -30 || true return 1 fi } @@ -245,32 +260,50 @@ main() { SCALE_FACTOR="$2" shift 2 ;; + --compat) + COMPAT="$2" + shift 2 + ;; --quiet) QUIET=1 shift ;; --help) - usage + print_usage ;; *) if [[ -z "$table" ]]; then table=$1 else log_error "Too many arguments" - usage + print_usage fi shift ;; esac done - # Set fixture directory based on scale factor - FIXTURE_DIR="$PROJECT_ROOT/tests/fixtures/scale-$SCALE_FACTOR" + # Resolve compat mode -> fixture directory and reference label + local ref_label + case $COMPAT in + trino) + FIXTURE_DIR="$PROJECT_ROOT/tests/fixtures/scale-${SCALE_FACTOR}-java" + ref_label="Java" + ;; + c) + FIXTURE_DIR="$PROJECT_ROOT/tests/fixtures/scale-${SCALE_FACTOR}-c" + ref_label="C dsdgen" + ;; + *) + log_error "Unknown --compat value: $COMPAT (expected: trino, c)" + exit 1 + ;; + esac # Validate table argument if [[ -z "$table" ]]; then log_error "Table name required" - usage + print_usage fi log_info "=========================================" @@ -281,11 +314,15 @@ main() { local fixture_file="$FIXTURE_DIR/${table}.dat" if [[ ! -f "$fixture_file" ]]; then log_error "Fixture not found: $fixture_file" - log_error "Generate fixtures first: ./scripts/generate-fixtures.sh $table" + if [[ "$COMPAT" == "c" ]]; then + log_error "Download C reference data first: ./scripts/generate-fixtures.sh --compat c --scale $SCALE_FACTOR" + else + log_error "Generate fixtures first: ./scripts/generate-fixtures.sh $table" + fi exit 1 fi - log_info "Java fixture: $fixture_file" + log_info "$ref_label fixture: $fixture_file" # Generate Rust output local rust_output @@ -300,7 +337,7 @@ main() { # Compare files local result=0 - if ! compare_files "$fixture_file" "$rust_output" "$table"; then + if ! compare_files "$fixture_file" "$rust_output" "$table" "$ref_label"; then result=1 fi diff --git a/tpcdsgen/scripts/generate-fixtures.sh b/tpcdsgen/scripts/generate-fixtures.sh index 749126b..8810ce1 100755 --- a/tpcdsgen/scripts/generate-fixtures.sh +++ b/tpcdsgen/scripts/generate-fixtures.sh @@ -1,32 +1,106 @@ #!/usr/bin/env bash # -# Generate TPC-DS reference fixtures using the Java implementation +# generate-fixtures.sh — Generate reference TPC-DS fixtures used by the +# conformance suite (compare-table.sh / test-all-tables.sh). # -# Usage: -# ./scripts/generate-fixtures.sh # Generate all tables -# ./scripts/generate-fixtures.sh --quiet # Generate all tables (quiet mode) -# ./scripts/generate-fixtures.sh table1 ... # Generate specific tables -# ./scripts/generate-fixtures.sh --help # Show help +# Please see print_usage() below for details. set -euo pipefail -# Colors for output +print_usage() { + cat << 'EOF' +generate-fixtures.sh — Generate reference TPC-DS fixtures. + +Two reference implementations are supported, selected by --compat: + + --compat trino (default) + Runs the Java / Trino TPC-DS implementation (set up by + ./scripts/bootstrap-java.sh) and writes the resulting *.dat files + into tests/fixtures/scale-N-java/. These are the "golden reference" + the Rust port targets byte-for-byte. + + --compat c + Downloads pre-generated C `dsdgen` reference data from + https://github.com/alamb/tpcds-data (branch sfN; one branch per + scale factor). The branch is cloned with --depth 1, re-assembled + from split bzip2 tarballs, and extracted into + tests/fixtures/scale-N-c/. No local C toolchain needed. + +Usage: + generate-fixtures.sh [OPTIONS] [TABLES...] + +Options: + --compat trino|c Reference implementation (default: trino). + --scale N Scale factor (default: 1). + --quiet Quiet mode (minimal output). + --rebuild --compat c only: re-download and re-extract even + if fixtures already exist. + --verify --compat c only: only check that fixtures look + sane; do not download. + --help Show this help message. + +Arguments: + TABLES --compat trino only: space-separated list of table + names to generate. If omitted, generates all 25. + Not meaningful for --compat c (the published + archive includes all 25 tables together). + +Environment variables: + TPCDS_C_DATA_REPO Override the C reference data repo URL. + Default: https://github.com/alamb/tpcds-data.git + TPCDS_SCALE Default scale factor (overridden by --scale). + TPCDS_COMPAT Default compat mode (overridden by --compat). + +Requirements: + --compat trino: Java 11+, Maven (a built tpcds-*.jar; see bootstrap-java.sh) + --compat c : git, tar, bzip2 + +Output: + tests/fixtures/scale-N-java/
.dat — pipe-delimited, trailing |. + tests/fixtures/scale-N-c/
.dat — same format, C dsdgen origin. + Files are gitignored; regenerate as needed. + +Examples: + # Java reference, all 25 tables at scale 1 (default). + ./scripts/generate-fixtures.sh + + # Java reference, scale 10, two specific tables. + ./scripts/generate-fixtures.sh --scale 10 call_center warehouse + + # C dsdgen reference, scale 1. + ./scripts/generate-fixtures.sh --compat c + + # C dsdgen reference, scale 2, force re-download. + ./scripts/generate-fixtures.sh --compat c --scale 2 --rebuild + +See scripts/README.md for the full conformance-testing workflow. +EOF + exit 0 +} + +# Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' -NC='\033[0m' # No Color +NC='\033[0m' # Script directory and project root SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" JAVA_DIR="$PROJECT_ROOT/../tpcds" -# Configuration (can be overridden by --scale) +# Configuration (overridable by flags / env vars) SCALE_FACTOR=${TPCDS_SCALE:-1} +COMPAT=${TPCDS_COMPAT:-trino} QUIET=0 +FORCE_REBUILD=0 +VERIFY_ONLY=0 + +# alamb/tpcds-data repository (for --compat c). +TPCDS_DATA_REPO="${TPCDS_C_DATA_REPO:-https://github.com/alamb/tpcds-data.git}" -# All TPC-DS tables (25 tables) +# All TPC-DS tables (25 tables). ALL_TABLES=( "call_center" "catalog_page" @@ -55,7 +129,7 @@ ALL_TABLES=( "web_site" ) -# Logging functions +# Logging log_info() { if [[ $QUIET -eq 0 ]]; then echo -e "${BLUE}[INFO]${NC} $*" @@ -76,104 +150,62 @@ log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2 } -# Print usage -usage() { - cat << EOF -Generate TPC-DS reference fixtures using the Java implementation - -Usage: - $(basename "$0") [OPTIONS] [TABLES...] - -Options: - --scale N Scale factor (default: 1) - --quiet Quiet mode (minimal output) - --help Show this help message - -Arguments: - TABLES Space-separated list of table names to generate - If omitted, generates all 25 tables - -Examples: - $(basename "$0") # Generate all tables at scale 1 - $(basename "$0") --scale 10 # Generate all tables at scale 10 - $(basename "$0") --quiet # Generate all tables (quiet) - $(basename "$0") call_center warehouse # Generate specific tables +# ----------------------------------------------------------------------------- +# --compat trino (Java) helpers +# ----------------------------------------------------------------------------- -EOF - exit 0 -} - -# Find Java JAR file find_java_jar() { - local jar_pattern="$JAVA_DIR/target/tpcds-*-jar-with-dependencies.jar" local jar_file - jar_file=$(find "$JAVA_DIR/target" -name "tpcds-*-jar-with-dependencies.jar" 2>/dev/null | head -1) - if [[ -z "$jar_file" ]]; then return 1 fi - echo "$jar_file" - return 0 } -# Build Java implementation if needed ensure_java_build() { log_info "Checking Java implementation..." - if ! find_java_jar >/dev/null 2>&1; then log_warn "Java JAR not found. Building Java implementation..." - cd "$JAVA_DIR" if ! mvn -q clean package -DskipTests; then log_error "Failed to build Java implementation" exit 1 fi cd - >/dev/null - log_success "Java implementation built successfully" else log_info "Java JAR found: $(find_java_jar)" fi } -# Generate a single table -generate_table() { +generate_java_table() { local table=$1 + local fixture_dir=$2 local jar_file jar_file=$(find_java_jar) log_info "Generating $table..." - # Create a temporary directory for generation local temp_dir temp_dir=$(mktemp -d) - # Generate table in temp directory - # - # Run Java generator, filter out DEBUG lines but capture errors - local output - if output=$(java -jar "$jar_file" \ + if java -jar "$jar_file" \ --table "$table" \ --scale "$SCALE_FACTOR" \ --overwrite \ --directory "$temp_dir" \ - 2>&1); then + >/dev/null 2>&1; then - # Move generated file to fixture directory local output_file="$temp_dir/${table}.dat" - if [[ -f "$output_file" ]]; then - mv "$output_file" "$FIXTURE_DIR/" - - # Get file info - local file_size - local row_count - file_size=$(du -h "$FIXTURE_DIR/${table}.dat" | cut -f1) - row_count=$(wc -l < "$FIXTURE_DIR/${table}.dat" | tr -d ' ') - + mv "$output_file" "$fixture_dir/" + local file_size row_count + file_size=$(du -h "$fixture_dir/${table}.dat" | cut -f1) + row_count=$(wc -l < "$fixture_dir/${table}.dat" | tr -d ' ') log_success "$table generated: $row_count rows, $file_size" + rm -rf "$temp_dir" + return 0 else log_error "Expected output file not found: $output_file" rm -rf "$temp_dir" @@ -184,69 +216,31 @@ generate_table() { rm -rf "$temp_dir" return 1 fi - - # Clean up temp directory - rm -rf "$temp_dir" - return 0 } -# Main function -main() { - local tables_to_generate=() - local start_time - local end_time - local success_count=0 - local fail_count=0 - - # Parse arguments - while [[ $# -gt 0 ]]; do - case $1 in - --scale) - SCALE_FACTOR="$2" - shift 2 - ;; - --quiet) - QUIET=1 - shift - ;; - --help) - usage - ;; - *) - tables_to_generate+=("$1") - shift - ;; - esac - done - - # Set fixture directory based on scale factor - FIXTURE_DIR="$PROJECT_ROOT/tests/fixtures/scale-$SCALE_FACTOR" - - # If no tables specified, generate all - if [[ ${#tables_to_generate[@]} -eq 0 ]]; then - tables_to_generate=("${ALL_TABLES[@]}") - fi +generate_trino_fixtures() { + local fixture_dir=$1 + shift + local tables_to_generate=("$@") log_info "=========================================" - log_info "TPC-DS Fixture Generator" + log_info "Java TPC-DS Fixture Generator" log_info "=========================================" - log_info "Scale Factor: $SCALE_FACTOR" + log_info "Scale Factor: $SCALE_FACTOR" log_info "Tables to generate: ${#tables_to_generate[@]}" - log_info "Fixture directory: $FIXTURE_DIR" + log_info "Fixture directory: $fixture_dir" log_info "=========================================" - # Ensure Java build exists ensure_java_build - # Create fixture directory - mkdir -p "$FIXTURE_DIR" - log_info "Created fixture directory: $FIXTURE_DIR" + mkdir -p "$fixture_dir" + log_info "Created fixture directory: $fixture_dir" - # Generate tables + local success_count=0 fail_count=0 start_time end_time start_time=$(date +%s) for table in "${tables_to_generate[@]}"; do - if generate_table "$table"; then + if generate_java_table "$table" "$fixture_dir"; then success_count=$((success_count + 1)) else fail_count=$((fail_count + 1)) @@ -256,25 +250,244 @@ main() { end_time=$(date +%s) local duration=$((end_time - start_time)) - # Print summary echo "" log_info "=========================================" log_info "Generation Complete" log_info "=========================================" log_success "Successfully generated: $success_count tables" - if [[ $fail_count -gt 0 ]]; then log_error "Failed to generate: $fail_count tables" fi - log_info "Total time: ${duration}s" - log_info "Fixtures saved to: $FIXTURE_DIR" + log_info "Fixtures saved to: $fixture_dir" log_info "=========================================" - # Exit with error if any tables failed if [[ $fail_count -gt 0 ]]; then exit 1 fi } +# ----------------------------------------------------------------------------- +# --compat c (C dsdgen) helpers +# ----------------------------------------------------------------------------- + +check_c_prerequisites() { + local missing=() + command -v git >/dev/null 2>&1 || missing+=(git) + command -v bzip2 >/dev/null 2>&1 || missing+=(bzip2) + command -v tar >/dev/null 2>&1 || missing+=(tar) + if [[ ${#missing[@]} -gt 0 ]]; then + log_error "Missing required tool(s) for --compat c: ${missing[*]}" + return 1 + fi + return 0 +} + +# Sanity check the extracted fixtures. +# At minimum, a handful of expected tables must be present and non-empty. +verify_c_fixtures() { + local fixture_dir=$1 + local required=(store_sales.dat catalog_sales.dat web_sales.dat reason.dat call_center.dat) + + if [[ ! -d "$fixture_dir" ]]; then + log_error "Fixture directory does not exist: $fixture_dir" + return 1 + fi + + for f in "${required[@]}"; do + if [[ ! -s "$fixture_dir/$f" ]]; then + log_error "Missing or empty fixture: $fixture_dir/$f" + return 1 + fi + done + + local count + count=$(find "$fixture_dir" -maxdepth 1 -name "*.dat" -type f | wc -l | tr -d ' ') + log_success "Found $count .dat fixtures in $fixture_dir" + return 0 +} + +download_and_extract_c() { + local branch=$1 + local fixture_dir=$2 + local clone_dir + clone_dir=$(mktemp -d -t tpcds-data-XXXXXX) + + # Cleanup helper. Called both on the success and failure paths below + # rather than via `trap RETURN`, which under `set -u` causes the trap + # to fire from later functions (e.g. `main`) where `$clone_dir` is no + # longer in scope. + _cleanup_clone_dir() { + if [[ -n "${clone_dir:-}" && -d "$clone_dir" ]]; then + rm -rf "$clone_dir" + fi + } + + log_info "Cloning $TPCDS_DATA_REPO branch '$branch' (depth 1) ..." + if ! git clone --depth 1 --single-branch --branch "$branch" \ + "$TPCDS_DATA_REPO" "$clone_dir/tpcds-data"; then + log_error "Failed to clone $TPCDS_DATA_REPO branch '$branch'" + log_error "Confirm the branch exists (sf1, sf2, ...)" + _cleanup_clone_dir + return 1 + fi + + if ! ls "$clone_dir/tpcds-data"/data.tar.bz2.* >/dev/null 2>&1; then + log_error "No data.tar.bz2.* parts found in cloned branch '$branch'" + _cleanup_clone_dir + return 1 + fi + + log_info "Extracting reference data into $fixture_dir ..." + mkdir -p "$fixture_dir" + + # The archive expands as data/
.dat. Extract into a temp dir, + # then flatten one level so the result is fixture_dir/
.dat. + local extract_dir="$clone_dir/extract" + mkdir -p "$extract_dir" + if ! cat "$clone_dir/tpcds-data"/data.tar.bz2.* | bzip2 -d | tar -x -C "$extract_dir"; then + log_error "Failed to extract data.tar.bz2.* parts" + _cleanup_clone_dir + return 1 + fi + + if [[ ! -d "$extract_dir/data" ]]; then + log_error "Unexpected archive layout: $extract_dir/data not found" + _cleanup_clone_dir + return 1 + fi + + mv "$extract_dir/data"/*.dat "$fixture_dir/" + _cleanup_clone_dir + return 0 +} + +generate_c_fixtures() { + local fixture_dir=$1 + local branch="sf${SCALE_FACTOR}" + + log_info "=========================================" + log_info "C dsdgen Reference Data Bootstrap" + log_info "=========================================" + log_info "Repository: $TPCDS_DATA_REPO" + log_info "Branch: $branch" + log_info "Fixture directory: $fixture_dir" + log_info "=========================================" + + if ! check_c_prerequisites; then + exit 1 + fi + + if [[ $VERIFY_ONLY -eq 1 ]]; then + if verify_c_fixtures "$fixture_dir"; then + exit 0 + else + exit 1 + fi + fi + + # Skip download if fixtures already look complete. + if [[ $FORCE_REBUILD -eq 0 ]] && verify_c_fixtures "$fixture_dir" >/dev/null 2>&1; then + log_success "C reference fixtures already present at $fixture_dir" + log_info "Use --rebuild to force re-download" + exit 0 + fi + + if [[ $FORCE_REBUILD -eq 1 && -d "$fixture_dir" ]]; then + log_info "Removing existing fixture directory: $fixture_dir" + rm -rf "$fixture_dir" + fi + + local start_time end_time + start_time=$(date +%s) + if ! download_and_extract_c "$branch" "$fixture_dir"; then + exit 1 + fi + end_time=$(date +%s) + + if ! verify_c_fixtures "$fixture_dir"; then + log_error "Bootstrap completed but verification failed" + exit 1 + fi + + echo "" + log_info "=========================================" + log_success "C dsdgen reference data ready" + log_info "Time: $((end_time - start_time))s" + log_info "=========================================" +} + +# ----------------------------------------------------------------------------- +# main +# ----------------------------------------------------------------------------- + +main() { + local tables_to_generate=() + + while [[ $# -gt 0 ]]; do + case $1 in + --compat) + COMPAT="$2" + shift 2 + ;; + --scale) + SCALE_FACTOR="$2" + shift 2 + ;; + --quiet) + QUIET=1 + shift + ;; + --rebuild) + FORCE_REBUILD=1 + shift + ;; + --verify) + VERIFY_ONLY=1 + shift + ;; + --help) + print_usage + ;; + *) + tables_to_generate+=("$1") + shift + ;; + esac + done + + case $COMPAT in + trino|c) ;; + *) + log_error "Unknown --compat value: $COMPAT (expected: trino, c)" + exit 1 + ;; + esac + + if [[ "$COMPAT" == "c" && ${#tables_to_generate[@]} -gt 0 ]]; then + log_error "Per-table selection is not supported with --compat c" + log_error "The published archive bundles all 25 tables together." + exit 1 + fi + + if [[ "$COMPAT" == "trino" && ( $FORCE_REBUILD -eq 1 || $VERIFY_ONLY -eq 1 ) ]]; then + log_error "--rebuild and --verify are only valid with --compat c" + exit 1 + fi + + case $COMPAT in + trino) + local fixture_dir="$PROJECT_ROOT/tests/fixtures/scale-${SCALE_FACTOR}-java" + if [[ ${#tables_to_generate[@]} -eq 0 ]]; then + tables_to_generate=("${ALL_TABLES[@]}") + fi + generate_trino_fixtures "$fixture_dir" "${tables_to_generate[@]}" + ;; + c) + local fixture_dir="$PROJECT_ROOT/tests/fixtures/scale-${SCALE_FACTOR}-c" + generate_c_fixtures "$fixture_dir" + ;; + esac +} + main "$@" diff --git a/tpcdsgen/scripts/test-all-tables.sh b/tpcdsgen/scripts/test-all-tables.sh index 9495148..091d503 100755 --- a/tpcdsgen/scripts/test-all-tables.sh +++ b/tpcdsgen/scripts/test-all-tables.sh @@ -1,16 +1,61 @@ #!/usr/bin/env bash # -# Test all ported Rust tables against Java reference fixtures +# test-all-tables.sh — Run the full conformance suite for one compat +# mode, byte-for-byte (MD5 + diff) comparing Rust output against +# reference fixtures. Main entry point used by CI. # -# Usage: -# ./scripts/test-all-tables.sh [--quiet] -# -# Exit codes: -# 0 - All tables match -# 1 - One or more tables differ +# Please see print_usage() below for details. set -euo pipefail +print_usage() { + cat << 'EOF' +test-all-tables.sh — Run the full conformance suite for one compat mode. + +Iterates all 24 TPC-DS tables (dbgen_version is always excluded because +it contains a generation timestamp), builds the Rust generator in release +mode, delegates each per-table comparison to ./scripts/compare-table.sh, +and prints a pass/fail summary. Exits non-zero if any table differs. + +Two reference implementations are supported, selected by --compat: + --compat trino (default) Java / Trino fixtures in + tests/fixtures/scale-N-java/ + (generate with + ./scripts/generate-fixtures.sh) + --compat c C dsdgen fixtures in + tests/fixtures/scale-N-c/ + (download with + ./scripts/generate-fixtures.sh --compat c) + +Per-compat skip lists live near the top of the script. As of this +writing, --compat c additionally skips `customer` until +alamb/tpcds-data is regenerated without the iconv ISO-8859-14 -> UTF-8 +step that double-encodes non-ASCII country names. + +Usage: + test-all-tables.sh [OPTIONS] + +Options: + --scale N Scale factor (default: 1). + --compat trino|c Reference implementation (default: trino). + --quiet Quiet mode (show only summary). + --help Show this help message. + +Examples: + test-all-tables.sh # All tables at scale 1 vs Java. + test-all-tables.sh --scale 10 # All tables at scale 10 vs Java. + test-all-tables.sh --compat c # All tables at scale 1 vs C dsdgen. + test-all-tables.sh --quiet # Summary-only output. + +Exit codes: + 0 - All tested tables match. + 1 - One or more tables differ. + +See scripts/README.md for the full conformance-testing workflow. +EOF + exit 0 +} + # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' @@ -24,6 +69,7 @@ PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" # Configuration (can be overridden by --scale) SCALE_FACTOR=${TPCDS_SCALE:-1} +COMPAT=${TPCDS_COMPAT:-trino} QUIET=0 # Logging functions @@ -45,31 +91,6 @@ log_warn() { echo -e "${YELLOW}[WARN]${NC} $*" } -# Print usage -usage() { - cat << EOF -Test all ported Rust tables against Java reference fixtures - -Usage: - $(basename "$0") [--scale N] [--quiet] - -Options: - --scale N Scale factor (default: 1) - --quiet Quiet mode (show only summary) - -Examples: - $(basename "$0") # Test all tables at scale 1 - $(basename "$0") --scale 10 # Test all tables at scale 10 - $(basename "$0") --quiet # Test all tables (quiet) - -Exit codes: - 0 - All tables match exactly - 1 - One or more tables differ - -EOF - exit 0 -} - # All TPC-DS tables to test (24 tables - excludes dbgen_version which has timestamps) # Note: dbgen_version is excluded because it contains timestamps that will never match ALL_TABLES=( @@ -99,9 +120,38 @@ ALL_TABLES=( "web_site" ) -# Get list of tables to test +# Tables to skip per compat mode (in addition to dbgen_version, which is +# always skipped because it contains a generation timestamp). +# +# --compat c: customer.dat is skipped because the reference data in +# https://github.com/alamb/tpcds-data was generated through a pipeline that +# accidentally double-UTF-8-encodes the non-ASCII country names (`CÔTE +# D'IVOIRE`, `RÉUNION`). The Rust --compat c output uses raw Latin-1, which +# is what unmodified C dsdgen produces. Once the reference data is +# regenerated without the iconv ISO-8859-14 -> UTF-8 step, this entry can +# be removed. +# TODO(alamb): re-include customer once alamb/tpcds-data has been regenerated. +C_COMPAT_SKIP_TABLES=("customer") + +# Get list of tables to test, applying per-compat skip lists. get_tables_to_test() { - echo "${ALL_TABLES[@]}" + local skip_list=() + if [[ "$COMPAT" == "c" ]]; then + skip_list=("${C_COMPAT_SKIP_TABLES[@]}") + fi + + local result=() + for t in "${ALL_TABLES[@]}"; do + local skip=0 + for s in "${skip_list[@]:-}"; do + if [[ "$t" == "$s" ]]; then + skip=1 + break + fi + done + [[ $skip -eq 0 ]] && result+=("$t") + done + echo "${result[@]}" } # Build the unified Rust table generator @@ -123,9 +173,9 @@ test_table() { local compare_script="$SCRIPT_DIR/compare-table.sh" if [[ $QUIET -eq 1 ]]; then - "$compare_script" "$table" --scale "$SCALE_FACTOR" --quiet + "$compare_script" "$table" --scale "$SCALE_FACTOR" --compat "$COMPAT" --quiet else - "$compare_script" "$table" --scale "$SCALE_FACTOR" + "$compare_script" "$table" --scale "$SCALE_FACTOR" --compat "$COMPAT" fi } @@ -143,23 +193,36 @@ main() { SCALE_FACTOR="$2" shift 2 ;; + --compat) + COMPAT="$2" + shift 2 + ;; --quiet) QUIET=1 shift ;; --help) - usage + print_usage ;; *) log_error "Unknown option: $1" - usage + print_usage ;; esac done + case $COMPAT in + trino|c) ;; + *) + log_error "Unknown --compat value: $COMPAT (expected: trino, c)" + exit 1 + ;; + esac + log_info "=========================================" log_info "TPC-DS Table Test Suite" log_info "Scale Factor: $SCALE_FACTOR" + log_info "Compat Mode: $COMPAT" log_info "=========================================" # Get tables to test diff --git a/tpcdsgen/scripts/tpcds_schemas.py b/tpcdsgen/scripts/tpcds_schemas.py deleted file mode 100644 index dec1aa2..0000000 --- a/tpcdsgen/scripts/tpcds_schemas.py +++ /dev/null @@ -1,506 +0,0 @@ -"""TPC-DS table schemas as PyArrow fields. - -Extracted from apache/datafusion-benchmarks (Apache-2.0). -Used by compare-c.py to parse .dat files and read Parquet reference data. -""" - -import pyarrow - -all_schemas = {} - -all_schemas['customer_address'] = [ - pyarrow.field("ca_address_sk", pyarrow.int32()), - pyarrow.field("ca_address_id", pyarrow.string()), - pyarrow.field("ca_street_number", pyarrow.string()), - pyarrow.field("ca_street_name", pyarrow.string()), - pyarrow.field("ca_street_type", pyarrow.string()), - pyarrow.field("ca_suite_number", pyarrow.string()), - pyarrow.field("ca_city", pyarrow.string()), - pyarrow.field("ca_county", pyarrow.string()), - pyarrow.field("ca_state", pyarrow.string()), - pyarrow.field("ca_zip", pyarrow.string()), - pyarrow.field("ca_country", pyarrow.string()), - pyarrow.field("ca_gmt_offset", pyarrow.decimal128(5, 2)), - pyarrow.field("ca_location_type", pyarrow.string()) -] - -all_schemas['customer_demographics'] = [ - pyarrow.field("cd_demo_sk", pyarrow.int32()), - pyarrow.field("cd_gender", pyarrow.string()), - pyarrow.field("cd_marital_status", pyarrow.string()), - pyarrow.field("cd_education_status", pyarrow.string()), - pyarrow.field("cd_purchase_estimate", pyarrow.int32()), - pyarrow.field("cd_credit_rating", pyarrow.string()), - pyarrow.field("cd_dep_count", pyarrow.int32()), - pyarrow.field("cd_dep_employed_count", pyarrow.int32()), - pyarrow.field("cd_dep_college_count", pyarrow.int32()) -] - -all_schemas['date_dim'] = [ - pyarrow.field("d_date_sk", pyarrow.int32()), - pyarrow.field("d_date_id", pyarrow.string()), - pyarrow.field("d_date", pyarrow.date32()), - pyarrow.field("d_month_seq", pyarrow.int32()), - pyarrow.field("d_week_seq", pyarrow.int32()), - pyarrow.field("d_quarter_seq", pyarrow.int32()), - pyarrow.field("d_year", pyarrow.int32()), - pyarrow.field("d_dow", pyarrow.int32()), - pyarrow.field("d_moy", pyarrow.int32()), - pyarrow.field("d_dom", pyarrow.int32()), - pyarrow.field("d_qoy", pyarrow.int32()), - pyarrow.field("d_fy_year", pyarrow.int32()), - pyarrow.field("d_fy_quarter_seq", pyarrow.int32()), - pyarrow.field("d_fy_week_seq", pyarrow.int32()), - pyarrow.field("d_day_name", pyarrow.string()), - pyarrow.field("d_quarter_name", pyarrow.string()), - pyarrow.field("d_holiday", pyarrow.string()), - pyarrow.field("d_weekend", pyarrow.string()), - pyarrow.field("d_following_holiday", pyarrow.string()), - pyarrow.field("d_first_dom", pyarrow.int32()), - pyarrow.field("d_last_dom", pyarrow.int32()), - pyarrow.field("d_same_day_ly", pyarrow.int32()), - pyarrow.field("d_same_day_lq", pyarrow.int32()), - pyarrow.field("d_current_day", pyarrow.string()), - pyarrow.field("d_current_week", pyarrow.string()), - pyarrow.field("d_current_month", pyarrow.string()), - pyarrow.field("d_current_quarter", pyarrow.string()), - pyarrow.field("d_current_year", pyarrow.string()), -] - -all_schemas["warehouse"] = [ - pyarrow.field("w_warehouse_sk", pyarrow.int32()), - pyarrow.field("w_warehouse_id", pyarrow.string()), - pyarrow.field("w_warehouse_name", pyarrow.string()), - pyarrow.field("w_warehouse_sq_ft", pyarrow.int32()), - pyarrow.field("w_street_number", pyarrow.string()), - pyarrow.field("w_street_name", pyarrow.string()), - pyarrow.field("w_street_type", pyarrow.string()), - pyarrow.field("w_suite_number", pyarrow.string()), - pyarrow.field("w_city", pyarrow.string()), - pyarrow.field("w_county", pyarrow.string()), - pyarrow.field("w_state", pyarrow.string()), - pyarrow.field("w_zip", pyarrow.string()), - pyarrow.field("w_country", pyarrow.string()), - pyarrow.field("w_gmt_offset", pyarrow.decimal128(5, 2)), -] - -all_schemas["ship_mode"] = [ - pyarrow.field("sm_ship_mode_sk", pyarrow.int32()), - pyarrow.field("sm_ship_mode_id", pyarrow.string()), - pyarrow.field("sm_type", pyarrow.string()), - pyarrow.field("sm_code", pyarrow.string()), - pyarrow.field("sm_carrier", pyarrow.string()), - pyarrow.field("sm_contract", pyarrow.string()), -] - -all_schemas["time_dim"] = [ - pyarrow.field("t_time_sk", pyarrow.int32()), - pyarrow.field("t_time_id", pyarrow.string()), - pyarrow.field("t_time", pyarrow.int32()), - pyarrow.field("t_hour", pyarrow.int32()), - pyarrow.field("t_minute", pyarrow.int32()), - pyarrow.field("t_second", pyarrow.int32()), - pyarrow.field("t_am_pm", pyarrow.string()), - pyarrow.field("t_shift", pyarrow.string()), - pyarrow.field("t_sub_shift", pyarrow.string()), - pyarrow.field("t_meal_time", pyarrow.string()), -] - -all_schemas["reason"] = [ - pyarrow.field("r_reason_sk", pyarrow.int32()), - pyarrow.field("r_reason_id", pyarrow.string()), - pyarrow.field("r_reason_desc", pyarrow.string()), -] - -all_schemas["income_band"] = [ - pyarrow.field("ib_income_band_sk", pyarrow.int32()), - pyarrow.field("ib_lower_bound", pyarrow.int32()), - pyarrow.field("ib_upper_bound", pyarrow.int32()), -] - -all_schemas["item"] = [ - pyarrow.field("i_item_sk", pyarrow.int32()), - pyarrow.field("i_item_id", pyarrow.string()), - pyarrow.field("i_rec_start_date", pyarrow.date32()), - pyarrow.field("i_rec_end_date", pyarrow.date32()), - pyarrow.field("i_item_desc", pyarrow.string()), - pyarrow.field("i_current_price", pyarrow.decimal128(7, 2)), - pyarrow.field("i_wholesale_cost", pyarrow.decimal128(7, 2)), - pyarrow.field("i_brand_id", pyarrow.int32()), - pyarrow.field("i_brand", pyarrow.string()), - pyarrow.field("i_class_id", pyarrow.int32()), - pyarrow.field("i_class", pyarrow.string()), - pyarrow.field("i_category_id", pyarrow.int32()), - pyarrow.field("i_category", pyarrow.string()), - pyarrow.field("i_manufact_id", pyarrow.int32()), - pyarrow.field("i_manufact", pyarrow.string()), - pyarrow.field("i_size", pyarrow.string()), - pyarrow.field("i_formulation", pyarrow.string()), - pyarrow.field("i_color", pyarrow.string()), - pyarrow.field("i_units", pyarrow.string()), - pyarrow.field("i_container", pyarrow.string()), - pyarrow.field("i_manager_id", pyarrow.int32()), - pyarrow.field("i_product_name", pyarrow.string()), -] - -all_schemas["store"] = [ - pyarrow.field("s_store_sk", pyarrow.int32()), - pyarrow.field("s_store_id", pyarrow.string()), - pyarrow.field("s_rec_start_date", pyarrow.date32()), - pyarrow.field("s_rec_end_date", pyarrow.date32()), - pyarrow.field("s_closed_date_sk", pyarrow.int32()), - pyarrow.field("s_store_name", pyarrow.string()), - pyarrow.field("s_number_employees", pyarrow.int32()), - pyarrow.field("s_floor_space", pyarrow.int32()), - pyarrow.field("s_hours", pyarrow.string()), - pyarrow.field("s_manager", pyarrow.string()), - pyarrow.field("s_market_id", pyarrow.int32()), - pyarrow.field("s_geography_class", pyarrow.string()), - pyarrow.field("s_market_desc", pyarrow.string()), - pyarrow.field("s_market_manager", pyarrow.string()), - pyarrow.field("s_division_id", pyarrow.int32()), - pyarrow.field("s_division_name", pyarrow.string()), - pyarrow.field("s_company_id", pyarrow.int32()), - pyarrow.field("s_company_name", pyarrow.string()), - pyarrow.field("s_street_number", pyarrow.string()), - pyarrow.field("s_street_name", pyarrow.string()), - pyarrow.field("s_street_type", pyarrow.string()), - pyarrow.field("s_suite_number", pyarrow.string()), - pyarrow.field("s_city", pyarrow.string()), - pyarrow.field("s_county", pyarrow.string()), - pyarrow.field("s_state", pyarrow.string()), - pyarrow.field("s_zip", pyarrow.string()), - pyarrow.field("s_country", pyarrow.string()), - pyarrow.field("s_gmt_offset", pyarrow.decimal128(5, 2)), - pyarrow.field("s_tax_precentage", pyarrow.decimal128(5, 2)), -] - -all_schemas["call_center"] = [ - pyarrow.field("cc_call_center_sk", pyarrow.int32()), - pyarrow.field("cc_call_center_id", pyarrow.string()), - pyarrow.field("cc_rec_start_date", pyarrow.date32()), - pyarrow.field("cc_rec_end_date", pyarrow.date32()), - pyarrow.field("cc_closed_date_sk", pyarrow.int32()), - pyarrow.field("cc_open_date_sk", pyarrow.int32()), - pyarrow.field("cc_name", pyarrow.string()), - pyarrow.field("cc_class", pyarrow.string()), - pyarrow.field("cc_employees", pyarrow.int32()), - pyarrow.field("cc_sq_ft", pyarrow.int32()), - pyarrow.field("cc_hours", pyarrow.string()), - pyarrow.field("cc_manager", pyarrow.string()), - pyarrow.field("cc_mkt_id", pyarrow.int32()), - pyarrow.field("cc_mkt_class", pyarrow.string()), - pyarrow.field("cc_mkt_desc", pyarrow.string()), - pyarrow.field("cc_market_manager", pyarrow.string()), - pyarrow.field("cc_division", pyarrow.int32()), - pyarrow.field("cc_division_name", pyarrow.string()), - pyarrow.field("cc_company", pyarrow.int32()), - pyarrow.field("cc_company_name", pyarrow.string()), - pyarrow.field("cc_street_number", pyarrow.string()), - pyarrow.field("cc_street_name", pyarrow.string()), - pyarrow.field("cc_street_type", pyarrow.string()), - pyarrow.field("cc_suite_number", pyarrow.string()), - pyarrow.field("cc_city", pyarrow.string()), - pyarrow.field("cc_county", pyarrow.string()), - pyarrow.field("cc_state", pyarrow.string()), - pyarrow.field("cc_zip", pyarrow.string()), - pyarrow.field("cc_country", pyarrow.string()), - pyarrow.field("cc_gmt_offset", pyarrow.decimal128(5, 2)), - pyarrow.field("cc_tax_percentage", pyarrow.decimal128(5, 2)), -] - -all_schemas["customer"] = [ - pyarrow.field("c_customer_sk", pyarrow.int32()), - pyarrow.field("c_customer_id", pyarrow.string()), - pyarrow.field("c_current_cdemo_sk", pyarrow.int32()), - pyarrow.field("c_current_hdemo_sk", pyarrow.int32()), - pyarrow.field("c_current_addr_sk", pyarrow.int32()), - pyarrow.field("c_first_shipto_date_sk", pyarrow.int32()), - pyarrow.field("c_first_sales_date_sk", pyarrow.int32()), - pyarrow.field("c_salutation", pyarrow.string()), - pyarrow.field("c_first_name", pyarrow.string()), - pyarrow.field("c_last_name", pyarrow.string()), - pyarrow.field("c_preferred_cust_flag", pyarrow.string()), - pyarrow.field("c_birth_day", pyarrow.int32()), - pyarrow.field("c_birth_month", pyarrow.int32()), - pyarrow.field("c_birth_year", pyarrow.int32()), - pyarrow.field("c_birth_country", pyarrow.string()), - pyarrow.field("c_login", pyarrow.string()), - pyarrow.field("c_email_address", pyarrow.string()), - pyarrow.field("c_last_review_date_sk", pyarrow.string()), -] - -all_schemas["web_site"] = [ - pyarrow.field("web_site_sk", pyarrow.int32()), - pyarrow.field("web_site_id", pyarrow.string()), - pyarrow.field("web_rec_start_date", pyarrow.date32()), - pyarrow.field("web_rec_end_date", pyarrow.date32()), - pyarrow.field("web_name", pyarrow.string()), - pyarrow.field("web_open_date_sk", pyarrow.int32()), - pyarrow.field("web_close_date_sk", pyarrow.int32()), - pyarrow.field("web_class", pyarrow.string()), - pyarrow.field("web_manager", pyarrow.string()), - pyarrow.field("web_mkt_id", pyarrow.int32()), - pyarrow.field("web_mkt_class", pyarrow.string()), - pyarrow.field("web_mkt_desc", pyarrow.string()), - pyarrow.field("web_market_manager", pyarrow.string()), - pyarrow.field("web_company_id", pyarrow.int32()), - pyarrow.field("web_company_name", pyarrow.string()), - pyarrow.field("web_street_number", pyarrow.string()), - pyarrow.field("web_street_name", pyarrow.string()), - pyarrow.field("web_street_type", pyarrow.string()), - pyarrow.field("web_suite_number", pyarrow.string()), - pyarrow.field("web_city", pyarrow.string()), - pyarrow.field("web_county", pyarrow.string()), - pyarrow.field("web_state", pyarrow.string()), - pyarrow.field("web_zip", pyarrow.string()), - pyarrow.field("web_country", pyarrow.string()), - pyarrow.field("web_gmt_offset", pyarrow.decimal128(5, 2)), - pyarrow.field("web_tax_percentage", pyarrow.decimal128(5, 2)), -] - -all_schemas["store_returns"] = [ - pyarrow.field("sr_returned_date_sk", pyarrow.int32()), - pyarrow.field("sr_return_time_sk", pyarrow.int32()), - pyarrow.field("sr_item_sk", pyarrow.int32()), - pyarrow.field("sr_customer_sk", pyarrow.int32()), - pyarrow.field("sr_cdemo_sk", pyarrow.int32()), - pyarrow.field("sr_hdemo_sk", pyarrow.int32()), - pyarrow.field("sr_addr_sk", pyarrow.int32()), - pyarrow.field("sr_store_sk", pyarrow.int32()), - pyarrow.field("sr_reason_sk", pyarrow.int32()), - pyarrow.field("sr_ticket_number", pyarrow.int32()), - pyarrow.field("sr_return_quantity", pyarrow.int32()), - pyarrow.field("sr_return_amt", pyarrow.decimal128(7, 2)), - pyarrow.field("sr_return_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("sr_return_amt_inc_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("sr_fee", pyarrow.decimal128(7, 2)), - pyarrow.field("sr_return_ship_cost", pyarrow.decimal128(7, 2)), - pyarrow.field("sr_refunded_cash", pyarrow.decimal128(7, 2)), - pyarrow.field("sr_reversed_charge", pyarrow.decimal128(7, 2)), - pyarrow.field("sr_store_credit", pyarrow.decimal128(7, 2)), - pyarrow.field("sr_net_loss", pyarrow.decimal128(7, 2)), -] - -all_schemas["household_demographics"] = [ - pyarrow.field("hd_demo_sk", pyarrow.int32()), - pyarrow.field("hd_income_band_sk", pyarrow.int32()), - pyarrow.field("hd_buy_potential", pyarrow.string()), - pyarrow.field("hd_dep_count", pyarrow.int32()), - pyarrow.field("hd_vehicle_count", pyarrow.int32()), -] - -all_schemas["web_page"] = [ - pyarrow.field("wp_web_page_sk", pyarrow.int32()), - pyarrow.field("wp_web_page_id", pyarrow.string()), - pyarrow.field("wp_rec_start_date", pyarrow.date32()), - pyarrow.field("wp_rec_end_date", pyarrow.date32()), - pyarrow.field("wp_creation_date_sk", pyarrow.int32()), - pyarrow.field("wp_access_date_sk", pyarrow.int32()), - pyarrow.field("wp_autogen_flag", pyarrow.string()), - pyarrow.field("wp_customer_sk", pyarrow.int32()), - pyarrow.field("wp_url", pyarrow.string()), - pyarrow.field("wp_type", pyarrow.string()), - pyarrow.field("wp_char_count", pyarrow.int32()), - pyarrow.field("wp_link_count", pyarrow.int32()), - pyarrow.field("wp_image_count", pyarrow.int32()), - pyarrow.field("wp_max_ad_count", pyarrow.int32()), -] - -all_schemas["promotion"] = [ - pyarrow.field("p_promo_sk", pyarrow.int32()), - pyarrow.field("p_promo_id", pyarrow.string()), - pyarrow.field("p_start_date_sk", pyarrow.int32()), - pyarrow.field("p_end_date_sk", pyarrow.int32()), - pyarrow.field("p_item_sk", pyarrow.int32()), - pyarrow.field("p_cost", pyarrow.decimal128(15, 2)), - pyarrow.field("p_response_target", pyarrow.int32()), - pyarrow.field("p_promo_name", pyarrow.string()), - pyarrow.field("p_channel_dmail", pyarrow.string()), - pyarrow.field("p_channel_email", pyarrow.string()), - pyarrow.field("p_channel_catalog", pyarrow.string()), - pyarrow.field("p_channel_tv", pyarrow.string()), - pyarrow.field("p_channel_radio", pyarrow.string()), - pyarrow.field("p_channel_press", pyarrow.string()), - pyarrow.field("p_channel_event", pyarrow.string()), - pyarrow.field("p_channel_demo", pyarrow.string()), - pyarrow.field("p_channel_details", pyarrow.string()), - pyarrow.field("p_purpose", pyarrow.string()), - pyarrow.field("p_discount_active", pyarrow.string()), -] - -all_schemas["catalog_page"] = [ - pyarrow.field("cp_catalog_page_sk", pyarrow.int32()), - pyarrow.field("cp_catalog_page_id", pyarrow.string()), - pyarrow.field("cp_start_date_sk", pyarrow.int32()), - pyarrow.field("cp_end_date_sk", pyarrow.int32()), - pyarrow.field("cp_department", pyarrow.string()), - pyarrow.field("cp_catalog_number", pyarrow.int32()), - pyarrow.field("cp_catalog_page_number", pyarrow.int32()), - pyarrow.field("cp_description", pyarrow.string()), - pyarrow.field("cp_type", pyarrow.string()), -] - -all_schemas["inventory"] = [ - pyarrow.field("inv_date_sk", pyarrow.int32()), - pyarrow.field("inv_item_sk", pyarrow.int32()), - pyarrow.field("inv_warehouse_sk", pyarrow.int32()), - pyarrow.field("inv_quantity_on_hand", pyarrow.int32()), -] - -all_schemas["catalog_returns"] = [ - pyarrow.field("cr_returned_date_sk", pyarrow.int32()), - pyarrow.field("cr_returned_time_sk", pyarrow.int32()), - pyarrow.field("cr_item_sk", pyarrow.int32()), - pyarrow.field("cr_refunded_customer_sk", pyarrow.int32()), - pyarrow.field("cr_refunded_cdemo_sk", pyarrow.int32()), - pyarrow.field("cr_refunded_hdemo_sk", pyarrow.int32()), - pyarrow.field("cr_refunded_addr_sk", pyarrow.int32()), - pyarrow.field("cr_returning_customer_sk", pyarrow.int32()), - pyarrow.field("cr_returning_cdemo_sk", pyarrow.int32()), - pyarrow.field("cr_returning_hdemo_sk", pyarrow.int32()), - pyarrow.field("cr_returning_addr_sk", pyarrow.int32()), - pyarrow.field("cr_call_center_sk", pyarrow.int32()), - pyarrow.field("cr_catalog_page_sk", pyarrow.int32()), - pyarrow.field("cr_ship_mode_sk", pyarrow.int32()), - pyarrow.field("cr_warehouse_sk", pyarrow.int32()), - pyarrow.field("cr_reason_sk", pyarrow.int32()), - pyarrow.field("cr_order_number", pyarrow.int32()), - pyarrow.field("cr_return_quantity", pyarrow.int32()), - pyarrow.field("cr_return_amount", pyarrow.decimal128(7, 2)), - pyarrow.field("cr_return_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("cr_return_amt_inc_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("cr_fee", pyarrow.decimal128(7, 2)), - pyarrow.field("cr_return_ship_cost", pyarrow.decimal128(7, 2)), - pyarrow.field("cr_refunded_cash", pyarrow.decimal128(7, 2)), - pyarrow.field("cr_reversed_charge", pyarrow.decimal128(7, 2)), - pyarrow.field("cr_store_credit", pyarrow.decimal128(7, 2)), - pyarrow.field("cr_net_loss", pyarrow.decimal128(7, 2)), -] - -all_schemas["web_returns"] = [ - pyarrow.field("wr_returned_date_sk", pyarrow.int32()), - pyarrow.field("wr_returned_time_sk", pyarrow.int32()), - pyarrow.field("wr_item_sk", pyarrow.int32()), - pyarrow.field("wr_refunded_customer_sk", pyarrow.int32()), - pyarrow.field("wr_refunded_cdemo_sk", pyarrow.int32()), - pyarrow.field("wr_refunded_hdemo_sk", pyarrow.int32()), - pyarrow.field("wr_refunded_addr_sk", pyarrow.int32()), - pyarrow.field("wr_returning_customer_sk", pyarrow.int32()), - pyarrow.field("wr_returning_cdemo_sk", pyarrow.int32()), - pyarrow.field("wr_returning_hdemo_sk", pyarrow.int32()), - pyarrow.field("wr_returning_addr_sk", pyarrow.int32()), - pyarrow.field("wr_web_page_sk", pyarrow.int32()), - pyarrow.field("wr_reason_sk", pyarrow.int32()), - pyarrow.field("wr_order_number", pyarrow.int32()), - pyarrow.field("wr_return_quantity", pyarrow.int32()), - pyarrow.field("wr_return_amt", pyarrow.decimal128(7, 2)), - pyarrow.field("wr_return_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("wr_return_amt_inc_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("wr_fee", pyarrow.decimal128(7, 2)), - pyarrow.field("wr_return_ship_cost", pyarrow.decimal128(7, 2)), - pyarrow.field("wr_refunded_cash", pyarrow.decimal128(7, 2)), - pyarrow.field("wr_reversed_charge", pyarrow.decimal128(7, 2)), - pyarrow.field("wr_account_credit", pyarrow.decimal128(7, 2)), - pyarrow.field("wr_net_loss", pyarrow.decimal128(7, 2)), -] - -all_schemas["web_sales"] = [ - pyarrow.field("ws_sold_date_sk", pyarrow.int32()), - pyarrow.field("ws_sold_time_sk", pyarrow.int32()), - pyarrow.field("ws_ship_date_sk", pyarrow.int32()), - pyarrow.field("ws_item_sk", pyarrow.int32()), - pyarrow.field("ws_bill_customer_sk", pyarrow.int32()), - pyarrow.field("ws_bill_cdemo_sk", pyarrow.int32()), - pyarrow.field("ws_bill_hdemo_sk", pyarrow.int32()), - pyarrow.field("ws_bill_addr_sk", pyarrow.int32()), - pyarrow.field("ws_ship_customer_sk", pyarrow.int32()), - pyarrow.field("ws_ship_cdemo_sk", pyarrow.int32()), - pyarrow.field("ws_ship_hdemo_sk", pyarrow.int32()), - pyarrow.field("ws_ship_addr_sk", pyarrow.int32()), - pyarrow.field("ws_web_page_sk", pyarrow.int32()), - pyarrow.field("ws_web_site_sk", pyarrow.int32()), - pyarrow.field("ws_ship_mode_sk", pyarrow.int32()), - pyarrow.field("ws_warehouse_sk", pyarrow.int32()), - pyarrow.field("ws_promo_sk", pyarrow.int32()), - pyarrow.field("ws_order_number", pyarrow.int32()), - pyarrow.field("ws_quantity", pyarrow.int32()), - pyarrow.field("ws_wholesale_cost", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_list_price", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_sales_price", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_ext_discount_amt", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_ext_sales_price", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_ext_wholesale_cost", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_ext_list_price", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_ext_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_coupon_amt", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_ext_ship_cost", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_net_paid", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_net_paid_inc_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_net_paid_inc_ship", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_net_paid_inc_ship_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("ws_net_profit", pyarrow.decimal128(7, 2)), -] - -all_schemas["catalog_sales"] = [ - pyarrow.field("cs_sold_date_sk", pyarrow.int32()), - pyarrow.field("cs_sold_time_sk", pyarrow.int32()), - pyarrow.field("cs_ship_date_sk", pyarrow.int32()), - pyarrow.field("cs_bill_customer_sk", pyarrow.int32()), - pyarrow.field("cs_bill_cdemo_sk", pyarrow.int32()), - pyarrow.field("cs_bill_hdemo_sk", pyarrow.int32()), - pyarrow.field("cs_bill_addr_sk", pyarrow.int32()), - pyarrow.field("cs_ship_customer_sk", pyarrow.int32()), - pyarrow.field("cs_ship_cdemo_sk", pyarrow.int32()), - pyarrow.field("cs_ship_hdemo_sk", pyarrow.int32()), - pyarrow.field("cs_ship_addr_sk", pyarrow.int32()), - pyarrow.field("cs_call_center_sk", pyarrow.int32()), - pyarrow.field("cs_catalog_page_sk", pyarrow.int32()), - pyarrow.field("cs_ship_mode_sk", pyarrow.int32()), - pyarrow.field("cs_warehouse_sk", pyarrow.int32()), - pyarrow.field("cs_item_sk", pyarrow.int32()), - pyarrow.field("cs_promo_sk", pyarrow.int32()), - pyarrow.field("cs_order_number", pyarrow.int32()), - pyarrow.field("cs_quantity", pyarrow.int32()), - pyarrow.field("cs_wholesale_cost", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_list_price", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_sales_price", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_ext_discount_amt", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_ext_sales_price", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_ext_wholesale_cost", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_ext_list_price", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_ext_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_coupon_amt", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_ext_ship_cost", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_net_paid", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_net_paid_inc_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_net_paid_inc_ship", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_net_paid_inc_ship_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("cs_net_profit", pyarrow.decimal128(7, 2)), -] - -all_schemas["store_sales"] = [ - pyarrow.field("ss_sold_date_sk", pyarrow.int32()), - pyarrow.field("ss_sold_time_sk", pyarrow.int32()), - pyarrow.field("ss_item_sk", pyarrow.int32()), - pyarrow.field("ss_customer_sk", pyarrow.int32()), - pyarrow.field("ss_cdemo_sk", pyarrow.int32()), - pyarrow.field("ss_hdemo_sk", pyarrow.int32()), - pyarrow.field("ss_addr_sk", pyarrow.int32()), - pyarrow.field("ss_store_sk", pyarrow.int32()), - pyarrow.field("ss_promo_sk", pyarrow.int32()), - pyarrow.field("ss_ticket_number", pyarrow.int32()), - pyarrow.field("ss_quantity", pyarrow.int32()), - pyarrow.field("ss_wholesale_cost", pyarrow.decimal128(7, 2)), - pyarrow.field("ss_list_price", pyarrow.decimal128(7, 2)), - pyarrow.field("ss_sales_price", pyarrow.decimal128(7, 2)), - pyarrow.field("ss_ext_discount_amt", pyarrow.decimal128(7, 2)), - pyarrow.field("ss_ext_sales_price", pyarrow.decimal128(7, 2)), - pyarrow.field("ss_ext_wholesale_cost", pyarrow.decimal128(7, 2)), - pyarrow.field("ss_ext_list_price", pyarrow.decimal128(7, 2)), - pyarrow.field("ss_ext_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("ss_coupon_amt", pyarrow.decimal128(7, 2)), - pyarrow.field("ss_net_paid", pyarrow.decimal128(7, 2)), - pyarrow.field("ss_net_paid_inc_tax", pyarrow.decimal128(7, 2)), - pyarrow.field("ss_net_profit", pyarrow.decimal128(7, 2)), -] diff --git a/tpcdsgen/uv.lock b/tpcdsgen/uv.lock deleted file mode 100644 index 0f3a8e7..0000000 --- a/tpcdsgen/uv.lock +++ /dev/null @@ -1,56 +0,0 @@ -version = 1 -revision = 3 -requires-python = ">=3.14" - -[[package]] -name = "datafusion" -version = "53.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyarrow" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/58/2b/0f96f12b70839c93930c4e17d767fc32b6c77d548c78784128049e944701/datafusion-53.0.0.tar.gz", hash = "sha256:ba9a5ec06b5453fbd8710d6aeeb515a8bcac4b6c140e254409bb53a5f322ef22", size = 224267, upload-time = "2026-04-13T00:45:02.686Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/af/4c/60e052813d81f1ffe3123ead013dbdd2cf961daa576cb9056cbb80228e6b/datafusion-53.0.0-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:a0bd1a98d736571321416dc4ed361a9d1225da1ec9f6c5fad818d75f547697a7", size = 35774913, upload-time = "2026-04-13T00:44:46.235Z" }, - { url = "https://files.pythonhosted.org/packages/6e/59/beabe5301df3338d8206446cd624079e43bdad46e20377a6336017fb6ccf/datafusion-53.0.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:ce186a8d2405afd67e11e2fb75715019f16b00d070b8d0da89d8aa61cc74c8b5", size = 32667118, upload-time = "2026-04-13T00:44:50.269Z" }, - { url = "https://files.pythonhosted.org/packages/ae/94/636ab61ade98395daea6e733e225e9c7beef111c7c5b575ac851513e203c/datafusion-53.0.0-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:288a00a7ef03e2807a4667683f7560efd80d60ed1d41696ac15ca9ded14c8251", size = 35585824, upload-time = "2026-04-13T00:44:53.683Z" }, - { url = "https://files.pythonhosted.org/packages/34/80/b9f4889209af02f8d14bccb0e6f0519c329b072bc4d2595025a1303f144c/datafusion-53.0.0-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:8fef0004f0161fcfc556c025a7201f9cc3169aa3adb97a86419ebb34182d9efb", size = 38083690, upload-time = "2026-04-13T00:44:57.188Z" }, - { url = "https://files.pythonhosted.org/packages/4b/1a/ea4831fc6aeefedbcf186c9f6a273d507b1787c03cbb905bded7e1149a6a/datafusion-53.0.0-cp310-abi3-win_amd64.whl", hash = "sha256:4c8410f5f659b926677be6c7d443bbc05d825c078c970b7d8cf977ebcf948314", size = 38120687, upload-time = "2026-04-13T00:45:00.633Z" }, -] - -[[package]] -name = "pyarrow" -version = "24.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261, upload-time = "2026-04-21T10:51:25.837Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ad/80/d022a34ff05d2cbedd8ccf841fc1f532ecfa9eb5ed1711b56d0e0ea71fc9/pyarrow-24.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:1cc9057f0319e26333b357e17f3c2c022f1a83739b48a88b25bfd5fa2dc18838", size = 35007997, upload-time = "2026-04-21T10:49:48.796Z" }, - { url = "https://files.pythonhosted.org/packages/1a/ff/f01485fda6f4e5d441afb8dd5e7681e4db18826c1e271852f5d3957d6a80/pyarrow-24.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e6f1278ee4785b6db21229374a1c9e54ec7c549de5d1efc9630b6207de7e170b", size = 36678720, upload-time = "2026-04-21T10:49:55.858Z" }, - { url = "https://files.pythonhosted.org/packages/9e/c2/2d2d5fea814237923f71b36495211f20b43a1576f9a4d6da7e751a64ec6f/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:adbbedc55506cbdabb830890444fb856bfb0060c46c6f8026c6c2f2cf86ae795", size = 45741852, upload-time = "2026-04-21T10:50:04.624Z" }, - { url = "https://files.pythonhosted.org/packages/8e/3a/28ba9c1c1ebdbb5f1b94dfebb46f207e52e6a554b7fe4132540fde29a3a0/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ae8a1145af31d903fa9bb166824d7abe9b4681a000b0159c9fb99c11bc11ad26", size = 48889852, upload-time = "2026-04-21T10:50:12.293Z" }, - { url = "https://files.pythonhosted.org/packages/df/51/4a389acfd31dca009f8fb82d7f510bb4130f2b3a8e18cf00194d0687d8ac/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d7027eba1df3b2069e2e8d80f644fa0918b68c46432af3d088ddd390d063ecde", size = 49445207, upload-time = "2026-04-21T10:50:20.677Z" }, - { url = "https://files.pythonhosted.org/packages/19/4b/0bab2b23d2ae901b1b9a03c0efd4b2d070256f8ce3fc43f6e58c167b2081/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e56a1ffe9bf7b727432b89104cc0849c21582949dd7bdcb34f17b2001a351a76", size = 51954117, upload-time = "2026-04-21T10:50:29.14Z" }, - { url = "https://files.pythonhosted.org/packages/29/88/f4e9145da0417b3d2c12035a8492b35ff4a3dbc653e614fcfb51d9dedb38/pyarrow-24.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:38be1808cdd068605b787e6ca9119b27eb275a0234e50212c3492331680c3b1e", size = 28001155, upload-time = "2026-04-21T10:51:22.337Z" }, - { url = "https://files.pythonhosted.org/packages/79/4f/46a49a63f43526da895b1a45bbb51d5baf8e4d77159f8528fc3e5490007f/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:418e48ce50a45a6a6c73c454677203a9c75c966cb1e92ca3370959185f197a05", size = 35250387, upload-time = "2026-04-21T10:50:35.552Z" }, - { url = "https://files.pythonhosted.org/packages/a0/da/d5e0cd5ef00796922404806d5f00325cdadc3441ce2c13fe7115f2df9a64/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:2f16197705a230a78270cdd4ea8a1d57e86b2fdcbc34a1f6aebc72e65c986f9a", size = 36797102, upload-time = "2026-04-21T10:50:42.417Z" }, - { url = "https://files.pythonhosted.org/packages/34/c7/5904145b0a593a05236c882933d439b5720f0a145381179063722fbfc123/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:fb24ac194bfc5e86839d7dcd52092ee31e5fe6733fe11f5e3b06ef0812b20072", size = 45745118, upload-time = "2026-04-21T10:50:49.324Z" }, - { url = "https://files.pythonhosted.org/packages/13/d3/cca42fe166d1c6e4d5b80e530b7949104d10e17508a90ae202dac205ce2a/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:9700ebd9a51f5895ce75ff4ac4b3c47a7d4b42bc618be8e713e5d56bacf5f931", size = 48844765, upload-time = "2026-04-21T10:50:55.579Z" }, - { url = "https://files.pythonhosted.org/packages/b0/49/942c3b79878ba928324d1e17c274ed84581db8c0a749b24bcf4cbdf15bd3/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d8ddd2768da81d3ee08cfea9b597f4abb4e8e1dc8ae7e204b608d23a0d3ab699", size = 49471890, upload-time = "2026-04-21T10:51:02.439Z" }, - { url = "https://files.pythonhosted.org/packages/76/97/ff71431000a75d84135a1ace5ca4ba11726a231a8007bbb320a4c54075d5/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:61a3d7eaa97a14768b542f3d284dc6400dd2470d9f080708b13cd46b6ae18136", size = 51932250, upload-time = "2026-04-21T10:51:10.576Z" }, - { url = "https://files.pythonhosted.org/packages/51/be/6f79d55816d5c22557cf27533543d5d70dfe692adfbee4b99f2760674f38/pyarrow-24.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:c91d00057f23b8d353039520dc3a6c09d8608164c692e9f59a175a42b2ae0c19", size = 28131282, upload-time = "2026-04-21T10:51:16.815Z" }, -] - -[[package]] -name = "tpcdsgen" -version = "0.1.0" -source = { virtual = "." } -dependencies = [ - { name = "datafusion" }, - { name = "pyarrow" }, -] - -[package.metadata] -requires-dist = [ - { name = "datafusion", specifier = ">=53.0.0" }, - { name = "pyarrow", specifier = ">=24.0.0" }, -]