diff --git a/tpcdsgen/scripts/README.md b/tpcdsgen/scripts/README.md index 320d2f9..eca0fae 100644 --- a/tpcdsgen/scripts/README.md +++ b/tpcdsgen/scripts/README.md @@ -79,20 +79,6 @@ extracts into `tests/fixtures/scale-N-c/`. ./scripts/compare-table.sh reason --compat c --full # byte-for-byte ``` -### Tables excluded from automated checks - -The following tables are excluded from automated MD5 comparison; the -exclusion lists live in `test-all-tables.sh`. - -- **Always:** `dbgen_version.dat` — contains a generation timestamp. -- **`--compat c` only:** `customer.dat` — the reference data in - `alamb/tpcds-data` was generated through a pipeline that double-UTF-8 - encodes the non-ASCII country names (`CÔTE D'IVOIRE`, `RÉUNION`). The - Rust `--compat c` output uses raw Latin-1, which is what unmodified C - `dsdgen` produces. Once the reference data is regenerated without the - `iconv ISO-8859-14 -> UTF-8` step in `alamb/tpcds-data`'s `Dockerfile`, - this exclusion can be removed. - ## Scripts Each script is self-documenting — open it and read the header comment for diff --git a/tpcdsgen/scripts/test-all-tables.sh b/tpcdsgen/scripts/test-all-tables.sh index 73b4f18..619a46d 100755 --- a/tpcdsgen/scripts/test-all-tables.sh +++ b/tpcdsgen/scripts/test-all-tables.sh @@ -38,11 +38,6 @@ Two reference implementations are supported, selected by --compat: ./scripts/generate-fixtures.sh --compat c, --full only) -Per-compat skip lists live near the top of the script. As of this -writing, --compat c additionally skips `customer` until -alamb/tpcds-data is regenerated without the iconv ISO-8859-14 -> UTF-8 -step that double-encodes non-ASCII country names. - Usage: test-all-tables.sh [OPTIONS] @@ -135,40 +130,6 @@ ALL_TABLES=( "web_site" ) -# Tables to skip per compat mode (in addition to dbgen_version, which is -# always skipped because it contains a generation timestamp). -# -# --compat c: customer.dat is skipped because the reference data in -# https://github.com/alamb/tpcds-data was generated through a pipeline that -# accidentally double-UTF-8-encodes the non-ASCII country names (`CÔTE -# D'IVOIRE`, `RÉUNION`). The Rust --compat c output uses raw Latin-1, which -# is what unmodified C dsdgen produces. Once the reference data is -# regenerated without the iconv ISO-8859-14 -> UTF-8 step, this entry can -# be removed. -# TODO(alamb): re-include customer once alamb/tpcds-data has been regenerated. -C_COMPAT_SKIP_TABLES=("customer") - -# Get list of tables to test, applying per-compat skip lists. -get_tables_to_test() { - local skip_list=() - if [[ "$COMPAT" == "c" ]]; then - skip_list=("${C_COMPAT_SKIP_TABLES[@]}") - fi - - local result=() - for t in "${ALL_TABLES[@]}"; do - local skip=0 - for s in "${skip_list[@]:-}"; do - if [[ "$t" == "$s" ]]; then - skip=1 - break - fi - done - [[ $skip -eq 0 ]] && result+=("$t") - done - echo "${result[@]}" -} - # Build the unified Rust table generator build_generator() { log_info "Building Rust TPC-DS generator..." @@ -248,14 +209,10 @@ main() { log_info "Comparison: $mode_label" log_info "=========================================" - # Get tables to test - local tables_to_test - tables_to_test=$(get_tables_to_test) - local tables_array=($tables_to_test) - local total_count=${#tables_array[@]} + local total_count=${#ALL_TABLES[@]} log_info "Testing $total_count tables:" - for table in "${tables_array[@]}"; do + for table in "${ALL_TABLES[@]}"; do log_info " - $table" done log_info "=========================================" @@ -270,7 +227,7 @@ main() { # Test each table start_time=$(date +%s) - for table in "${tables_array[@]}"; do + for table in "${ALL_TABLES[@]}"; do log_info "" log_info "Testing: $table" log_info "-----------------------------------------" diff --git a/tpcdsgen/src/main.rs b/tpcdsgen/src/main.rs index d84b5b2..a9f31ed 100644 --- a/tpcdsgen/src/main.rs +++ b/tpcdsgen/src/main.rs @@ -23,7 +23,7 @@ use std::path::Path; use std::time::Instant; use tpcdsgen::config::{Options, Session, Table}; -use tpcdsgen::output::Iso8859Writer; +use tpcdsgen::output::CompatWriter; use tpcdsgen::row::*; use tpcdsgen::types::Date; @@ -141,7 +141,7 @@ fn generate_simple(table: Table, session: &Session) -> R let path = get_output_path(table, session); let file = File::create(&path)?; - let mut writer = Iso8859Writer::new(BufWriter::new(file)); + let mut writer = CompatWriter::new(BufWriter::new(file), session.get_compat_mode()); print!("Generating {}... ", table.get_name()); std::io::stdout().flush()?; @@ -170,8 +170,11 @@ fn generate_store_sales(session: &Session) -> Result<()> { let sales_path = get_output_path(Table::StoreSales, session); let returns_path = get_output_path(Table::StoreReturns, session); - let mut sales_writer = Iso8859Writer::new(BufWriter::new(File::create(&sales_path)?)); - let mut returns_writer = Iso8859Writer::new(BufWriter::new(File::create(&returns_path)?)); + let compat_mode = session.get_compat_mode(); + let mut sales_writer = + CompatWriter::new(BufWriter::new(File::create(&sales_path)?), compat_mode); + let mut returns_writer = + CompatWriter::new(BufWriter::new(File::create(&returns_path)?), compat_mode); print!("Generating store_sales + store_returns... "); std::io::stdout().flush()?; @@ -223,8 +226,11 @@ fn generate_catalog_sales(session: &Session) -> Result<()> { let sales_path = get_output_path(Table::CatalogSales, session); let returns_path = get_output_path(Table::CatalogReturns, session); - let mut sales_writer = Iso8859Writer::new(BufWriter::new(File::create(&sales_path)?)); - let mut returns_writer = Iso8859Writer::new(BufWriter::new(File::create(&returns_path)?)); + let compat_mode = session.get_compat_mode(); + let mut sales_writer = + CompatWriter::new(BufWriter::new(File::create(&sales_path)?), compat_mode); + let mut returns_writer = + CompatWriter::new(BufWriter::new(File::create(&returns_path)?), compat_mode); print!("Generating catalog_sales + catalog_returns... "); std::io::stdout().flush()?; @@ -276,8 +282,11 @@ fn generate_web_sales(session: &Session) -> Result<()> { let sales_path = get_output_path(Table::WebSales, session); let returns_path = get_output_path(Table::WebReturns, session); - let mut sales_writer = Iso8859Writer::new(BufWriter::new(File::create(&sales_path)?)); - let mut returns_writer = Iso8859Writer::new(BufWriter::new(File::create(&returns_path)?)); + let compat_mode = session.get_compat_mode(); + let mut sales_writer = + CompatWriter::new(BufWriter::new(File::create(&sales_path)?), compat_mode); + let mut returns_writer = + CompatWriter::new(BufWriter::new(File::create(&returns_path)?), compat_mode); print!("Generating web_sales + web_returns... "); std::io::stdout().flush()?; @@ -333,7 +342,10 @@ fn generate_inventory(session: &Session) -> Result<()> { let num_rows = item_count * warehouse_count * n_weeks as i64; let path = get_output_path(Table::Inventory, session); - let mut writer = Iso8859Writer::new(BufWriter::new(File::create(&path)?)); + let mut writer = CompatWriter::new( + BufWriter::new(File::create(&path)?), + session.get_compat_mode(), + ); print!("Generating inventory... "); std::io::stdout().flush()?; diff --git a/tpcdsgen/src/output.rs b/tpcdsgen/src/output.rs index 7b99cd8..8a8769f 100644 --- a/tpcdsgen/src/output.rs +++ b/tpcdsgen/src/output.rs @@ -14,18 +14,16 @@ //! Output utilities for TPC-DS data generation //! -//! The Java implementation reads distribution files as ISO-8859-1 (Latin-1) and -//! writes output files as ISO-8859-1 (see TableGenerator.java line 80). +//! The Java (Trino) implementation reads distribution files as ISO-8859-1 +//! (Latin-1) and writes output files as ISO-8859-1 (see TableGenerator.java +//! line 80). The C `dsdgen` outputs UTF-8. //! -//! Rust reads ISO-8859-1 bytes and converts them to UTF-8 strings (since Rust -//! strings are UTF-8). For byte-for-byte compatibility with Java output, we must -//! convert back to ISO-8859-1 when writing. -//! -//! Since ISO-8859-1 bytes 0x00-0xFF map directly to Unicode code points U+0000-U+00FF, -//! any character from the distribution files can be safely converted back to a single byte. +//! [`CompatWriter`] selects the right behavior based on [`CompatMode`]. use std::io::{self, Write}; +use crate::config::CompatMode; + /// Converts a UTF-8 string to ISO-8859-1 bytes. /// /// This is the inverse of the conversion done in file_loader.rs when reading @@ -54,7 +52,7 @@ pub fn to_iso_8859_1(s: &str) -> io::Result> { /// A writer wrapper that converts UTF-8 strings to ISO-8859-1 before writing. /// -/// This matches Java's behavior in TableGenerator.java which writes output +/// This matches Trino's behavior in TableGenerator.java which writes output /// using StandardCharsets.ISO_8859_1. pub struct Iso8859Writer { inner: W, @@ -103,6 +101,41 @@ impl Write for Iso8859Writer { } } +/// Writer that selects the output encoding based on [`CompatMode`]. +/// +/// * `Iso8859`: outputs ISO-8859-1 to match Trino. +/// * `Utf8`: outputs UTF-8 to match unmodified C `dsdgen`. +pub enum CompatWriter { + Iso8859(Iso8859Writer), + Utf8(W), +} + +impl CompatWriter { + /// Build a writer for `compat_mode`. + pub fn new(writer: W, compat_mode: CompatMode) -> Self { + match compat_mode { + CompatMode::Trino => CompatWriter::Iso8859(Iso8859Writer::new(writer)), + CompatMode::C => CompatWriter::Utf8(writer), + } + } +} + +impl Write for CompatWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + match self { + CompatWriter::Iso8859(w) => w.write(buf), + CompatWriter::Utf8(w) => w.write(buf), + } + } + + fn flush(&mut self) -> io::Result<()> { + match self { + CompatWriter::Iso8859(w) => w.flush(), + CompatWriter::Utf8(w) => w.flush(), + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -142,4 +175,28 @@ mod tests { assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert!(err.to_string().contains("outside ISO-8859-1 range")); } + + #[test] + fn test_compat_writer_trino_emits_iso_8859_1() { + let mut buffer = Vec::new(); + { + let mut writer = CompatWriter::new(&mut buffer, CompatMode::Trino); + write!(writer, "CÔTE D'IVOIRE").unwrap(); + } + // Trino/Java emits a single 0xD4 byte for Ô. + assert_eq!(buffer[1], 0xD4); + assert_eq!(buffer.len(), 13); + } + + #[test] + fn test_compat_writer_c_emits_utf8() { + let mut buffer = Vec::new(); + { + let mut writer = CompatWriter::new(&mut buffer, CompatMode::C); + write!(writer, "CÔTE D'IVOIRE").unwrap(); + } + // C dsdgen passes the UTF-8 bytes through (Ô is 0xC3 0x94). + assert_eq!(&buffer[..3], &[b'C', 0xC3, 0x94]); + assert_eq!(buffer.len(), 14); + } }