Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 0 additions & 14 deletions tpcdsgen/scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,20 +79,6 @@ extracts into `tests/fixtures/scale-N-c/`.
./scripts/compare-table.sh reason --compat c --full # byte-for-byte
```

### Tables excluded from automated checks

The following tables are excluded from automated MD5 comparison; the
exclusion lists live in `test-all-tables.sh`.

- **Always:** `dbgen_version.dat` — contains a generation timestamp.
- **`--compat c` only:** `customer.dat` — the reference data in
`alamb/tpcds-data` was generated through a pipeline that double-UTF-8
encodes the non-ASCII country names (`CÔTE D'IVOIRE`, `RÉUNION`). The
Rust `--compat c` output uses raw Latin-1, which is what unmodified C
`dsdgen` produces. Once the reference data is regenerated without the
`iconv ISO-8859-14 -> UTF-8` step in `alamb/tpcds-data`'s `Dockerfile`,
this exclusion can be removed.

## Scripts

Each script is self-documenting — open it and read the header comment for
Expand Down
49 changes: 3 additions & 46 deletions tpcdsgen/scripts/test-all-tables.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,6 @@ Two reference implementations are supported, selected by --compat:
./scripts/generate-fixtures.sh --compat c,
--full only)

Per-compat skip lists live near the top of the script. As of this
writing, --compat c additionally skips `customer` until
alamb/tpcds-data is regenerated without the iconv ISO-8859-14 -> UTF-8
step that double-encodes non-ASCII country names.

Usage:
test-all-tables.sh [OPTIONS]

Expand Down Expand Up @@ -135,40 +130,6 @@ ALL_TABLES=(
"web_site"
)

# Tables to skip per compat mode (in addition to dbgen_version, which is
# always skipped because it contains a generation timestamp).
#
# --compat c: customer.dat is skipped because the reference data in
# https://github.com/alamb/tpcds-data was generated through a pipeline that
# accidentally double-UTF-8-encodes the non-ASCII country names (`CÔTE
# D'IVOIRE`, `RÉUNION`). The Rust --compat c output uses raw Latin-1, which
# is what unmodified C dsdgen produces. Once the reference data is
# regenerated without the iconv ISO-8859-14 -> UTF-8 step, this entry can
# be removed.
# TODO(alamb): re-include customer once alamb/tpcds-data has been regenerated.
C_COMPAT_SKIP_TABLES=("customer")

# Get list of tables to test, applying per-compat skip lists.
get_tables_to_test() {
local skip_list=()
if [[ "$COMPAT" == "c" ]]; then
skip_list=("${C_COMPAT_SKIP_TABLES[@]}")
fi

local result=()
for t in "${ALL_TABLES[@]}"; do
local skip=0
for s in "${skip_list[@]:-}"; do
if [[ "$t" == "$s" ]]; then
skip=1
break
fi
done
[[ $skip -eq 0 ]] && result+=("$t")
done
echo "${result[@]}"
}

# Build the unified Rust table generator
build_generator() {
log_info "Building Rust TPC-DS generator..."
Expand Down Expand Up @@ -248,14 +209,10 @@ main() {
log_info "Comparison: $mode_label"
log_info "========================================="

# Get tables to test
local tables_to_test
tables_to_test=$(get_tables_to_test)
local tables_array=($tables_to_test)
local total_count=${#tables_array[@]}
local total_count=${#ALL_TABLES[@]}

log_info "Testing $total_count tables:"
for table in "${tables_array[@]}"; do
for table in "${ALL_TABLES[@]}"; do
log_info " - $table"
done
log_info "========================================="
Expand All @@ -270,7 +227,7 @@ main() {
# Test each table
start_time=$(date +%s)

for table in "${tables_array[@]}"; do
for table in "${ALL_TABLES[@]}"; do
log_info ""
log_info "Testing: $table"
log_info "-----------------------------------------"
Expand Down
30 changes: 21 additions & 9 deletions tpcdsgen/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use std::path::Path;
use std::time::Instant;

use tpcdsgen::config::{Options, Session, Table};
use tpcdsgen::output::Iso8859Writer;
use tpcdsgen::output::CompatWriter;
use tpcdsgen::row::*;
use tpcdsgen::types::Date;

Expand Down Expand Up @@ -141,7 +141,7 @@ fn generate_simple<G: RowGeneratorFactory>(table: Table, session: &Session) -> R

let path = get_output_path(table, session);
let file = File::create(&path)?;
let mut writer = Iso8859Writer::new(BufWriter::new(file));
let mut writer = CompatWriter::new(BufWriter::new(file), session.get_compat_mode());

print!("Generating {}... ", table.get_name());
std::io::stdout().flush()?;
Expand Down Expand Up @@ -170,8 +170,11 @@ fn generate_store_sales(session: &Session) -> Result<()> {
let sales_path = get_output_path(Table::StoreSales, session);
let returns_path = get_output_path(Table::StoreReturns, session);

let mut sales_writer = Iso8859Writer::new(BufWriter::new(File::create(&sales_path)?));
let mut returns_writer = Iso8859Writer::new(BufWriter::new(File::create(&returns_path)?));
let compat_mode = session.get_compat_mode();
let mut sales_writer =
CompatWriter::new(BufWriter::new(File::create(&sales_path)?), compat_mode);
let mut returns_writer =
CompatWriter::new(BufWriter::new(File::create(&returns_path)?), compat_mode);

print!("Generating store_sales + store_returns... ");
std::io::stdout().flush()?;
Expand Down Expand Up @@ -223,8 +226,11 @@ fn generate_catalog_sales(session: &Session) -> Result<()> {
let sales_path = get_output_path(Table::CatalogSales, session);
let returns_path = get_output_path(Table::CatalogReturns, session);

let mut sales_writer = Iso8859Writer::new(BufWriter::new(File::create(&sales_path)?));
let mut returns_writer = Iso8859Writer::new(BufWriter::new(File::create(&returns_path)?));
let compat_mode = session.get_compat_mode();
let mut sales_writer =
CompatWriter::new(BufWriter::new(File::create(&sales_path)?), compat_mode);
let mut returns_writer =
CompatWriter::new(BufWriter::new(File::create(&returns_path)?), compat_mode);

print!("Generating catalog_sales + catalog_returns... ");
std::io::stdout().flush()?;
Expand Down Expand Up @@ -276,8 +282,11 @@ fn generate_web_sales(session: &Session) -> Result<()> {
let sales_path = get_output_path(Table::WebSales, session);
let returns_path = get_output_path(Table::WebReturns, session);

let mut sales_writer = Iso8859Writer::new(BufWriter::new(File::create(&sales_path)?));
let mut returns_writer = Iso8859Writer::new(BufWriter::new(File::create(&returns_path)?));
let compat_mode = session.get_compat_mode();
let mut sales_writer =
CompatWriter::new(BufWriter::new(File::create(&sales_path)?), compat_mode);
let mut returns_writer =
CompatWriter::new(BufWriter::new(File::create(&returns_path)?), compat_mode);

print!("Generating web_sales + web_returns... ");
std::io::stdout().flush()?;
Expand Down Expand Up @@ -333,7 +342,10 @@ fn generate_inventory(session: &Session) -> Result<()> {
let num_rows = item_count * warehouse_count * n_weeks as i64;

let path = get_output_path(Table::Inventory, session);
let mut writer = Iso8859Writer::new(BufWriter::new(File::create(&path)?));
let mut writer = CompatWriter::new(
BufWriter::new(File::create(&path)?),
session.get_compat_mode(),
);

print!("Generating inventory... ");
std::io::stdout().flush()?;
Expand Down
75 changes: 66 additions & 9 deletions tpcdsgen/src/output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,16 @@

//! Output utilities for TPC-DS data generation
//!
//! The Java implementation reads distribution files as ISO-8859-1 (Latin-1) and
//! writes output files as ISO-8859-1 (see TableGenerator.java line 80).
//! The Java (Trino) implementation reads distribution files as ISO-8859-1
//! (Latin-1) and writes output files as ISO-8859-1 (see TableGenerator.java
//! line 80). The C `dsdgen` outputs UTF-8.
//!
//! Rust reads ISO-8859-1 bytes and converts them to UTF-8 strings (since Rust
//! strings are UTF-8). For byte-for-byte compatibility with Java output, we must
//! convert back to ISO-8859-1 when writing.
//!
//! Since ISO-8859-1 bytes 0x00-0xFF map directly to Unicode code points U+0000-U+00FF,
//! any character from the distribution files can be safely converted back to a single byte.
//! [`CompatWriter`] selects the right behavior based on [`CompatMode`].

use std::io::{self, Write};

use crate::config::CompatMode;

/// Converts a UTF-8 string to ISO-8859-1 bytes.
///
/// This is the inverse of the conversion done in file_loader.rs when reading
Expand Down Expand Up @@ -54,7 +52,7 @@ pub fn to_iso_8859_1(s: &str) -> io::Result<Vec<u8>> {

/// A writer wrapper that converts UTF-8 strings to ISO-8859-1 before writing.
///
/// This matches Java's behavior in TableGenerator.java which writes output
/// This matches Trino's behavior in TableGenerator.java which writes output
/// using StandardCharsets.ISO_8859_1.
pub struct Iso8859Writer<W: Write> {
inner: W,
Expand Down Expand Up @@ -103,6 +101,41 @@ impl<W: Write> Write for Iso8859Writer<W> {
}
}

/// Writer that selects the output encoding based on [`CompatMode`].
Comment thread
alamb marked this conversation as resolved.
///
/// * `Iso8859`: outputs ISO-8859-1 to match Trino.
/// * `Utf8`: outputs UTF-8 to match unmodified C `dsdgen`.
pub enum CompatWriter<W: Write> {
Iso8859(Iso8859Writer<W>),
Utf8(W),
}

impl<W: Write> CompatWriter<W> {
/// Build a writer for `compat_mode`.
pub fn new(writer: W, compat_mode: CompatMode) -> Self {
match compat_mode {
CompatMode::Trino => CompatWriter::Iso8859(Iso8859Writer::new(writer)),
CompatMode::C => CompatWriter::Utf8(writer),
}
}
}

impl<W: Write> Write for CompatWriter<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
match self {
CompatWriter::Iso8859(w) => w.write(buf),
CompatWriter::Utf8(w) => w.write(buf),
}
}

fn flush(&mut self) -> io::Result<()> {
match self {
CompatWriter::Iso8859(w) => w.flush(),
CompatWriter::Utf8(w) => w.flush(),
}
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -142,4 +175,28 @@ mod tests {
assert_eq!(err.kind(), io::ErrorKind::InvalidData);
assert!(err.to_string().contains("outside ISO-8859-1 range"));
}

#[test]
fn test_compat_writer_trino_emits_iso_8859_1() {
let mut buffer = Vec::new();
{
let mut writer = CompatWriter::new(&mut buffer, CompatMode::Trino);
write!(writer, "CÔTE D'IVOIRE").unwrap();
}
// Trino/Java emits a single 0xD4 byte for Ô.
assert_eq!(buffer[1], 0xD4);
assert_eq!(buffer.len(), 13);
}

#[test]
fn test_compat_writer_c_emits_utf8() {
let mut buffer = Vec::new();
{
let mut writer = CompatWriter::new(&mut buffer, CompatMode::C);
write!(writer, "CÔTE D'IVOIRE").unwrap();
}
// C dsdgen passes the UTF-8 bytes through (Ô is 0xC3 0x94).
assert_eq!(&buffer[..3], &[b'C', 0xC3, 0x94]);
assert_eq!(buffer.len(), 14);
}
}
Loading