Skip to content

Commit d362ab5

Browse files
alambclaude
andauthored
fix(tpcdsgen): Fix customer.dat to match C dsdgen (#266)
fix(tpcdsgen): emit UTF-8 in --compat c so customer.dat matches C dsdgen Closes #261. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent c222bd9 commit d362ab5

4 files changed

Lines changed: 90 additions & 78 deletions

File tree

tpcdsgen/scripts/README.md

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -79,20 +79,6 @@ extracts into `tests/fixtures/scale-N-c/`.
7979
./scripts/compare-table.sh reason --compat c --full # byte-for-byte
8080
```
8181

82-
### Tables excluded from automated checks
83-
84-
The following tables are excluded from automated MD5 comparison; the
85-
exclusion lists live in `test-all-tables.sh`.
86-
87-
- **Always:** `dbgen_version.dat` — contains a generation timestamp.
88-
- **`--compat c` only:** `customer.dat` — the reference data in
89-
`alamb/tpcds-data` was generated through a pipeline that double-UTF-8
90-
encodes the non-ASCII country names (`CÔTE D'IVOIRE`, `RÉUNION`). The
91-
Rust `--compat c` output uses raw Latin-1, which is what unmodified C
92-
`dsdgen` produces. Once the reference data is regenerated without the
93-
`iconv ISO-8859-14 -> UTF-8` step in `alamb/tpcds-data`'s `Dockerfile`,
94-
this exclusion can be removed.
95-
9682
## Scripts
9783

9884
Each script is self-documenting — open it and read the header comment for

tpcdsgen/scripts/test-all-tables.sh

Lines changed: 3 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,6 @@ Two reference implementations are supported, selected by --compat:
3838
./scripts/generate-fixtures.sh --compat c,
3939
--full only)
4040
41-
Per-compat skip lists live near the top of the script. As of this
42-
writing, --compat c additionally skips `customer` until
43-
alamb/tpcds-data is regenerated without the iconv ISO-8859-14 -> UTF-8
44-
step that double-encodes non-ASCII country names.
45-
4641
Usage:
4742
test-all-tables.sh [OPTIONS]
4843
@@ -135,40 +130,6 @@ ALL_TABLES=(
135130
"web_site"
136131
)
137132

138-
# Tables to skip per compat mode (in addition to dbgen_version, which is
139-
# always skipped because it contains a generation timestamp).
140-
#
141-
# --compat c: customer.dat is skipped because the reference data in
142-
# https://github.com/alamb/tpcds-data was generated through a pipeline that
143-
# accidentally double-UTF-8-encodes the non-ASCII country names (`CÔTE
144-
# D'IVOIRE`, `RÉUNION`). The Rust --compat c output uses raw Latin-1, which
145-
# is what unmodified C dsdgen produces. Once the reference data is
146-
# regenerated without the iconv ISO-8859-14 -> UTF-8 step, this entry can
147-
# be removed.
148-
# TODO(alamb): re-include customer once alamb/tpcds-data has been regenerated.
149-
C_COMPAT_SKIP_TABLES=("customer")
150-
151-
# Get list of tables to test, applying per-compat skip lists.
152-
get_tables_to_test() {
153-
local skip_list=()
154-
if [[ "$COMPAT" == "c" ]]; then
155-
skip_list=("${C_COMPAT_SKIP_TABLES[@]}")
156-
fi
157-
158-
local result=()
159-
for t in "${ALL_TABLES[@]}"; do
160-
local skip=0
161-
for s in "${skip_list[@]:-}"; do
162-
if [[ "$t" == "$s" ]]; then
163-
skip=1
164-
break
165-
fi
166-
done
167-
[[ $skip -eq 0 ]] && result+=("$t")
168-
done
169-
echo "${result[@]}"
170-
}
171-
172133
# Build the unified Rust table generator
173134
build_generator() {
174135
log_info "Building Rust TPC-DS generator..."
@@ -248,14 +209,10 @@ main() {
248209
log_info "Comparison: $mode_label"
249210
log_info "========================================="
250211

251-
# Get tables to test
252-
local tables_to_test
253-
tables_to_test=$(get_tables_to_test)
254-
local tables_array=($tables_to_test)
255-
local total_count=${#tables_array[@]}
212+
local total_count=${#ALL_TABLES[@]}
256213

257214
log_info "Testing $total_count tables:"
258-
for table in "${tables_array[@]}"; do
215+
for table in "${ALL_TABLES[@]}"; do
259216
log_info " - $table"
260217
done
261218
log_info "========================================="
@@ -270,7 +227,7 @@ main() {
270227
# Test each table
271228
start_time=$(date +%s)
272229

273-
for table in "${tables_array[@]}"; do
230+
for table in "${ALL_TABLES[@]}"; do
274231
log_info ""
275232
log_info "Testing: $table"
276233
log_info "-----------------------------------------"

tpcdsgen/src/main.rs

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ use std::path::Path;
2323
use std::time::Instant;
2424

2525
use tpcdsgen::config::{Options, Session, Table};
26-
use tpcdsgen::output::Iso8859Writer;
26+
use tpcdsgen::output::CompatWriter;
2727
use tpcdsgen::row::*;
2828
use tpcdsgen::types::Date;
2929

@@ -141,7 +141,7 @@ fn generate_simple<G: RowGeneratorFactory>(table: Table, session: &Session) -> R
141141

142142
let path = get_output_path(table, session);
143143
let file = File::create(&path)?;
144-
let mut writer = Iso8859Writer::new(BufWriter::new(file));
144+
let mut writer = CompatWriter::new(BufWriter::new(file), session.get_compat_mode());
145145

146146
print!("Generating {}... ", table.get_name());
147147
std::io::stdout().flush()?;
@@ -170,8 +170,11 @@ fn generate_store_sales(session: &Session) -> Result<()> {
170170
let sales_path = get_output_path(Table::StoreSales, session);
171171
let returns_path = get_output_path(Table::StoreReturns, session);
172172

173-
let mut sales_writer = Iso8859Writer::new(BufWriter::new(File::create(&sales_path)?));
174-
let mut returns_writer = Iso8859Writer::new(BufWriter::new(File::create(&returns_path)?));
173+
let compat_mode = session.get_compat_mode();
174+
let mut sales_writer =
175+
CompatWriter::new(BufWriter::new(File::create(&sales_path)?), compat_mode);
176+
let mut returns_writer =
177+
CompatWriter::new(BufWriter::new(File::create(&returns_path)?), compat_mode);
175178

176179
print!("Generating store_sales + store_returns... ");
177180
std::io::stdout().flush()?;
@@ -223,8 +226,11 @@ fn generate_catalog_sales(session: &Session) -> Result<()> {
223226
let sales_path = get_output_path(Table::CatalogSales, session);
224227
let returns_path = get_output_path(Table::CatalogReturns, session);
225228

226-
let mut sales_writer = Iso8859Writer::new(BufWriter::new(File::create(&sales_path)?));
227-
let mut returns_writer = Iso8859Writer::new(BufWriter::new(File::create(&returns_path)?));
229+
let compat_mode = session.get_compat_mode();
230+
let mut sales_writer =
231+
CompatWriter::new(BufWriter::new(File::create(&sales_path)?), compat_mode);
232+
let mut returns_writer =
233+
CompatWriter::new(BufWriter::new(File::create(&returns_path)?), compat_mode);
228234

229235
print!("Generating catalog_sales + catalog_returns... ");
230236
std::io::stdout().flush()?;
@@ -276,8 +282,11 @@ fn generate_web_sales(session: &Session) -> Result<()> {
276282
let sales_path = get_output_path(Table::WebSales, session);
277283
let returns_path = get_output_path(Table::WebReturns, session);
278284

279-
let mut sales_writer = Iso8859Writer::new(BufWriter::new(File::create(&sales_path)?));
280-
let mut returns_writer = Iso8859Writer::new(BufWriter::new(File::create(&returns_path)?));
285+
let compat_mode = session.get_compat_mode();
286+
let mut sales_writer =
287+
CompatWriter::new(BufWriter::new(File::create(&sales_path)?), compat_mode);
288+
let mut returns_writer =
289+
CompatWriter::new(BufWriter::new(File::create(&returns_path)?), compat_mode);
281290

282291
print!("Generating web_sales + web_returns... ");
283292
std::io::stdout().flush()?;
@@ -333,7 +342,10 @@ fn generate_inventory(session: &Session) -> Result<()> {
333342
let num_rows = item_count * warehouse_count * n_weeks as i64;
334343

335344
let path = get_output_path(Table::Inventory, session);
336-
let mut writer = Iso8859Writer::new(BufWriter::new(File::create(&path)?));
345+
let mut writer = CompatWriter::new(
346+
BufWriter::new(File::create(&path)?),
347+
session.get_compat_mode(),
348+
);
337349

338350
print!("Generating inventory... ");
339351
std::io::stdout().flush()?;

tpcdsgen/src/output.rs

Lines changed: 66 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,16 @@
1414

1515
//! Output utilities for TPC-DS data generation
1616
//!
17-
//! The Java implementation reads distribution files as ISO-8859-1 (Latin-1) and
18-
//! writes output files as ISO-8859-1 (see TableGenerator.java line 80).
17+
//! The Java (Trino) implementation reads distribution files as ISO-8859-1
18+
//! (Latin-1) and writes output files as ISO-8859-1 (see TableGenerator.java
19+
//! line 80). The C `dsdgen` outputs UTF-8.
1920
//!
20-
//! Rust reads ISO-8859-1 bytes and converts them to UTF-8 strings (since Rust
21-
//! strings are UTF-8). For byte-for-byte compatibility with Java output, we must
22-
//! convert back to ISO-8859-1 when writing.
23-
//!
24-
//! Since ISO-8859-1 bytes 0x00-0xFF map directly to Unicode code points U+0000-U+00FF,
25-
//! any character from the distribution files can be safely converted back to a single byte.
21+
//! [`CompatWriter`] selects the right behavior based on [`CompatMode`].
2622
2723
use std::io::{self, Write};
2824

25+
use crate::config::CompatMode;
26+
2927
/// Converts a UTF-8 string to ISO-8859-1 bytes.
3028
///
3129
/// This is the inverse of the conversion done in file_loader.rs when reading
@@ -54,7 +52,7 @@ pub fn to_iso_8859_1(s: &str) -> io::Result<Vec<u8>> {
5452

5553
/// A writer wrapper that converts UTF-8 strings to ISO-8859-1 before writing.
5654
///
57-
/// This matches Java's behavior in TableGenerator.java which writes output
55+
/// This matches Trino's behavior in TableGenerator.java which writes output
5856
/// using StandardCharsets.ISO_8859_1.
5957
pub struct Iso8859Writer<W: Write> {
6058
inner: W,
@@ -103,6 +101,41 @@ impl<W: Write> Write for Iso8859Writer<W> {
103101
}
104102
}
105103

104+
/// Writer that selects the output encoding based on [`CompatMode`].
105+
///
106+
/// * `Iso8859`: outputs ISO-8859-1 to match Trino.
107+
/// * `Utf8`: outputs UTF-8 to match unmodified C `dsdgen`.
108+
pub enum CompatWriter<W: Write> {
109+
Iso8859(Iso8859Writer<W>),
110+
Utf8(W),
111+
}
112+
113+
impl<W: Write> CompatWriter<W> {
114+
/// Build a writer for `compat_mode`.
115+
pub fn new(writer: W, compat_mode: CompatMode) -> Self {
116+
match compat_mode {
117+
CompatMode::Trino => CompatWriter::Iso8859(Iso8859Writer::new(writer)),
118+
CompatMode::C => CompatWriter::Utf8(writer),
119+
}
120+
}
121+
}
122+
123+
impl<W: Write> Write for CompatWriter<W> {
124+
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
125+
match self {
126+
CompatWriter::Iso8859(w) => w.write(buf),
127+
CompatWriter::Utf8(w) => w.write(buf),
128+
}
129+
}
130+
131+
fn flush(&mut self) -> io::Result<()> {
132+
match self {
133+
CompatWriter::Iso8859(w) => w.flush(),
134+
CompatWriter::Utf8(w) => w.flush(),
135+
}
136+
}
137+
}
138+
106139
#[cfg(test)]
107140
mod tests {
108141
use super::*;
@@ -142,4 +175,28 @@ mod tests {
142175
assert_eq!(err.kind(), io::ErrorKind::InvalidData);
143176
assert!(err.to_string().contains("outside ISO-8859-1 range"));
144177
}
178+
179+
#[test]
180+
fn test_compat_writer_trino_emits_iso_8859_1() {
181+
let mut buffer = Vec::new();
182+
{
183+
let mut writer = CompatWriter::new(&mut buffer, CompatMode::Trino);
184+
write!(writer, "CÔTE D'IVOIRE").unwrap();
185+
}
186+
// Trino/Java emits a single 0xD4 byte for Ô.
187+
assert_eq!(buffer[1], 0xD4);
188+
assert_eq!(buffer.len(), 13);
189+
}
190+
191+
#[test]
192+
fn test_compat_writer_c_emits_utf8() {
193+
let mut buffer = Vec::new();
194+
{
195+
let mut writer = CompatWriter::new(&mut buffer, CompatMode::C);
196+
write!(writer, "CÔTE D'IVOIRE").unwrap();
197+
}
198+
// C dsdgen passes the UTF-8 bytes through (Ô is 0xC3 0x94).
199+
assert_eq!(&buffer[..3], &[b'C', 0xC3, 0x94]);
200+
assert_eq!(buffer.len(), 14);
201+
}
145202
}

0 commit comments

Comments
 (0)