|
14 | 14 |
|
15 | 15 | //! Output utilities for TPC-DS data generation |
16 | 16 | //! |
17 | | -//! The Java implementation reads distribution files as ISO-8859-1 (Latin-1) and |
18 | | -//! writes output files as ISO-8859-1 (see TableGenerator.java line 80). |
| 17 | +//! The Java (Trino) implementation reads distribution files as ISO-8859-1 |
| 18 | +//! (Latin-1) and writes output files as ISO-8859-1 (see TableGenerator.java |
| 19 | +//! line 80). The C `dsdgen` outputs UTF-8. |
19 | 20 | //! |
20 | | -//! Rust reads ISO-8859-1 bytes and converts them to UTF-8 strings (since Rust |
21 | | -//! strings are UTF-8). For byte-for-byte compatibility with Java output, we must |
22 | | -//! convert back to ISO-8859-1 when writing. |
23 | | -//! |
24 | | -//! Since ISO-8859-1 bytes 0x00-0xFF map directly to Unicode code points U+0000-U+00FF, |
25 | | -//! any character from the distribution files can be safely converted back to a single byte. |
| 21 | +//! [`CompatWriter`] selects the right behavior based on [`CompatMode`]. |
26 | 22 |
|
27 | 23 | use std::io::{self, Write}; |
28 | 24 |
|
| 25 | +use crate::config::CompatMode; |
| 26 | + |
29 | 27 | /// Converts a UTF-8 string to ISO-8859-1 bytes. |
30 | 28 | /// |
31 | 29 | /// This is the inverse of the conversion done in file_loader.rs when reading |
@@ -54,7 +52,7 @@ pub fn to_iso_8859_1(s: &str) -> io::Result<Vec<u8>> { |
54 | 52 |
|
55 | 53 | /// A writer wrapper that converts UTF-8 strings to ISO-8859-1 before writing. |
56 | 54 | /// |
57 | | -/// This matches Java's behavior in TableGenerator.java which writes output |
| 55 | +/// This matches Trino's behavior in TableGenerator.java which writes output |
58 | 56 | /// using StandardCharsets.ISO_8859_1. |
59 | 57 | pub struct Iso8859Writer<W: Write> { |
60 | 58 | inner: W, |
@@ -103,6 +101,41 @@ impl<W: Write> Write for Iso8859Writer<W> { |
103 | 101 | } |
104 | 102 | } |
105 | 103 |
|
| 104 | +/// Writer that selects the output encoding based on [`CompatMode`]. |
| 105 | +/// |
| 106 | +/// * `Iso8859`: outputs ISO-8859-1 to match Trino. |
| 107 | +/// * `Utf8`: outputs UTF-8 to match unmodified C `dsdgen`. |
| 108 | +pub enum CompatWriter<W: Write> { |
| 109 | + Iso8859(Iso8859Writer<W>), |
| 110 | + Utf8(W), |
| 111 | +} |
| 112 | + |
| 113 | +impl<W: Write> CompatWriter<W> { |
| 114 | + /// Build a writer for `compat_mode`. |
| 115 | + pub fn new(writer: W, compat_mode: CompatMode) -> Self { |
| 116 | + match compat_mode { |
| 117 | + CompatMode::Trino => CompatWriter::Iso8859(Iso8859Writer::new(writer)), |
| 118 | + CompatMode::C => CompatWriter::Utf8(writer), |
| 119 | + } |
| 120 | + } |
| 121 | +} |
| 122 | + |
| 123 | +impl<W: Write> Write for CompatWriter<W> { |
| 124 | + fn write(&mut self, buf: &[u8]) -> io::Result<usize> { |
| 125 | + match self { |
| 126 | + CompatWriter::Iso8859(w) => w.write(buf), |
| 127 | + CompatWriter::Utf8(w) => w.write(buf), |
| 128 | + } |
| 129 | + } |
| 130 | + |
| 131 | + fn flush(&mut self) -> io::Result<()> { |
| 132 | + match self { |
| 133 | + CompatWriter::Iso8859(w) => w.flush(), |
| 134 | + CompatWriter::Utf8(w) => w.flush(), |
| 135 | + } |
| 136 | + } |
| 137 | +} |
| 138 | + |
106 | 139 | #[cfg(test)] |
107 | 140 | mod tests { |
108 | 141 | use super::*; |
@@ -142,4 +175,28 @@ mod tests { |
142 | 175 | assert_eq!(err.kind(), io::ErrorKind::InvalidData); |
143 | 176 | assert!(err.to_string().contains("outside ISO-8859-1 range")); |
144 | 177 | } |
| 178 | + |
| 179 | + #[test] |
| 180 | + fn test_compat_writer_trino_emits_iso_8859_1() { |
| 181 | + let mut buffer = Vec::new(); |
| 182 | + { |
| 183 | + let mut writer = CompatWriter::new(&mut buffer, CompatMode::Trino); |
| 184 | + write!(writer, "CÔTE D'IVOIRE").unwrap(); |
| 185 | + } |
| 186 | + // Trino/Java emits a single 0xD4 byte for Ô. |
| 187 | + assert_eq!(buffer[1], 0xD4); |
| 188 | + assert_eq!(buffer.len(), 13); |
| 189 | + } |
| 190 | + |
| 191 | + #[test] |
| 192 | + fn test_compat_writer_c_emits_utf8() { |
| 193 | + let mut buffer = Vec::new(); |
| 194 | + { |
| 195 | + let mut writer = CompatWriter::new(&mut buffer, CompatMode::C); |
| 196 | + write!(writer, "CÔTE D'IVOIRE").unwrap(); |
| 197 | + } |
| 198 | + // C dsdgen passes the UTF-8 bytes through (Ô is 0xC3 0x94). |
| 199 | + assert_eq!(&buffer[..3], &[b'C', 0xC3, 0x94]); |
| 200 | + assert_eq!(buffer.len(), 14); |
| 201 | + } |
145 | 202 | } |
0 commit comments