From acb664553a6c1ede52dddb2d26a465d15e8b8b88 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 10:15:15 -0700 Subject: [PATCH 01/23] add deprecation notice Co-authored-by: Copilot --- tpchgen-cli/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tpchgen-cli/README.md b/tpchgen-cli/README.md index edcb590..6734bfb 100644 --- a/tpchgen-cli/README.md +++ b/tpchgen-cli/README.md @@ -87,3 +87,15 @@ done Times to create TPCH tables in Parquet format using `tpchgen-cli` and `duckdb` for various scale factors. +## Deprecation Notice + +`--format=parquet`, `--parquet-compression`, and `--parquet-row-group-bytes` are deprecated as of v3.x and will be removed in v4.0.0. Use the `parquet` subcommand instead: + +```shell +# Before +tpchgen-cli --format=parquet --parquet-compression=ZSTD(1) -s 10 + +# After +tpchgen-cli parquet --compression=ZSTD(1) -s 10 +``` + From 1b5fc404acae97474a96dbf81d9a53e9c4782daf Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 10:30:36 -0700 Subject: [PATCH 02/23] add tbl/csv subcommands Co-authored-by: Copilot --- tpchgen-cli/README.md | 5 +- tpchgen-cli/bin/main.rs | 132 +++++++++++++++++++++------ tpchgen-cli/tests/cli_integration.rs | 90 +++++++++++++++++- 3 files changed, 197 insertions(+), 30 deletions(-) diff --git a/tpchgen-cli/README.md b/tpchgen-cli/README.md index 6734bfb..57e21f8 100644 --- a/tpchgen-cli/README.md +++ b/tpchgen-cli/README.md @@ -49,6 +49,7 @@ tpchgen-cli parquet -s 10 # Scale Factor 10, all tables, in `tbl`(csv like) format in the `sf10` directory # (10GB, 8 files, 60M lineitem rows) +# Note: `tpchgen-cli tbl` also works explicitly tpchgen-cli -s 10 --output-dir sf10 # Scale Factor 1000, lineitem table, in Apache Parquet format in sf1000 directory, @@ -89,13 +90,15 @@ Times to create TPCH tables in Parquet format using `tpchgen-cli` and `duckdb` f ## Deprecation Notice -`--format=parquet`, `--parquet-compression`, and `--parquet-row-group-bytes` are deprecated as of v3.x and will be removed in v4.0.0. Use the `parquet` subcommand instead: +`--format`, `--parquet-compression`, `--parquet-row-group-bytes`, and `--delimiter` are deprecated as of v3.x and will be removed in v4.0.0. Use subcommands instead: ```shell # Before tpchgen-cli --format=parquet --parquet-compression=ZSTD(1) -s 10 +tpchgen-cli --format=csv --delimiter='\t' -s 10 # After tpchgen-cli parquet --compression=ZSTD(1) -s 10 +tpchgen-cli csv --delimiter='\t' -s 10 ``` diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index f9c6828..8faeaa2 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -34,17 +34,13 @@ multiple files named //
.. Examples -# Generate all tables at scale factor 1 (1GB) in TBL format to /tmp/tpch directory: +# Generate all tables at scale factor 1 (1GB) in TBL format (default) to /tmp/tpch directory: tpchgen-cli -s 1 --output-dir=/tmp/tpch -# Generate all tables in CSV format: +# Generate all tables in CSV format with tab delimiter: -tpchgen-cli -s 1 --format=csv --output-dir=/tmp/tpch - -# Generate scale factor one in CSV format with tab delimiter: - -tpchgen-cli -s 1 --format=csv --delimiter='\t' --output-dir=/tmp/tpch +tpchgen-cli csv -s 1 --delimiter='\t' --output-dir=/tmp/tpch # Generate the lineitem table at scale factor 100 in 10 Apache Parquet files to # /tmp/tpch/lineitem: @@ -66,6 +62,10 @@ struct Cli { #[derive(clap::Subcommand)] enum Commands { + /// Generate TBL (pipe-delimited) output + Tbl(TblArgs), + /// Generate CSV output with CSV-specific options + Csv(CsvArgs), /// Generate Apache Parquet output with Parquet-specific options Parquet(ParquetArgs), } @@ -117,11 +117,11 @@ struct TopLevelArgs { #[command(flatten)] common: CommonArgs, - /// Output format: tbl, csv, parquet (default: tbl) + /// Output format (deprecated: use subcommands `tbl`, `csv`, or `parquet` instead) /// - /// For Parquet output, prefer using the `parquet` subcommand instead. - /// The --format flag will be replaced by subcommands in v4.0.0. - #[arg(short, long)] + /// The --format flag will be removed in v4.0.0. + #[arg(short, long, hide = true)] + #[deprecated] format: Option, /// Parquet block compression format (deprecated: use 'parquet' subcommand instead) @@ -134,11 +134,28 @@ struct TopLevelArgs { #[deprecated] parquet_row_group_bytes: Option, + /// CSV delimiter character (deprecated: use 'csv' subcommand instead) + #[arg(long, hide = true, default_value = ",", value_parser = parse_delimiter)] + #[deprecated] + delimiter: char, +} + +#[derive(clap::Args)] +struct TblArgs { + #[command(flatten)] + common: CommonArgs, +} + +#[derive(clap::Args)] +struct CsvArgs { + #[command(flatten)] + common: CommonArgs, + /// CSV delimiter character (default: ',') /// /// Specifies the delimiter character to use when generating CSV files. /// - /// Supports escape sequences: \t (tab), \n (newline), \r (carriage return), \\ (backslash) + /// Supports escape sequences: \t (tab), \n (newline), \r (carriage return), \\\ (backslash) /// Common delimiters: ',' (comma), '|' (pipe), '\t' (tab), ';' (semicolon) #[arg(long, default_value = ",", value_parser = parse_delimiter)] delimiter: char, @@ -253,6 +270,7 @@ async fn main() -> io::Result<()> { impl Cli { /// Main function to run the generation + #[allow(deprecated)] async fn main(self) -> io::Result<()> { // Error if both --format and a subcommand are specified if self.args.format.is_some() && self.command.is_some() { @@ -262,6 +280,8 @@ impl Cli { )); } match self.command { + Some(Commands::Tbl(args)) => args.run().await, + Some(Commands::Csv(args)) => args.run().await, Some(Commands::Parquet(args)) => args.run().await, None => self.run().await, } @@ -285,11 +305,19 @@ impl Cli { configure_logging(verbose, quiet); - // Warn about --format=parquet migration to subcommand - if format == OutputFormat::Parquet { - log::warn!( - "The --format=parquet flag will be replaced by the `parquet` subcommand in v4.0.0. Use `tpchgen-cli parquet` instead." - ); + // Warn about --format migration to subcommands (only when explicitly provided) + if self.args.format.is_some() { + match format { + OutputFormat::Parquet => { + log::warn!("The --format flag will be removed in v4.0.0. Use `tpchgen-cli parquet` instead."); + } + OutputFormat::Csv => { + log::warn!("The --format flag will be removed in v4.0.0. Use `tpchgen-cli csv` instead."); + } + OutputFormat::Tbl => { + log::warn!("The --format flag will be removed in v4.0.0. Use `tpchgen-cli tbl` instead."); + } + } } if self.args.parquet_compression.is_some() { @@ -307,17 +335,12 @@ impl Cli { } } - // Validate delimiter usage - if format == OutputFormat::Tbl && delimiter != ',' { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "The --delimiter option cannot be used with --format=tbl. TBL format uses the TPC-H standard pipe delimiter." - )); - } - - // Warn if delimiter is set but not generating CSV - if format != OutputFormat::Csv && delimiter != ',' { - log::warn!("Delimiter option set but not generating CSV"); + if self.args.delimiter != ',' { + if format == OutputFormat::Csv { + log::warn!("The --delimiter flag is deprecated. Use 'tpchgen-cli csv --delimiter=...' instead"); + } else { + log::warn!("Delimiter option set but not generating CSV"); + } } // Build the generator using the library API @@ -346,6 +369,59 @@ impl Cli { } } +impl TblArgs { + async fn run(self) -> io::Result<()> { + configure_logging(self.common.verbose, self.common.quiet); + + let mut builder = TpchGenerator::builder() + .with_scale_factor(self.common.scale_factor) + .with_output_dir(self.common.output_dir) + .with_format(OutputFormat::Tbl) + .with_num_threads(self.common.num_threads) + .with_stdout(self.common.stdout); + + if let Some(tables) = self.common.tables { + builder = builder.with_tables(tables); + } + + if let Some(parts) = self.common.parts { + builder = builder.with_parts(parts); + } + if let Some(part) = self.common.part { + builder = builder.with_part(part); + } + + builder.build().generate().await + } +} + +impl CsvArgs { + async fn run(self) -> io::Result<()> { + configure_logging(self.common.verbose, self.common.quiet); + + let mut builder = TpchGenerator::builder() + .with_scale_factor(self.common.scale_factor) + .with_output_dir(self.common.output_dir) + .with_format(OutputFormat::Csv) + .with_num_threads(self.common.num_threads) + .with_stdout(self.common.stdout) + .with_csv_delimiter(self.delimiter); + + if let Some(tables) = self.common.tables { + builder = builder.with_tables(tables); + } + + if let Some(parts) = self.common.parts { + builder = builder.with_parts(parts); + } + if let Some(part) = self.common.part { + builder = builder.with_part(part); + } + + builder.build().generate().await + } +} + impl ParquetArgs { async fn run(self) -> io::Result<()> { configure_logging(self.common.verbose, self.common.quiet); diff --git a/tpchgen-cli/tests/cli_integration.rs b/tpchgen-cli/tests/cli_integration.rs index a9e4095..0d2d2d9 100644 --- a/tpchgen-cli/tests/cli_integration.rs +++ b/tpchgen-cli/tests/cli_integration.rs @@ -720,7 +720,7 @@ async fn test_format_parquet_warns_about_subcommand() { .assert() .success() .stderr(predicates::str::contains( - "will be replaced by the `parquet` subcommand in v4.0.0", + "will be removed in v4.0.0", )); } @@ -768,3 +768,91 @@ fn test_default_format_is_tbl() { expected_file ); } + +/// Test that the `tbl` subcommand generates TBL files +#[test] +fn test_tbl_subcommand() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + cargo_bin_cmd!("tpchgen-cli") + .arg("tbl") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .success(); + + let expected_file = temp_dir.path().join("part.tbl"); + assert!( + expected_file.exists(), + "Expected TBL file {:?} to exist with `tbl` subcommand", + expected_file + ); +} + +/// Test that the `csv` subcommand generates CSV files +#[test] +fn test_csv_subcommand() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + cargo_bin_cmd!("tpchgen-cli") + .arg("csv") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .success(); + + let expected_file = temp_dir.path().join("part.csv"); + assert!( + expected_file.exists(), + "Expected CSV file {:?} to exist with `csv` subcommand", + expected_file + ); +} + +/// Test that --format=csv emits a deprecation warning +#[tokio::test] +async fn test_format_csv_warns_about_subcommand() { + let output_dir = tempdir().unwrap(); + cargo_bin_cmd!("tpchgen-cli") + .arg("--format") + .arg("csv") + .arg("--tables") + .arg("part") + .arg("--scale-factor") + .arg("0.001") + .arg("--output-dir") + .arg(output_dir.path()) + .assert() + .success() + .stderr(predicates::str::contains( + "will be removed in v4.0.0", + )); +} + +/// Test that --format=tbl emits a deprecation warning +#[tokio::test] +async fn test_format_tbl_warns_about_subcommand() { + let output_dir = tempdir().unwrap(); + cargo_bin_cmd!("tpchgen-cli") + .arg("--format") + .arg("tbl") + .arg("--tables") + .arg("part") + .arg("--scale-factor") + .arg("0.001") + .arg("--output-dir") + .arg(output_dir.path()) + .assert() + .success() + .stderr(predicates::str::contains( + "will be removed in v4.0.0", + )); +} From 87e228d236712a34f1d57906f6035a7f6b39ced0 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 10:34:15 -0700 Subject: [PATCH 03/23] move delimiter to csv subcommand Co-authored-by: Copilot --- tpchgen-cli/README.md | 4 +--- tpchgen-cli/bin/main.rs | 17 +---------------- 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/tpchgen-cli/README.md b/tpchgen-cli/README.md index 57e21f8..2fb3087 100644 --- a/tpchgen-cli/README.md +++ b/tpchgen-cli/README.md @@ -90,15 +90,13 @@ Times to create TPCH tables in Parquet format using `tpchgen-cli` and `duckdb` f ## Deprecation Notice -`--format`, `--parquet-compression`, `--parquet-row-group-bytes`, and `--delimiter` are deprecated as of v3.x and will be removed in v4.0.0. Use subcommands instead: +`--format`, `--parquet-compression`, and `--parquet-row-group-bytes` are deprecated as of v3.x and will be removed in v4.0.0. Use subcommands instead: ```shell # Before tpchgen-cli --format=parquet --parquet-compression=ZSTD(1) -s 10 -tpchgen-cli --format=csv --delimiter='\t' -s 10 # After tpchgen-cli parquet --compression=ZSTD(1) -s 10 -tpchgen-cli csv --delimiter='\t' -s 10 ``` diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index 8faeaa2..e34ce0c 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -133,11 +133,6 @@ struct TopLevelArgs { #[arg(long, hide = true)] #[deprecated] parquet_row_group_bytes: Option, - - /// CSV delimiter character (deprecated: use 'csv' subcommand instead) - #[arg(long, hide = true, default_value = ",", value_parser = parse_delimiter)] - #[deprecated] - delimiter: char, } #[derive(clap::Args)] @@ -296,7 +291,6 @@ impl Cli { let verbose = self.args.common.verbose; let quiet = self.args.common.quiet; let stdout = self.args.common.stdout; - let delimiter = self.args.delimiter; let parquet_compression = self.args.parquet_compression.unwrap_or(Compression::SNAPPY); let parquet_row_group_bytes = self .args @@ -335,14 +329,6 @@ impl Cli { } } - if self.args.delimiter != ',' { - if format == OutputFormat::Csv { - log::warn!("The --delimiter flag is deprecated. Use 'tpchgen-cli csv --delimiter=...' instead"); - } else { - log::warn!("Delimiter option set but not generating CSV"); - } - } - // Build the generator using the library API let mut builder = TpchGenerator::builder() .with_scale_factor(scale_factor) @@ -351,8 +337,7 @@ impl Cli { .with_num_threads(num_threads) .with_parquet_compression(parquet_compression) .with_parquet_row_group_bytes(parquet_row_group_bytes) - .with_stdout(stdout) - .with_csv_delimiter(delimiter); + .with_stdout(stdout); if let Some(tables) = self.args.common.tables { builder = builder.with_tables(tables); From d9c7af3ab8f410d418672d646d9706076e9ad042 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 10:45:42 -0700 Subject: [PATCH 04/23] unnecessary changes Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index e34ce0c..f99abe1 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -38,6 +38,10 @@ Examples tpchgen-cli -s 1 --output-dir=/tmp/tpch +# Generate all tables in CSV format: + +tpchgen-cli csv -s 1 --output-dir=/tmp/tpch + # Generate all tables in CSV format with tab delimiter: tpchgen-cli csv -s 1 --delimiter='\t' --output-dir=/tmp/tpch @@ -150,7 +154,7 @@ struct CsvArgs { /// /// Specifies the delimiter character to use when generating CSV files. /// - /// Supports escape sequences: \t (tab), \n (newline), \r (carriage return), \\\ (backslash) + /// Supports escape sequences: \t (tab), \n (newline), \r (carriage return), \\ (backslash) /// Common delimiters: ',' (comma), '|' (pipe), '\t' (tab), ';' (semicolon) #[arg(long, default_value = ",", value_parser = parse_delimiter)] delimiter: char, From d9f54c159e83cd1ad6b47417b11bf8cfebe71053 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 10:47:17 -0700 Subject: [PATCH 05/23] refactor --- tpchgen-cli/bin/main.rs | 147 ++++++++++++++-------------------------- 1 file changed, 52 insertions(+), 95 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index f99abe1..4c5fc4e 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -14,7 +14,8 @@ use std::io; use std::path::PathBuf; use std::str::FromStr; use tpchgen_cli::{ - Compression, OutputFormat, Table, TpchGenerator, DEFAULT_PARQUET_ROW_GROUP_BYTES, + Compression, OutputFormat, Table, TpchGenerator, TpchGeneratorBuilder, + DEFAULT_PARQUET_ROW_GROUP_BYTES, }; #[derive(Parser)] @@ -116,6 +117,30 @@ struct CommonArgs { stdout: bool, } +impl CommonArgs { + /// Create a [`TpchGeneratorBuilder`] pre-configured with the common options. + fn builder(self, format: OutputFormat) -> TpchGeneratorBuilder { + let mut builder = TpchGenerator::builder() + .with_scale_factor(self.scale_factor) + .with_output_dir(self.output_dir) + .with_format(format) + .with_num_threads(self.num_threads) + .with_stdout(self.stdout); + + if let Some(tables) = self.tables { + builder = builder.with_tables(tables); + } + if let Some(parts) = self.parts { + builder = builder.with_parts(parts); + } + if let Some(part) = self.part { + builder = builder.with_part(part); + } + + builder + } +} + #[derive(clap::Args)] struct TopLevelArgs { #[command(flatten)] @@ -289,19 +314,8 @@ impl Cli { #[allow(deprecated)] async fn run(self) -> io::Result<()> { let format = self.args.format.unwrap_or(OutputFormat::Tbl); - let scale_factor = self.args.common.scale_factor; - let output_dir = self.args.common.output_dir; - let num_threads = self.args.common.num_threads; - let verbose = self.args.common.verbose; - let quiet = self.args.common.quiet; - let stdout = self.args.common.stdout; - let parquet_compression = self.args.parquet_compression.unwrap_or(Compression::SNAPPY); - let parquet_row_group_bytes = self - .args - .parquet_row_group_bytes - .unwrap_or(DEFAULT_PARQUET_ROW_GROUP_BYTES); - configure_logging(verbose, quiet); + configure_logging(self.args.common.verbose, self.args.common.quiet); // Warn about --format migration to subcommands (only when explicitly provided) if self.args.format.is_some() { @@ -318,6 +332,7 @@ impl Cli { } } + let parquet_compression = self.args.parquet_compression.unwrap_or(Compression::SNAPPY); if self.args.parquet_compression.is_some() { if format == OutputFormat::Parquet { log::warn!("The --parquet-compression flag is deprecated. Use 'tpchgen-cli parquet --compression=...' instead"); @@ -325,6 +340,11 @@ impl Cli { log::warn!("Parquet compression option set but not generating Parquet files"); } } + + let parquet_row_group_bytes = self + .args + .parquet_row_group_bytes + .unwrap_or(DEFAULT_PARQUET_ROW_GROUP_BYTES); if self.args.parquet_row_group_bytes.is_some() { if format == OutputFormat::Parquet { log::warn!("The --parquet-row-group-bytes flag is deprecated. Use 'tpchgen-cli parquet --row-group-bytes=...' instead"); @@ -333,109 +353,46 @@ impl Cli { } } - // Build the generator using the library API - let mut builder = TpchGenerator::builder() - .with_scale_factor(scale_factor) - .with_output_dir(output_dir) - .with_format(format) - .with_num_threads(num_threads) + self.args + .common + .builder(format) .with_parquet_compression(parquet_compression) .with_parquet_row_group_bytes(parquet_row_group_bytes) - .with_stdout(stdout); - - if let Some(tables) = self.args.common.tables { - builder = builder.with_tables(tables); - } - - if let Some(parts) = self.args.common.parts { - builder = builder.with_parts(parts); - } - if let Some(part) = self.args.common.part { - builder = builder.with_part(part); - } - - builder.build().generate().await + .build() + .generate() + .await } } impl TblArgs { async fn run(self) -> io::Result<()> { configure_logging(self.common.verbose, self.common.quiet); - - let mut builder = TpchGenerator::builder() - .with_scale_factor(self.common.scale_factor) - .with_output_dir(self.common.output_dir) - .with_format(OutputFormat::Tbl) - .with_num_threads(self.common.num_threads) - .with_stdout(self.common.stdout); - - if let Some(tables) = self.common.tables { - builder = builder.with_tables(tables); - } - - if let Some(parts) = self.common.parts { - builder = builder.with_parts(parts); - } - if let Some(part) = self.common.part { - builder = builder.with_part(part); - } - - builder.build().generate().await + self.common.builder(OutputFormat::Tbl).build().generate().await } } impl CsvArgs { async fn run(self) -> io::Result<()> { configure_logging(self.common.verbose, self.common.quiet); - - let mut builder = TpchGenerator::builder() - .with_scale_factor(self.common.scale_factor) - .with_output_dir(self.common.output_dir) - .with_format(OutputFormat::Csv) - .with_num_threads(self.common.num_threads) - .with_stdout(self.common.stdout) - .with_csv_delimiter(self.delimiter); - - if let Some(tables) = self.common.tables { - builder = builder.with_tables(tables); - } - - if let Some(parts) = self.common.parts { - builder = builder.with_parts(parts); - } - if let Some(part) = self.common.part { - builder = builder.with_part(part); - } - - builder.build().generate().await + self.common + .builder(OutputFormat::Csv) + .with_csv_delimiter(self.delimiter) + .build() + .generate() + .await } } impl ParquetArgs { async fn run(self) -> io::Result<()> { configure_logging(self.common.verbose, self.common.quiet); - - let mut builder = TpchGenerator::builder() - .with_scale_factor(self.common.scale_factor) - .with_output_dir(self.common.output_dir) - .with_format(OutputFormat::Parquet) - .with_num_threads(self.common.num_threads) - .with_stdout(self.common.stdout) + self.common + .builder(OutputFormat::Parquet) .with_parquet_compression(self.compression) - .with_parquet_row_group_bytes(self.row_group_bytes); - - if let Some(tables) = self.common.tables { - builder = builder.with_tables(tables); - } - - if let Some(parts) = self.common.parts { - builder = builder.with_parts(parts); - } - if let Some(part) = self.common.part { - builder = builder.with_part(part); - } - - builder.build().generate().await + .with_parquet_row_group_bytes(self.row_group_bytes) + .build() + .generate() + .await } } From 1100bd589713e48d0789c0c6b42e4a5d7575ad12 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 10:50:01 -0700 Subject: [PATCH 06/23] lint --- tpchgen-cli/bin/main.rs | 6 +++++- tpchgen-cli/tests/cli_integration.rs | 12 +++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index 4c5fc4e..ad96b49 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -367,7 +367,11 @@ impl Cli { impl TblArgs { async fn run(self) -> io::Result<()> { configure_logging(self.common.verbose, self.common.quiet); - self.common.builder(OutputFormat::Tbl).build().generate().await + self.common + .builder(OutputFormat::Tbl) + .build() + .generate() + .await } } diff --git a/tpchgen-cli/tests/cli_integration.rs b/tpchgen-cli/tests/cli_integration.rs index 0d2d2d9..60e35bf 100644 --- a/tpchgen-cli/tests/cli_integration.rs +++ b/tpchgen-cli/tests/cli_integration.rs @@ -719,9 +719,7 @@ async fn test_format_parquet_warns_about_subcommand() { .arg(output_dir.path()) .assert() .success() - .stderr(predicates::str::contains( - "will be removed in v4.0.0", - )); + .stderr(predicates::str::contains("will be removed in v4.0.0")); } /// Test that using --format together with a subcommand errors @@ -832,9 +830,7 @@ async fn test_format_csv_warns_about_subcommand() { .arg(output_dir.path()) .assert() .success() - .stderr(predicates::str::contains( - "will be removed in v4.0.0", - )); + .stderr(predicates::str::contains("will be removed in v4.0.0")); } /// Test that --format=tbl emits a deprecation warning @@ -852,7 +848,5 @@ async fn test_format_tbl_warns_about_subcommand() { .arg(output_dir.path()) .assert() .success() - .stderr(predicates::str::contains( - "will be removed in v4.0.0", - )); + .stderr(predicates::str::contains("will be removed in v4.0.0")); } From d3cfe8948d1629077f1b26fcffe7d2f3d0555f09 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 12:46:36 -0700 Subject: [PATCH 07/23] add todo to remove top level parquet in v4.0.0 Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index ad96b49..a131c42 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -353,14 +353,15 @@ impl Cli { } } - self.args - .common - .builder(format) - .with_parquet_compression(parquet_compression) - .with_parquet_row_group_bytes(parquet_row_group_bytes) - .build() - .generate() - .await + let mut builder = self.args.common.builder(format); + // TODO: Remove this block when the deprecated top-level --parquet-compression + // and --parquet-row-group-bytes flags are removed in v4.0.0. + if format == OutputFormat::Parquet { + builder = builder + .with_parquet_compression(parquet_compression) + .with_parquet_row_group_bytes(parquet_row_group_bytes); + } + builder.build().generate().await } } From a15df5ed58d44cda4d8358fea203f732d77e7450 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 12:53:19 -0700 Subject: [PATCH 08/23] error when mixing top level option with subcommand Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 26 ++++++--- tpchgen-cli/tests/cli_integration.rs | 80 ++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 6 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index a131c42..76c9ab8 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -296,12 +296,26 @@ impl Cli { /// Main function to run the generation #[allow(deprecated)] async fn main(self) -> io::Result<()> { - // Error if both --format and a subcommand are specified - if self.args.format.is_some() && self.command.is_some() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "Cannot use --format with a subcommand. Use the subcommand directly, e.g. `tpchgen-cli parquet`", - )); + // Error if deprecated top-level flags are used with a subcommand + if self.command.is_some() { + if self.args.format.is_some() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "Cannot use --format with a subcommand. Use the subcommand directly, e.g. `tpchgen-cli parquet`", + )); + } + if self.args.parquet_compression.is_some() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "Cannot use --parquet-compression with a subcommand. Use `tpchgen-cli parquet --compression=...` instead", + )); + } + if self.args.parquet_row_group_bytes.is_some() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "Cannot use --parquet-row-group-bytes with a subcommand. Use `tpchgen-cli parquet --row-group-bytes=...` instead", + )); + } } match self.command { Some(Commands::Tbl(args)) => args.run().await, diff --git a/tpchgen-cli/tests/cli_integration.rs b/tpchgen-cli/tests/cli_integration.rs index 60e35bf..6b1e86f 100644 --- a/tpchgen-cli/tests/cli_integration.rs +++ b/tpchgen-cli/tests/cli_integration.rs @@ -744,6 +744,86 @@ fn test_format_with_subcommand_conflict() { )); } +/// Test that using --parquet-compression together with a subcommand errors +#[test] +fn test_parquet_compression_with_subcommand_conflict() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + // With parquet subcommand + cargo_bin_cmd!("tpchgen-cli") + .arg("--parquet-compression") + .arg("SNAPPY") + .arg("parquet") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains( + "Cannot use --parquet-compression with a subcommand", + )); + + // With tbl subcommand + cargo_bin_cmd!("tpchgen-cli") + .arg("--parquet-compression") + .arg("SNAPPY") + .arg("tbl") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains( + "Cannot use --parquet-compression with a subcommand", + )); +} + +/// Test that using --parquet-row-group-bytes together with a subcommand errors +#[test] +fn test_parquet_row_group_bytes_with_subcommand_conflict() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + // With parquet subcommand + cargo_bin_cmd!("tpchgen-cli") + .arg("--parquet-row-group-bytes") + .arg("1000000") + .arg("parquet") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains( + "Cannot use --parquet-row-group-bytes with a subcommand", + )); + + // With csv subcommand + cargo_bin_cmd!("tpchgen-cli") + .arg("--parquet-row-group-bytes") + .arg("1000000") + .arg("csv") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains( + "Cannot use --parquet-row-group-bytes with a subcommand", + )); +} + /// Test that running with no --format and no subcommand defaults to TBL #[test] fn test_default_format_is_tbl() { From 20093b27c1e4b0e488b48b3ccca5433eaf7e5f97 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 13:06:30 -0700 Subject: [PATCH 09/23] use args_conflicts_with_subcommands Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 27 ++++------------- tpchgen-cli/tests/cli_integration.rs | 43 ++++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 27 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index 76c9ab8..587152a 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -55,12 +55,16 @@ tpchgen-cli parquet -s 100 --tables=lineitem --parts=10 --output-dir=/tmp/tpch # Generate scale factor one in current directory, seeing debug output RUST_LOG=debug tpchgen-cli -s 1 --output-dir=/tmp/tpch -"# +"#, + args_conflicts_with_subcommands = true )] struct Cli { #[command(subcommand)] command: Option, + // Top-level args are only used when no subcommand is given (legacy path). + // args_conflicts_with_subcommands prevents these from being silently ignored + // when a subcommand is present (e.g. `tpchgen-cli -s 10 parquet` is an error). #[command(flatten)] args: TopLevelArgs, } @@ -296,27 +300,6 @@ impl Cli { /// Main function to run the generation #[allow(deprecated)] async fn main(self) -> io::Result<()> { - // Error if deprecated top-level flags are used with a subcommand - if self.command.is_some() { - if self.args.format.is_some() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "Cannot use --format with a subcommand. Use the subcommand directly, e.g. `tpchgen-cli parquet`", - )); - } - if self.args.parquet_compression.is_some() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "Cannot use --parquet-compression with a subcommand. Use `tpchgen-cli parquet --compression=...` instead", - )); - } - if self.args.parquet_row_group_bytes.is_some() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "Cannot use --parquet-row-group-bytes with a subcommand. Use `tpchgen-cli parquet --row-group-bytes=...` instead", - )); - } - } match self.command { Some(Commands::Tbl(args)) => args.run().await, Some(Commands::Csv(args)) => args.run().await, diff --git a/tpchgen-cli/tests/cli_integration.rs b/tpchgen-cli/tests/cli_integration.rs index 6b1e86f..a4bbc27 100644 --- a/tpchgen-cli/tests/cli_integration.rs +++ b/tpchgen-cli/tests/cli_integration.rs @@ -740,7 +740,7 @@ fn test_format_with_subcommand_conflict() { .assert() .failure() .stderr(predicates::str::contains( - "Cannot use --format with a subcommand", + "cannot be used with", )); } @@ -763,7 +763,7 @@ fn test_parquet_compression_with_subcommand_conflict() { .assert() .failure() .stderr(predicates::str::contains( - "Cannot use --parquet-compression with a subcommand", + "cannot be used with", )); // With tbl subcommand @@ -780,7 +780,7 @@ fn test_parquet_compression_with_subcommand_conflict() { .assert() .failure() .stderr(predicates::str::contains( - "Cannot use --parquet-compression with a subcommand", + "cannot be used with", )); } @@ -803,7 +803,7 @@ fn test_parquet_row_group_bytes_with_subcommand_conflict() { .assert() .failure() .stderr(predicates::str::contains( - "Cannot use --parquet-row-group-bytes with a subcommand", + "cannot be used with", )); // With csv subcommand @@ -820,10 +820,43 @@ fn test_parquet_row_group_bytes_with_subcommand_conflict() { .assert() .failure() .stderr(predicates::str::contains( - "Cannot use --parquet-row-group-bytes with a subcommand", + "cannot be used with", )); } +/// Test that common args before a subcommand are rejected +#[test] +fn test_common_args_with_subcommand_conflict() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + // -s before subcommand should error + cargo_bin_cmd!("tpchgen-cli") + .arg("-s") + .arg("0.01") + .arg("parquet") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains( + "cannot be used with", + )); + + // -s after subcommand should work + cargo_bin_cmd!("tpchgen-cli") + .arg("parquet") + .arg("-s") + .arg("0.01") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .success(); +} + /// Test that running with no --format and no subcommand defaults to TBL #[test] fn test_default_format_is_tbl() { From 183df47c6bbf17da5b888cf993e83eac25f6ae88 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 13:14:10 -0700 Subject: [PATCH 10/23] more tests --- tpchgen-cli/tests/cli_integration.rs | 115 +++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/tpchgen-cli/tests/cli_integration.rs b/tpchgen-cli/tests/cli_integration.rs index a4bbc27..0c2139b 100644 --- a/tpchgen-cli/tests/cli_integration.rs +++ b/tpchgen-cli/tests/cli_integration.rs @@ -963,3 +963,118 @@ async fn test_format_tbl_warns_about_subcommand() { .success() .stderr(predicates::str::contains("will be removed in v4.0.0")); } + +/// Test that the `csv` subcommand with a custom delimiter produces tab-delimited output +#[test] +fn test_csv_subcommand_custom_delimiter() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + cargo_bin_cmd!("tpchgen-cli") + .arg("csv") + .arg("--delimiter") + .arg("\\t") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("region") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .success(); + + let csv_file = temp_dir.path().join("region.csv"); + assert!(csv_file.exists(), "Expected CSV file {:?} to exist", csv_file); + + let contents = std::fs::read_to_string(&csv_file).unwrap(); + // Region table has 5 rows; each should contain tabs as delimiters + assert!( + contents.contains('\t'), + "Expected tab-delimited output, got:\n{}", + contents + ); + // Verify multiple tab-separated fields per line + let first_line = contents.lines().next().unwrap(); + let tab_count = first_line.matches('\t').count(); + assert!( + tab_count >= 2, + "Expected at least 2 tabs per line, got {} in: {}", + tab_count, + first_line + ); +} + +/// Test that the `tbl` subcommand rejects --delimiter +#[test] +fn test_tbl_subcommand_rejects_delimiter() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + cargo_bin_cmd!("tpchgen-cli") + .arg("tbl") + .arg("--delimiter") + .arg(",") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains("unexpected argument")); +} + +/// Test that deprecated --format=parquet with --parquet-compression still works +#[tokio::test] +async fn test_deprecated_parquet_compression_flag_works() { + let output_dir = tempdir().unwrap(); + + cargo_bin_cmd!("tpchgen-cli") + .arg("--format") + .arg("parquet") + .arg("--parquet-compression") + .arg("ZSTD(1)") + .arg("--tables") + .arg("region") + .arg("--scale-factor") + .arg("0.001") + .arg("--output-dir") + .arg(output_dir.path()) + .assert() + .success() + .stderr(predicates::str::contains("--parquet-compression flag is deprecated")); + + let parquet_file = output_dir.path().join("region.parquet"); + assert!( + parquet_file.exists(), + "Expected Parquet file {:?} to exist", + parquet_file + ); +} + +/// Test that deprecated --format=parquet with --parquet-row-group-bytes still works +#[tokio::test] +async fn test_deprecated_parquet_row_group_bytes_flag_works() { + let output_dir = tempdir().unwrap(); + + cargo_bin_cmd!("tpchgen-cli") + .arg("--format") + .arg("parquet") + .arg("--parquet-row-group-bytes") + .arg("1000000") + .arg("--tables") + .arg("region") + .arg("--scale-factor") + .arg("0.001") + .arg("--output-dir") + .arg(output_dir.path()) + .assert() + .success() + .stderr(predicates::str::contains("--parquet-row-group-bytes flag is deprecated")); + + let parquet_file = output_dir.path().join("region.parquet"); + assert!( + parquet_file.exists(), + "Expected Parquet file {:?} to exist", + parquet_file + ); +} From 898093fa166b19ce3513245c1f27eec20b33ea96 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 13:26:20 -0700 Subject: [PATCH 11/23] fixes Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 5 ----- tpchgen-cli/tests/cli_integration.rs | 7 +++++++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index 587152a..3f44c20 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -154,17 +154,14 @@ struct TopLevelArgs { /// /// The --format flag will be removed in v4.0.0. #[arg(short, long, hide = true)] - #[deprecated] format: Option, /// Parquet block compression format (deprecated: use 'parquet' subcommand instead) #[arg(short = 'c', long, hide = true)] - #[deprecated] parquet_compression: Option, /// Target row group size in bytes (deprecated: use 'parquet' subcommand instead) #[arg(long, hide = true)] - #[deprecated] parquet_row_group_bytes: Option, } @@ -298,7 +295,6 @@ async fn main() -> io::Result<()> { impl Cli { /// Main function to run the generation - #[allow(deprecated)] async fn main(self) -> io::Result<()> { match self.command { Some(Commands::Tbl(args)) => args.run().await, @@ -308,7 +304,6 @@ impl Cli { } } - #[allow(deprecated)] async fn run(self) -> io::Result<()> { let format = self.args.format.unwrap_or(OutputFormat::Tbl); diff --git a/tpchgen-cli/tests/cli_integration.rs b/tpchgen-cli/tests/cli_integration.rs index 0c2139b..efedbe8 100644 --- a/tpchgen-cli/tests/cli_integration.rs +++ b/tpchgen-cli/tests/cli_integration.rs @@ -944,6 +944,13 @@ async fn test_format_csv_warns_about_subcommand() { .assert() .success() .stderr(predicates::str::contains("will be removed in v4.0.0")); + + let expected_file = output_dir.path().join("part.csv"); + assert!( + expected_file.exists(), + "Expected CSV file {:?} to exist with deprecated --format=csv path", + expected_file + ); } /// Test that --format=tbl emits a deprecation warning From f76477263a546c96d5557fc5ba6752c2268d0aa6 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 13:31:14 -0700 Subject: [PATCH 12/23] fmt --- tpchgen-cli/tests/cli_integration.rs | 38 +++++++++++++--------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/tpchgen-cli/tests/cli_integration.rs b/tpchgen-cli/tests/cli_integration.rs index efedbe8..2a8e240 100644 --- a/tpchgen-cli/tests/cli_integration.rs +++ b/tpchgen-cli/tests/cli_integration.rs @@ -739,9 +739,7 @@ fn test_format_with_subcommand_conflict() { .arg(temp_dir.path()) .assert() .failure() - .stderr(predicates::str::contains( - "cannot be used with", - )); + .stderr(predicates::str::contains("cannot be used with")); } /// Test that using --parquet-compression together with a subcommand errors @@ -762,9 +760,7 @@ fn test_parquet_compression_with_subcommand_conflict() { .arg(temp_dir.path()) .assert() .failure() - .stderr(predicates::str::contains( - "cannot be used with", - )); + .stderr(predicates::str::contains("cannot be used with")); // With tbl subcommand cargo_bin_cmd!("tpchgen-cli") @@ -779,9 +775,7 @@ fn test_parquet_compression_with_subcommand_conflict() { .arg(temp_dir.path()) .assert() .failure() - .stderr(predicates::str::contains( - "cannot be used with", - )); + .stderr(predicates::str::contains("cannot be used with")); } /// Test that using --parquet-row-group-bytes together with a subcommand errors @@ -802,9 +796,7 @@ fn test_parquet_row_group_bytes_with_subcommand_conflict() { .arg(temp_dir.path()) .assert() .failure() - .stderr(predicates::str::contains( - "cannot be used with", - )); + .stderr(predicates::str::contains("cannot be used with")); // With csv subcommand cargo_bin_cmd!("tpchgen-cli") @@ -819,9 +811,7 @@ fn test_parquet_row_group_bytes_with_subcommand_conflict() { .arg(temp_dir.path()) .assert() .failure() - .stderr(predicates::str::contains( - "cannot be used with", - )); + .stderr(predicates::str::contains("cannot be used with")); } /// Test that common args before a subcommand are rejected @@ -840,9 +830,7 @@ fn test_common_args_with_subcommand_conflict() { .arg(temp_dir.path()) .assert() .failure() - .stderr(predicates::str::contains( - "cannot be used with", - )); + .stderr(predicates::str::contains("cannot be used with")); // -s after subcommand should work cargo_bin_cmd!("tpchgen-cli") @@ -990,7 +978,11 @@ fn test_csv_subcommand_custom_delimiter() { .success(); let csv_file = temp_dir.path().join("region.csv"); - assert!(csv_file.exists(), "Expected CSV file {:?} to exist", csv_file); + assert!( + csv_file.exists(), + "Expected CSV file {:?} to exist", + csv_file + ); let contents = std::fs::read_to_string(&csv_file).unwrap(); // Region table has 5 rows; each should contain tabs as delimiters @@ -1048,7 +1040,9 @@ async fn test_deprecated_parquet_compression_flag_works() { .arg(output_dir.path()) .assert() .success() - .stderr(predicates::str::contains("--parquet-compression flag is deprecated")); + .stderr(predicates::str::contains( + "--parquet-compression flag is deprecated", + )); let parquet_file = output_dir.path().join("region.parquet"); assert!( @@ -1076,7 +1070,9 @@ async fn test_deprecated_parquet_row_group_bytes_flag_works() { .arg(output_dir.path()) .assert() .success() - .stderr(predicates::str::contains("--parquet-row-group-bytes flag is deprecated")); + .stderr(predicates::str::contains( + "--parquet-row-group-bytes flag is deprecated", + )); let parquet_file = output_dir.path().join("region.parquet"); assert!( From 1357305a53d9709dc67ddcd495e0b0b9d25f6ad0 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 13:44:51 -0700 Subject: [PATCH 13/23] nits Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index 3f44c20..9e1dfae 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -43,7 +43,7 @@ tpchgen-cli -s 1 --output-dir=/tmp/tpch tpchgen-cli csv -s 1 --output-dir=/tmp/tpch -# Generate all tables in CSV format with tab delimiter: +# Generate scale factor one in CSV format with tab delimiter: tpchgen-cli csv -s 1 --delimiter='\t' --output-dir=/tmp/tpch @@ -346,8 +346,6 @@ impl Cli { } let mut builder = self.args.common.builder(format); - // TODO: Remove this block when the deprecated top-level --parquet-compression - // and --parquet-row-group-bytes flags are removed in v4.0.0. if format == OutputFormat::Parquet { builder = builder .with_parquet_compression(parquet_compression) From d55d50498618a2a79ef63f52357e0217cd7af446 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 14:38:28 -0700 Subject: [PATCH 14/23] warn when --delimiter is used with --format=csv Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 11 +++++++++++ tpchgen-cli/tests/cli_integration.rs | 23 +++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index 9e1dfae..0fcf423 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -163,6 +163,10 @@ struct TopLevelArgs { /// Target row group size in bytes (deprecated: use 'parquet' subcommand instead) #[arg(long, hide = true)] parquet_row_group_bytes: Option, + + /// CSV delimiter character (use 'csv --delimiter=...' instead) + #[arg(long, hide = true, value_parser = parse_delimiter)] + delimiter: Option, } #[derive(clap::Args)] @@ -345,6 +349,13 @@ impl Cli { } } + if self.args.delimiter.is_some() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "The --delimiter flag is not supported at the top level. Use `tpchgen-cli csv --delimiter=...` instead.", + )); + } + let mut builder = self.args.common.builder(format); if format == OutputFormat::Parquet { builder = builder diff --git a/tpchgen-cli/tests/cli_integration.rs b/tpchgen-cli/tests/cli_integration.rs index 2a8e240..4b68e21 100644 --- a/tpchgen-cli/tests/cli_integration.rs +++ b/tpchgen-cli/tests/cli_integration.rs @@ -1081,3 +1081,26 @@ async fn test_deprecated_parquet_row_group_bytes_flag_works() { parquet_file ); } + +/// Test that --delimiter at the top level errors with a helpful message +#[test] +fn test_top_level_delimiter_errors() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + cargo_bin_cmd!("tpchgen-cli") + .arg("--format") + .arg("csv") + .arg("--delimiter") + .arg("\\t") + .arg("--tables") + .arg("region") + .arg("--scale-factor") + .arg("0.001") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains( + "Use `tpchgen-cli csv --delimiter=...` instead", + )); +} From 59f200fb37ddf44439846662974cf4c8a8465bb0 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 15:21:24 -0700 Subject: [PATCH 15/23] remove top level delimiter Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 22 ++++++---------------- tpchgen-cli/tests/cli_integration.rs | 23 ----------------------- 2 files changed, 6 insertions(+), 39 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index 0fcf423..7803493 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -163,10 +163,6 @@ struct TopLevelArgs { /// Target row group size in bytes (deprecated: use 'parquet' subcommand instead) #[arg(long, hide = true)] parquet_row_group_bytes: Option, - - /// CSV delimiter character (use 'csv --delimiter=...' instead) - #[arg(long, hide = true, value_parser = parse_delimiter)] - delimiter: Option, } #[derive(clap::Args)] @@ -328,7 +324,6 @@ impl Cli { } } - let parquet_compression = self.args.parquet_compression.unwrap_or(Compression::SNAPPY); if self.args.parquet_compression.is_some() { if format == OutputFormat::Parquet { log::warn!("The --parquet-compression flag is deprecated. Use 'tpchgen-cli parquet --compression=...' instead"); @@ -337,10 +332,6 @@ impl Cli { } } - let parquet_row_group_bytes = self - .args - .parquet_row_group_bytes - .unwrap_or(DEFAULT_PARQUET_ROW_GROUP_BYTES); if self.args.parquet_row_group_bytes.is_some() { if format == OutputFormat::Parquet { log::warn!("The --parquet-row-group-bytes flag is deprecated. Use 'tpchgen-cli parquet --row-group-bytes=...' instead"); @@ -349,15 +340,14 @@ impl Cli { } } - if self.args.delimiter.is_some() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "The --delimiter flag is not supported at the top level. Use `tpchgen-cli csv --delimiter=...` instead.", - )); - } - let mut builder = self.args.common.builder(format); if format == OutputFormat::Parquet { + let parquet_compression = + self.args.parquet_compression.unwrap_or(Compression::SNAPPY); + let parquet_row_group_bytes = self + .args + .parquet_row_group_bytes + .unwrap_or(DEFAULT_PARQUET_ROW_GROUP_BYTES); builder = builder .with_parquet_compression(parquet_compression) .with_parquet_row_group_bytes(parquet_row_group_bytes); diff --git a/tpchgen-cli/tests/cli_integration.rs b/tpchgen-cli/tests/cli_integration.rs index 4b68e21..2a8e240 100644 --- a/tpchgen-cli/tests/cli_integration.rs +++ b/tpchgen-cli/tests/cli_integration.rs @@ -1081,26 +1081,3 @@ async fn test_deprecated_parquet_row_group_bytes_flag_works() { parquet_file ); } - -/// Test that --delimiter at the top level errors with a helpful message -#[test] -fn test_top_level_delimiter_errors() { - let temp_dir = tempdir().expect("Failed to create temporary directory"); - - cargo_bin_cmd!("tpchgen-cli") - .arg("--format") - .arg("csv") - .arg("--delimiter") - .arg("\\t") - .arg("--tables") - .arg("region") - .arg("--scale-factor") - .arg("0.001") - .arg("--output-dir") - .arg(temp_dir.path()) - .assert() - .failure() - .stderr(predicates::str::contains( - "Use `tpchgen-cli csv --delimiter=...` instead", - )); -} From 986137aa28b6aba1d07f7012f1d087e5dd62fa8c Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 18:11:17 -0700 Subject: [PATCH 16/23] fmt --- tpchgen-cli/bin/main.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index 7803493..7c618ab 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -342,8 +342,7 @@ impl Cli { let mut builder = self.args.common.builder(format); if format == OutputFormat::Parquet { - let parquet_compression = - self.args.parquet_compression.unwrap_or(Compression::SNAPPY); + let parquet_compression = self.args.parquet_compression.unwrap_or(Compression::SNAPPY); let parquet_row_group_bytes = self .args .parquet_row_group_bytes From ee6a992fc0f67f7f523f09f998fc23d51475c6be Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 18:16:57 -0700 Subject: [PATCH 17/23] simplify parquet_compression / parquet_row_group_bytes Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index 7c618ab..c7bc4bd 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -324,33 +324,26 @@ impl Cli { } } - if self.args.parquet_compression.is_some() { + let mut builder = self.args.common.builder(format); + + if let Some(parquet_compression) = self.args.parquet_compression { if format == OutputFormat::Parquet { log::warn!("The --parquet-compression flag is deprecated. Use 'tpchgen-cli parquet --compression=...' instead"); + builder = builder.with_parquet_compression(parquet_compression); } else { log::warn!("Parquet compression option set but not generating Parquet files"); } } - if self.args.parquet_row_group_bytes.is_some() { + if let Some(parquet_row_group_bytes) = self.args.parquet_row_group_bytes { if format == OutputFormat::Parquet { log::warn!("The --parquet-row-group-bytes flag is deprecated. Use 'tpchgen-cli parquet --row-group-bytes=...' instead"); + builder = builder.with_parquet_row_group_bytes(parquet_row_group_bytes); } else { log::warn!("Parquet row group size option set but not generating Parquet files"); } } - let mut builder = self.args.common.builder(format); - if format == OutputFormat::Parquet { - let parquet_compression = self.args.parquet_compression.unwrap_or(Compression::SNAPPY); - let parquet_row_group_bytes = self - .args - .parquet_row_group_bytes - .unwrap_or(DEFAULT_PARQUET_ROW_GROUP_BYTES); - builder = builder - .with_parquet_compression(parquet_compression) - .with_parquet_row_group_bytes(parquet_row_group_bytes); - } builder.build().generate().await } } From 802522d56cd93a3da712378033022afcdeca0e62 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 18:18:40 -0700 Subject: [PATCH 18/23] simplify logic Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index c7bc4bd..9e96d64 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -311,17 +311,14 @@ impl Cli { // Warn about --format migration to subcommands (only when explicitly provided) if self.args.format.is_some() { - match format { - OutputFormat::Parquet => { - log::warn!("The --format flag will be removed in v4.0.0. Use `tpchgen-cli parquet` instead."); - } - OutputFormat::Csv => { - log::warn!("The --format flag will be removed in v4.0.0. Use `tpchgen-cli csv` instead."); - } - OutputFormat::Tbl => { - log::warn!("The --format flag will be removed in v4.0.0. Use `tpchgen-cli tbl` instead."); - } - } + let subcommand = match format { + OutputFormat::Parquet => "parquet", + OutputFormat::Csv => "csv", + OutputFormat::Tbl => "tbl", + }; + log::warn!( + "The --format flag will be removed in v4.0.0. Use `tpchgen-cli {subcommand}` instead." + ); } let mut builder = self.args.common.builder(format); From 8715d0cacae49dd1f7bf13edcbf5c6beef9bb5e8 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 18:20:35 -0700 Subject: [PATCH 19/23] format Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index 9e96d64..080fae1 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -305,12 +305,10 @@ impl Cli { } async fn run(self) -> io::Result<()> { - let format = self.args.format.unwrap_or(OutputFormat::Tbl); - configure_logging(self.args.common.verbose, self.args.common.quiet); // Warn about --format migration to subcommands (only when explicitly provided) - if self.args.format.is_some() { + let format = if let Some(format) = self.args.format { let subcommand = match format { OutputFormat::Parquet => "parquet", OutputFormat::Csv => "csv", @@ -319,7 +317,10 @@ impl Cli { log::warn!( "The --format flag will be removed in v4.0.0. Use `tpchgen-cli {subcommand}` instead." ); - } + format + } else { + OutputFormat::Tbl + }; let mut builder = self.args.common.builder(format); From 63de9824976739fb67a08f8c369d75afb6102660 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 18:23:37 -0700 Subject: [PATCH 20/23] csv reject non ascii Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 11 ++++++++++- tpchgen-cli/tests/cli_integration.rs | 20 ++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index 080fae1..9fb0884 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -223,7 +223,10 @@ struct ParquetArgs { row_group_bytes: i64, } -/// Parse a delimiter string, handling escape sequences +/// Parse a delimiter string, handling escape sequences. +/// +/// The underlying arrow-csv writer requires an ASCII byte for the delimiter, +/// so non-ASCII characters are rejected here rather than failing mid-generation. fn parse_delimiter(s: &str) -> Result { // Handle common escape sequences let parsed = match s { @@ -243,6 +246,12 @@ fn parse_delimiter(s: &str) -> Result { chars[0] } }; + if !parsed.is_ascii() { + return Err(format!( + "Delimiter must be an ASCII character, got: '{}'", + parsed + )); + } Ok(parsed) } diff --git a/tpchgen-cli/tests/cli_integration.rs b/tpchgen-cli/tests/cli_integration.rs index 2a8e240..2fac2cb 100644 --- a/tpchgen-cli/tests/cli_integration.rs +++ b/tpchgen-cli/tests/cli_integration.rs @@ -1002,6 +1002,26 @@ fn test_csv_subcommand_custom_delimiter() { ); } +/// Test that the `csv` subcommand rejects a non-ASCII delimiter at parse time +#[test] +fn test_csv_subcommand_rejects_non_ascii_delimiter() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + cargo_bin_cmd!("tpchgen-cli") + .arg("csv") + .arg("--delimiter") + .arg("€") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("region") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains("ASCII")); +} + /// Test that the `tbl` subcommand rejects --delimiter #[test] fn test_tbl_subcommand_rejects_delimiter() { From 9cc56cdc218a746ba57bdf8f46d239fd3ecd0801 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 18:26:28 -0700 Subject: [PATCH 21/23] configure logging once Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index 9fb0884..dc10932 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -305,6 +305,14 @@ async fn main() -> io::Result<()> { impl Cli { /// Main function to run the generation async fn main(self) -> io::Result<()> { + let common = match &self.command { + Some(Commands::Tbl(args)) => &args.common, + Some(Commands::Csv(args)) => &args.common, + Some(Commands::Parquet(args)) => &args.common, + None => &self.args.common, + }; + configure_logging(common.verbose, common.quiet); + match self.command { Some(Commands::Tbl(args)) => args.run().await, Some(Commands::Csv(args)) => args.run().await, @@ -314,8 +322,6 @@ impl Cli { } async fn run(self) -> io::Result<()> { - configure_logging(self.args.common.verbose, self.args.common.quiet); - // Warn about --format migration to subcommands (only when explicitly provided) let format = if let Some(format) = self.args.format { let subcommand = match format { @@ -357,7 +363,6 @@ impl Cli { impl TblArgs { async fn run(self) -> io::Result<()> { - configure_logging(self.common.verbose, self.common.quiet); self.common .builder(OutputFormat::Tbl) .build() @@ -368,7 +373,6 @@ impl TblArgs { impl CsvArgs { async fn run(self) -> io::Result<()> { - configure_logging(self.common.verbose, self.common.quiet); self.common .builder(OutputFormat::Csv) .with_csv_delimiter(self.delimiter) @@ -380,7 +384,6 @@ impl CsvArgs { impl ParquetArgs { async fn run(self) -> io::Result<()> { - configure_logging(self.common.verbose, self.common.quiet); self.common .builder(OutputFormat::Parquet) .with_parquet_compression(self.compression) From d9c8d2fe57527705cc528d56cd2652d65919480b Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 18:33:41 -0700 Subject: [PATCH 22/23] better error msg Co-authored-by: Copilot --- tpchgen-cli/bin/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index dc10932..f41e68c 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -344,7 +344,7 @@ impl Cli { log::warn!("The --parquet-compression flag is deprecated. Use 'tpchgen-cli parquet --compression=...' instead"); builder = builder.with_parquet_compression(parquet_compression); } else { - log::warn!("Parquet compression option set but not generating Parquet files"); + log::warn!("--parquet-compression ignored: output format is not parquet"); } } @@ -353,7 +353,7 @@ impl Cli { log::warn!("The --parquet-row-group-bytes flag is deprecated. Use 'tpchgen-cli parquet --row-group-bytes=...' instead"); builder = builder.with_parquet_row_group_bytes(parquet_row_group_bytes); } else { - log::warn!("Parquet row group size option set but not generating Parquet files"); + log::warn!("--parquet-row-group-bytes ignored: output format is not parquet"); } } From cabd01bd135219a8ea27c6ae645f9bb1a4208233 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 May 2026 18:35:38 -0700 Subject: [PATCH 23/23] fix test --- tpchgen-cli/tests/cli_integration.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tpchgen-cli/tests/cli_integration.rs b/tpchgen-cli/tests/cli_integration.rs index 2fac2cb..f25a7b5 100644 --- a/tpchgen-cli/tests/cli_integration.rs +++ b/tpchgen-cli/tests/cli_integration.rs @@ -605,10 +605,10 @@ async fn test_incompatible_options_warnings() { // still success, but should see warnings in stderr .success() .stderr(predicates::str::contains( - "Parquet compression option set but not generating Parquet files", + "--parquet-compression ignored: output format is not parquet", )) .stderr(predicates::str::contains( - "Parquet row group size option set but not generating Parquet files", + "--parquet-row-group-bytes ignored: output format is not parquet", )); }