diff --git a/tpchgen-cli/README.md b/tpchgen-cli/README.md index edcb590..2fb3087 100644 --- a/tpchgen-cli/README.md +++ b/tpchgen-cli/README.md @@ -49,6 +49,7 @@ tpchgen-cli parquet -s 10 # Scale Factor 10, all tables, in `tbl`(csv like) format in the `sf10` directory # (10GB, 8 files, 60M lineitem rows) +# Note: `tpchgen-cli tbl` also works explicitly tpchgen-cli -s 10 --output-dir sf10 # Scale Factor 1000, lineitem table, in Apache Parquet format in sf1000 directory, @@ -87,3 +88,15 @@ done Times to create TPCH tables in Parquet format using `tpchgen-cli` and `duckdb` for various scale factors. +## Deprecation Notice + +`--format`, `--parquet-compression`, and `--parquet-row-group-bytes` are deprecated as of v3.x and will be removed in v4.0.0. Use subcommands instead: + +```shell +# Before +tpchgen-cli --format=parquet --parquet-compression=ZSTD(1) -s 10 + +# After +tpchgen-cli parquet --compression=ZSTD(1) -s 10 +``` + diff --git a/tpchgen-cli/bin/main.rs b/tpchgen-cli/bin/main.rs index f9c6828..f41e68c 100644 --- a/tpchgen-cli/bin/main.rs +++ b/tpchgen-cli/bin/main.rs @@ -14,7 +14,8 @@ use std::io; use std::path::PathBuf; use std::str::FromStr; use tpchgen_cli::{ - Compression, OutputFormat, Table, TpchGenerator, DEFAULT_PARQUET_ROW_GROUP_BYTES, + Compression, OutputFormat, Table, TpchGenerator, TpchGeneratorBuilder, + DEFAULT_PARQUET_ROW_GROUP_BYTES, }; #[derive(Parser)] @@ -34,17 +35,17 @@ multiple files named //
.. Examples -# Generate all tables at scale factor 1 (1GB) in TBL format to /tmp/tpch directory: +# Generate all tables at scale factor 1 (1GB) in TBL format (default) to /tmp/tpch directory: tpchgen-cli -s 1 --output-dir=/tmp/tpch # Generate all tables in CSV format: -tpchgen-cli -s 1 --format=csv --output-dir=/tmp/tpch +tpchgen-cli csv -s 1 --output-dir=/tmp/tpch # Generate scale factor one in CSV format with tab delimiter: -tpchgen-cli -s 1 --format=csv --delimiter='\t' --output-dir=/tmp/tpch +tpchgen-cli csv -s 1 --delimiter='\t' --output-dir=/tmp/tpch # Generate the lineitem table at scale factor 100 in 10 Apache Parquet files to # /tmp/tpch/lineitem: @@ -54,18 +55,26 @@ tpchgen-cli parquet -s 100 --tables=lineitem --parts=10 --output-dir=/tmp/tpch # Generate scale factor one in current directory, seeing debug output RUST_LOG=debug tpchgen-cli -s 1 --output-dir=/tmp/tpch -"# +"#, + args_conflicts_with_subcommands = true )] struct Cli { #[command(subcommand)] command: Option, + // Top-level args are only used when no subcommand is given (legacy path). + // args_conflicts_with_subcommands prevents these from being silently ignored + // when a subcommand is present (e.g. `tpchgen-cli -s 10 parquet` is an error). #[command(flatten)] args: TopLevelArgs, } #[derive(clap::Subcommand)] enum Commands { + /// Generate TBL (pipe-delimited) output + Tbl(TblArgs), + /// Generate CSV output with CSV-specific options + Csv(CsvArgs), /// Generate Apache Parquet output with Parquet-specific options Parquet(ParquetArgs), } @@ -112,27 +121,60 @@ struct CommonArgs { stdout: bool, } +impl CommonArgs { + /// Create a [`TpchGeneratorBuilder`] pre-configured with the common options. + fn builder(self, format: OutputFormat) -> TpchGeneratorBuilder { + let mut builder = TpchGenerator::builder() + .with_scale_factor(self.scale_factor) + .with_output_dir(self.output_dir) + .with_format(format) + .with_num_threads(self.num_threads) + .with_stdout(self.stdout); + + if let Some(tables) = self.tables { + builder = builder.with_tables(tables); + } + if let Some(parts) = self.parts { + builder = builder.with_parts(parts); + } + if let Some(part) = self.part { + builder = builder.with_part(part); + } + + builder + } +} + #[derive(clap::Args)] struct TopLevelArgs { #[command(flatten)] common: CommonArgs, - /// Output format: tbl, csv, parquet (default: tbl) + /// Output format (deprecated: use subcommands `tbl`, `csv`, or `parquet` instead) /// - /// For Parquet output, prefer using the `parquet` subcommand instead. - /// The --format flag will be replaced by subcommands in v4.0.0. - #[arg(short, long)] + /// The --format flag will be removed in v4.0.0. + #[arg(short, long, hide = true)] format: Option, /// Parquet block compression format (deprecated: use 'parquet' subcommand instead) #[arg(short = 'c', long, hide = true)] - #[deprecated] parquet_compression: Option, /// Target row group size in bytes (deprecated: use 'parquet' subcommand instead) #[arg(long, hide = true)] - #[deprecated] parquet_row_group_bytes: Option, +} + +#[derive(clap::Args)] +struct TblArgs { + #[command(flatten)] + common: CommonArgs, +} + +#[derive(clap::Args)] +struct CsvArgs { + #[command(flatten)] + common: CommonArgs, /// CSV delimiter character (default: ',') /// @@ -181,7 +223,10 @@ struct ParquetArgs { row_group_bytes: i64, } -/// Parse a delimiter string, handling escape sequences +/// Parse a delimiter string, handling escape sequences. +/// +/// The underlying arrow-csv writer requires an ASCII byte for the delimiter, +/// so non-ASCII characters are rejected here rather than failing mid-generation. fn parse_delimiter(s: &str) -> Result { // Handle common escape sequences let parsed = match s { @@ -201,6 +246,12 @@ fn parse_delimiter(s: &str) -> Result { chars[0] } }; + if !parsed.is_ascii() { + return Err(format!( + "Delimiter must be an ASCII character, got: '{}'", + parsed + )); + } Ok(parsed) } @@ -254,123 +305,92 @@ async fn main() -> io::Result<()> { impl Cli { /// Main function to run the generation async fn main(self) -> io::Result<()> { - // Error if both --format and a subcommand are specified - if self.args.format.is_some() && self.command.is_some() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "Cannot use --format with a subcommand. Use the subcommand directly, e.g. `tpchgen-cli parquet`", - )); - } + let common = match &self.command { + Some(Commands::Tbl(args)) => &args.common, + Some(Commands::Csv(args)) => &args.common, + Some(Commands::Parquet(args)) => &args.common, + None => &self.args.common, + }; + configure_logging(common.verbose, common.quiet); + match self.command { + Some(Commands::Tbl(args)) => args.run().await, + Some(Commands::Csv(args)) => args.run().await, Some(Commands::Parquet(args)) => args.run().await, None => self.run().await, } } - #[allow(deprecated)] async fn run(self) -> io::Result<()> { - let format = self.args.format.unwrap_or(OutputFormat::Tbl); - let scale_factor = self.args.common.scale_factor; - let output_dir = self.args.common.output_dir; - let num_threads = self.args.common.num_threads; - let verbose = self.args.common.verbose; - let quiet = self.args.common.quiet; - let stdout = self.args.common.stdout; - let delimiter = self.args.delimiter; - let parquet_compression = self.args.parquet_compression.unwrap_or(Compression::SNAPPY); - let parquet_row_group_bytes = self - .args - .parquet_row_group_bytes - .unwrap_or(DEFAULT_PARQUET_ROW_GROUP_BYTES); - - configure_logging(verbose, quiet); - - // Warn about --format=parquet migration to subcommand - if format == OutputFormat::Parquet { + // Warn about --format migration to subcommands (only when explicitly provided) + let format = if let Some(format) = self.args.format { + let subcommand = match format { + OutputFormat::Parquet => "parquet", + OutputFormat::Csv => "csv", + OutputFormat::Tbl => "tbl", + }; log::warn!( - "The --format=parquet flag will be replaced by the `parquet` subcommand in v4.0.0. Use `tpchgen-cli parquet` instead." + "The --format flag will be removed in v4.0.0. Use `tpchgen-cli {subcommand}` instead." ); - } + format + } else { + OutputFormat::Tbl + }; - if self.args.parquet_compression.is_some() { + let mut builder = self.args.common.builder(format); + + if let Some(parquet_compression) = self.args.parquet_compression { if format == OutputFormat::Parquet { log::warn!("The --parquet-compression flag is deprecated. Use 'tpchgen-cli parquet --compression=...' instead"); + builder = builder.with_parquet_compression(parquet_compression); } else { - log::warn!("Parquet compression option set but not generating Parquet files"); + log::warn!("--parquet-compression ignored: output format is not parquet"); } } - if self.args.parquet_row_group_bytes.is_some() { + + if let Some(parquet_row_group_bytes) = self.args.parquet_row_group_bytes { if format == OutputFormat::Parquet { log::warn!("The --parquet-row-group-bytes flag is deprecated. Use 'tpchgen-cli parquet --row-group-bytes=...' instead"); + builder = builder.with_parquet_row_group_bytes(parquet_row_group_bytes); } else { - log::warn!("Parquet row group size option set but not generating Parquet files"); + log::warn!("--parquet-row-group-bytes ignored: output format is not parquet"); } } - // Validate delimiter usage - if format == OutputFormat::Tbl && delimiter != ',' { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "The --delimiter option cannot be used with --format=tbl. TBL format uses the TPC-H standard pipe delimiter." - )); - } - - // Warn if delimiter is set but not generating CSV - if format != OutputFormat::Csv && delimiter != ',' { - log::warn!("Delimiter option set but not generating CSV"); - } - - // Build the generator using the library API - let mut builder = TpchGenerator::builder() - .with_scale_factor(scale_factor) - .with_output_dir(output_dir) - .with_format(format) - .with_num_threads(num_threads) - .with_parquet_compression(parquet_compression) - .with_parquet_row_group_bytes(parquet_row_group_bytes) - .with_stdout(stdout) - .with_csv_delimiter(delimiter); - - if let Some(tables) = self.args.common.tables { - builder = builder.with_tables(tables); - } + builder.build().generate().await + } +} - if let Some(parts) = self.args.common.parts { - builder = builder.with_parts(parts); - } - if let Some(part) = self.args.common.part { - builder = builder.with_part(part); - } +impl TblArgs { + async fn run(self) -> io::Result<()> { + self.common + .builder(OutputFormat::Tbl) + .build() + .generate() + .await + } +} - builder.build().generate().await +impl CsvArgs { + async fn run(self) -> io::Result<()> { + self.common + .builder(OutputFormat::Csv) + .with_csv_delimiter(self.delimiter) + .build() + .generate() + .await } } impl ParquetArgs { async fn run(self) -> io::Result<()> { - configure_logging(self.common.verbose, self.common.quiet); - - let mut builder = TpchGenerator::builder() - .with_scale_factor(self.common.scale_factor) - .with_output_dir(self.common.output_dir) - .with_format(OutputFormat::Parquet) - .with_num_threads(self.common.num_threads) - .with_stdout(self.common.stdout) + self.common + .builder(OutputFormat::Parquet) .with_parquet_compression(self.compression) - .with_parquet_row_group_bytes(self.row_group_bytes); - - if let Some(tables) = self.common.tables { - builder = builder.with_tables(tables); - } - - if let Some(parts) = self.common.parts { - builder = builder.with_parts(parts); - } - if let Some(part) = self.common.part { - builder = builder.with_part(part); - } - - builder.build().generate().await + .with_parquet_row_group_bytes(self.row_group_bytes) + .build() + .generate() + .await } } diff --git a/tpchgen-cli/tests/cli_integration.rs b/tpchgen-cli/tests/cli_integration.rs index a9e4095..f25a7b5 100644 --- a/tpchgen-cli/tests/cli_integration.rs +++ b/tpchgen-cli/tests/cli_integration.rs @@ -605,10 +605,10 @@ async fn test_incompatible_options_warnings() { // still success, but should see warnings in stderr .success() .stderr(predicates::str::contains( - "Parquet compression option set but not generating Parquet files", + "--parquet-compression ignored: output format is not parquet", )) .stderr(predicates::str::contains( - "Parquet row group size option set but not generating Parquet files", + "--parquet-row-group-bytes ignored: output format is not parquet", )); } @@ -719,9 +719,7 @@ async fn test_format_parquet_warns_about_subcommand() { .arg(output_dir.path()) .assert() .success() - .stderr(predicates::str::contains( - "will be replaced by the `parquet` subcommand in v4.0.0", - )); + .stderr(predicates::str::contains("will be removed in v4.0.0")); } /// Test that using --format together with a subcommand errors @@ -741,9 +739,110 @@ fn test_format_with_subcommand_conflict() { .arg(temp_dir.path()) .assert() .failure() - .stderr(predicates::str::contains( - "Cannot use --format with a subcommand", - )); + .stderr(predicates::str::contains("cannot be used with")); +} + +/// Test that using --parquet-compression together with a subcommand errors +#[test] +fn test_parquet_compression_with_subcommand_conflict() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + // With parquet subcommand + cargo_bin_cmd!("tpchgen-cli") + .arg("--parquet-compression") + .arg("SNAPPY") + .arg("parquet") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains("cannot be used with")); + + // With tbl subcommand + cargo_bin_cmd!("tpchgen-cli") + .arg("--parquet-compression") + .arg("SNAPPY") + .arg("tbl") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains("cannot be used with")); +} + +/// Test that using --parquet-row-group-bytes together with a subcommand errors +#[test] +fn test_parquet_row_group_bytes_with_subcommand_conflict() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + // With parquet subcommand + cargo_bin_cmd!("tpchgen-cli") + .arg("--parquet-row-group-bytes") + .arg("1000000") + .arg("parquet") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains("cannot be used with")); + + // With csv subcommand + cargo_bin_cmd!("tpchgen-cli") + .arg("--parquet-row-group-bytes") + .arg("1000000") + .arg("csv") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains("cannot be used with")); +} + +/// Test that common args before a subcommand are rejected +#[test] +fn test_common_args_with_subcommand_conflict() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + // -s before subcommand should error + cargo_bin_cmd!("tpchgen-cli") + .arg("-s") + .arg("0.01") + .arg("parquet") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains("cannot be used with")); + + // -s after subcommand should work + cargo_bin_cmd!("tpchgen-cli") + .arg("parquet") + .arg("-s") + .arg("0.01") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .success(); } /// Test that running with no --format and no subcommand defaults to TBL @@ -768,3 +867,237 @@ fn test_default_format_is_tbl() { expected_file ); } + +/// Test that the `tbl` subcommand generates TBL files +#[test] +fn test_tbl_subcommand() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + cargo_bin_cmd!("tpchgen-cli") + .arg("tbl") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .success(); + + let expected_file = temp_dir.path().join("part.tbl"); + assert!( + expected_file.exists(), + "Expected TBL file {:?} to exist with `tbl` subcommand", + expected_file + ); +} + +/// Test that the `csv` subcommand generates CSV files +#[test] +fn test_csv_subcommand() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + cargo_bin_cmd!("tpchgen-cli") + .arg("csv") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .success(); + + let expected_file = temp_dir.path().join("part.csv"); + assert!( + expected_file.exists(), + "Expected CSV file {:?} to exist with `csv` subcommand", + expected_file + ); +} + +/// Test that --format=csv emits a deprecation warning +#[tokio::test] +async fn test_format_csv_warns_about_subcommand() { + let output_dir = tempdir().unwrap(); + cargo_bin_cmd!("tpchgen-cli") + .arg("--format") + .arg("csv") + .arg("--tables") + .arg("part") + .arg("--scale-factor") + .arg("0.001") + .arg("--output-dir") + .arg(output_dir.path()) + .assert() + .success() + .stderr(predicates::str::contains("will be removed in v4.0.0")); + + let expected_file = output_dir.path().join("part.csv"); + assert!( + expected_file.exists(), + "Expected CSV file {:?} to exist with deprecated --format=csv path", + expected_file + ); +} + +/// Test that --format=tbl emits a deprecation warning +#[tokio::test] +async fn test_format_tbl_warns_about_subcommand() { + let output_dir = tempdir().unwrap(); + cargo_bin_cmd!("tpchgen-cli") + .arg("--format") + .arg("tbl") + .arg("--tables") + .arg("part") + .arg("--scale-factor") + .arg("0.001") + .arg("--output-dir") + .arg(output_dir.path()) + .assert() + .success() + .stderr(predicates::str::contains("will be removed in v4.0.0")); +} + +/// Test that the `csv` subcommand with a custom delimiter produces tab-delimited output +#[test] +fn test_csv_subcommand_custom_delimiter() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + cargo_bin_cmd!("tpchgen-cli") + .arg("csv") + .arg("--delimiter") + .arg("\\t") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("region") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .success(); + + let csv_file = temp_dir.path().join("region.csv"); + assert!( + csv_file.exists(), + "Expected CSV file {:?} to exist", + csv_file + ); + + let contents = std::fs::read_to_string(&csv_file).unwrap(); + // Region table has 5 rows; each should contain tabs as delimiters + assert!( + contents.contains('\t'), + "Expected tab-delimited output, got:\n{}", + contents + ); + // Verify multiple tab-separated fields per line + let first_line = contents.lines().next().unwrap(); + let tab_count = first_line.matches('\t').count(); + assert!( + tab_count >= 2, + "Expected at least 2 tabs per line, got {} in: {}", + tab_count, + first_line + ); +} + +/// Test that the `csv` subcommand rejects a non-ASCII delimiter at parse time +#[test] +fn test_csv_subcommand_rejects_non_ascii_delimiter() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + cargo_bin_cmd!("tpchgen-cli") + .arg("csv") + .arg("--delimiter") + .arg("€") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("region") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains("ASCII")); +} + +/// Test that the `tbl` subcommand rejects --delimiter +#[test] +fn test_tbl_subcommand_rejects_delimiter() { + let temp_dir = tempdir().expect("Failed to create temporary directory"); + + cargo_bin_cmd!("tpchgen-cli") + .arg("tbl") + .arg("--delimiter") + .arg(",") + .arg("--scale-factor") + .arg("0.001") + .arg("--tables") + .arg("part") + .arg("--output-dir") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicates::str::contains("unexpected argument")); +} + +/// Test that deprecated --format=parquet with --parquet-compression still works +#[tokio::test] +async fn test_deprecated_parquet_compression_flag_works() { + let output_dir = tempdir().unwrap(); + + cargo_bin_cmd!("tpchgen-cli") + .arg("--format") + .arg("parquet") + .arg("--parquet-compression") + .arg("ZSTD(1)") + .arg("--tables") + .arg("region") + .arg("--scale-factor") + .arg("0.001") + .arg("--output-dir") + .arg(output_dir.path()) + .assert() + .success() + .stderr(predicates::str::contains( + "--parquet-compression flag is deprecated", + )); + + let parquet_file = output_dir.path().join("region.parquet"); + assert!( + parquet_file.exists(), + "Expected Parquet file {:?} to exist", + parquet_file + ); +} + +/// Test that deprecated --format=parquet with --parquet-row-group-bytes still works +#[tokio::test] +async fn test_deprecated_parquet_row_group_bytes_flag_works() { + let output_dir = tempdir().unwrap(); + + cargo_bin_cmd!("tpchgen-cli") + .arg("--format") + .arg("parquet") + .arg("--parquet-row-group-bytes") + .arg("1000000") + .arg("--tables") + .arg("region") + .arg("--scale-factor") + .arg("0.001") + .arg("--output-dir") + .arg(output_dir.path()) + .assert() + .success() + .stderr(predicates::str::contains( + "--parquet-row-group-bytes flag is deprecated", + )); + + let parquet_file = output_dir.path().join("region.parquet"); + assert!( + parquet_file.exists(), + "Expected Parquet file {:?} to exist", + parquet_file + ); +}