Skip to content

Commit 3ecba03

Browse files
authored
feat: Add parquet subcommand with format-specific argument validation (#234)
Adds a `parquet` subcommand so parquet-specific flags (`--compression`, `--row-group-bytes`) are only accepted when generating Parquet output, replacing the error-prone `--format=parquet` + `--parquet-*` flags pattern. ## Changes - Add `parquet` subcommand with dedicated `ParquetArgs` (compression, row-group-bytes) - Extract shared options into `CommonArgs` (scale-factor, output-dir, tables, parts, etc.) - Deprecate top-level `--parquet-compression` and `--parquet-row-group-bytes` with migration warnings - Warn on `--format=parquet` usage, directing users to the subcommand - Rename `--parquet-compression` → `--compression` and `--parquet-row-group-bytes` → `--row-group-bytes` under the subcommand - Extract `configure_logging()` helper - Update CLI integration tests and help examples ## Migration ```sh # Before tpchgen-cli -s 100 --format=parquet --parquet-compression snappy --parquet-row-group-bytes=100000000 --output-dir /tmp/tpch # After tpchgen-cli parquet -s 100 --compression snappy --row-group-bytes=100000000 --output-dir /tmp/tpch ``` TBL/CSV generation via top-level flags is unchanged. Closes #173
2 parents c8ac306 + 9bf4e23 commit 3ecba03

2 files changed

Lines changed: 178 additions & 77 deletions

File tree

tpchgen-cli/bin/main.rs

Lines changed: 175 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,40 @@ Examples
3838
3939
tpchgen-cli -s 1 --output-dir=/tmp/tpch
4040
41+
# Generate all tables in CSV format:
42+
43+
tpchgen-cli -s 1 --format=csv --output-dir=/tmp/tpch
44+
45+
# Generate scale factor one in CSV format with tab delimiter:
46+
47+
tpchgen-cli -s 1 --format=csv --delimiter='\t' --output-dir=/tmp/tpch
48+
4149
# Generate the lineitem table at scale factor 100 in 10 Apache Parquet files to
42-
# /tmp/tpch/lineitem
50+
# /tmp/tpch/lineitem:
4351
44-
tpchgen-cli -s 100 --tables=lineitem --format=parquet --parts=10 --output-dir=/tmp/tpch
52+
tpchgen-cli parquet -s 100 --tables=lineitem --parts=10 --output-dir=/tmp/tpch
4553
4654
# Generate scale factor one in current directory, seeing debug output
4755
48-
RUST_LOG=debug tpchgen -s 1
56+
RUST_LOG=debug tpchgen-cli -s 1 --output-dir=/tmp/tpch
4957
"#
5058
)]
5159
struct Cli {
60+
#[command(subcommand)]
61+
command: Option<Commands>,
62+
63+
#[command(flatten)]
64+
args: TopLevelArgs,
65+
}
66+
67+
#[derive(clap::Subcommand)]
68+
enum Commands {
69+
/// Generate Apache Parquet output with Parquet-specific options
70+
Parquet(ParquetArgs),
71+
}
72+
73+
#[derive(clap::Args)]
74+
struct CommonArgs {
5275
/// Scale factor to create
5376
#[arg(short, long, default_value_t = 1.)]
5477
scale_factor: f64,
@@ -69,13 +92,59 @@ struct Cli {
6992
#[arg(long)]
7093
part: Option<i32>,
7194

95+
/// The number of threads for parallel generation, defaults to the number of CPUs
96+
#[arg(short, long, default_value_t = num_cpus::get())]
97+
num_threads: usize,
98+
99+
/// Verbose output
100+
///
101+
/// When specified, sets the log level to `info` and ignores the `RUST_LOG`
102+
/// environment variable. When not specified, uses `RUST_LOG`
103+
#[arg(short, long, default_value_t = false, conflicts_with = "quiet")]
104+
verbose: bool,
105+
106+
/// Quiet mode - only show error-level logs
107+
#[arg(short, long, default_value_t = false, conflicts_with = "verbose")]
108+
quiet: bool,
109+
110+
/// Write the output to stdout instead of a file.
111+
#[arg(long, default_value_t = false)]
112+
stdout: bool,
113+
}
114+
115+
#[derive(clap::Args)]
116+
struct TopLevelArgs {
117+
#[command(flatten)]
118+
common: CommonArgs,
119+
72120
/// Output format: tbl, csv, parquet
73121
#[arg(short, long, default_value = "tbl")]
74122
format: OutputFormat,
75123

76-
/// The number of threads for parallel generation, defaults to the number of CPUs
77-
#[arg(short, long, default_value_t = num_cpus::get())]
78-
num_threads: usize,
124+
/// Parquet block compression format (deprecated: use 'parquet' subcommand instead)
125+
#[arg(short = 'c', long, hide = true)]
126+
#[deprecated]
127+
parquet_compression: Option<Compression>,
128+
129+
/// Target row group size in bytes (deprecated: use 'parquet' subcommand instead)
130+
#[arg(long, hide = true)]
131+
#[deprecated]
132+
parquet_row_group_bytes: Option<i64>,
133+
134+
/// CSV delimiter character (default: ',')
135+
///
136+
/// Specifies the delimiter character to use when generating CSV files.
137+
///
138+
/// Supports escape sequences: \t (tab), \n (newline), \r (carriage return), \\ (backslash)
139+
/// Common delimiters: ',' (comma), '|' (pipe), '\t' (tab), ';' (semicolon)
140+
#[arg(long, default_value = ",", value_parser = parse_delimiter)]
141+
delimiter: char,
142+
}
143+
144+
#[derive(clap::Args)]
145+
struct ParquetArgs {
146+
#[command(flatten)]
147+
common: CommonArgs,
79148

80149
/// Parquet block compression format.
81150
///
@@ -91,22 +160,7 @@ struct Cli {
91160
/// SNAPPY: 2.4G (0.75 GB/sec)
92161
/// UNCOMPRESSED: 3.8G (1.41 GB/sec)
93162
#[arg(short = 'c', long, default_value = "SNAPPY")]
94-
parquet_compression: Compression,
95-
96-
/// Verbose output
97-
///
98-
/// When specified, sets the log level to `info` and ignores the `RUST_LOG`
99-
/// environment variable. When not specified, uses `RUST_LOG`
100-
#[arg(short, long, default_value_t = false, conflicts_with = "quiet")]
101-
verbose: bool,
102-
103-
/// Quiet mode - only show error-level logs
104-
#[arg(short, long, default_value_t = false, conflicts_with = "verbose")]
105-
quiet: bool,
106-
107-
/// Write the output to stdout instead of a file.
108-
#[arg(long, default_value_t = false)]
109-
stdout: bool,
163+
compression: Compression,
110164

111165
/// Target size in row group bytes in Parquet files
112166
///
@@ -121,17 +175,7 @@ struct Cli {
121175
///
122176
/// Typical values range from 10MB to 100MB.
123177
#[arg(long, default_value_t = DEFAULT_PARQUET_ROW_GROUP_BYTES)]
124-
parquet_row_group_bytes: i64,
125-
126-
/// CSV delimiter character (default: ',')
127-
///
128-
/// Specifies the delimiter character to use when generating CSV files.
129-
/// This option only applies to CSV format and cannot be used with TBL format.
130-
///
131-
/// Supports escape sequences: \t (tab), \n (newline), \r (carriage return), \\ (backslash)
132-
/// Common delimiters: ',' (comma), '|' (pipe), '\t' (tab), ';' (semicolon)
133-
#[arg(long, default_value = ",", value_parser = parse_delimiter)]
134-
delimiter: char,
178+
row_group_bytes: i64,
135179
}
136180

137181
/// Parse a delimiter string, handling escape sequences
@@ -207,71 +251,133 @@ async fn main() -> io::Result<()> {
207251
impl Cli {
208252
/// Main function to run the generation
209253
async fn main(self) -> io::Result<()> {
210-
// Configure logging
211-
if self.quiet {
212-
// Quiet mode: only show error-level logs
213-
env_logger::builder()
214-
.filter_level(LevelFilter::Error)
215-
.init();
216-
} else if self.verbose {
217-
env_logger::builder().filter_level(LevelFilter::Info).init();
218-
info!("Verbose output enabled (ignoring RUST_LOG environment variable)");
219-
} else {
220-
// Default: show warnings and errors, but respect RUST_LOG if set
221-
env_logger::builder()
222-
.filter_level(LevelFilter::Warn)
223-
.parse_default_env()
224-
.init();
254+
match self.command {
255+
Some(Commands::Parquet(args)) => args.run().await,
256+
None => self.run().await,
225257
}
258+
}
259+
260+
#[allow(deprecated)]
261+
async fn run(self) -> io::Result<()> {
262+
let format = self.args.format;
263+
let scale_factor = self.args.common.scale_factor;
264+
let output_dir = self.args.common.output_dir;
265+
let num_threads = self.args.common.num_threads;
266+
let verbose = self.args.common.verbose;
267+
let quiet = self.args.common.quiet;
268+
let stdout = self.args.common.stdout;
269+
let delimiter = self.args.delimiter;
270+
let parquet_compression = self.args.parquet_compression.unwrap_or(Compression::SNAPPY);
271+
let parquet_row_group_bytes = self
272+
.args
273+
.parquet_row_group_bytes
274+
.unwrap_or(DEFAULT_PARQUET_ROW_GROUP_BYTES);
275+
276+
configure_logging(verbose, quiet);
226277

227278
// Warn if parquet specific options are set but not generating parquet
228-
if self.format != OutputFormat::Parquet {
229-
if self.parquet_compression != Compression::SNAPPY {
279+
if format == OutputFormat::Parquet {
280+
log::warn!(
281+
"Warning: Use 'tpchgen-cli parquet' subcommand instead of '--format=parquet' for better validation and control"
282+
);
283+
}
284+
285+
if self.args.parquet_compression.is_some() {
286+
if format == OutputFormat::Parquet {
287+
log::warn!("The --parquet-compression flag is deprecated. Use 'tpchgen-cli parquet --compression=...' instead");
288+
} else {
230289
log::warn!("Parquet compression option set but not generating Parquet files");
231290
}
232-
if self.parquet_row_group_bytes != DEFAULT_PARQUET_ROW_GROUP_BYTES {
291+
}
292+
if self.args.parquet_row_group_bytes.is_some() {
293+
if format == OutputFormat::Parquet {
294+
log::warn!("The --parquet-row-group-bytes flag is deprecated. Use 'tpchgen-cli parquet --row-group-bytes=...' instead");
295+
} else {
233296
log::warn!("Parquet row group size option set but not generating Parquet files");
234297
}
235298
}
236299

237300
// Validate delimiter usage
238-
if self.format == OutputFormat::Tbl && self.delimiter != ',' {
301+
if format == OutputFormat::Tbl && delimiter != ',' {
239302
return Err(io::Error::new(
240303
io::ErrorKind::InvalidInput,
241304
"The --delimiter option cannot be used with --format=tbl. TBL format uses the TPC-H standard pipe delimiter."
242305
));
243306
}
244307

245308
// Warn if delimiter is set but not generating CSV
246-
if self.format != OutputFormat::Csv && self.delimiter != ',' {
247-
eprintln!("Warning: Delimiter option set but not generating CSV files");
309+
if format != OutputFormat::Csv && delimiter != ',' {
310+
log::warn!("Warning: Delimiter option set but not generating CSV");
248311
}
249312

250313
// Build the generator using the library API
251314
let mut builder = TpchGenerator::builder()
252-
.with_scale_factor(self.scale_factor)
253-
.with_output_dir(self.output_dir)
254-
.with_format(self.format)
255-
.with_num_threads(self.num_threads)
256-
.with_parquet_compression(self.parquet_compression)
257-
.with_parquet_row_group_bytes(self.parquet_row_group_bytes)
258-
.with_stdout(self.stdout)
259-
.with_csv_delimiter(self.delimiter);
260-
261-
// Add tables if specified
262-
if let Some(tables) = self.tables {
315+
.with_scale_factor(scale_factor)
316+
.with_output_dir(output_dir)
317+
.with_format(format)
318+
.with_num_threads(num_threads)
319+
.with_parquet_compression(parquet_compression)
320+
.with_parquet_row_group_bytes(parquet_row_group_bytes)
321+
.with_stdout(stdout)
322+
.with_csv_delimiter(delimiter);
323+
324+
if let Some(tables) = self.args.common.tables {
325+
builder = builder.with_tables(tables);
326+
}
327+
328+
if let Some(parts) = self.args.common.parts {
329+
builder = builder.with_parts(parts);
330+
}
331+
if let Some(part) = self.args.common.part {
332+
builder = builder.with_part(part);
333+
}
334+
335+
builder.build().generate().await
336+
}
337+
}
338+
339+
impl ParquetArgs {
340+
async fn run(self) -> io::Result<()> {
341+
configure_logging(self.common.verbose, self.common.quiet);
342+
343+
let mut builder = TpchGenerator::builder()
344+
.with_scale_factor(self.common.scale_factor)
345+
.with_output_dir(self.common.output_dir)
346+
.with_format(OutputFormat::Parquet)
347+
.with_num_threads(self.common.num_threads)
348+
.with_stdout(self.common.stdout)
349+
.with_parquet_compression(self.compression)
350+
.with_parquet_row_group_bytes(self.row_group_bytes);
351+
352+
if let Some(tables) = self.common.tables {
263353
builder = builder.with_tables(tables);
264354
}
265355

266-
// Add parts/part if specified
267-
if let Some(parts) = self.parts {
356+
if let Some(parts) = self.common.parts {
268357
builder = builder.with_parts(parts);
269358
}
270-
if let Some(part) = self.part {
359+
if let Some(part) = self.common.part {
271360
builder = builder.with_part(part);
272361
}
273362

274-
// Generate using the library
275363
builder.build().generate().await
276364
}
277365
}
366+
367+
fn configure_logging(verbose: bool, quiet: bool) {
368+
if quiet {
369+
// Quiet mode: only show error-level logs
370+
env_logger::builder()
371+
.filter_level(LevelFilter::Error)
372+
.init();
373+
} else if verbose {
374+
env_logger::builder().filter_level(LevelFilter::Info).init();
375+
info!("Verbose output enabled (ignoring RUST_LOG environment variable)");
376+
} else {
377+
// Default: show warnings and errors, but respect RUST_LOG if set
378+
env_logger::builder()
379+
.filter_level(LevelFilter::Warn)
380+
.parse_default_env()
381+
.init();
382+
}
383+
}

tpchgen-cli/tests/cli_integration.rs

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -126,12 +126,11 @@ fn test_tpchgen_cli_parquet_no_overwrite() {
126126

127127
// First run - create the file
128128
cargo_bin_cmd!("tpchgen-cli")
129+
.arg("parquet")
129130
.arg("--scale-factor")
130131
.arg("0.001")
131132
.arg("--tables")
132133
.arg("part")
133-
.arg("--format")
134-
.arg("parquet")
135134
.arg("--output-dir")
136135
.arg(temp_dir.path())
137136
.assert()
@@ -144,12 +143,11 @@ fn test_tpchgen_cli_parquet_no_overwrite() {
144143
// Run the tpchgen-cli command again with the same parameters and expect the
145144
// file to not be overwritten and a warning to be logged
146145
let output = cargo_bin_cmd!("tpchgen-cli")
146+
.arg("parquet")
147147
.arg("--scale-factor")
148148
.arg("0.001")
149149
.arg("--tables")
150150
.arg("part")
151-
.arg("--format")
152-
.arg("parquet")
153151
.arg("--output-dir")
154152
.arg(temp_dir.path())
155153
.assert()
@@ -351,7 +349,6 @@ async fn test_write_parquet_orders() {
351349
let output_dir = tempdir().unwrap();
352350
let output_path = output_dir.path().join("orders.parquet");
353351
cargo_bin_cmd!("tpchgen-cli")
354-
.arg("--format")
355352
.arg("parquet")
356353
.arg("--tables")
357354
.arg("orders")
@@ -396,7 +393,6 @@ async fn test_write_parquet_row_group_size_default() {
396393
// Run the CLI command to generate parquet data with default settings
397394
let output_dir = tempdir().unwrap();
398395
cargo_bin_cmd!("tpchgen-cli")
399-
.arg("--format")
400396
.arg("parquet")
401397
.arg("--scale-factor")
402398
.arg("1")
@@ -463,13 +459,12 @@ async fn test_write_parquet_row_group_size_20mb() {
463459
// Run the CLI command to generate parquet data with larger row group size
464460
let output_dir = tempdir().unwrap();
465461
cargo_bin_cmd!("tpchgen-cli")
466-
.arg("--format")
467462
.arg("parquet")
468463
.arg("--scale-factor")
469464
.arg("1")
470465
.arg("--output-dir")
471466
.arg(output_dir.path())
472-
.arg("--parquet-row-group-bytes")
467+
.arg("--row-group-bytes")
473468
.arg("20000000") // 20 MB
474469
.assert()
475470
.success();

0 commit comments

Comments
 (0)