Skip to content

Commit ee6c72a

Browse files
authored
chore: Follow-up improvements to parquet subcommand CLI (#254)
Follow-up to #234. Updates docs, benchmarks, and CLI behavior for the `parquet` subcommand. ## Changes ### CLI behavior - Make `--format` optional (defaults to `tbl`) instead of requiring an explicit value - Warn when `--format=parquet` is used, directing users to the `parquet` subcommand (v4.0.0 migration) - Error when `--format` is used together with a subcommand - Fix redundant "Warning:" prefix in `log::warn!` message ### Docs and benchmarks - Update examples to use `parquet` subcommand instead of `--format=parquet` - Add deprecation notice to README for `--format=parquet`, `--parquet-compression`, `--parquet-row-group-bytes` ### Tests - Add integration tests for deprecation warning, format+subcommand conflict, and default format
2 parents 3ecba03 + 27ae167 commit ee6c72a

5 files changed

Lines changed: 88 additions & 13 deletions

File tree

benchmarks/BENCHMARKS.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ tpchgen-cli -s 100 --stdout | pv -arb > /dev/null
1717
# Outputs something similar to
1818
# 106GiB [3.09GiB/s] (3.09GiB/s)
1919
# For parquet
20-
tpchgen-cli -s 100 --format=parquet --stdout | pv -arb > /dev/null
20+
tpchgen-cli parquet -s 100 --stdout | pv -arb > /dev/null
2121
# 38.2GiB [ 865MiB/s] ( 865MiB/s)
2222
```
2323

@@ -49,7 +49,7 @@ single parquet file per table, with snappy page compression.
4949
Example command to create Scale Factor 10
5050

5151
```shell
52-
tpchgen-cli -s 10 --format=parquet
52+
tpchgen-cli parquet -s 10
5353
```
5454

5555
## `parquet_duckdb.sh`

benchmarks/parquet_tpchgen.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,5 @@ uname -a >> $LOGFILE
1313
SCALE_FACTORS="1 10 100 1000"
1414
for sf in $SCALE_FACTORS ; do
1515
echo "SF=$sf" >> $LOGFILE
16-
/usr/bin/time -a -o $LOGFILE tpchgen-cli -s $sf --output-dir=out_tpchgen --format=parquet
16+
/usr/bin/time -a -o $LOGFILE tpchgen-cli parquet -s $sf --output-dir=out_tpchgen
1717
done

tpchgen-cli/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,16 +45,16 @@ RUSTFLAGS='-C target-cpu=native' cargo install tpchgen-cli
4545
```shell
4646
# Scale Factor 10, all tables, in Apache Parquet format in the current directory
4747
# (3.6GB, 8 files, 60M lineitem rows, in 5 seconds on a modern laptop)
48-
tpchgen-cli -s 10 --format=parquet
48+
tpchgen-cli parquet -s 10
4949

5050
# Scale Factor 10, all tables, in `tbl`(csv like) format in the `sf10` directory
5151
# (10GB, 8 files, 60M lineitem rows)
5252
tpchgen-cli -s 10 --output-dir sf10
5353

5454
# Scale Factor 1000, lineitem table, in Apache Parquet format in sf1000 directory,
55-
# 20 part(ititons), 100MB row groups
55+
# 20 part(itions), 100MB row groups
5656
# (220GB, 20 files, 6B lineitem rows, 3.5 minutes on a modern laptop)
57-
tpchgen-cli -s 1000 --tables lineitem --parts 20 --format=parquet --parquet-row-group-bytes=100000000 --output-dir sf1000
57+
tpchgen-cli parquet -s 1000 --tables lineitem --parts 20 --row-group-bytes=100000000 --output-dir sf1000
5858

5959
# Scale Factor 10, partition 2 and 3 of 10 in sf10 directory
6060
#

tpchgen-cli/bin/main.rs

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,12 @@ struct TopLevelArgs {
117117
#[command(flatten)]
118118
common: CommonArgs,
119119

120-
/// Output format: tbl, csv, parquet
121-
#[arg(short, long, default_value = "tbl")]
122-
format: OutputFormat,
120+
/// Output format: tbl, csv, parquet (default: tbl)
121+
///
122+
/// For Parquet output, prefer using the `parquet` subcommand instead.
123+
/// The --format flag will be replaced by subcommands in v4.0.0.
124+
#[arg(short, long)]
125+
format: Option<OutputFormat>,
123126

124127
/// Parquet block compression format (deprecated: use 'parquet' subcommand instead)
125128
#[arg(short = 'c', long, hide = true)]
@@ -251,6 +254,13 @@ async fn main() -> io::Result<()> {
251254
impl Cli {
252255
/// Main function to run the generation
253256
async fn main(self) -> io::Result<()> {
257+
// Error if both --format and a subcommand are specified
258+
if self.args.format.is_some() && self.command.is_some() {
259+
return Err(io::Error::new(
260+
io::ErrorKind::InvalidInput,
261+
"Cannot use --format with a subcommand. Use the subcommand directly, e.g. `tpchgen-cli parquet`",
262+
));
263+
}
254264
match self.command {
255265
Some(Commands::Parquet(args)) => args.run().await,
256266
None => self.run().await,
@@ -259,7 +269,7 @@ impl Cli {
259269

260270
#[allow(deprecated)]
261271
async fn run(self) -> io::Result<()> {
262-
let format = self.args.format;
272+
let format = self.args.format.unwrap_or(OutputFormat::Tbl);
263273
let scale_factor = self.args.common.scale_factor;
264274
let output_dir = self.args.common.output_dir;
265275
let num_threads = self.args.common.num_threads;
@@ -275,10 +285,10 @@ impl Cli {
275285

276286
configure_logging(verbose, quiet);
277287

278-
// Warn if parquet specific options are set but not generating parquet
288+
// Warn about --format=parquet migration to subcommand
279289
if format == OutputFormat::Parquet {
280290
log::warn!(
281-
"Warning: Use 'tpchgen-cli parquet' subcommand instead of '--format=parquet' for better validation and control"
291+
"The --format=parquet flag will be replaced by the `parquet` subcommand in v4.0.0. Use `tpchgen-cli parquet` instead."
282292
);
283293
}
284294

@@ -307,7 +317,7 @@ impl Cli {
307317

308318
// Warn if delimiter is set but not generating CSV
309319
if format != OutputFormat::Csv && delimiter != ',' {
310-
log::warn!("Warning: Delimiter option set but not generating CSV");
320+
log::warn!("Delimiter option set but not generating CSV");
311321
}
312322

313323
// Build the generator using the library API

tpchgen-cli/tests/cli_integration.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -703,3 +703,68 @@ fn expect_row_group_sizes(output_dir: &Path, expected_row_groups: Vec<RowGroups>
703703
let actual_row_groups = format!("{actual_row_groups:#?}");
704704
assert_eq!(actual_row_groups, expected_row_groups);
705705
}
706+
707+
/// Test that --format=parquet emits a warning about v4.0.0 migration
708+
#[tokio::test]
709+
async fn test_format_parquet_warns_about_subcommand() {
710+
let output_dir = tempdir().unwrap();
711+
cargo_bin_cmd!("tpchgen-cli")
712+
.arg("--format")
713+
.arg("parquet")
714+
.arg("--tables")
715+
.arg("part")
716+
.arg("--scale-factor")
717+
.arg("0.001")
718+
.arg("--output-dir")
719+
.arg(output_dir.path())
720+
.assert()
721+
.success()
722+
.stderr(predicates::str::contains(
723+
"will be replaced by the `parquet` subcommand in v4.0.0",
724+
));
725+
}
726+
727+
/// Test that using --format together with a subcommand errors
728+
#[test]
729+
fn test_format_with_subcommand_conflict() {
730+
let temp_dir = tempdir().expect("Failed to create temporary directory");
731+
732+
cargo_bin_cmd!("tpchgen-cli")
733+
.arg("--format")
734+
.arg("parquet")
735+
.arg("parquet")
736+
.arg("--scale-factor")
737+
.arg("0.001")
738+
.arg("--tables")
739+
.arg("part")
740+
.arg("--output-dir")
741+
.arg(temp_dir.path())
742+
.assert()
743+
.failure()
744+
.stderr(predicates::str::contains(
745+
"Cannot use --format with a subcommand",
746+
));
747+
}
748+
749+
/// Test that running with no --format and no subcommand defaults to TBL
750+
#[test]
751+
fn test_default_format_is_tbl() {
752+
let temp_dir = tempdir().expect("Failed to create temporary directory");
753+
754+
cargo_bin_cmd!("tpchgen-cli")
755+
.arg("--scale-factor")
756+
.arg("0.001")
757+
.arg("--tables")
758+
.arg("part")
759+
.arg("--output-dir")
760+
.arg(temp_dir.path())
761+
.assert()
762+
.success();
763+
764+
let expected_file = temp_dir.path().join("part.tbl");
765+
assert!(
766+
expected_file.exists(),
767+
"Expected TBL file {:?} to exist when no --format or subcommand is specified",
768+
expected_file
769+
);
770+
}

0 commit comments

Comments
 (0)