Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions tpchgen-cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ tpchgen-cli parquet -s 10

# Scale Factor 10, all tables, in `tbl`(csv like) format in the `sf10` directory
# (10GB, 8 files, 60M lineitem rows)
# Note: `tpchgen-cli tbl` also works explicitly
tpchgen-cli -s 10 --output-dir sf10

# Scale Factor 1000, lineitem table, in Apache Parquet format in sf1000 directory,
Expand Down Expand Up @@ -87,3 +88,15 @@ done

Times to create TPCH tables in Parquet format using `tpchgen-cli` and `duckdb` for various scale factors.

## Deprecation Notice

`--format`, `--parquet-compression`, and `--parquet-row-group-bytes` are deprecated as of v3.x and will be removed in v4.0.0. Use subcommands instead:

```shell
# Before
tpchgen-cli --format=parquet --parquet-compression=ZSTD(1) -s 10

# After
tpchgen-cli parquet --compression=ZSTD(1) -s 10
```

202 changes: 115 additions & 87 deletions tpchgen-cli/bin/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ use std::io;
use std::path::PathBuf;
use std::str::FromStr;
use tpchgen_cli::{
Compression, OutputFormat, Table, TpchGenerator, DEFAULT_PARQUET_ROW_GROUP_BYTES,
Compression, OutputFormat, Table, TpchGenerator, TpchGeneratorBuilder,
DEFAULT_PARQUET_ROW_GROUP_BYTES,
};

#[derive(Parser)]
Expand All @@ -34,17 +35,17 @@ multiple files named <output_dir>/<table>/<table>.<part>.<format>

Examples

# Generate all tables at scale factor 1 (1GB) in TBL format to /tmp/tpch directory:
# Generate all tables at scale factor 1 (1GB) in TBL format (default) to /tmp/tpch directory:

tpchgen-cli -s 1 --output-dir=/tmp/tpch

# Generate all tables in CSV format:

tpchgen-cli -s 1 --format=csv --output-dir=/tmp/tpch
tpchgen-cli csv -s 1 --output-dir=/tmp/tpch

# Generate scale factor one in CSV format with tab delimiter:

tpchgen-cli -s 1 --format=csv --delimiter='\t' --output-dir=/tmp/tpch
tpchgen-cli csv -s 1 --delimiter='\t' --output-dir=/tmp/tpch

# Generate the lineitem table at scale factor 100 in 10 Apache Parquet files to
# /tmp/tpch/lineitem:
Expand All @@ -54,18 +55,26 @@ tpchgen-cli parquet -s 100 --tables=lineitem --parts=10 --output-dir=/tmp/tpch
# Generate scale factor one in current directory, seeing debug output

RUST_LOG=debug tpchgen-cli -s 1 --output-dir=/tmp/tpch
"#
"#,
args_conflicts_with_subcommands = true
)]
struct Cli {
#[command(subcommand)]
command: Option<Commands>,

// Top-level args are only used when no subcommand is given (legacy path).
// args_conflicts_with_subcommands prevents these from being silently ignored
// when a subcommand is present (e.g. `tpchgen-cli -s 10 parquet` is an error).
#[command(flatten)]
args: TopLevelArgs,
}

#[derive(clap::Subcommand)]
enum Commands {
/// Generate TBL (pipe-delimited) output
Tbl(TblArgs),
/// Generate CSV output with CSV-specific options
Csv(CsvArgs),
/// Generate Apache Parquet output with Parquet-specific options
Parquet(ParquetArgs),
}
Expand Down Expand Up @@ -112,28 +121,65 @@ struct CommonArgs {
stdout: bool,
}

impl CommonArgs {
/// Create a [`TpchGeneratorBuilder`] pre-configured with the common options.
fn builder(self, format: OutputFormat) -> TpchGeneratorBuilder {
let mut builder = TpchGenerator::builder()
.with_scale_factor(self.scale_factor)
.with_output_dir(self.output_dir)
.with_format(format)
.with_num_threads(self.num_threads)
.with_stdout(self.stdout);

if let Some(tables) = self.tables {
builder = builder.with_tables(tables);
}
if let Some(parts) = self.parts {
builder = builder.with_parts(parts);
}
if let Some(part) = self.part {
builder = builder.with_part(part);
}

builder
}
}

#[derive(clap::Args)]
struct TopLevelArgs {
#[command(flatten)]
common: CommonArgs,

/// Output format: tbl, csv, parquet (default: tbl)
/// Output format (deprecated: use subcommands `tbl`, `csv`, or `parquet` instead)
///
/// For Parquet output, prefer using the `parquet` subcommand instead.
/// The --format flag will be replaced by subcommands in v4.0.0.
#[arg(short, long)]
/// The --format flag will be removed in v4.0.0.
#[arg(short, long, hide = true)]
format: Option<OutputFormat>,

/// Parquet block compression format (deprecated: use 'parquet' subcommand instead)
#[arg(short = 'c', long, hide = true)]
#[deprecated]
parquet_compression: Option<Compression>,

/// Target row group size in bytes (deprecated: use 'parquet' subcommand instead)
#[arg(long, hide = true)]
#[deprecated]
parquet_row_group_bytes: Option<i64>,

/// CSV delimiter character (use 'csv --delimiter=...' instead)
#[arg(long, hide = true, value_parser = parse_delimiter)]
delimiter: Option<char>,
}

#[derive(clap::Args)]
struct TblArgs {
#[command(flatten)]
common: CommonArgs,
}

#[derive(clap::Args)]
struct CsvArgs {
#[command(flatten)]
common: CommonArgs,
Comment thread
kevinjqliu marked this conversation as resolved.

/// CSV delimiter character (default: ',')
///
/// Specifies the delimiter character to use when generating CSV files.
Expand Down Expand Up @@ -254,51 +300,47 @@ async fn main() -> io::Result<()> {
impl Cli {
/// Main function to run the generation
async fn main(self) -> io::Result<()> {
// Error if both --format and a subcommand are specified
if self.args.format.is_some() && self.command.is_some() {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"Cannot use --format with a subcommand. Use the subcommand directly, e.g. `tpchgen-cli parquet`",
));
}
match self.command {
Some(Commands::Tbl(args)) => args.run().await,
Some(Commands::Csv(args)) => args.run().await,
Some(Commands::Parquet(args)) => args.run().await,
None => self.run().await,
}
}

#[allow(deprecated)]
async fn run(self) -> io::Result<()> {
let format = self.args.format.unwrap_or(OutputFormat::Tbl);
let scale_factor = self.args.common.scale_factor;
let output_dir = self.args.common.output_dir;
let num_threads = self.args.common.num_threads;
let verbose = self.args.common.verbose;
let quiet = self.args.common.quiet;
let stdout = self.args.common.stdout;
let delimiter = self.args.delimiter;
let parquet_compression = self.args.parquet_compression.unwrap_or(Compression::SNAPPY);
let parquet_row_group_bytes = self
.args
.parquet_row_group_bytes
.unwrap_or(DEFAULT_PARQUET_ROW_GROUP_BYTES);

configure_logging(verbose, quiet);

// Warn about --format=parquet migration to subcommand
if format == OutputFormat::Parquet {
log::warn!(
"The --format=parquet flag will be replaced by the `parquet` subcommand in v4.0.0. Use `tpchgen-cli parquet` instead."
);
configure_logging(self.args.common.verbose, self.args.common.quiet);

// Warn about --format migration to subcommands (only when explicitly provided)
if self.args.format.is_some() {
match format {
OutputFormat::Parquet => {
log::warn!("The --format flag will be removed in v4.0.0. Use `tpchgen-cli parquet` instead.");
}
OutputFormat::Csv => {
log::warn!("The --format flag will be removed in v4.0.0. Use `tpchgen-cli csv` instead.");
}
OutputFormat::Tbl => {
log::warn!("The --format flag will be removed in v4.0.0. Use `tpchgen-cli tbl` instead.");
}
}
}

let parquet_compression = self.args.parquet_compression.unwrap_or(Compression::SNAPPY);
Comment thread
kevinjqliu marked this conversation as resolved.
Outdated
if self.args.parquet_compression.is_some() {
if format == OutputFormat::Parquet {
log::warn!("The --parquet-compression flag is deprecated. Use 'tpchgen-cli parquet --compression=...' instead");
} else {
log::warn!("Parquet compression option set but not generating Parquet files");
}
}

let parquet_row_group_bytes = self
.args
.parquet_row_group_bytes
.unwrap_or(DEFAULT_PARQUET_ROW_GROUP_BYTES);
Comment thread
kevinjqliu marked this conversation as resolved.
Outdated
if self.args.parquet_row_group_bytes.is_some() {
if format == OutputFormat::Parquet {
log::warn!("The --parquet-row-group-bytes flag is deprecated. Use 'tpchgen-cli parquet --row-group-bytes=...' instead");
Expand All @@ -307,70 +349,56 @@ impl Cli {
}
}

// Validate delimiter usage
if format == OutputFormat::Tbl && delimiter != ',' {
if self.args.delimiter.is_some() {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"The --delimiter option cannot be used with --format=tbl. TBL format uses the TPC-H standard pipe delimiter."
"The --delimiter flag is not supported at the top level. Use `tpchgen-cli csv --delimiter=...` instead.",
));
}

// Warn if delimiter is set but not generating CSV
if format != OutputFormat::Csv && delimiter != ',' {
log::warn!("Delimiter option set but not generating CSV");
}

// Build the generator using the library API
let mut builder = TpchGenerator::builder()
.with_scale_factor(scale_factor)
.with_output_dir(output_dir)
.with_format(format)
.with_num_threads(num_threads)
.with_parquet_compression(parquet_compression)
.with_parquet_row_group_bytes(parquet_row_group_bytes)
.with_stdout(stdout)
.with_csv_delimiter(delimiter);

if let Some(tables) = self.args.common.tables {
builder = builder.with_tables(tables);
let mut builder = self.args.common.builder(format);
if format == OutputFormat::Parquet {
builder = builder
.with_parquet_compression(parquet_compression)
.with_parquet_row_group_bytes(parquet_row_group_bytes);
}
builder.build().generate().await
}
}

if let Some(parts) = self.args.common.parts {
builder = builder.with_parts(parts);
}
if let Some(part) = self.args.common.part {
builder = builder.with_part(part);
}
impl TblArgs {
async fn run(self) -> io::Result<()> {
configure_logging(self.common.verbose, self.common.quiet);
self.common
.builder(OutputFormat::Tbl)
.build()
.generate()
.await
}
}

builder.build().generate().await
impl CsvArgs {
async fn run(self) -> io::Result<()> {
configure_logging(self.common.verbose, self.common.quiet);
self.common
.builder(OutputFormat::Csv)
.with_csv_delimiter(self.delimiter)
.build()
.generate()
.await
}
}

impl ParquetArgs {
async fn run(self) -> io::Result<()> {
configure_logging(self.common.verbose, self.common.quiet);

let mut builder = TpchGenerator::builder()
.with_scale_factor(self.common.scale_factor)
.with_output_dir(self.common.output_dir)
.with_format(OutputFormat::Parquet)
.with_num_threads(self.common.num_threads)
.with_stdout(self.common.stdout)
self.common
.builder(OutputFormat::Parquet)
.with_parquet_compression(self.compression)
.with_parquet_row_group_bytes(self.row_group_bytes);

if let Some(tables) = self.common.tables {
builder = builder.with_tables(tables);
}

if let Some(parts) = self.common.parts {
builder = builder.with_parts(parts);
}
if let Some(part) = self.common.part {
builder = builder.with_part(part);
}

builder.build().generate().await
.with_parquet_row_group_bytes(self.row_group_bytes)
.build()
.generate()
.await
}
}

Expand Down
Loading
Loading