@@ -38,17 +38,40 @@ Examples
3838
3939tpchgen-cli -s 1 --output-dir=/tmp/tpch
4040
41+ # Generate all tables in CSV format:
42+
43+ tpchgen-cli -s 1 --format=csv --output-dir=/tmp/tpch
44+
45+ # Generate scale factor one in CSV format with tab delimiter:
46+
47+ tpchgen-cli -s 1 --format=csv --delimiter='\t' --output-dir=/tmp/tpch
48+
4149# Generate the lineitem table at scale factor 100 in 10 Apache Parquet files to
42- # /tmp/tpch/lineitem
50+ # /tmp/tpch/lineitem:
4351
44- tpchgen-cli -s 100 --tables=lineitem --format=parquet --parts=10 --output-dir=/tmp/tpch
52+ tpchgen-cli parquet -s 100 --tables=lineitem --parts=10 --output-dir=/tmp/tpch
4553
4654# Generate scale factor one in current directory, seeing debug output
4755
48- RUST_LOG=debug tpchgen -s 1
56+ RUST_LOG=debug tpchgen-cli -s 1 --output-dir=/tmp/tpch
4957"#
5058) ]
5159struct Cli {
60+ #[ command( subcommand) ]
61+ command : Option < Commands > ,
62+
63+ #[ command( flatten) ]
64+ args : TopLevelArgs ,
65+ }
66+
67+ #[ derive( clap:: Subcommand ) ]
68+ enum Commands {
69+ /// Generate Apache Parquet output with Parquet-specific options
70+ Parquet ( ParquetArgs ) ,
71+ }
72+
73+ #[ derive( clap:: Args ) ]
74+ struct CommonArgs {
5275 /// Scale factor to create
5376 #[ arg( short, long, default_value_t = 1. ) ]
5477 scale_factor : f64 ,
@@ -69,13 +92,59 @@ struct Cli {
6992 #[ arg( long) ]
7093 part : Option < i32 > ,
7194
95+ /// The number of threads for parallel generation, defaults to the number of CPUs
96+ #[ arg( short, long, default_value_t = num_cpus:: get( ) ) ]
97+ num_threads : usize ,
98+
99+ /// Verbose output
100+ ///
101+ /// When specified, sets the log level to `info` and ignores the `RUST_LOG`
102+ /// environment variable. When not specified, uses `RUST_LOG`
103+ #[ arg( short, long, default_value_t = false , conflicts_with = "quiet" ) ]
104+ verbose : bool ,
105+
106+ /// Quiet mode - only show error-level logs
107+ #[ arg( short, long, default_value_t = false , conflicts_with = "verbose" ) ]
108+ quiet : bool ,
109+
110+ /// Write the output to stdout instead of a file.
111+ #[ arg( long, default_value_t = false ) ]
112+ stdout : bool ,
113+ }
114+
115+ #[ derive( clap:: Args ) ]
116+ struct TopLevelArgs {
117+ #[ command( flatten) ]
118+ common : CommonArgs ,
119+
72120 /// Output format: tbl, csv, parquet
73121 #[ arg( short, long, default_value = "tbl" ) ]
74122 format : OutputFormat ,
75123
76- /// The number of threads for parallel generation, defaults to the number of CPUs
77- #[ arg( short, long, default_value_t = num_cpus:: get( ) ) ]
78- num_threads : usize ,
124+ /// Parquet block compression format (deprecated: use 'parquet' subcommand instead)
125+ #[ arg( short = 'c' , long, hide = true ) ]
126+ #[ deprecated]
127+ parquet_compression : Option < Compression > ,
128+
129+ /// Target row group size in bytes (deprecated: use 'parquet' subcommand instead)
130+ #[ arg( long, hide = true ) ]
131+ #[ deprecated]
132+ parquet_row_group_bytes : Option < i64 > ,
133+
134+ /// CSV delimiter character (default: ',')
135+ ///
136+ /// Specifies the delimiter character to use when generating CSV files.
137+ ///
138+ /// Supports escape sequences: \t (tab), \n (newline), \r (carriage return), \\ (backslash)
139+ /// Common delimiters: ',' (comma), '|' (pipe), '\t' (tab), ';' (semicolon)
140+ #[ arg( long, default_value = "," , value_parser = parse_delimiter) ]
141+ delimiter : char ,
142+ }
143+
144+ #[ derive( clap:: Args ) ]
145+ struct ParquetArgs {
146+ #[ command( flatten) ]
147+ common : CommonArgs ,
79148
80149 /// Parquet block compression format.
81150 ///
@@ -91,22 +160,7 @@ struct Cli {
91160 /// SNAPPY: 2.4G (0.75 GB/sec)
92161 /// UNCOMPRESSED: 3.8G (1.41 GB/sec)
93162 #[ arg( short = 'c' , long, default_value = "SNAPPY" ) ]
94- parquet_compression : Compression ,
95-
96- /// Verbose output
97- ///
98- /// When specified, sets the log level to `info` and ignores the `RUST_LOG`
99- /// environment variable. When not specified, uses `RUST_LOG`
100- #[ arg( short, long, default_value_t = false , conflicts_with = "quiet" ) ]
101- verbose : bool ,
102-
103- /// Quiet mode - only show error-level logs
104- #[ arg( short, long, default_value_t = false , conflicts_with = "verbose" ) ]
105- quiet : bool ,
106-
107- /// Write the output to stdout instead of a file.
108- #[ arg( long, default_value_t = false ) ]
109- stdout : bool ,
163+ compression : Compression ,
110164
111165 /// Target size in row group bytes in Parquet files
112166 ///
@@ -121,17 +175,7 @@ struct Cli {
121175 ///
122176 /// Typical values range from 10MB to 100MB.
123177 #[ arg( long, default_value_t = DEFAULT_PARQUET_ROW_GROUP_BYTES ) ]
124- parquet_row_group_bytes : i64 ,
125-
126- /// CSV delimiter character (default: ',')
127- ///
128- /// Specifies the delimiter character to use when generating CSV files.
129- /// This option only applies to CSV format and cannot be used with TBL format.
130- ///
131- /// Supports escape sequences: \t (tab), \n (newline), \r (carriage return), \\ (backslash)
132- /// Common delimiters: ',' (comma), '|' (pipe), '\t' (tab), ';' (semicolon)
133- #[ arg( long, default_value = "," , value_parser = parse_delimiter) ]
134- delimiter : char ,
178+ row_group_bytes : i64 ,
135179}
136180
137181/// Parse a delimiter string, handling escape sequences
@@ -207,71 +251,133 @@ async fn main() -> io::Result<()> {
207251impl Cli {
208252 /// Main function to run the generation
209253 async fn main ( self ) -> io:: Result < ( ) > {
210- // Configure logging
211- if self . quiet {
212- // Quiet mode: only show error-level logs
213- env_logger:: builder ( )
214- . filter_level ( LevelFilter :: Error )
215- . init ( ) ;
216- } else if self . verbose {
217- env_logger:: builder ( ) . filter_level ( LevelFilter :: Info ) . init ( ) ;
218- info ! ( "Verbose output enabled (ignoring RUST_LOG environment variable)" ) ;
219- } else {
220- // Default: show warnings and errors, but respect RUST_LOG if set
221- env_logger:: builder ( )
222- . filter_level ( LevelFilter :: Warn )
223- . parse_default_env ( )
224- . init ( ) ;
254+ match self . command {
255+ Some ( Commands :: Parquet ( args) ) => args. run ( ) . await ,
256+ None => self . run ( ) . await ,
225257 }
258+ }
259+
260+ #[ allow( deprecated) ]
261+ async fn run ( self ) -> io:: Result < ( ) > {
262+ let format = self . args . format ;
263+ let scale_factor = self . args . common . scale_factor ;
264+ let output_dir = self . args . common . output_dir ;
265+ let num_threads = self . args . common . num_threads ;
266+ let verbose = self . args . common . verbose ;
267+ let quiet = self . args . common . quiet ;
268+ let stdout = self . args . common . stdout ;
269+ let delimiter = self . args . delimiter ;
270+ let parquet_compression = self . args . parquet_compression . unwrap_or ( Compression :: SNAPPY ) ;
271+ let parquet_row_group_bytes = self
272+ . args
273+ . parquet_row_group_bytes
274+ . unwrap_or ( DEFAULT_PARQUET_ROW_GROUP_BYTES ) ;
275+
276+ configure_logging ( verbose, quiet) ;
226277
227278 // Warn if parquet specific options are set but not generating parquet
228- if self . format != OutputFormat :: Parquet {
229- if self . parquet_compression != Compression :: SNAPPY {
279+ if format == OutputFormat :: Parquet {
280+ log:: warn!(
281+ "Warning: Use 'tpchgen-cli parquet' subcommand instead of '--format=parquet' for better validation and control"
282+ ) ;
283+ }
284+
285+ if self . args . parquet_compression . is_some ( ) {
286+ if format == OutputFormat :: Parquet {
287+ log:: warn!( "The --parquet-compression flag is deprecated. Use 'tpchgen-cli parquet --compression=...' instead" ) ;
288+ } else {
230289 log:: warn!( "Parquet compression option set but not generating Parquet files" ) ;
231290 }
232- if self . parquet_row_group_bytes != DEFAULT_PARQUET_ROW_GROUP_BYTES {
291+ }
292+ if self . args . parquet_row_group_bytes . is_some ( ) {
293+ if format == OutputFormat :: Parquet {
294+ log:: warn!( "The --parquet-row-group-bytes flag is deprecated. Use 'tpchgen-cli parquet --row-group-bytes=...' instead" ) ;
295+ } else {
233296 log:: warn!( "Parquet row group size option set but not generating Parquet files" ) ;
234297 }
235298 }
236299
237300 // Validate delimiter usage
238- if self . format == OutputFormat :: Tbl && self . delimiter != ',' {
301+ if format == OutputFormat :: Tbl && delimiter != ',' {
239302 return Err ( io:: Error :: new (
240303 io:: ErrorKind :: InvalidInput ,
241304 "The --delimiter option cannot be used with --format=tbl. TBL format uses the TPC-H standard pipe delimiter."
242305 ) ) ;
243306 }
244307
245308 // Warn if delimiter is set but not generating CSV
246- if self . format != OutputFormat :: Csv && self . delimiter != ',' {
247- eprintln ! ( "Warning: Delimiter option set but not generating CSV files " ) ;
309+ if format != OutputFormat :: Csv && delimiter != ',' {
310+ log :: warn !( "Warning: Delimiter option set but not generating CSV" ) ;
248311 }
249312
250313 // Build the generator using the library API
251314 let mut builder = TpchGenerator :: builder ( )
252- . with_scale_factor ( self . scale_factor )
253- . with_output_dir ( self . output_dir )
254- . with_format ( self . format )
255- . with_num_threads ( self . num_threads )
256- . with_parquet_compression ( self . parquet_compression )
257- . with_parquet_row_group_bytes ( self . parquet_row_group_bytes )
258- . with_stdout ( self . stdout )
259- . with_csv_delimiter ( self . delimiter ) ;
260-
261- // Add tables if specified
262- if let Some ( tables) = self . tables {
315+ . with_scale_factor ( scale_factor)
316+ . with_output_dir ( output_dir)
317+ . with_format ( format)
318+ . with_num_threads ( num_threads)
319+ . with_parquet_compression ( parquet_compression)
320+ . with_parquet_row_group_bytes ( parquet_row_group_bytes)
321+ . with_stdout ( stdout)
322+ . with_csv_delimiter ( delimiter) ;
323+
324+ if let Some ( tables) = self . args . common . tables {
325+ builder = builder. with_tables ( tables) ;
326+ }
327+
328+ if let Some ( parts) = self . args . common . parts {
329+ builder = builder. with_parts ( parts) ;
330+ }
331+ if let Some ( part) = self . args . common . part {
332+ builder = builder. with_part ( part) ;
333+ }
334+
335+ builder. build ( ) . generate ( ) . await
336+ }
337+ }
338+
339+ impl ParquetArgs {
340+ async fn run ( self ) -> io:: Result < ( ) > {
341+ configure_logging ( self . common . verbose , self . common . quiet ) ;
342+
343+ let mut builder = TpchGenerator :: builder ( )
344+ . with_scale_factor ( self . common . scale_factor )
345+ . with_output_dir ( self . common . output_dir )
346+ . with_format ( OutputFormat :: Parquet )
347+ . with_num_threads ( self . common . num_threads )
348+ . with_stdout ( self . common . stdout )
349+ . with_parquet_compression ( self . compression )
350+ . with_parquet_row_group_bytes ( self . row_group_bytes ) ;
351+
352+ if let Some ( tables) = self . common . tables {
263353 builder = builder. with_tables ( tables) ;
264354 }
265355
266- // Add parts/part if specified
267- if let Some ( parts) = self . parts {
356+ if let Some ( parts) = self . common . parts {
268357 builder = builder. with_parts ( parts) ;
269358 }
270- if let Some ( part) = self . part {
359+ if let Some ( part) = self . common . part {
271360 builder = builder. with_part ( part) ;
272361 }
273362
274- // Generate using the library
275363 builder. build ( ) . generate ( ) . await
276364 }
277365}
366+
367+ fn configure_logging ( verbose : bool , quiet : bool ) {
368+ if quiet {
369+ // Quiet mode: only show error-level logs
370+ env_logger:: builder ( )
371+ . filter_level ( LevelFilter :: Error )
372+ . init ( ) ;
373+ } else if verbose {
374+ env_logger:: builder ( ) . filter_level ( LevelFilter :: Info ) . init ( ) ;
375+ info ! ( "Verbose output enabled (ignoring RUST_LOG environment variable)" ) ;
376+ } else {
377+ // Default: show warnings and errors, but respect RUST_LOG if set
378+ env_logger:: builder ( )
379+ . filter_level ( LevelFilter :: Warn )
380+ . parse_default_env ( )
381+ . init ( ) ;
382+ }
383+ }
0 commit comments