@@ -16,7 +16,10 @@ limitations under the License.
1616
1717use std:: collections:: HashMap ;
1818use std:: sync:: Arc ;
19+ use std:: time:: { SystemTime , UNIX_EPOCH } ;
1920
21+ use arrow:: array:: { RecordBatch , TimestampMicrosecondArray } ;
22+ use arrow:: datatypes:: { DataType , Field , Schema , SchemaRef , TimeUnit } ;
2023use data_generation:: config:: DatasetConfig as GenerationDatasetConfig ;
2124use data_generation:: dataset:: simple_sequence:: SimpleSequenceDataset ;
2225use data_generation:: dataset:: tpch:: TpchDataset ;
@@ -36,6 +39,38 @@ use crate::sink::{InsertOp, Sink};
3639
3740pub mod sink;
3841
42+ /// Column name appended by the ETL pipeline to every batch.
43+ const CREATED_AT_COLUMN : & str = "__created_at" ;
44+
45+ /// Returns a new schema with the `__created_at` timestamp column appended.
46+ fn schema_with_created_at ( schema : & SchemaRef ) -> SchemaRef {
47+ let mut fields: Vec < _ > = schema. fields ( ) . iter ( ) . cloned ( ) . collect ( ) ;
48+ fields. push ( Arc :: new ( Field :: new (
49+ CREATED_AT_COLUMN ,
50+ DataType :: Timestamp ( TimeUnit :: Microsecond , Some ( "UTC" . into ( ) ) ) ,
51+ true ,
52+ ) ) ) ;
53+ Arc :: new ( Schema :: new ( fields) )
54+ }
55+
56+ /// Appends a `__created_at` column (current wall-clock time, microsecond UTC)
57+ /// to the given batch.
58+ fn append_created_at ( batch : & RecordBatch ) -> anyhow:: Result < RecordBatch > {
59+ let now_us = SystemTime :: now ( )
60+ . duration_since ( UNIX_EPOCH )
61+ . expect ( "system time before UNIX epoch" )
62+ . as_micros ( ) as i64 ;
63+
64+ let timestamps =
65+ TimestampMicrosecondArray :: from ( vec ! [ Some ( now_us) ; batch. num_rows( ) ] ) . with_timezone ( "UTC" ) ;
66+
67+ let new_schema = schema_with_created_at ( & batch. schema ( ) ) ;
68+ let mut columns: Vec < _ > = batch. columns ( ) . to_vec ( ) ;
69+ columns. push ( Arc :: new ( timestamps) ) ;
70+
71+ Ok ( RecordBatch :: try_new ( new_schema, columns) ?)
72+ }
73+
3974/// Specifies which dataset implementation to use for the ETL pipeline.
4075#[ derive( Debug , Clone ) ]
4176pub enum DatasetSource {
@@ -191,7 +226,7 @@ impl ETLPipeline {
191226 . into_iter ( )
192227 . map ( |( name, table) | {
193228 let config = ProtocolDatasetConfig {
194- schema : table. rehydrated_schema ( ) ,
229+ schema : schema_with_created_at ( & table. schema ) ,
195230 } ;
196231 ( name, config)
197232 } )
@@ -220,7 +255,6 @@ impl ETLPipeline {
220255
221256 let mut join_set: JoinSet < Result < String , String > > = JoinSet :: new ( ) ;
222257 for table_name in tables. keys ( ) {
223- let dataset = Arc :: clone ( & self . dataset ) ;
224258 let source = Arc :: clone ( & self . data_storage ) ;
225259 let target = Arc :: clone ( & self . data_sink ) ;
226260 let table_name = table_name. clone ( ) ;
@@ -237,8 +271,8 @@ impl ETLPipeline {
237271 let op = sink_op_from_batch_op ( & read_result. operation ) ;
238272
239273 for batch in read_result. batches {
240- let rehydrated = dataset . rehydrate ( & table_name , & batch) . map_err ( |e| {
241- format ! ( "rehydrate {table_name} batch {first_batch_id}: {e}" )
274+ let rehydrated = append_created_at ( & batch) . map_err ( |e| {
275+ format ! ( "append __created_at to {table_name} batch {first_batch_id}: {e}" )
242276 } ) ?;
243277
244278 target
@@ -289,8 +323,8 @@ impl ETLPipeline {
289323 /// each batch the task:
290324 ///
291325 /// 1. Reads the batch from the [`Source`].
292- /// 2. Rehydrates it through the [`Dataset`] (appending time columns, etc.) .
293- /// 3. Writes the rehydrated batch to the [`Sink`].
326+ /// 2. Appends the `__created_at` timestamp column .
327+ /// 3. Writes the enriched batch to the [`Sink`].
294328 ///
295329 /// The task transitions to [`PipelineState::Stopped`] when all batches are
296330 /// processed, the [`CancellationToken`] is triggered, or an error occurs.
@@ -329,7 +363,7 @@ impl ETLPipeline {
329363 work. sort_by_key ( |( _, id) | * id) ;
330364
331365 let handle = tokio:: spawn ( async move {
332- let reason = run_pipeline ( dataset , source, target, work, cancel) . await ;
366+ let reason = run_pipeline ( source, target, work, cancel) . await ;
333367 let _ = state_tx. send ( PipelineState :: Stopped ( reason) ) ;
334368 } ) ;
335369
@@ -355,7 +389,6 @@ impl ETLPipeline {
355389/// Groups work items by batch ID and processes all tables within each step
356390/// concurrently, checking for cancellation between steps.
357391async fn run_pipeline (
358- dataset : Arc < dyn Dataset > ,
359392 data_storage : Arc < dyn DataStorage > ,
360393 data_sink : Arc < dyn Sink > ,
361394 work : Vec < ( String , u64 ) > ,
@@ -433,7 +466,6 @@ async fn run_pipeline(
433466 // Process all tables for this batch_id concurrently.
434467 let mut join_set: JoinSet < Result < ( String , bool ) , String > > = JoinSet :: new ( ) ;
435468 for table_name in active_tables {
436- let dataset = Arc :: clone ( & dataset) ;
437469 let data_storage = Arc :: clone ( & data_storage) ;
438470 let data_sink = Arc :: clone ( & data_sink) ;
439471
@@ -462,18 +494,20 @@ async fn run_pipeline(
462494
463495 let op = sink_op_from_batch_op ( & read_result. operation ) ;
464496
465- // 2. Rehydrate each record batch and write to target
497+ // 2. Append __created_at and write to target
466498 for batch in read_result. batches {
467- let rehydrated = match dataset . rehydrate ( & table_name , & batch) {
499+ let rehydrated = match append_created_at ( & batch) {
468500 Ok ( b) => b,
469501 Err ( e) => {
470502 error ! (
471503 table = %table_name,
472504 batch_id,
473505 error = %e,
474- "Failed to rehydrate batch "
506+ "Failed to append __created_at column "
475507 ) ;
476- return Err ( format ! ( "rehydrate {table_name} batch {batch_id}: {e}" ) ) ;
508+ return Err ( format ! (
509+ "append __created_at to {table_name} batch {batch_id}: {e}"
510+ ) ) ;
477511 }
478512 } ;
479513
0 commit comments