@@ -40,6 +40,7 @@ use crate::execution_plans::shuffle_manager::{
4040 InMemoryShuffleManager , ShufflePartitionData , global_shuffle_manager,
4141} ;
4242use crate :: extension:: SessionConfigExt ;
43+ use crate :: shuffle_storage:: ShuffleStorageType ;
4344use crate :: utils;
4445
4546use crate :: serde:: protobuf:: ShuffleWritePartition ;
@@ -317,6 +318,18 @@ impl ShuffleWriterExec {
317318 // Use memory mode only for intermediate stages, not for the final output stage
318319 let use_memory = memory_mode && !is_final_stage;
319320
321+ // Check for object store shuffle configuration
322+ let storage_type_str = context. session_config ( ) . ballista_shuffle_storage_type ( ) ;
323+ let storage_type: ShuffleStorageType = storage_type_str
324+ . parse ( )
325+ . unwrap_or ( ShuffleStorageType :: Local ) ;
326+ let storage_url = context. session_config ( ) . ballista_shuffle_storage_url ( ) ;
327+ let use_object_store = !use_memory
328+ && matches ! (
329+ storage_type,
330+ ShuffleStorageType :: S3 | ShuffleStorageType :: Azure
331+ ) ;
332+
320333 // Get shuffle format from session config
321334 let shuffle_format = context. session_config ( ) . ballista_shuffle_format ( ) ;
322335 let file_ext = utils:: shuffle_file_extension ( shuffle_format) ;
@@ -338,6 +351,20 @@ impl ShuffleWriterExec {
338351 shuffle_format,
339352 )
340353 . await
354+ } else if use_object_store {
355+ // Use object store (S3 or Azure) for shuffle data
356+ Self :: execute_shuffle_write_object_store (
357+ & job_id,
358+ stage_id,
359+ input_partition,
360+ & mut stream,
361+ output_partitioning,
362+ write_metrics,
363+ now,
364+ storage_type,
365+ storage_url,
366+ )
367+ . await
341368 } else {
342369 // Use disk-based shuffle storage with configurable format
343370 // This is used for:
@@ -507,6 +534,158 @@ impl ShuffleWriterExec {
507534 }
508535 }
509536
537+ /// Executes shuffle write to an object store (S3 or Azure).
538+ ///
539+ /// Uses Arrow IPC format with LZ4 compression for serialization. Data is serialized
540+ /// to an in-memory buffer and then uploaded to the object store in a single PUT request.
541+ #[ allow( clippy:: too_many_arguments) ]
542+ async fn execute_shuffle_write_object_store (
543+ job_id : & str ,
544+ stage_id : usize ,
545+ input_partition : usize ,
546+ stream : & mut std:: pin:: Pin <
547+ Box < dyn datafusion:: physical_plan:: RecordBatchStream + Send > ,
548+ > ,
549+ output_partitioning : Option < Partitioning > ,
550+ write_metrics : ShuffleWriteMetrics ,
551+ now : Instant ,
552+ storage_type : ShuffleStorageType ,
553+ storage_url : Option < String > ,
554+ ) -> Result < Vec < ShuffleWritePartition > > {
555+ use crate :: shuffle_storage:: { ShuffleStorageConfig , ShuffleStorageFactory } ;
556+
557+ let base_url = storage_url. ok_or_else ( || {
558+ DataFusionError :: Configuration ( format ! (
559+ "Shuffle storage URL must be set when using {storage_type} storage type. Set the 'ballista.shuffle.storage_url' configuration."
560+ ) )
561+ } ) ?;
562+
563+ let config = ShuffleStorageConfig :: from_type_and_url ( storage_type, & base_url)
564+ . map_err ( |e| DataFusionError :: External ( Box :: new ( e) ) ) ?;
565+ let storage = ShuffleStorageFactory :: create ( & config)
566+ . map_err ( |e| DataFusionError :: External ( Box :: new ( e) ) ) ?;
567+
568+ let schema = stream. schema ( ) ;
569+
570+ match output_partitioning {
571+ None => {
572+ // No repartitioning — collect all batches and write them as a single partition
573+ let mut batches = Vec :: new ( ) ;
574+ while let Some ( result) = stream. next ( ) . await {
575+ let batch = result?;
576+ write_metrics. input_rows . add ( batch. num_rows ( ) ) ;
577+ write_metrics. output_rows . add ( batch. num_rows ( ) ) ;
578+ batches. push ( batch) ;
579+ }
580+
581+ let ( path, stats) = storage
582+ . write_shuffle_data (
583+ job_id,
584+ stage_id,
585+ input_partition,
586+ input_partition,
587+ batches,
588+ schema,
589+ & write_metrics. write_time ,
590+ )
591+ . await
592+ . map_err ( |e| DataFusionError :: External ( Box :: new ( e) ) ) ?;
593+
594+ info ! (
595+ "Executed partition {} to object store in {} seconds. Statistics: {}" ,
596+ input_partition,
597+ now. elapsed( ) . as_secs( ) ,
598+ stats
599+ ) ;
600+
601+ Ok ( vec ! [ ShuffleWritePartition {
602+ partition_id: input_partition as u64 ,
603+ path,
604+ num_batches: stats. num_batches. unwrap_or( 0 ) ,
605+ num_rows: stats. num_rows. unwrap_or( 0 ) ,
606+ num_bytes: stats. num_bytes. unwrap_or( 0 ) ,
607+ } ] )
608+ }
609+
610+ Some ( Partitioning :: Hash ( exprs, num_output_partitions) ) => {
611+ // Hash-repartition: collect batches per output partition, then upload each
612+ let mut partition_batches: Vec < Option < ( Vec < RecordBatch > , usize , usize ) > > =
613+ ( 0 ..num_output_partitions) . map ( |_| None ) . collect ( ) ;
614+
615+ let mut partitioner = BatchPartitioner :: try_new (
616+ Partitioning :: Hash ( exprs, num_output_partitions) ,
617+ write_metrics. repart_time . clone ( ) ,
618+ ) ?;
619+
620+ while let Some ( result) = stream. next ( ) . await {
621+ let input_batch = result?;
622+ write_metrics. input_rows . add ( input_batch. num_rows ( ) ) ;
623+
624+ partitioner. partition (
625+ input_batch,
626+ |output_partition, output_batch| {
627+ let timer = write_metrics. write_time . timer ( ) ;
628+ let batch_rows = output_batch. num_rows ( ) ;
629+ match & mut partition_batches[ output_partition] {
630+ Some ( ( batches, num_batches, num_rows) ) => {
631+ * num_batches += 1 ;
632+ * num_rows += batch_rows;
633+ batches. push ( output_batch) ;
634+ }
635+ None => {
636+ partition_batches[ output_partition] =
637+ Some ( ( vec ! [ output_batch] , 1 , batch_rows) ) ;
638+ }
639+ }
640+ write_metrics. output_rows . add ( batch_rows) ;
641+ timer. done ( ) ;
642+ Ok ( ( ) )
643+ } ,
644+ ) ?;
645+ }
646+
647+ let mut part_locs = Vec :: new ( ) ;
648+
649+ for ( output_partition, entry) in
650+ partition_batches. into_iter ( ) . enumerate ( )
651+ {
652+ if let Some ( ( batches, _num_batches, _num_rows) ) = entry {
653+ let ( path, stats) = storage
654+ . write_shuffle_data (
655+ job_id,
656+ stage_id,
657+ output_partition,
658+ input_partition,
659+ batches,
660+ schema. clone ( ) ,
661+ & write_metrics. write_time ,
662+ )
663+ . await
664+ . map_err ( |e| DataFusionError :: External ( Box :: new ( e) ) ) ?;
665+
666+ debug ! (
667+ "Finished writing shuffle partition {} to object store. Stats: {}." ,
668+ output_partition, stats
669+ ) ;
670+
671+ part_locs. push ( ShuffleWritePartition {
672+ partition_id : output_partition as u64 ,
673+ path,
674+ num_batches : stats. num_batches . unwrap_or ( 0 ) ,
675+ num_rows : stats. num_rows . unwrap_or ( 0 ) ,
676+ num_bytes : stats. num_bytes . unwrap_or ( 0 ) ,
677+ } ) ;
678+ }
679+ }
680+ Ok ( part_locs)
681+ }
682+
683+ _ => Err ( DataFusionError :: Execution (
684+ "Invalid shuffle partitioning scheme" . to_owned ( ) ,
685+ ) ) ,
686+ }
687+ }
688+
510689 /// Executes shuffle write to in-memory storage.
511690 #[ allow( clippy:: too_many_arguments) ]
512691 async fn execute_shuffle_write_memory (
0 commit comments