spiceai
diff --git a/‎ballista/client/tests/context_checks.rs‎
Lines changed: 165 additions & 0 deletions b/‎ballista/client/tests/context_checks.rs‎
Lines changed: 165 additions & 0 deletions
diff --git a/‎ballista/core/proto/ballista.proto‎
Lines changed: 35 additions & 0 deletions b/‎ballista/core/proto/ballista.proto‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎ballista/core/proto/datafusion.proto‎
Lines changed: 10 additions & 0 deletions b/‎ballista/core/proto/datafusion.proto‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎ballista/core/src/execution_plans/distributed_query.rs‎
Lines changed: 36 additions & 4 deletions b/‎ballista/core/src/execution_plans/distributed_query.rs‎
Lines changed: 36 additions & 4 deletions
diff --git a/‎ballista/core/src/planner.rs‎
Lines changed: 41 additions & 2 deletions b/‎ballista/core/src/planner.rs‎
Lines changed: 41 additions & 2 deletions
@@ -1077,4 +1077,169 @@ mod supported {
         ];
         assert_batches_eq!(expected, &result);
     }
+
+    #[rstest]
+    #[case::standalone(standalone_context())]
+    #[case::remote(remote_context())]
+    #[tokio::test]
+    async fn should_execute_explain_format_tree_query_correctly(
+        #[future(awt)]
+        #[case]
+        ctx: SessionContext,
+    ) {
+        let result = ctx
+            .sql("EXPLAIN FORMAT TREE select count(*), id from (select unnest([1,2,3,4,5]) as id) group by id")
+            .await
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+
+        // With the Ballista logical extension codec, FORMAT TREE round-trips
+        // through the distributed scheduler. The result contains both the
+        // tree-rendered physical_plan and the Ballista distributed_plan.
+        assert_eq!(result.len(), 1);
+        let batch = &result[0];
+
+        // Verify we have 2 columns: plan_type and plan
+        assert_eq!(batch.num_columns(), 2);
+        // Tree format: physical_plan and distributed_plan (no logical_plan)
+        assert_eq!(batch.column(0).len(), 2);
+
+        // Verify the plan_type column contains the expected values
+        let plan_type_col = batch.column(0);
+        let plan_type_arr = plan_type_col
+            .as_any()
+            .downcast_ref::<datafusion::arrow::array::StringArray>()
+            .unwrap();
+
+        assert_eq!(plan_type_arr.value(0), "physical_plan");
+        assert_eq!(plan_type_arr.value(1), "distributed_plan");
+
+        // Verify physical_plan is in tree format (contains box characters)
+        let plan_col = batch.column(1);
+        let plan_arr = plan_col
+            .as_any()
+            .downcast_ref::<datafusion::arrow::array::StringArray>()
+            .unwrap();
+
+        let physical_plan_txt = plan_arr.value(0);
+        // Tree format uses box drawing characters like ┌, ─, ┐, │, └, ┘, ┬, ┴, etc.
+        assert!(
+            physical_plan_txt.contains('┌') || physical_plan_txt.contains('│'),
+            "Expected tree format with box characters in physical_plan, got: {}",
+            physical_plan_txt
+        );
+
+        // Verify distributed_plan is present and non-empty
+        let distributed_plan_txt = plan_arr.value(1);
+        assert!(
+            !distributed_plan_txt.is_empty(),
+            "Expected non-empty distributed_plan"
+        );
+    }
+
+    #[rstest]
+    #[case::standalone(standalone_context())]
+    #[case::remote(remote_context())]
+    #[tokio::test]
+    async fn should_execute_explain_analyze_query_correctly(
+        #[future(awt)]
+        #[case]
+        ctx: SessionContext,
+    ) {
+        let result = ctx
+            .sql("EXPLAIN ANALYZE select count(*), id from (select unnest([1,2,3,4,5]) as id) group by id")
+            .await
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+
+        assert_eq!(result.len(), 1);
+        let batch = &result[0];
+
+        // Two columns: plan_type and plan; single row with the rendered
+        // annotated plan text produced by the scheduler.
+        assert_eq!(batch.num_columns(), 2);
+        assert_eq!(batch.column(0).len(), 1);
+
+        let plan_type_col = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<datafusion::arrow::array::StringArray>()
+            .unwrap();
+        let plan_col = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<datafusion::arrow::array::StringArray>()
+            .unwrap();
+
+        assert_eq!(plan_type_col.value(0), "Plan with Metrics");
+
+        let plan_txt = plan_col.value(0);
+        assert!(
+            !plan_txt.is_empty(),
+            "Expected non-empty Plan with Metrics output"
+        );
+        assert!(
+            plan_txt.contains("Stage[stage_id="),
+            "Expected stage header in EXPLAIN ANALYZE output, got: {}",
+            plan_txt
+        );
+        // The per-stage rendering uses the Ballista indent visitor which
+        // always emits a `metrics=[...]` suffix on each operator line.
+        assert!(
+            plan_txt.contains("metrics=["),
+            "Expected metrics=[ in EXPLAIN ANALYZE output, got: {}",
+            plan_txt
+        );
+        // Aggregation in the query should produce a real output_rows metric.
+        assert!(
+            plan_txt.contains("output_rows="),
+            "Expected output_rows= in EXPLAIN ANALYZE output, got: {}",
+            plan_txt
+        );
+    }
+
+    #[rstest]
+    #[case::standalone(standalone_context())]
+    #[case::remote(remote_context())]
+    #[tokio::test]
+    async fn should_execute_explain_analyze_verbose_query_correctly(
+        #[future(awt)]
+        #[case]
+        ctx: SessionContext,
+    ) {
+        // VERBOSE must be accepted and propagate end-to-end. The rendered
+        // output shape is the same as plain EXPLAIN ANALYZE.
+        let result = ctx
+            .sql("EXPLAIN ANALYZE VERBOSE select count(*), id from (select unnest([1,2,3,4,5]) as id) group by id")
+            .await
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+
+        assert_eq!(result.len(), 1);
+        let batch = &result[0];
+        assert_eq!(batch.num_columns(), 2);
+        assert_eq!(batch.column(0).len(), 1);
+
+        let plan_type_col = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<datafusion::arrow::array::StringArray>()
+            .unwrap();
+        let plan_col = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<datafusion::arrow::array::StringArray>()
+            .unwrap();
+
+        assert_eq!(plan_type_col.value(0), "Plan with Metrics");
+        let plan_txt = plan_col.value(0);
+        assert!(plan_txt.contains("Stage[stage_id="));
+        assert!(plan_txt.contains("metrics=["));
+    }
 }
@@ -38,6 +38,35 @@ message BallistaPhysicalPlanNode {
   }
 }
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Ballista Logical Plan extensions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Ballista wrapper around datafusion Explain logical plan node.
+// Used to preserve `explain_format` across the client -> scheduler boundary
+// because the datafusion-proto `ExplainNode` does not encode that field.
+message BallistaExplainNode {
+  bool verbose = 1;
+  // One of: "indent", "tree", "pgjson", "graphviz"
+  string explain_format = 2;
+}
+
+// Ballista wrapper around datafusion Analyze logical plan node.
+// Reserved for carrying distributed-analyze specific flags forward.
+message BallistaAnalyzeNode {
+  bool verbose = 1;
+}
+
+// Discriminating wrapper encoded as the payload of a Ballista logical
+// extension node. Allows a single codec entry point to route to the
+// appropriate Ballista logical extension type.
+message BallistaLogicalExtensionNode {
+  oneof node {
+    BallistaExplainNode explain = 1;
+    BallistaAnalyzeNode analyze = 2;
+  }
+}
+
 message ShuffleWriterExecNode {
   //TODO it seems redundant to provide job and stage id here since we also have them
   // in the TaskDefinition that wraps this plan
@@ -611,6 +640,12 @@ message SuccessfulJob {
   uint64 queued_at = 2;
   uint64 started_at = 3;
   uint64 ended_at = 4;
+  // Set when the original query was `EXPLAIN ANALYZE`. Contains the
+  // rendered annotated plan text produced by the scheduler from the
+  // per-stage metrics collected during distributed execution. When
+  // present, the client synthesizes a 2-column (plan_type, plan) output
+  // batch from this text instead of fetching partition data.
+  optional string analyzed_plan_text = 5;
 }
 
 message QueuedJob {
 
@@ -223,9 +223,19 @@ message AnalyzeNode {
   bool verbose = 2;
 }
 
+// Format options for EXPLAIN output
+enum ExplainFormat {
+  EXPLAIN_FORMAT_UNSPECIFIED = 0;
+  EXPLAIN_FORMAT_INDENT = 1;
+  EXPLAIN_FORMAT_TREE = 2;
+  EXPLAIN_FORMAT_POSTGRES_JSON = 3;
+  EXPLAIN_FORMAT_GRAPHVIZ = 4;
+}
+
 message ExplainNode {
   LogicalPlanNode input = 1;
   bool verbose = 2;
+  ExplainFormat format = 3;
 }
 
 message AggregateNode {
 
@@ -48,7 +48,7 @@ use datafusion::physical_plan::{
 use datafusion_proto::logical_plan::{
     AsLogicalPlan, DefaultLogicalExtensionCodec, LogicalExtensionCodec,
 };
-use futures::{Stream, StreamExt, TryFutureExt, TryStreamExt};
+use futures::{StreamExt, TryFutureExt, TryStreamExt};
 use log::{debug, error, info};
 use std::any::Any;
 use std::fmt::Debug;
@@ -271,6 +271,7 @@ impl<T: 'static + AsLogicalPlan> ExecutionPlan for DistributedQueryExec<T> {
                 customize_endpoint,
                 use_tls,
                 result_fetch_callback,
+                self.schema(),
             )
             .map_err(|e| ArrowError::ExternalError(Box::new(e))),
         )
@@ -314,7 +315,8 @@ async fn execute_query(
     customize_endpoint: Option<Arc<BallistaConfigGrpcEndpoint>>,
     use_tls: bool,
     result_fetch_callback: Option<Arc<dyn ResultFetchMetricsCallback>>,
-) -> Result<impl Stream<Item = Result<RecordBatch>> + Send> {
+    output_schema: SchemaRef,
+) -> Result<futures::stream::BoxStream<'static, Result<RecordBatch>>> {
     // Capture query submission time for total_query_time_ms
     let query_start_time = std::time::Instant::now();
 
@@ -409,7 +411,7 @@ async fn execute_query(
                 started_at,
                 ended_at,
                 partition_location,
-                ..
+                analyzed_plan_text,
             })) => {
                 // Calculate job execution time (server-side execution)
                 let job_execution_ms = ended_at.saturating_sub(started_at);
@@ -442,6 +444,14 @@ async fn execute_query(
                 // happens lazily when the stream is consumed, not during execute_query.
                 // This could be added in a future enhancement by wrapping the stream.
 
+                // If the server reports an EXPLAIN ANALYZE result, synthesize the
+                // output locally using the scheduler-rendered plan text. We skip
+                // partition fetching entirely in this case.
+                if let Some(text) = analyzed_plan_text {
+                    let batch = build_analyze_record_batch(&output_schema, text)?;
+                    break Ok(futures::stream::iter(vec![Ok(batch)]).boxed());
+                }
+
                 let streams = partition_location.into_iter().map(move |partition| {
                     let callback = result_fetch_callback.clone();
                     let f = fetch_partition(
@@ -457,12 +467,34 @@ async fn execute_query(
                     futures::stream::once(f).try_flatten()
                 });
 
-                break Ok(futures::stream::iter(streams).flatten());
+                break Ok(futures::stream::iter(streams).flatten().boxed());
             }
         };
     }
 }
 
+/// Construct the single-row RecordBatch returned by a distributed
+/// `EXPLAIN ANALYZE` statement. The schema is expected to match the
+/// `LogicalPlan::Analyze` output schema (`plan_type`, `plan`).
+fn build_analyze_record_batch(
+    schema: &SchemaRef,
+    text: String,
+) -> Result<RecordBatch, DataFusionError> {
+    use datafusion::arrow::array::StringArray;
+
+    if schema.fields().len() != 2 {
+        return Err(DataFusionError::Internal(format!(
+            "expected EXPLAIN ANALYZE schema to have 2 columns, got {}",
+            schema.fields().len()
+        )));
+    }
+
+    let plan_type = Arc::new(StringArray::from(vec!["Plan with Metrics"]));
+    let plan = Arc::new(StringArray::from(vec![text]));
+    RecordBatch::try_new(schema.clone(), vec![plan_type, plan])
+        .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))
+}
+
 async fn fetch_partition(
     location: PartitionLocation,
     max_message_size: usize,
 
@@ -18,13 +18,14 @@
 use crate::config::BallistaConfig;
 use crate::execution_plans::DistributedQueryExec;
 use crate::serde::BallistaLogicalExtensionCodec;
+use crate::serde::logical_plan_ext::{BallistaAnalyzeNode, BallistaExplainNode};
 
 use async_trait::async_trait;
 use datafusion::arrow::datatypes::Schema;
 use datafusion::common::tree_node::{TreeNode, TreeNodeVisitor};
 use datafusion::error::DataFusionError;
 use datafusion::execution::context::{QueryPlanner, SessionState};
-use datafusion::logical_expr::{LogicalPlan, TableScan};
+use datafusion::logical_expr::{Extension, LogicalPlan, TableScan};
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_plan::empty::EmptyExec;
 use datafusion::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
@@ -131,10 +132,17 @@ impl<T: 'static + AsLogicalPlan> QueryPlanner for BallistaQueryPlanner<T> {
                 _ => {
                     log::debug!("create_physical_plan - handling general statement");
 
+                    // For EXPLAIN / EXPLAIN ANALYZE, wrap the plan in a Ballista
+                    // logical extension so fields (like `explain_format`) survive
+                    // the client -> scheduler serialization round-trip. The
+                    // scheduler unwraps these before physical planning.
+                    let plan_to_send =
+                        wrap_explain_analyze_for_distribution(logical_plan);
+
                     Ok(Arc::new(DistributedQueryExec::<T>::with_extension(
                         self.scheduler_url.clone(),
                         self.config.clone(),
-                        logical_plan.clone(),
+                        plan_to_send,
                         self.extension_codec.clone(),
                         session_state.session_id().to_string(),
                     )))
@@ -144,6 +152,37 @@ impl<T: 'static + AsLogicalPlan> QueryPlanner for BallistaQueryPlanner<T> {
     }
 }
 
+/// Wrap `LogicalPlan::Explain` or `LogicalPlan::Analyze` in a Ballista
+/// logical extension node so that fields such as `explain_format` survive
+/// serialization to the scheduler via `datafusion-proto`. Other plans are
+/// returned unchanged.
+fn wrap_explain_analyze_for_distribution(plan: &LogicalPlan) -> LogicalPlan {
+    match plan {
+        LogicalPlan::Explain(explain) => {
+            let node = BallistaExplainNode {
+                verbose: explain.verbose,
+                explain_format: explain.explain_format.clone(),
+                plan: explain.plan.clone(),
+                schema: explain.schema.clone(),
+            };
+            LogicalPlan::Extension(Extension {
+                node: Arc::new(node),
+            })
+        }
+        LogicalPlan::Analyze(analyze) => {
+            let node = BallistaAnalyzeNode {
+                verbose: analyze.verbose,
+                input: analyze.input.clone(),
+                schema: analyze.schema.clone(),
+            };
+            LogicalPlan::Extension(Extension {
+                node: Arc::new(node),
+            })
+        }
+        _ => plan.clone(),
+    }
+}
+
 /// A Visitor which detect if query is using local tables,
 /// such as tables located in `information_schema` and returns true
 /// only if all scans are in from local tables