|
| 1 | +// Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +// or more contributor license agreements. See the NOTICE file |
| 3 | +// distributed with this work for additional information |
| 4 | +// regarding copyright ownership. The ASF licenses this file |
| 5 | +// to you under the Apache License, Version 2.0 (the |
| 6 | +// "License"); you may not use this file except in compliance |
| 7 | +// with the License. You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, |
| 12 | +// software distributed under the License is distributed on an |
| 13 | +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +// KIND, either express or implied. See the License for the |
| 15 | +// specific language governing permissions and limitations |
| 16 | +// under the License. |
| 17 | + |
| 18 | +//! [`ProbeShuffleExec`] — a round-robin repartitioning wrapper that is invisible |
| 19 | +//! to DataFusion's `EnforceDistribution` / `EnforceSorting` optimizer passes. |
| 20 | +//! |
| 21 | +//! Those passes unconditionally strip every [`RepartitionExec`] before |
| 22 | +//! re-evaluating distribution requirements. Because `SpatialJoinExec` reports |
| 23 | +//! `UnspecifiedDistribution` for its inputs, a bare `RepartitionExec` that was |
| 24 | +//! inserted by the extension planner is removed and never re-added. |
| 25 | +//! |
| 26 | +//! `ProbeShuffleExec` wraps a hidden, internal `RepartitionExec` so that: |
| 27 | +//! * **Optimizer passes** see an opaque node (not a `RepartitionExec`) and leave |
| 28 | +//! it alone. |
| 29 | +//! * **`children()` / `with_new_children()`** expose the *original* input so |
| 30 | +//! the rest of the optimizer tree can still be rewritten normally. |
| 31 | +//! * **`execute()`** delegates to the internal `RepartitionExec` which performs |
| 32 | +//! the actual round-robin shuffle. |
| 33 | +
|
| 34 | +use std::any::Any; |
| 35 | +use std::fmt; |
| 36 | +use std::sync::Arc; |
| 37 | + |
| 38 | +use datafusion_common::config::ConfigOptions; |
| 39 | +use datafusion_common::{internal_err, plan_err, Result, Statistics}; |
| 40 | +use datafusion_execution::{SendableRecordBatchStream, TaskContext}; |
| 41 | +use datafusion_physical_expr::PhysicalExpr; |
| 42 | +use datafusion_physical_plan::execution_plan::CardinalityEffect; |
| 43 | +use datafusion_physical_plan::filter_pushdown::{ |
| 44 | + ChildPushdownResult, FilterDescription, FilterPushdownPhase, FilterPushdownPropagation, |
| 45 | +}; |
| 46 | +use datafusion_physical_plan::metrics::MetricsSet; |
| 47 | +use datafusion_physical_plan::projection::ProjectionExec; |
| 48 | +use datafusion_physical_plan::repartition::RepartitionExec; |
| 49 | +use datafusion_physical_plan::{ |
| 50 | + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, Partitioning, |
| 51 | + PlanProperties, |
| 52 | +}; |
| 53 | + |
| 54 | +/// A round-robin repartitioning node that is invisible to DataFusion's |
| 55 | +/// physical optimizer passes. |
| 56 | +/// |
| 57 | +/// See [module-level documentation](self) for motivation and design. |
| 58 | +#[derive(Debug)] |
| 59 | +pub struct ProbeShuffleExec { |
| 60 | + inner_repartition: RepartitionExec, |
| 61 | +} |
| 62 | + |
| 63 | +impl ProbeShuffleExec { |
| 64 | + /// Create a new [`ProbeShuffleExec`] that round-robin repartitions `input` |
| 65 | + /// into the same number of output partitions as `input`. This will ensure |
| 66 | + /// that the probe workload of a spatial join will be evenly distributed. |
| 67 | + /// More importantly, shuffled probe side data will be less likely to |
| 68 | + /// cause skew issues when out-of-core, spatial partitioned spatial join is enabled, |
| 69 | + /// especially when the input probe data is sorted by their spatial locations. |
| 70 | + pub fn try_new(input: Arc<dyn ExecutionPlan>) -> Result<Self> { |
| 71 | + let num_partitions = input.output_partitioning().partition_count(); |
| 72 | + let inner_repartition = RepartitionExec::try_new( |
| 73 | + Arc::clone(&input), |
| 74 | + Partitioning::RoundRobinBatch(num_partitions), |
| 75 | + )?; |
| 76 | + Ok(Self { inner_repartition }) |
| 77 | + } |
| 78 | + |
| 79 | + /// Try to wrap the given [`RepartitionExec`] `plan` with [`ProbeShuffleExec`]. |
| 80 | + pub fn try_wrap_repartition(plan: Arc<dyn ExecutionPlan>) -> Result<Self> { |
| 81 | + let Some(repartition_exec) = plan.as_any().downcast_ref::<RepartitionExec>() else { |
| 82 | + return plan_err!( |
| 83 | + "ProbeShuffleExec can only wrap RepartitionExec, but got {}", |
| 84 | + plan.name() |
| 85 | + ); |
| 86 | + }; |
| 87 | + Ok(Self { |
| 88 | + inner_repartition: repartition_exec.clone(), |
| 89 | + }) |
| 90 | + } |
| 91 | + |
| 92 | + /// Number of output partitions. |
| 93 | + pub fn num_partitions(&self) -> usize { |
| 94 | + self.inner_repartition |
| 95 | + .properties() |
| 96 | + .output_partitioning() |
| 97 | + .partition_count() |
| 98 | + } |
| 99 | +} |
| 100 | + |
| 101 | +impl DisplayAs for ProbeShuffleExec { |
| 102 | + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { |
| 103 | + match t { |
| 104 | + DisplayFormatType::Default | DisplayFormatType::Verbose => { |
| 105 | + write!( |
| 106 | + f, |
| 107 | + "ProbeShuffleExec: partitioning=RoundRobinBatch({})", |
| 108 | + self.num_partitions() |
| 109 | + ) |
| 110 | + } |
| 111 | + DisplayFormatType::TreeRender => { |
| 112 | + write!(f, "partitioning=RoundRobinBatch({})", self.num_partitions()) |
| 113 | + } |
| 114 | + } |
| 115 | + } |
| 116 | +} |
| 117 | + |
| 118 | +impl ExecutionPlan for ProbeShuffleExec { |
| 119 | + fn name(&self) -> &str { |
| 120 | + "ProbeShuffleExec" |
| 121 | + } |
| 122 | + |
| 123 | + fn as_any(&self) -> &dyn Any { |
| 124 | + self |
| 125 | + } |
| 126 | + |
| 127 | + fn properties(&self) -> &PlanProperties { |
| 128 | + self.inner_repartition.properties() |
| 129 | + } |
| 130 | + |
| 131 | + fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> { |
| 132 | + vec![self.inner_repartition.input()] |
| 133 | + } |
| 134 | + |
| 135 | + fn with_new_children( |
| 136 | + self: Arc<Self>, |
| 137 | + mut children: Vec<Arc<dyn ExecutionPlan>>, |
| 138 | + ) -> Result<Arc<dyn ExecutionPlan>> { |
| 139 | + if children.len() != 1 { |
| 140 | + return internal_err!( |
| 141 | + "ProbeShuffleExec expects exactly 1 child, got {}", |
| 142 | + children.len() |
| 143 | + ); |
| 144 | + } |
| 145 | + let child = children.remove(0); |
| 146 | + Ok(Arc::new(Self::try_new(child)?)) |
| 147 | + } |
| 148 | + |
| 149 | + fn execute( |
| 150 | + &self, |
| 151 | + partition: usize, |
| 152 | + context: Arc<TaskContext>, |
| 153 | + ) -> Result<SendableRecordBatchStream> { |
| 154 | + self.inner_repartition.execute(partition, context) |
| 155 | + } |
| 156 | + |
| 157 | + fn maintains_input_order(&self) -> Vec<bool> { |
| 158 | + self.inner_repartition.maintains_input_order() |
| 159 | + } |
| 160 | + |
| 161 | + fn benefits_from_input_partitioning(&self) -> Vec<bool> { |
| 162 | + self.inner_repartition.benefits_from_input_partitioning() |
| 163 | + } |
| 164 | + |
| 165 | + fn cardinality_effect(&self) -> CardinalityEffect { |
| 166 | + self.inner_repartition.cardinality_effect() |
| 167 | + } |
| 168 | + |
| 169 | + fn metrics(&self) -> Option<MetricsSet> { |
| 170 | + self.inner_repartition.metrics() |
| 171 | + } |
| 172 | + |
| 173 | + fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> { |
| 174 | + self.inner_repartition.partition_statistics(partition) |
| 175 | + } |
| 176 | + |
| 177 | + fn try_swapping_with_projection( |
| 178 | + &self, |
| 179 | + projection: &ProjectionExec, |
| 180 | + ) -> Result<Option<Arc<dyn ExecutionPlan>>> { |
| 181 | + let Some(new_repartition) = self |
| 182 | + .inner_repartition |
| 183 | + .try_swapping_with_projection(projection)? |
| 184 | + else { |
| 185 | + return Ok(None); |
| 186 | + }; |
| 187 | + let new_plan = Self::try_wrap_repartition(new_repartition)?; |
| 188 | + Ok(Some(Arc::new(new_plan))) |
| 189 | + } |
| 190 | + |
| 191 | + fn gather_filters_for_pushdown( |
| 192 | + &self, |
| 193 | + phase: FilterPushdownPhase, |
| 194 | + parent_filters: Vec<Arc<dyn PhysicalExpr>>, |
| 195 | + config: &ConfigOptions, |
| 196 | + ) -> Result<FilterDescription> { |
| 197 | + self.inner_repartition |
| 198 | + .gather_filters_for_pushdown(phase, parent_filters, config) |
| 199 | + } |
| 200 | + |
| 201 | + fn handle_child_pushdown_result( |
| 202 | + &self, |
| 203 | + phase: FilterPushdownPhase, |
| 204 | + child_pushdown_result: ChildPushdownResult, |
| 205 | + config: &ConfigOptions, |
| 206 | + ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> { |
| 207 | + self.inner_repartition |
| 208 | + .handle_child_pushdown_result(phase, child_pushdown_result, config) |
| 209 | + } |
| 210 | + |
| 211 | + fn repartitioned( |
| 212 | + &self, |
| 213 | + target_partitions: usize, |
| 214 | + config: &ConfigOptions, |
| 215 | + ) -> Result<Option<Arc<dyn ExecutionPlan>>> { |
| 216 | + let Some(plan) = self |
| 217 | + .inner_repartition |
| 218 | + .repartitioned(target_partitions, config)? |
| 219 | + else { |
| 220 | + return Ok(None); |
| 221 | + }; |
| 222 | + let new_plan = Self::try_wrap_repartition(plan)?; |
| 223 | + Ok(Some(Arc::new(new_plan))) |
| 224 | + } |
| 225 | +} |
0 commit comments