From 340a5e2f80310aed91a77ea9a164c8edfdd1e785 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 8 Nov 2024 15:00:50 +0800 Subject: [PATCH 01/33] reservoir_sampling Signed-off-by: coldWater --- src/query/expression/src/lib.rs | 1 + src/query/expression/src/simpler/mod.rs | 15 +++ .../src/simpler/reservoir_sampling.rs | 96 +++++++++++++++++++ 3 files changed, 112 insertions(+) create mode 100644 src/query/expression/src/simpler/mod.rs create mode 100644 src/query/expression/src/simpler/reservoir_sampling.rs diff --git a/src/query/expression/src/lib.rs b/src/query/expression/src/lib.rs index f7b6824f4d153..bc576c3d633e3 100755 --- a/src/query/expression/src/lib.rs +++ b/src/query/expression/src/lib.rs @@ -64,6 +64,7 @@ mod register_vectorize; pub mod row; pub mod sampler; pub mod schema; +pub mod simpler; pub mod type_check; pub mod types; pub mod utils; diff --git a/src/query/expression/src/simpler/mod.rs b/src/query/expression/src/simpler/mod.rs new file mode 100644 index 0000000000000..d6e99fdf5a2e5 --- /dev/null +++ b/src/query/expression/src/simpler/mod.rs @@ -0,0 +1,15 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod reservoir_sampling; diff --git a/src/query/expression/src/simpler/reservoir_sampling.rs b/src/query/expression/src/simpler/reservoir_sampling.rs new file mode 100644 index 0000000000000..898303894a396 --- /dev/null +++ b/src/query/expression/src/simpler/reservoir_sampling.rs @@ -0,0 +1,96 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use rand::Rng; + +/// An implementation of Algorithm `L` (https://en.wikipedia.org/wiki/Reservoir_sampling#An_optimal_algorithm) +pub struct AlgoL<'a, R: Rng + ?Sized> { + k: usize, + r: &'a mut R, + + i: usize, + w: f64, +} + +impl AlgoL<'_, R> { + pub fn new<'a>(k: usize, rng: &'a mut R) -> AlgoL<'a, R> { + assert!(k > 0); + let mut al = AlgoL::<'a, R> { + k, + i: k - 1, + w: 1.0, + r: rng, + }; + al.update_w(); + al + } + + pub fn next_index(&mut self) -> usize { + let i = (self.rng().log2() / (1.0 - self.w).log2()).floor() + 1.0 + self.i as f64; + if i.is_normal() && i < u64::MAX as f64 { + i as usize + } else { + usize::MAX + } + } + + pub fn pos(&mut self) -> usize { + self.r.sample(rand::distributions::Uniform::new(0, self.k)) + } + + pub fn update(&mut self, i: usize) { + self.i = i; + self.update_w() + } + + fn rng(&mut self) -> f64 { + self.r.sample(rand::distributions::Open01) + } + + fn update_w(&mut self) { + self.w *= (self.rng().log2() / self.k as f64).exp2(); // rng ^ (1/k) + } +} + +#[cfg(test)] +mod tests { + use rand::rngs::StdRng; + use rand::SeedableRng; + + use super::*; + + #[test] + fn test_algo_l() { + let mut rng = StdRng::seed_from_u64(0); + let mut sample = vec![0_u64; 10]; + + let mut al = AlgoL::new(10, &mut rng); + for (i, v) in sample.iter_mut().enumerate() { + *v = i as u64 + } + + loop { + let i = al.next_index(); + if i < 100 { + sample[al.pos()] = i as u64; + al.update(i) + } else { + break; + } + } + + let want: Vec = vec![69, 49, 53, 83, 4, 72, 88, 38, 45, 27]; + assert_eq!(want, sample) + } +} From 40fa33bd932126972543ffbd8abad6ec7aa94c51 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 8 Nov 2024 20:54:16 +0800 Subject: [PATCH 02/33] Simpler Signed-off-by: coldWater --- src/query/expression/src/simpler/mod.rs | 1 + .../src/simpler/reservoir_sampling.rs | 47 +++-- src/query/expression/src/simpler/simpler.rs | 176 ++++++++++++++++++ 3 files changed, 198 insertions(+), 26 deletions(-) create mode 100644 src/query/expression/src/simpler/simpler.rs diff --git a/src/query/expression/src/simpler/mod.rs b/src/query/expression/src/simpler/mod.rs index d6e99fdf5a2e5..1917509188299 100644 --- a/src/query/expression/src/simpler/mod.rs +++ b/src/query/expression/src/simpler/mod.rs @@ -13,3 +13,4 @@ // limitations under the License. mod reservoir_sampling; +mod simpler; diff --git a/src/query/expression/src/simpler/reservoir_sampling.rs b/src/query/expression/src/simpler/reservoir_sampling.rs index 898303894a396..2b9cd32d04d10 100644 --- a/src/query/expression/src/simpler/reservoir_sampling.rs +++ b/src/query/expression/src/simpler/reservoir_sampling.rs @@ -12,34 +12,33 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::num::NonZeroUsize; + use rand::Rng; /// An implementation of Algorithm `L` (https://en.wikipedia.org/wiki/Reservoir_sampling#An_optimal_algorithm) -pub struct AlgoL<'a, R: Rng + ?Sized> { +pub struct AlgoL { k: usize, - r: &'a mut R, - - i: usize, w: f64, + + r: R, } -impl AlgoL<'_, R> { - pub fn new<'a>(k: usize, rng: &'a mut R) -> AlgoL<'a, R> { - assert!(k > 0); - let mut al = AlgoL::<'a, R> { - k, - i: k - 1, +impl AlgoL { + pub fn new(k: NonZeroUsize, r: R) -> Self { + let mut al = Self { + k: k.into(), w: 1.0, - r: rng, + r, }; al.update_w(); al } - pub fn next_index(&mut self) -> usize { - let i = (self.rng().log2() / (1.0 - self.w).log2()).floor() + 1.0 + self.i as f64; - if i.is_normal() && i < u64::MAX as f64 { - i as usize + pub fn search(&mut self) -> usize { + let s = (self.rng().log2() / (1.0 - self.w).log2()).floor() + 1.0; + if s.is_normal() { + s as usize } else { usize::MAX } @@ -49,18 +48,13 @@ impl AlgoL<'_, R> { self.r.sample(rand::distributions::Uniform::new(0, self.k)) } - pub fn update(&mut self, i: usize) { - self.i = i; - self.update_w() + pub fn update_w(&mut self) { + self.w *= (self.rng().log2() / self.k as f64).exp2(); // rng ^ (1/k) } fn rng(&mut self) -> f64 { self.r.sample(rand::distributions::Open01) } - - fn update_w(&mut self) { - self.w *= (self.rng().log2() / self.k as f64).exp2(); // rng ^ (1/k) - } } #[cfg(test)] @@ -72,19 +66,20 @@ mod tests { #[test] fn test_algo_l() { - let mut rng = StdRng::seed_from_u64(0); + let rng = StdRng::seed_from_u64(0); let mut sample = vec![0_u64; 10]; - let mut al = AlgoL::new(10, &mut rng); + let mut al = AlgoL::new(10.try_into().unwrap(), rng); for (i, v) in sample.iter_mut().enumerate() { *v = i as u64 } + let mut i = 9; loop { - let i = al.next_index(); + i += al.search(); if i < 100 { sample[al.pos()] = i as u64; - al.update(i) + al.update_w() } else { break; } diff --git a/src/query/expression/src/simpler/simpler.rs b/src/query/expression/src/simpler/simpler.rs new file mode 100644 index 0000000000000..15b7dfa82a854 --- /dev/null +++ b/src/query/expression/src/simpler/simpler.rs @@ -0,0 +1,176 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; + +use rand::Rng; + +use super::reservoir_sampling::AlgoL; +use crate::BlockRowIndex; +use crate::DataBlock; + +pub struct Simpler { + columns: Vec, + k: usize, + + blocks: Vec, + indices: Vec, + core: AlgoL, + + s: usize, +} + +impl Simpler { + pub fn new(columns: Vec, k: usize, rng: R) -> Self { + let core = AlgoL::new(k.try_into().unwrap(), rng); + Self { + columns, + blocks: Vec::new(), + indices: Vec::with_capacity(k), + k, + core, + s: usize::MAX, + } + } + + pub fn add_block(&mut self, data: DataBlock) { + let rows = data.num_rows(); + assert!(rows > 0); + let block_idx = self.blocks.len() as u32; + let change = self.add_indices(rows, block_idx); + if change { + let columns = self + .columns + .iter() + .map(|&offset| data.get_by_offset(offset).to_owned()) + .collect::>(); + + self.blocks.push(DataBlock::new(columns, rows)); + } + } + + fn add_indices(&mut self, rows: usize, block_idx: u32) -> bool { + let mut change = false; + let mut cur: usize = 0; + if self.indices.len() < self.k { + if rows + self.indices.len() <= self.k { + for i in 0..rows { + self.indices.push((block_idx, i as u32, 1)); + } + if self.indices.len() == self.k { + self.s = self.core.search() + } + return true; + } + while self.indices.len() < self.k { + self.indices.push((block_idx, cur as u32, 1)); + cur += 1; + } + self.s = self.core.search(); + change = true; + } + + while rows - cur > self.s { + change = true; + cur += self.s; + self.indices[self.core.pos()] = (block_idx, cur as u32, 1); + self.core.update_w(); + self.s = self.core.search(); + } + + self.s -= rows - cur; + change + } + + pub fn compact_indices(&mut self) { + let used_set: HashSet<_> = self.indices.iter().map(|&(b, _, _)| b).collect(); + if used_set.len() == self.blocks.len() { + return; + } + + let mut used: Vec<_> = used_set.iter().cloned().collect(); + used.sort(); + + self.indices = self + .indices + .drain(..) + .map(|(b, r, c)| (used.binary_search(&b).unwrap() as u32, r, c)) + .collect(); + + self.blocks = self + .blocks + .drain(..) + .enumerate() + .filter_map(|(i, block)| { + if used_set.contains(&(i as u32)) { + Some(block) + } else { + None + } + }) + .collect(); + } + + pub fn compact_blocks(&mut self) { + let rows = self.indices.len(); + let block = DataBlock::take_blocks(&self.blocks, &self.indices, rows); + self.blocks.clear(); + self.blocks.push(block); + + for (i, (b, r, _)) in self.indices.iter_mut().enumerate() { + *b = 0; + *r = i as u32; + } + } + + pub fn memory_size(self) -> usize { + self.blocks.iter().map(|b| b.memory_size()).sum() + } +} + +#[cfg(test)] +mod tests { + use rand::rngs::StdRng; + use rand::SeedableRng; + + use super::*; + + #[test] + fn test_add_indeces() { + let rng = StdRng::seed_from_u64(0); + let k = 5; + let core = AlgoL::new(k.try_into().unwrap(), rng); + let mut simpler = Simpler { + columns: vec![0], + blocks: Vec::new(), + indices: Vec::new(), + k, + core, + s: usize::MAX, + }; + + simpler.add_indices(15, 0); + + let want: Vec = + vec![(0, 10, 1), (0, 1, 1), (0, 2, 1), (0, 8, 1), (0, 12, 1)]; + assert_eq!(&want, &simpler.indices); + assert_eq!(0, simpler.s); + + simpler.add_indices(20, 1); + + let want: Vec = vec![(1, 0, 1), (0, 1, 1), (1, 6, 1), (0, 8, 1), (1, 9, 1)]; + assert_eq!(&want, &simpler.indices); + assert_eq!(1, simpler.s); + } +} From ac42d380864a651d6ff97dae932bd2920aadca6e Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 12 Nov 2024 12:12:19 +0800 Subject: [PATCH 03/33] TransformSortSimple Signed-off-by: coldWater --- src/common/base/src/base/watch_notify.rs | 23 +- .../expression/src/{simpler => }/simpler.rs | 52 ++- src/query/pipeline/core/src/processors/mod.rs | 1 + .../core/src/processors/shuffle_processor.rs | 10 +- .../src/processors/transforms/mod.rs | 2 +- .../transforms/transform_multi_sort_merge.rs | 4 +- .../src/pipelines/builders/builder_sort.rs | 93 ++++- .../pipelines/processors/transforms/mod.rs | 1 + .../processors/transforms/sort}/mod.rs | 9 +- .../transforms/sort/sort_exchange.rs | 148 ++++++++ .../processors/transforms/sort/sort_merge.rs | 62 ++++ .../processors/transforms/sort/sort_simple.rs | 333 ++++++++++++++++++ .../processors/transforms/sort/sort_wait.rs | 113 ++++++ src/query/settings/src/settings_default.rs | 7 + .../settings/src/settings_getter_setter.rs | 4 + 15 files changed, 825 insertions(+), 37 deletions(-) rename src/query/expression/src/{simpler => }/simpler.rs (80%) rename src/query/{expression/src/simpler => service/src/pipelines/processors/transforms/sort}/mod.rs (83%) create mode 100644 src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/sort/sort_merge.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/sort/sort_simple.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/sort/sort_wait.rs diff --git a/src/common/base/src/base/watch_notify.rs b/src/common/base/src/base/watch_notify.rs index be05dfc9028c0..61d0528a94c77 100644 --- a/src/common/base/src/base/watch_notify.rs +++ b/src/common/base/src/base/watch_notify.rs @@ -41,6 +41,16 @@ impl WatchNotify { let _ = rx.changed().await; } + pub fn has_notified(&self) -> bool { + match self.rx.has_changed() { + Ok(b) => b, + Err(_) => { + // The sender has never dropped before + unreachable!() + } + } + } + pub fn notify_waiters(&self) { let _ = self.tx.send_replace(true); } @@ -61,11 +71,18 @@ mod tests { #[tokio::test] async fn test_notify_waiters_ahead() { let notify = WatchNotify::new(); + assert!(!notify.has_notified()); + let notified1 = notify.notified(); + assert!(!notify.has_notified()); + // notify_waiters ahead of notified being instantiated and awaited notify.notify_waiters(); - + assert!(notify.has_notified()); // this should not await indefinitely - let notified = notify.notified(); - notified.await; + let notified2 = notify.notified(); + notified2.await; + + notified1.await; + assert!(notify.has_notified()); } } diff --git a/src/query/expression/src/simpler/simpler.rs b/src/query/expression/src/simpler.rs similarity index 80% rename from src/query/expression/src/simpler/simpler.rs rename to src/query/expression/src/simpler.rs index 15b7dfa82a854..ab928d721ec77 100644 --- a/src/query/expression/src/simpler/simpler.rs +++ b/src/query/expression/src/simpler.rs @@ -12,17 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod reservoir_sampling; + use std::collections::HashSet; use rand::Rng; +use reservoir_sampling::AlgoL; -use super::reservoir_sampling::AlgoL; use crate::BlockRowIndex; use crate::DataBlock; pub struct Simpler { columns: Vec, k: usize, + block_size: usize, blocks: Vec, indices: Vec, @@ -32,19 +35,20 @@ pub struct Simpler { } impl Simpler { - pub fn new(columns: Vec, k: usize, rng: R) -> Self { + pub fn new(columns: Vec, block_size: usize, k: usize, rng: R) -> Self { let core = AlgoL::new(k.try_into().unwrap(), rng); Self { columns, blocks: Vec::new(), indices: Vec::with_capacity(k), k, + block_size, core, s: usize::MAX, } } - pub fn add_block(&mut self, data: DataBlock) { + pub fn add_block(&mut self, data: DataBlock) -> bool { let rows = data.num_rows(); assert!(rows > 0); let block_idx = self.blocks.len() as u32; @@ -57,7 +61,11 @@ impl Simpler { .collect::>(); self.blocks.push(DataBlock::new(columns, rows)); + if self.blocks.len() > self.k { + self.compact_blocks() + } } + change } fn add_indices(&mut self, rows: usize, block_idx: u32) -> bool { @@ -123,20 +131,35 @@ impl Simpler { } pub fn compact_blocks(&mut self) { - let rows = self.indices.len(); - let block = DataBlock::take_blocks(&self.blocks, &self.indices, rows); - self.blocks.clear(); - self.blocks.push(block); - - for (i, (b, r, _)) in self.indices.iter_mut().enumerate() { - *b = 0; - *r = i as u32; - } + self.blocks = self + .indices + .chunks_mut(self.block_size) + .enumerate() + .map(|(i, indices)| { + let rows = indices.len(); + let block = DataBlock::take_blocks(&self.blocks, indices, rows); + + for (j, (b, r, _)) in indices.iter_mut().enumerate() { + *b = i as u32; + *r = j as u32; + } + + block + }) + .collect::>(); } pub fn memory_size(self) -> usize { self.blocks.iter().map(|b| b.memory_size()).sum() } + + pub fn take_blocks(&mut self) -> Vec { + std::mem::take(&mut self.blocks) + } + + pub fn k(&self) -> usize { + self.k + } } #[cfg(test)] @@ -147,15 +170,16 @@ mod tests { use super::*; #[test] - fn test_add_indeces() { + fn test_add_indices() { let rng = StdRng::seed_from_u64(0); let k = 5; let core = AlgoL::new(k.try_into().unwrap(), rng); let mut simpler = Simpler { columns: vec![0], + k, + block_size: 65536, blocks: Vec::new(), indices: Vec::new(), - k, core, s: usize::MAX, }; diff --git a/src/query/pipeline/core/src/processors/mod.rs b/src/query/pipeline/core/src/processors/mod.rs index c3b0e1772a341..095a9d597be61 100644 --- a/src/query/pipeline/core/src/processors/mod.rs +++ b/src/query/pipeline/core/src/processors/mod.rs @@ -39,5 +39,6 @@ pub use resize_processor::create_resize_item; pub use resize_processor::ResizeProcessor; pub use shuffle_processor::Exchange; pub use shuffle_processor::MergePartitionProcessor; +pub use shuffle_processor::MultiwayStrategy; pub use shuffle_processor::PartitionProcessor; pub use shuffle_processor::ShuffleProcessor; diff --git a/src/query/pipeline/core/src/processors/shuffle_processor.rs b/src/query/pipeline/core/src/processors/shuffle_processor.rs index 2b57c3b3cc333..dac49ea50b79e 100644 --- a/src/query/pipeline/core/src/processors/shuffle_processor.rs +++ b/src/query/pipeline/core/src/processors/shuffle_processor.rs @@ -345,7 +345,10 @@ impl Processor for MergePartitionProcessor { input.set_need_data(); } - if all_inputs_finished { + if all_inputs_finished + && (!matches!(T::STRATEGY, MultiwayStrategy::Custom) + || self.inputs_data.iter().all(Option::is_none)) + { self.output.finish(); return Ok(Event::Finished); } @@ -357,6 +360,11 @@ impl Processor for MergePartitionProcessor { self.output.push_data(Ok(block)); return Ok(Event::NeedConsume); } + + if all_inputs_finished && self.inputs_data.iter().all(Option::is_none) { + self.output.finish(); + return Ok(Event::Finished); + } } Ok(Event::NeedData) diff --git a/src/query/pipeline/transforms/src/processors/transforms/mod.rs b/src/query/pipeline/transforms/src/processors/transforms/mod.rs index ec6ca0faf96a0..8fe951ce2c89a 100644 --- a/src/query/pipeline/transforms/src/processors/transforms/mod.rs +++ b/src/query/pipeline/transforms/src/processors/transforms/mod.rs @@ -41,7 +41,7 @@ pub use transform_compact_builder::*; pub use transform_compact_no_split_builder::*; pub use transform_dummy::*; pub use transform_k_way_merge_sort::*; -pub use transform_multi_sort_merge::try_add_multi_sort_merge; +pub use transform_multi_sort_merge::*; pub use transform_pipeline_helper::TransformPipelineHelper; pub use transform_retry_async::*; pub use transform_sort_merge::sort_merge; diff --git a/src/query/pipeline/transforms/src/processors/transforms/transform_multi_sort_merge.rs b/src/query/pipeline/transforms/src/processors/transforms/transform_multi_sort_merge.rs index 573315604e414..5ab82226a9940 100644 --- a/src/query/pipeline/transforms/src/processors/transforms/transform_multi_sort_merge.rs +++ b/src/query/pipeline/transforms/src/processors/transforms/transform_multi_sort_merge.rs @@ -77,7 +77,7 @@ pub fn try_add_multi_sort_merge( } let output_port = OutputPort::create(); - let processor = ProcessorPtr::create(create_processor( + let processor = ProcessorPtr::create(create_multi_sort_merge_processor( inputs_port.clone(), output_port.clone(), schema, @@ -98,7 +98,7 @@ pub fn try_add_multi_sort_merge( } } -fn create_processor( +pub fn create_multi_sort_merge_processor( inputs: Vec>, output: Arc, schema: DataSchemaRef, diff --git a/src/query/service/src/pipelines/builders/builder_sort.rs b/src/query/service/src/pipelines/builders/builder_sort.rs index fe7f8b72356c8..c0e071ba20b56 100644 --- a/src/query/service/src/pipelines/builders/builder_sort.rs +++ b/src/query/service/src/pipelines/builders/builder_sort.rs @@ -33,6 +33,11 @@ use databend_common_storage::DataOperator; use databend_common_storages_fuse::TableContext; use crate::pipelines::memory_settings::MemorySettingsExt; +use crate::pipelines::processors::transforms::sort::add_range_shuffle; +use crate::pipelines::processors::transforms::sort::add_range_shuffle_merge; +use crate::pipelines::processors::transforms::sort::add_sort_simple; +use crate::pipelines::processors::transforms::sort::SortSimpleState; +use crate::pipelines::processors::transforms::TransformLimit; use crate::pipelines::processors::transforms::TransformSortBuilder; use crate::pipelines::PipelineBuilder; use crate::sessions::QueryContext; @@ -133,9 +138,16 @@ impl PipelineBuilder { None => { // Build for single node mode. // We build the full sort pipeline for it. - builder - .remove_order_col_at_last() - .build_full_sort_pipeline(&mut self.main_pipeline) + let k = self.settings.get_range_shuffle_sort_simple_size()?; + if k > 0 && self.main_pipeline.output_len() > 1 { + builder + .remove_order_col_at_last() + .build_range_shuffle_sort_pipeline(&mut self.main_pipeline, k) + } else { + builder + .remove_order_col_at_last() + .build_full_sort_pipeline(&mut self.main_pipeline) + } } } } @@ -148,6 +160,7 @@ pub struct SortPipelineBuilder { limit: Option, block_size: usize, remove_order_col_at_last: bool, + enable_loser_tree: bool, } impl SortPipelineBuilder { @@ -156,7 +169,9 @@ impl SortPipelineBuilder { schema: DataSchemaRef, sort_desc: Arc<[SortColumnDescription]>, ) -> Result { - let block_size = ctx.get_settings().get_max_block_size()? as usize; + let settings = ctx.get_settings(); + let block_size = settings.get_max_block_size()? as usize; + let enable_loser_tree = settings.get_enable_loser_tree_merge_sort()?; Ok(Self { ctx, schema, @@ -164,6 +179,7 @@ impl SortPipelineBuilder { limit: None, block_size, remove_order_col_at_last: false, + enable_loser_tree, }) } @@ -195,11 +211,52 @@ impl SortPipelineBuilder { self.build_merge_sort_pipeline(pipeline, false) } - pub fn build_merge_sort_pipeline( - self, - pipeline: &mut Pipeline, - order_col_generated: bool, - ) -> Result<()> { + fn build_range_shuffle_sort_pipeline(self, pipeline: &mut Pipeline, k: usize) -> Result<()> { + let inputs = pipeline.output_len(); + let settings = self.ctx.get_settings(); + let max_threads = settings.get_max_threads()? as usize; + let simple = SortSimpleState::new( + inputs, + max_threads, + self.schema.clone(), + self.sort_desc.clone(), + ); + + add_sort_simple(pipeline, simple.clone(), self.sort_desc.clone(), k)?; + + // Partial sort + pipeline.add_transformer(|| { + TransformSortPartial::new( + LimitType::from_limit_rows(self.limit), + self.sort_desc.clone(), + ) + }); + + self.build_merge_sort(pipeline, false)?; + + add_range_shuffle( + pipeline, + simple.clone(), + self.sort_desc.clone(), + self.schema.clone(), + self.block_size, + self.limit, + self.remove_order_col_at_last, + self.enable_loser_tree, + )?; + + add_range_shuffle_merge(pipeline)?; + + if self.limit.is_none() { + return Ok(()); + } + + pipeline.add_transform(|input, output| { + TransformLimit::try_create(self.limit, 0, input, output).map(ProcessorPtr::create) + }) + } + + fn build_merge_sort(&self, pipeline: &mut Pipeline, order_col_generated: bool) -> Result<()> { // Merge sort let need_multi_merge = pipeline.output_len() > 1; let output_order_col = need_multi_merge || !self.remove_order_col_at_last; @@ -247,7 +304,16 @@ impl SortPipelineBuilder { .with_enable_loser_tree(enable_loser_tree); Ok(ProcessorPtr::create(builder.build()?)) - })?; + }) + } + + pub fn build_merge_sort_pipeline( + self, + pipeline: &mut Pipeline, + order_col_generated: bool, + ) -> Result<()> { + let need_multi_merge = pipeline.output_len() > 1; + self.build_merge_sort(pipeline, order_col_generated)?; if !need_multi_merge { return Ok(()); @@ -259,9 +325,8 @@ impl SortPipelineBuilder { pub fn build_multi_merge(self, pipeline: &mut Pipeline) -> Result<()> { // Multi-pipelines merge sort let settings = self.ctx.get_settings(); - let enable_loser_tree = settings.get_enable_loser_tree_merge_sort()?; - let max_threads = settings.get_max_threads()? as usize; if settings.get_enable_parallel_multi_merge_sort()? { + let max_threads = settings.get_max_threads()? as usize; add_k_way_merge_sort( pipeline, self.schema.clone(), @@ -270,7 +335,7 @@ impl SortPipelineBuilder { self.limit, self.sort_desc, self.remove_order_col_at_last, - enable_loser_tree, + self.enable_loser_tree, ) } else { try_add_multi_sort_merge( @@ -280,7 +345,7 @@ impl SortPipelineBuilder { self.limit, self.sort_desc, self.remove_order_col_at_last, - enable_loser_tree, + self.enable_loser_tree, ) } } diff --git a/src/query/service/src/pipelines/processors/transforms/mod.rs b/src/query/service/src/pipelines/processors/transforms/mod.rs index da8b50455878f..cdefac2d1a7e9 100644 --- a/src/query/service/src/pipelines/processors/transforms/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/mod.rs @@ -16,6 +16,7 @@ pub mod aggregator; mod hash_join; pub(crate) mod range_join; mod runtime_pool; +pub mod sort; mod transform_add_computed_columns; mod transform_add_const_columns; mod transform_add_internal_columns; diff --git a/src/query/expression/src/simpler/mod.rs b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs similarity index 83% rename from src/query/expression/src/simpler/mod.rs rename to src/query/service/src/pipelines/processors/transforms/sort/mod.rs index 1917509188299..8049256d68e67 100644 --- a/src/query/expression/src/simpler/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs @@ -12,5 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod reservoir_sampling; -mod simpler; +mod sort_exchange; +mod sort_merge; +mod sort_simple; +mod sort_wait; + +pub use sort_merge::*; +pub use sort_simple::*; diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs new file mode 100644 index 0000000000000..1bdc66673ebb4 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs @@ -0,0 +1,148 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Ordering; +use std::iter; +use std::marker::PhantomData; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_expression::DataSchemaRef; +use databend_common_expression::SortColumnDescription; +use databend_common_pipeline_core::processors::Exchange; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::Pipe; +use databend_common_pipeline_core::PipeItem; +use databend_common_pipeline_transforms::processors::sort::select_row_type; +use databend_common_pipeline_transforms::processors::sort::Rows; +use databend_common_pipeline_transforms::processors::sort::RowsTypeVisitor; +use databend_common_pipeline_transforms::sort::RowConverter; + +use super::sort_simple::SortSimpleState; +use crate::pipelines::processors::PartitionProcessor; + +pub struct SortRangeExchange { + state: Arc, + _r: PhantomData, +} + +unsafe impl Send for SortRangeExchange {} + +unsafe impl Sync for SortRangeExchange {} + +impl Exchange for SortRangeExchange { + const NAME: &'static str = "SortRange"; + fn partition(&self, data: DataBlock, n: usize) -> Result> { + let bounds = self.state.bounds().unwrap(); + debug_assert_eq!(n, self.state.partitions()); + debug_assert!(bounds.len() < n); + + if data.is_empty() { + return Ok(vec![]); + } + + if bounds.len() == 0 { + return Ok(vec![data]); + } + + let bounds = R::from_column(&bounds)?; + let rows = R::from_column(data.get_last_column())?; + + let mut i = 0; + let mut j = 0; + let mut bound = bounds.row(j); + let mut indices = Vec::new(); + while i < rows.len() { + match rows.row(i).cmp(&bound) { + Ordering::Less => indices.push(j as u32), + Ordering::Greater if j + 1 < bounds.len() => { + j += 1; + bound = bounds.row(j); + continue; + } + _ => indices.push(j as u32 + 1), + } + i += 1; + } + + DataBlock::scatter(&data, &indices, n) + } +} + +pub fn create_exchange_pipe( + inputs: usize, + partitions: usize, + schema: DataSchemaRef, + sort_desc: Arc<[SortColumnDescription]>, + state: Arc, +) -> Pipe { + let mut builder = Builder { + inputs, + partitions, + sort_desc, + schema, + state, + items: Vec::new(), + }; + + select_row_type(&mut builder); + + Pipe::create(inputs, inputs * partitions, builder.items) +} + +struct Builder { + inputs: usize, + partitions: usize, + sort_desc: Arc<[SortColumnDescription]>, + schema: DataSchemaRef, + state: Arc, + items: Vec, +} + +impl RowsTypeVisitor for Builder { + fn visit_type(&mut self) + where + R: Rows + 'static, + C: RowConverter + Send + 'static, + { + let exchange = Arc::new(SortRangeExchange:: { + state: self.state.clone(), + _r: PhantomData, + }); + self.items = iter::repeat_with(|| { + let input = InputPort::create(); + let outputs = iter::repeat_with(OutputPort::create) + .take(self.partitions) + .collect::>(); + + PipeItem::create( + PartitionProcessor::create(input.clone(), outputs.clone(), exchange.clone()), + vec![input], + outputs, + ) + }) + .take(self.inputs) + .collect::>(); + } + + fn schema(&self) -> DataSchemaRef { + self.schema.clone() + } + + fn sort_desc(&self) -> &[SortColumnDescription] { + &self.sort_desc + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_merge.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_merge.rs new file mode 100644 index 0000000000000..569a1e54e83b3 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_merge.rs @@ -0,0 +1,62 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_pipeline_core::processors::Exchange; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::MergePartitionProcessor; +use databend_common_pipeline_core::processors::MultiwayStrategy; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::Pipe; +use databend_common_pipeline_core::PipeItem; +use databend_common_pipeline_core::Pipeline; + +pub struct TransformSortRangeMerge {} + +impl Exchange for TransformSortRangeMerge { + const NAME: &'static str = "SortRangeMerge"; + const STRATEGY: MultiwayStrategy = MultiwayStrategy::Custom; + + fn partition(&self, block: DataBlock, _: usize) -> Result> { + Ok(vec![block]) + } + + fn multiway_pick(&self, partitions: &[Option]) -> Result { + Ok(partitions.iter().position(Option::is_some).unwrap()) + } +} + +pub fn add_range_shuffle_merge(pipeline: &mut Pipeline) -> Result<()> { + let inputs = pipeline.output_len(); + let inputs_port = (0..inputs).map(|_| InputPort::create()).collect::>(); + let output = OutputPort::create(); + + let processor = MergePartitionProcessor::create( + inputs_port.clone(), + output.clone(), + Arc::new(TransformSortRangeMerge {}), + ); + + let pipe = Pipe::create(inputs, 1, vec![PipeItem::create( + processor, + inputs_port, + vec![output], + )]); + + pipeline.add_pipe(pipe); + Ok(()) +} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_simple.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_simple.rs new file mode 100644 index 0000000000000..7195769842c42 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_simple.rs @@ -0,0 +1,333 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::sync::RwLock; + +use databend_common_base::base::WatchNotify; +use databend_common_exception::Result; +use databend_common_expression::simpler::Simpler; +use databend_common_expression::visitor::ValueVisitor; +use databend_common_expression::Column; +use databend_common_expression::DataBlock; +use databend_common_expression::DataSchemaRef; +use databend_common_expression::SortColumnDescription; +use databend_common_expression::SortCompare; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_pipeline_core::Pipe; +use databend_common_pipeline_core::PipeItem; +use databend_common_pipeline_core::Pipeline; +use databend_common_pipeline_transforms::processors::create_multi_sort_merge_processor; +use databend_common_pipeline_transforms::processors::sort::convert_rows; +use databend_common_pipeline_transforms::processors::Transform; +use databend_common_pipeline_transforms::TransformPipelineHelper; +use rand::rngs::StdRng; +use rand::SeedableRng; + +use super::sort_exchange::create_exchange_pipe; +use super::sort_wait::TransformSortSimpleWait; + +pub struct SortSimpleState { + inner: RwLock, + pub(super) done: WatchNotify, +} + +impl SortSimpleState { + pub fn partitions(&self) -> usize { + self.inner.read().unwrap().partitions + } +} + +struct StateInner { + partitions: usize, + // schema for bounds DataBlock + schema: DataSchemaRef, + // sort_desc for bounds DataBlock + sort_desc: Vec, + partial: Vec>, + bounds: Option, +} + +impl StateInner { + fn determine_bounds(&mut self) -> Result<()> { + let partial = std::mem::take(&mut self.partial) + .into_iter() + .filter_map(|b| { + let b = b.unwrap(); + if b.is_empty() { + None + } else { + Some(b) + } + }) + .collect::>(); + + if partial.is_empty() { + let bounds = convert_rows( + self.schema.clone(), + &self.sort_desc, + DataBlock::empty_with_schema(self.schema.clone()), + )?; + + self.bounds = Some(bounds); + return Ok(()); + } + + let candidates = DataBlock::concat(&partial)?; + let rows = candidates.num_rows(); + + let mut sort_compare = SortCompare::with_force_equality(self.sort_desc.clone(), rows); + + for desc in &self.sort_desc { + let array = candidates.get_by_offset(desc.offset).value.clone(); + sort_compare.visit_value(array)?; + sort_compare.increment_column_index(); + } + + let equality = sort_compare.equality_index().to_vec(); + let permutation = sort_compare.take_permutation(); + + let step = permutation.len() as f64 / self.partitions as f64; + let mut target = step; + let mut bounds = Vec::with_capacity(self.partitions - 1); + let mut equals = true; + for (i, (&pos, eq)) in permutation.iter().zip(equality).enumerate() { + if bounds.len() >= self.partitions - 1 { + break; + } + if equals && eq == 0 { + equals = false + } + if i as f64 >= target && (!equals || i != 0) { + bounds.push(pos); + target += step; + equals = true + } + } + + let bounds = convert_rows( + self.schema.clone(), + &self.sort_desc, + candidates.take(&bounds)?, + )?; + self.bounds = Some(bounds); + Ok(()) + } +} + +impl SortSimpleState { + pub fn new( + inputs: usize, + partitions: usize, + schema: DataSchemaRef, + sort_desc: Arc<[SortColumnDescription]>, + ) -> Arc { + let columns = sort_desc.iter().map(|desc| desc.offset).collect::>(); + let schema = schema.project(&columns).into(); + let sort_desc = sort_desc + .iter() + .enumerate() + .map(|(i, desc)| SortColumnDescription { + offset: i, + asc: desc.asc, + nulls_first: desc.nulls_first, + }) + .collect::>(); + Arc::new(SortSimpleState { + inner: RwLock::new(StateInner { + partitions, + schema, + sort_desc, + partial: vec![None; inputs], + bounds: None, + }), + done: WatchNotify::new(), + }) + } + + pub fn bounds(&self) -> Option { + if let Some(bounds) = &self.inner.read().unwrap().bounds { + return Some(bounds.clone()); + } + None + } + + pub fn commit_simple(&self, id: usize, block: Option) -> Result { + let mut inner = self.inner.write().unwrap(); + + let block = block.unwrap_or(DataBlock::empty_with_schema(inner.schema.clone())); + let x = inner.partial[id].replace(block); + debug_assert!(x.is_none()); + let done = inner.partial.iter().all(|x| x.is_some()); + if done { + inner.determine_bounds()?; + self.done.notify_waiters(); + } + Ok(done) + } +} + +pub struct TransformSortSimple { + id: usize, + simpler: Simpler, + state: Arc, +} + +unsafe impl Send for TransformSortSimple {} + +impl TransformSortSimple { + fn new(id: usize, k: usize, columns: Vec, state: Arc) -> Self { + let rng = StdRng::from_rng(rand::thread_rng()).unwrap(); + let simpler = Simpler::new(columns, 65536, k, rng); + TransformSortSimple { id, simpler, state } + } +} + +impl Transform for TransformSortSimple { + const NAME: &'static str = "TransformSortSimple"; + + fn transform(&mut self, data: DataBlock) -> Result { + self.simpler.add_block(data.clone()); + Ok(data) + } + + fn on_finish(&mut self) -> Result<()> { + self.simpler.compact_blocks(); + let mut simple = self.simpler.take_blocks(); + assert!(simple.len() <= 1); // Unlikely to sample rows greater than 65536 + self.state.commit_simple( + self.id, + if simple.is_empty() { + None + } else { + Some(simple.remove(0)) + }, + )?; + Ok(()) + } +} + +pub fn add_sort_simple( + pipeline: &mut Pipeline, + state: Arc, + sort_desc: Arc<[SortColumnDescription]>, + k: usize, +) -> Result<()> { + use std::sync::atomic; + let i = atomic::AtomicUsize::new(0); + let columns = sort_desc.iter().map(|desc| desc.offset).collect::>(); + pipeline.add_transformer(|| { + let id = i.fetch_add(1, atomic::Ordering::AcqRel); + TransformSortSimple::new(id, k, columns.clone(), state.clone()) + }); + Ok(()) +} + +pub fn add_range_shuffle( + pipeline: &mut Pipeline, + state: Arc, + sort_desc: Arc<[SortColumnDescription]>, + schema: DataSchemaRef, + block_size: usize, + limit: Option, + remove_order_col: bool, + enable_loser_tree: bool, +) -> Result<()> { + pipeline.add_transform(|input, output| { + Ok(ProcessorPtr::create(Box::new( + TransformSortSimpleWait::new(input, output, state.clone()), + ))) + })?; + + // partition data block + let input_len = pipeline.output_len(); + let n = state.partitions(); + let exchange = create_exchange_pipe(input_len, n, schema.clone(), sort_desc.clone(), state); + pipeline.add_pipe(exchange); + + let reorder_edges = (0..input_len * n) + .map(|index| (index % n) * input_len + (index / n)) + .collect::>(); + + pipeline.reorder_inputs(reorder_edges); + + let mut items = Vec::with_capacity(input_len); + for _ in 0..n { + let output = OutputPort::create(); + let inputs: Vec<_> = (0..input_len).map(|_| InputPort::create()).collect(); + + let proc = create_multi_sort_merge_processor( + inputs.clone(), + output.clone(), + schema.clone(), + block_size, + limit, + sort_desc.clone(), + remove_order_col, + enable_loser_tree, + )?; + + items.push(PipeItem::create(ProcessorPtr::create(proc), inputs, vec![ + output, + ])); + } + + // merge partition + pipeline.add_pipe(Pipe::create(input_len * n, n, items)); + + Ok(()) +} + +#[cfg(test)] +mod tests { + use databend_common_expression::types::ArgType; + use databend_common_expression::types::Int32Type; + use databend_common_expression::DataField; + use databend_common_expression::DataSchemaRefExt; + use databend_common_expression::FromData; + + use super::*; + + #[test] + fn test_determine_bounds() { + let partial = vec![vec![1, 2, 3, 4], vec![4, 5, 6, 7], vec![0, 2, 4, 5]] + .into_iter() + .map(|data| { + Some(DataBlock::new_from_columns(vec![Int32Type::from_data( + data, + )])) + }) + .collect::>(); + + let schema = DataSchemaRefExt::create(vec![DataField::new("a", Int32Type::data_type())]); + let mut inner = StateInner { + partitions: 3, + schema, + sort_desc: vec![SortColumnDescription { + offset: 0, + asc: true, + nulls_first: false, + }], + partial, + bounds: None, + }; + + inner.determine_bounds().unwrap(); + + // 0 1 2 2 | 3 4 4 4 | 5 5 6 7 + assert_eq!(Int32Type::from_data(vec![3, 5]), inner.bounds.unwrap()) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_wait.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_wait.rs new file mode 100644 index 0000000000000..2fc6ec3cecea9 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_wait.rs @@ -0,0 +1,113 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::VecDeque; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_pipeline_core::processors::Event; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::Processor; + +use super::SortSimpleState; + +pub struct TransformSortSimpleWait { + input: Arc, + output: Arc, + output_data: VecDeque, + blocks: Vec, + state: Arc, +} + +impl TransformSortSimpleWait { + pub fn new( + input: Arc, + output: Arc, + state: Arc, + ) -> Self { + Self { + input, + output, + output_data: VecDeque::new(), + blocks: Vec::new(), + state, + } + } +} + +#[async_trait::async_trait] +impl Processor for TransformSortSimpleWait { + fn name(&self) -> String { + "TransformSortSimpleWait".to_string() + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.output.is_finished() { + self.input.finish(); + return Ok(Event::Finished); + } + + if !self.output.can_push() { + self.input.set_not_need_data(); + return Ok(Event::NeedConsume); + } + + if let Some(data_block) = self.output_data.pop_front() { + self.output.push_data(Ok(data_block)); + return Ok(Event::NeedConsume); + } + + if self.input.has_data() { + self.blocks.push(self.input.pull_data().unwrap()?); + self.input.set_need_data(); + return Ok(Event::NeedData); + } + + if self.input.is_finished() { + if self.blocks.is_empty() { + self.output.finish(); + return Ok(Event::Finished); + } + + return if self.state.done.has_notified() { + Ok(Event::Sync) + } else { + Ok(Event::Async) + }; + } + + self.input.set_need_data(); + Ok(Event::NeedData) + } + + fn process(&mut self) -> Result<()> { + debug_assert!(!self.blocks.is_empty()); + self.output_data = VecDeque::from(std::mem::take(&mut self.blocks)); + Ok(()) + } + + #[async_backtrace::framed] + async fn async_process(&mut self) -> Result<()> { + self.state.done.notified().await; + self.output_data = VecDeque::from(std::mem::take(&mut self.blocks)); + Ok(()) + } +} diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index f70a446a63a52..cfb981b79580b 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -605,6 +605,13 @@ impl DefaultSettings { scope: SettingScope::Both, range: Some(SettingRange::Numeric(4 * 1024..=u64::MAX)), }), + ("range_shuffle_sort_simple_size", DefaultSettingValue { + value: UserSettingValue::UInt64(20), + desc: "Sets the simple size per partition used for range shuffle sorting, 0 to disable range shuffle sorting.", + mode: SettingMode::Both, + scope: SettingScope::Both, + range: Some(SettingRange::Numeric(0..=500)), + }), ("group_by_shuffle_mode", DefaultSettingValue { value: UserSettingValue::String(String::from("before_merge")), desc: "Group by shuffle mode, 'before_partial' is more balanced, but more data needs to exchange.", diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index 7deb0fd812d9f..cf35a2f4a8eda 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -486,6 +486,10 @@ impl Settings { Ok(self.try_get_u64("sort_spilling_memory_ratio")? as usize) } + pub fn get_range_shuffle_sort_simple_size(&self) -> Result { + Ok(self.try_get_u64("range_shuffle_sort_simple_size")? as usize) + } + pub fn get_group_by_shuffle_mode(&self) -> Result { self.try_get_string("group_by_shuffle_mode") } From 809535c9fef0141ff9c428b8835b2c133c18527b Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 15 Apr 2025 17:25:34 +0800 Subject: [PATCH 04/33] rename --- src/query/expression/src/lib.rs | 1 - .../src/simpler/reservoir_sampling.rs | 91 ------------------- .../src/pipelines/builders/builder_sort.rs | 10 +- .../processors/transforms/sort/mod.rs | 4 +- .../transforms/sort/sort_exchange.rs | 8 +- .../sort/{sort_simple.rs => sort_sample.rs} | 54 +++++------ .../processors/transforms/sort/sort_wait.rs | 12 +-- 7 files changed, 44 insertions(+), 136 deletions(-) delete mode 100644 src/query/expression/src/simpler/reservoir_sampling.rs rename src/query/service/src/pipelines/processors/transforms/sort/{sort_simple.rs => sort_sample.rs} (89%) diff --git a/src/query/expression/src/lib.rs b/src/query/expression/src/lib.rs index bc576c3d633e3..f7b6824f4d153 100755 --- a/src/query/expression/src/lib.rs +++ b/src/query/expression/src/lib.rs @@ -64,7 +64,6 @@ mod register_vectorize; pub mod row; pub mod sampler; pub mod schema; -pub mod simpler; pub mod type_check; pub mod types; pub mod utils; diff --git a/src/query/expression/src/simpler/reservoir_sampling.rs b/src/query/expression/src/simpler/reservoir_sampling.rs deleted file mode 100644 index 2b9cd32d04d10..0000000000000 --- a/src/query/expression/src/simpler/reservoir_sampling.rs +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::num::NonZeroUsize; - -use rand::Rng; - -/// An implementation of Algorithm `L` (https://en.wikipedia.org/wiki/Reservoir_sampling#An_optimal_algorithm) -pub struct AlgoL { - k: usize, - w: f64, - - r: R, -} - -impl AlgoL { - pub fn new(k: NonZeroUsize, r: R) -> Self { - let mut al = Self { - k: k.into(), - w: 1.0, - r, - }; - al.update_w(); - al - } - - pub fn search(&mut self) -> usize { - let s = (self.rng().log2() / (1.0 - self.w).log2()).floor() + 1.0; - if s.is_normal() { - s as usize - } else { - usize::MAX - } - } - - pub fn pos(&mut self) -> usize { - self.r.sample(rand::distributions::Uniform::new(0, self.k)) - } - - pub fn update_w(&mut self) { - self.w *= (self.rng().log2() / self.k as f64).exp2(); // rng ^ (1/k) - } - - fn rng(&mut self) -> f64 { - self.r.sample(rand::distributions::Open01) - } -} - -#[cfg(test)] -mod tests { - use rand::rngs::StdRng; - use rand::SeedableRng; - - use super::*; - - #[test] - fn test_algo_l() { - let rng = StdRng::seed_from_u64(0); - let mut sample = vec![0_u64; 10]; - - let mut al = AlgoL::new(10.try_into().unwrap(), rng); - for (i, v) in sample.iter_mut().enumerate() { - *v = i as u64 - } - - let mut i = 9; - loop { - i += al.search(); - if i < 100 { - sample[al.pos()] = i as u64; - al.update_w() - } else { - break; - } - } - - let want: Vec = vec![69, 49, 53, 83, 4, 72, 88, 38, 45, 27]; - assert_eq!(want, sample) - } -} diff --git a/src/query/service/src/pipelines/builders/builder_sort.rs b/src/query/service/src/pipelines/builders/builder_sort.rs index c0e071ba20b56..2d0a4b1fae4b5 100644 --- a/src/query/service/src/pipelines/builders/builder_sort.rs +++ b/src/query/service/src/pipelines/builders/builder_sort.rs @@ -35,8 +35,8 @@ use databend_common_storages_fuse::TableContext; use crate::pipelines::memory_settings::MemorySettingsExt; use crate::pipelines::processors::transforms::sort::add_range_shuffle; use crate::pipelines::processors::transforms::sort::add_range_shuffle_merge; -use crate::pipelines::processors::transforms::sort::add_sort_simple; -use crate::pipelines::processors::transforms::sort::SortSimpleState; +use crate::pipelines::processors::transforms::sort::add_sort_sample; +use crate::pipelines::processors::transforms::sort::SortSampleState; use crate::pipelines::processors::transforms::TransformLimit; use crate::pipelines::processors::transforms::TransformSortBuilder; use crate::pipelines::PipelineBuilder; @@ -215,14 +215,14 @@ impl SortPipelineBuilder { let inputs = pipeline.output_len(); let settings = self.ctx.get_settings(); let max_threads = settings.get_max_threads()? as usize; - let simple = SortSimpleState::new( + let sample = SortSampleState::new( inputs, max_threads, self.schema.clone(), self.sort_desc.clone(), ); - add_sort_simple(pipeline, simple.clone(), self.sort_desc.clone(), k)?; + add_sort_sample(pipeline, sample.clone(), self.sort_desc.clone(), k)?; // Partial sort pipeline.add_transformer(|| { @@ -236,7 +236,7 @@ impl SortPipelineBuilder { add_range_shuffle( pipeline, - simple.clone(), + sample.clone(), self.sort_desc.clone(), self.schema.clone(), self.block_size, diff --git a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs index 8049256d68e67..c237beb982d8e 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs @@ -14,8 +14,8 @@ mod sort_exchange; mod sort_merge; -mod sort_simple; +mod sort_sample; mod sort_wait; pub use sort_merge::*; -pub use sort_simple::*; +pub use sort_sample::*; diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs index 1bdc66673ebb4..a528b5f3352f0 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs @@ -31,11 +31,11 @@ use databend_common_pipeline_transforms::processors::sort::Rows; use databend_common_pipeline_transforms::processors::sort::RowsTypeVisitor; use databend_common_pipeline_transforms::sort::RowConverter; -use super::sort_simple::SortSimpleState; +use super::sort_sample::SortSampleState; use crate::pipelines::processors::PartitionProcessor; pub struct SortRangeExchange { - state: Arc, + state: Arc, _r: PhantomData, } @@ -87,7 +87,7 @@ pub fn create_exchange_pipe( partitions: usize, schema: DataSchemaRef, sort_desc: Arc<[SortColumnDescription]>, - state: Arc, + state: Arc, ) -> Pipe { let mut builder = Builder { inputs, @@ -108,7 +108,7 @@ struct Builder { partitions: usize, sort_desc: Arc<[SortColumnDescription]>, schema: DataSchemaRef, - state: Arc, + state: Arc, items: Vec, } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_simple.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_sample.rs similarity index 89% rename from src/query/service/src/pipelines/processors/transforms/sort/sort_simple.rs rename to src/query/service/src/pipelines/processors/transforms/sort/sort_sample.rs index 7195769842c42..6f5adb69a3c84 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_simple.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_sample.rs @@ -17,7 +17,7 @@ use std::sync::RwLock; use databend_common_base::base::WatchNotify; use databend_common_exception::Result; -use databend_common_expression::simpler::Simpler; +use databend_common_expression::sampler::FixedSizeSampler; use databend_common_expression::visitor::ValueVisitor; use databend_common_expression::Column; use databend_common_expression::DataBlock; @@ -38,14 +38,14 @@ use rand::rngs::StdRng; use rand::SeedableRng; use super::sort_exchange::create_exchange_pipe; -use super::sort_wait::TransformSortSimpleWait; +use super::sort_wait::TransformSortSampleWait; -pub struct SortSimpleState { +pub struct SortSampleState { inner: RwLock, pub(super) done: WatchNotify, } -impl SortSimpleState { +impl SortSampleState { pub fn partitions(&self) -> usize { self.inner.read().unwrap().partitions } @@ -128,13 +128,13 @@ impl StateInner { } } -impl SortSimpleState { +impl SortSampleState { pub fn new( inputs: usize, partitions: usize, schema: DataSchemaRef, sort_desc: Arc<[SortColumnDescription]>, - ) -> Arc { + ) -> Arc { let columns = sort_desc.iter().map(|desc| desc.offset).collect::>(); let schema = schema.project(&columns).into(); let sort_desc = sort_desc @@ -146,7 +146,7 @@ impl SortSimpleState { nulls_first: desc.nulls_first, }) .collect::>(); - Arc::new(SortSimpleState { + Arc::new(SortSampleState { inner: RwLock::new(StateInner { partitions, schema, @@ -165,7 +165,7 @@ impl SortSimpleState { None } - pub fn commit_simple(&self, id: usize, block: Option) -> Result { + pub fn commit_sample(&self, id: usize, block: Option) -> Result { let mut inner = self.inner.write().unwrap(); let block = block.unwrap_or(DataBlock::empty_with_schema(inner.schema.clone())); @@ -180,35 +180,35 @@ impl SortSimpleState { } } -pub struct TransformSortSimple { +pub struct TransformSortSample { id: usize, - simpler: Simpler, - state: Arc, + sampler: FixedSizeSampler, + state: Arc, } -unsafe impl Send for TransformSortSimple {} +unsafe impl Send for TransformSortSample {} -impl TransformSortSimple { - fn new(id: usize, k: usize, columns: Vec, state: Arc) -> Self { +impl TransformSortSample { + fn new(id: usize, k: usize, columns: Vec, state: Arc) -> Self { let rng = StdRng::from_rng(rand::thread_rng()).unwrap(); - let simpler = Simpler::new(columns, 65536, k, rng); - TransformSortSimple { id, simpler, state } + let sampler = FixedSizeSampler::new(columns, 65536, k, rng); + TransformSortSample { id, sampler, state } } } -impl Transform for TransformSortSimple { - const NAME: &'static str = "TransformSortSimple"; +impl Transform for TransformSortSample { + const NAME: &'static str = "TransformSortSample"; fn transform(&mut self, data: DataBlock) -> Result { - self.simpler.add_block(data.clone()); + self.sampler.add_block(data.clone()); Ok(data) } fn on_finish(&mut self) -> Result<()> { - self.simpler.compact_blocks(); - let mut simple = self.simpler.take_blocks(); + self.sampler.compact_blocks(); + let mut simple = self.sampler.take_blocks(); assert!(simple.len() <= 1); // Unlikely to sample rows greater than 65536 - self.state.commit_simple( + self.state.commit_sample( self.id, if simple.is_empty() { None @@ -220,9 +220,9 @@ impl Transform for TransformSortSimple { } } -pub fn add_sort_simple( +pub fn add_sort_sample( pipeline: &mut Pipeline, - state: Arc, + state: Arc, sort_desc: Arc<[SortColumnDescription]>, k: usize, ) -> Result<()> { @@ -231,14 +231,14 @@ pub fn add_sort_simple( let columns = sort_desc.iter().map(|desc| desc.offset).collect::>(); pipeline.add_transformer(|| { let id = i.fetch_add(1, atomic::Ordering::AcqRel); - TransformSortSimple::new(id, k, columns.clone(), state.clone()) + TransformSortSample::new(id, k, columns.clone(), state.clone()) }); Ok(()) } pub fn add_range_shuffle( pipeline: &mut Pipeline, - state: Arc, + state: Arc, sort_desc: Arc<[SortColumnDescription]>, schema: DataSchemaRef, block_size: usize, @@ -248,7 +248,7 @@ pub fn add_range_shuffle( ) -> Result<()> { pipeline.add_transform(|input, output| { Ok(ProcessorPtr::create(Box::new( - TransformSortSimpleWait::new(input, output, state.clone()), + TransformSortSampleWait::new(input, output, state.clone()), ))) })?; diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_wait.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_wait.rs index 2fc6ec3cecea9..3f1543d6b2760 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_wait.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_wait.rs @@ -23,21 +23,21 @@ use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::Processor; -use super::SortSimpleState; +use super::SortSampleState; -pub struct TransformSortSimpleWait { +pub struct TransformSortSampleWait { input: Arc, output: Arc, output_data: VecDeque, blocks: Vec, - state: Arc, + state: Arc, } -impl TransformSortSimpleWait { +impl TransformSortSampleWait { pub fn new( input: Arc, output: Arc, - state: Arc, + state: Arc, ) -> Self { Self { input, @@ -50,7 +50,7 @@ impl TransformSortSimpleWait { } #[async_trait::async_trait] -impl Processor for TransformSortSimpleWait { +impl Processor for TransformSortSampleWait { fn name(&self) -> String { "TransformSortSimpleWait".to_string() } From d6dd7b96534973785facc1d86a07134950a05cf7 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 18 Apr 2025 19:12:24 +0800 Subject: [PATCH 05/33] fix --- src/query/service/src/pipelines/builders/builder_sort.rs | 2 +- .../pipelines/processors/transforms/transform_merge_sort.rs | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/query/service/src/pipelines/builders/builder_sort.rs b/src/query/service/src/pipelines/builders/builder_sort.rs index 2d0a4b1fae4b5..a9ac35ea53dce 100644 --- a/src/query/service/src/pipelines/builders/builder_sort.rs +++ b/src/query/service/src/pipelines/builders/builder_sort.rs @@ -285,7 +285,7 @@ impl SortPipelineBuilder { use_parquet: settings.get_spilling_file_format()?.is_parquet(), }; let op = DataOperator::instance().spill_operator(); - Arc::new(Spiller::create(self.ctx.clone(), op, config.clone())?) + Arc::new(Spiller::create(self.ctx.clone(), op, config)?) }; pipeline.add_transform(|input, output| { diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs index 413e7fa1c3fd7..be189a8602184 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs @@ -363,6 +363,7 @@ where return match self.state { State::Collect => { if self.check_spill() { + // delay the handle of input until the next call. Ok(Event::Async) } else { Ok(Event::Sync) @@ -438,7 +439,7 @@ where self.prepare_spill_limit()?; } Inner::Collect(input_data) => { - debug_assert!(!finished); + assert!(!finished); let input_data = std::mem::take(input_data); self.prepare_spill(input_data); } @@ -470,7 +471,7 @@ where let Inner::Spill(input_data, spill_sort) = &mut self.inner else { unreachable!() }; - debug_assert!(input_data.is_empty()); + assert!(input_data.is_empty()); let (block, finish) = spill_sort.on_restore().await?; self.output_data.extend(block); if finish { From 7f5e43b2d55aa8c6312be306c3121aa92647f943 Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 21 Apr 2025 14:15:29 +0800 Subject: [PATCH 06/33] trait Spill --- .../transforms/transform_merge_sort.rs | 18 + .../transform_merge_sort/collect.rs | 377 ++++++++++++++++++ .../transform_merge_sort/sort_spill.rs | 156 +++++--- 3 files changed, 504 insertions(+), 47 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/transform_merge_sort/collect.rs diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs index be189a8602184..2e7ffe1f650dc 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs @@ -20,7 +20,10 @@ use std::sync::atomic::AtomicBool; use std::sync::Arc; use databend_common_exception::Result; +use databend_common_expression::local_block_meta_serde; use databend_common_expression::BlockEntry; +use databend_common_expression::BlockMetaInfo; +use databend_common_expression::Column; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_expression::SortColumnDescription; @@ -36,6 +39,7 @@ use databend_common_pipeline_transforms::MemorySettings; use databend_common_pipeline_transforms::MergeSort; use databend_common_pipeline_transforms::SortSpillParams; use databend_common_pipeline_transforms::TransformSortMergeLimit; +use sort_spill::SpillableBlock; use crate::spillers::Spiller; @@ -47,6 +51,8 @@ use sort_spill::SortSpill; mod builder; pub use builder::TransformSortBuilder; +mod collect; + #[derive(Debug)] enum State { /// This state means the processor will collect incoming blocks. @@ -65,6 +71,18 @@ struct Base { limit: Option, } +#[derive(Debug)] +pub struct SortCollectedMeta { + batch_rows: usize, + bounds: Vec, + blocks: Vec>, +} + +local_block_meta_serde!(SortCollectedMeta); + +#[typetag::serde(name = "sort_collected")] +impl BlockMetaInfo for SortCollectedMeta {} + enum Inner { Collect(Vec), Limit(TransformSortMergeLimit), diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/collect.rs b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/collect.rs new file mode 100644 index 0000000000000..121f3a258d51c --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/collect.rs @@ -0,0 +1,377 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::sync::atomic; +use std::sync::atomic::AtomicBool; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::BlockEntry; +use databend_common_expression::DataBlock; +use databend_common_expression::DataSchemaRef; +use databend_common_expression::SortColumnDescription; +use databend_common_expression::Value; +use databend_common_pipeline_core::processors::Event; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::Processor; +use databend_common_pipeline_transforms::processors::sort::algorithm::SortAlgorithm; +use databend_common_pipeline_transforms::sort::RowConverter; +use databend_common_pipeline_transforms::sort::Rows; +use databend_common_pipeline_transforms::MemorySettings; +use databend_common_pipeline_transforms::MergeSort; +use databend_common_pipeline_transforms::SortSpillParams; +use databend_common_pipeline_transforms::TransformSortMergeLimit; + +use super::sort_spill::SortSpill; +use super::Base; +use crate::spillers::Spiller; + +enum Inner { + Collect(Vec), + Limit(TransformSortMergeLimit), + Spill(Vec, SortSpill), + None, +} + +pub struct TransformSortCollect { + name: &'static str, + input: Arc, + output: Arc, + output_data: Option, + + row_converter: C, + sort_desc: Arc<[SortColumnDescription]>, + /// If this transform is after an Exchange transform, + /// it means it will compact the data from cluster nodes. + /// And the order column is already generated in each cluster node, + /// so we don't need to generate the order column again. + order_col_generated: bool, + + base: Base, + inner: Inner, + + aborting: AtomicBool, + + memory_settings: MemorySettings, +} + +impl TransformSortCollect +where + A: SortAlgorithm, + C: RowConverter, +{ + #[allow(clippy::too_many_arguments)] + fn new( + input: Arc, + output: Arc, + schema: DataSchemaRef, + sort_desc: Arc<[SortColumnDescription]>, + limit: Option<(usize, bool)>, + spiller: Arc, + order_col_generated: bool, + memory_settings: MemorySettings, + ) -> Result { + let sort_row_offset = schema.fields().len() - 1; + let row_converter = C::create(&sort_desc, schema.clone())?; + let (name, inner, limit) = match limit { + Some((limit, true)) => ( + "TransformSortMergeCollectLimit", + Inner::Limit(TransformSortMergeLimit::create(max_block_size, limit)), + Some(limit), + ), + Some((limit, false)) => ( + "TransformSortMergeCollect", + Inner::Collect(vec![]), + Some(limit), + ), + None => ("TransformSortMergeCollect", Inner::Collect(vec![]), None), + }; + Ok(Self { + input, + output, + name, + row_converter, + output_data: None, + sort_desc, + order_col_generated, + base: Base { + schema, + spiller, + sort_row_offset, + limit, + }, + inner, + aborting: AtomicBool::new(false), + memory_settings, + }) + } + + fn generate_order_column(&self, mut block: DataBlock) -> Result<(A::Rows, DataBlock)> { + let order_by_cols = self + .sort_desc + .iter() + .map(|desc| block.get_by_offset(desc.offset).clone()) + .collect::>(); + let rows = self + .row_converter + .convert(&order_by_cols, block.num_rows())?; + let order_col = rows.to_column(); + block.add_column(BlockEntry { + data_type: order_col.data_type(), + value: Value::Column(order_col), + }); + Ok((rows, block)) + } + + fn limit_trans_to_spill(&mut self) -> Result<()> { + let Inner::Limit(merger) = &self.inner else { + unreachable!() + }; + assert!(merger.num_rows() > 0); + let params = self.determine_params(merger.num_bytes(), merger.num_rows()); + let Inner::Limit(merger) = &mut self.inner else { + unreachable!() + }; + let blocks = merger.prepare_spill(params.batch_rows)?; + let spill_sort = SortSpill::new(self.base.clone(), params); + self.inner = Inner::Spill(blocks, spill_sort); + Ok(()) + } + + fn collect_trans_to_spill(&mut self, input_data: Vec) { + let (num_rows, num_bytes) = input_data + .iter() + .map(|block| (block.num_rows(), block.memory_size())) + .fold((0, 0), |(acc_rows, acc_bytes), (rows, bytes)| { + (acc_rows + rows, acc_bytes + bytes) + }); + assert!(num_rows > 0); + let params = self.determine_params(num_bytes, num_rows); + let spill_sort = SortSpill::new(self.base.clone(), params); + self.inner = Inner::Spill(input_data, spill_sort); + } + + fn trans_to_spill(&mut self) -> Result<()> { + match &mut self.inner { + Inner::Limit(_) => self.limit_trans_to_spill(), + Inner::Collect(input_data) => { + let input_data = std::mem::take(input_data); + self.collect_trans_to_spill(input_data); + Ok(()) + } + Inner::Spill(_, _) => Ok(()), + Inner::None => unreachable!(), + } + } + + fn determine_params(&self, bytes: usize, rows: usize) -> SortSpillParams { + // We use the first memory calculation to estimate the batch size and the number of merge. + let unit_size = self.memory_settings.spill_unit_size; + let num_merge = bytes.div_ceil(unit_size).max(2); + let batch_rows = rows.div_ceil(num_merge); + log::info!("determine sort spill params, buffer_bytes: {bytes}, buffer_rows: {rows}, spill_unit_size: {unit_size}, batch_rows: {batch_rows}, batch_num_merge {num_merge}"); + SortSpillParams { + batch_rows, + num_merge, + } + } + + fn collect_block(&mut self, block: DataBlock) -> Result<()> { + if self.order_col_generated { + return match &mut self.inner { + Inner::Limit(limit_sort) => { + let rows = A::Rows::from_column(block.get_last_column())?; + limit_sort.add_block(block, rows) + } + Inner::Collect(input_data) | Inner::Spill(input_data, _) => { + input_data.push(block); + Ok(()) + } + _ => unreachable!(), + }; + } + + let (rows, block) = self.generate_order_column(block)?; + match &mut self.inner { + Inner::Limit(limit_sort) => limit_sort.add_block(block, rows), + Inner::Collect(input_data) | Inner::Spill(input_data, _) => { + input_data.push(block); + Ok(()) + } + _ => unreachable!(), + } + } + + fn input_rows(&self) -> usize { + match &self.inner { + Inner::Collect(input_data) | Inner::Spill(input_data, _) => input_data.in_memory_rows(), + _ => 0, + } + } + + fn check_spill(&self) -> bool { + if !self.memory_settings.check_spill() { + return false; + } + + match &self.inner { + Inner::Limit(limit_sort) => { + limit_sort.num_bytes() > self.memory_settings.spill_unit_size * 2 + } + Inner::Collect(input_data) => { + input_data.iter().map(|b| b.memory_size()).sum::() + > self.memory_settings.spill_unit_size * 2 + } + Inner::Spill(input_data, sort_spill) => { + input_data.in_memory_rows() > sort_spill.max_rows() + } + _ => unreachable!(), + } + } + + fn create_output(&mut self) -> Result<()> { + let Inner::Spill(input_data, spill_sort) = std::mem::replace(&mut self.inner, Inner::None) + else { + unreachable!() + }; + assert!(input_data.is_empty()); + + let meta = spill_sort.dump_collect()?; + self.output_data = Some(DataBlock::empty_with_meta(Box::new(meta))); + Ok(()) + } +} + +trait MemoryRows { + fn in_memory_rows(&self) -> usize; +} + +impl MemoryRows for Vec { + fn in_memory_rows(&self) -> usize { + self.iter().map(|s| s.num_rows()).sum::() + } +} + +#[async_trait::async_trait] +impl Processor for TransformSortCollect +where + A: SortAlgorithm + 'static, + A::Rows: 'static, + C: RowConverter + Send + 'static, +{ + fn name(&self) -> String { + self.name.to_string() + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.output.is_finished() { + self.input.finish(); + return Ok(Event::Finished); + } + + if !self.output.can_push() { + self.input.set_not_need_data(); + return Ok(Event::NeedConsume); + } + + if let Some(block) = self.output_data.take() { + assert!(self.input.is_finished()); + self.output.push_data(Ok(block)); + self.output.finish(); + return Ok(Event::Finished); + } + + if self.input.has_data() { + return if self.check_spill() { + // delay the handle of input until the next call. + Ok(Event::Async) + } else { + Ok(Event::Sync) + }; + } + + if self.input.is_finished() { + return match &self.inner { + Inner::Limit(merger) => { + if merger.num_rows() == 0 { + self.output.finish(); + Ok(Event::Finished) + } else { + Ok(Event::Async) + } + } + Inner::Collect(input_data) => { + if input_data.is_empty() { + self.output.finish(); + Ok(Event::Finished) + } else { + Ok(Event::Async) + } + } + Inner::Spill(_, _) => Ok(Event::Async), + Inner::None => unreachable!(), + }; + } + + self.input.set_need_data(); + Ok(Event::NeedData) + } + + fn process(&mut self) -> Result<()> { + if let Some(block) = self.input.pull_data().transpose()? { + self.input.set_need_data(); + if !block.is_empty() { + self.collect_block(block)?; + } + } + Ok(()) + } + + #[async_backtrace::framed] + async fn async_process(&mut self) -> Result<()> { + let finished = self.input.is_finished(); + self.trans_to_spill()?; + + let input = self.input_rows(); + let Inner::Spill(input_data, spill_sort) = &mut self.inner else { + unreachable!() + }; + let memory_rows = spill_sort.collect_memory_rows(); + let max = spill_sort.max_rows(); + + if memory_rows > 0 && memory_rows + input > max { + spill_sort + .subsequent_spill_last(memory_rows + input - max) + .await?; + } + if input > max || finished && input > 0 { + spill_sort.sort_input_data(std::mem::take(input_data), &self.aborting)?; + } + if finished { + self.create_output() + } else { + Ok(()) + } + } + + fn interrupt(&self) { + self.aborting.store(true, atomic::Ordering::Release); + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/sort_spill.rs index 629f0761121bd..7f8911325bd08 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/sort_spill.rs @@ -40,6 +40,7 @@ use rand::SeedableRng; use super::Base; use super::MemoryRows; +use super::SortCollectedMeta; use crate::spillers::Location; use crate::spillers::Spiller; @@ -56,7 +57,7 @@ enum Step { struct StepCollect { params: SortSpillParams, sampler: FixedRateSampler, - streams: Vec>, + streams: Vec>>, } struct StepSort { @@ -66,10 +67,11 @@ struct StepSort { bounds: Vec, cur_bound: Option, - subsequent: Vec>, - current: Vec>, + subsequent: Vec>>, + current: Vec>>, - output_merger: Option>>, + #[allow(clippy::type_complexity)] + output_merger: Option>>>, } impl SortSpill @@ -155,6 +157,34 @@ where A: SortAlgorithm pub fn format_memory_usage(&self) -> FmtMemoryUsage<'_, A> { FmtMemoryUsage(self) } + + pub fn dump_collect(self) -> Result { + let Self { + base, + step: Step::Collect(mut collect), + } = self + else { + unreachable!() + }; + + let StepSort { + params, + bounds, + subsequent, + .. + } = collect.next_step(&base)?; + + let blocks = subsequent + .into_iter() + .map(|stream| Vec::from(stream.blocks).into_boxed_slice()) + .collect(); + + Ok(SortCollectedMeta { + batch_rows: params.batch_rows, + blocks, + bounds, + }) + } } impl StepCollect { @@ -443,8 +473,8 @@ impl Base { &self, blocks: VecDeque, bound: Option, - ) -> BoundBlockStream { - BoundBlockStream:: { + ) -> BoundBlockStream> { + BoundBlockStream { blocks, bound, sort_row_offset: self.sort_row_offset, @@ -511,7 +541,7 @@ impl Base { } } -impl MemoryRows for Vec> { +impl MemoryRows for Vec> { fn in_memory_rows(&self) -> usize { self.iter().map(|s| s.in_memory_rows()).sum::() } @@ -548,7 +578,7 @@ impl fmt::Debug for FmtMemoryUsage<'_, A> { } } -struct SpillableBlock { +pub struct SpillableBlock { data: Option, rows: usize, location: Option, @@ -586,10 +616,10 @@ impl SpillableBlock { R::from_column(&self.domain).unwrap() } - async fn spill(&mut self, spiller: &Spiller) -> Result<()> { + async fn spill(&mut self, spiller: &impl Spill) -> Result<()> { let data = self.data.take().unwrap(); if self.location.is_none() { - let location = spiller.spill(vec![data]).await?; + let location = spiller.spill(data).await?; self.location = Some(location); } Ok(()) @@ -615,15 +645,32 @@ fn sort_column(data: &DataBlock, sort_row_offset: usize) -> &Column { .unwrap() } +#[async_trait::async_trait] +pub trait Spill: Send { + async fn spill(&self, data_block: DataBlock) -> Result; + async fn resotre(&self, location: &Location) -> Result; +} + +#[async_trait::async_trait] +impl Spill for Arc { + async fn spill(&self, data_block: DataBlock) -> Result { + self.as_ref().spill(vec![data_block]).await + } + + async fn resotre(&self, location: &Location) -> Result { + self.read_spilled_file(location).await + } +} + /// BoundBlockStream is a stream of blocks that are cutoff less or equal than bound. -struct BoundBlockStream { +struct BoundBlockStream { blocks: VecDeque, bound: Option, sort_row_offset: usize, - spiller: Arc, + spiller: S, } -impl Debug for BoundBlockStream { +impl Debug for BoundBlockStream { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { f.debug_struct("BoundBlockStream") .field("blocks", &self.blocks) @@ -634,7 +681,7 @@ impl Debug for BoundBlockStream { } #[async_trait::async_trait] -impl SortedStream for BoundBlockStream { +impl SortedStream for BoundBlockStream { async fn async_next(&mut self) -> Result<(Option<(DataBlock, Column)>, bool)> { if self.should_include_first() { self.restore_first().await?; @@ -647,7 +694,7 @@ impl SortedStream for BoundBlockStream { } } -impl BoundBlockStream { +impl BoundBlockStream { fn should_include_first(&self) -> bool { let Some(block) = self.blocks.front() else { return false; @@ -679,6 +726,23 @@ impl BoundBlockStream { block.data.take().unwrap() } + fn len(&self) -> usize { + self.blocks.len() + } + + fn is_empty(&self) -> bool { + self.len() == 0 + } + + fn in_memory_rows(&self) -> usize { + self.blocks + .iter() + .map(|b| if b.data.is_some() { b.rows } else { 0 }) + .sum() + } +} + +impl BoundBlockStream { async fn restore_first(&mut self) -> Result<()> { let block = self.blocks.front_mut().unwrap(); if block.data.is_some() { @@ -686,7 +750,7 @@ impl BoundBlockStream { } let location = block.location.as_ref().unwrap(); - let data = self.spiller.read_spilled_file(location).await?; + let data = self.spiller.resotre(location).await?; block.data = Some(if block.processed != 0 { debug_assert_eq!(block.rows + block.processed, data.num_rows()); data.slice(block.processed..data.num_rows()) @@ -703,21 +767,6 @@ impl BoundBlockStream { Ok(()) } - fn len(&self) -> usize { - self.blocks.len() - } - - fn is_empty(&self) -> bool { - self.len() == 0 - } - - fn in_memory_rows(&self) -> usize { - self.blocks - .iter() - .map(|b| if b.data.is_some() { b.rows } else { 0 }) - .sum() - } - async fn spill(&mut self, skip: usize) -> Result<()> { for b in &mut self .blocks @@ -812,6 +861,10 @@ fn get_domain(col: &Column) -> Column { #[cfg(test)] mod tests { + use std::collections::HashMap; + use std::sync::Mutex; + + use databend_common_base::base::GlobalUniqName; use databend_common_expression::types::DataType; use databend_common_expression::types::Int32Type; use databend_common_expression::types::NumberDataType; @@ -825,12 +878,8 @@ mod tests { use databend_common_pipeline_transforms::processors::sort::convert_rows; use databend_common_pipeline_transforms::processors::sort::SimpleRowsAsc; use databend_common_pipeline_transforms::sort::SimpleRowsDesc; - use databend_common_storage::DataOperator; use super::*; - use crate::spillers::SpillerConfig; - use crate::spillers::SpillerType; - use crate::test_kits::*; fn test_data() -> (DataSchemaRef, DataBlock) { let col1 = Int32Type::from_data(vec![7, 7, 8, 11, 3, 5, 10, 11]); @@ -847,7 +896,7 @@ mod tests { } async fn run_bound_block_stream( - spiller: Arc, + spiller: impl Spill + Clone, sort_desc: Arc>, bound: Column, block_part: usize, @@ -870,7 +919,7 @@ mod tests { }) .collect::>(); - let mut stream = BoundBlockStream:: { + let mut stream = BoundBlockStream:: { blocks, bound, sort_row_offset, @@ -886,17 +935,9 @@ mod tests { #[tokio::test] async fn test_bound_block_stream() -> Result<()> { - let fixture = TestFixture::setup().await?; - let ctx = fixture.new_query_ctx().await?; - - let op = DataOperator::instance().spill_operator(); - let spill_config = SpillerConfig { - spiller_type: SpillerType::OrderBy, - location_prefix: "_spill_test".to_string(), - disk_spill: None, - use_parquet: true, + let spiller = MockSpiller { + map: Arc::new(Mutex::new(HashMap::new())), }; - let spiller = Arc::new(Spiller::create(ctx.clone(), op, spill_config)?); { let sort_desc = Arc::new(vec![SortColumnDescription { @@ -943,4 +984,25 @@ mod tests { Ok(()) } + + #[derive(Clone)] + struct MockSpiller { + map: Arc>>, + } + + #[async_trait::async_trait] + impl Spill for MockSpiller { + async fn spill(&self, data_block: DataBlock) -> Result { + let name = GlobalUniqName::unique(); + self.map.lock().unwrap().insert(name.clone(), data_block); + Ok(Location::Remote(name)) + } + + async fn resotre(&self, location: &Location) -> Result { + match location { + Location::Remote(name) => Ok(self.map.lock().unwrap().get(name).unwrap().clone()), + _ => unreachable!(), + } + } + } } From f192ca40a3da8e67a6f2791e4a13f0b8da96c9d6 Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 21 Apr 2025 18:55:24 +0800 Subject: [PATCH 07/33] execute --- .../transforms/transform_merge_sort.rs | 3 +- .../transform_merge_sort/builder.rs | 118 ++++++++++++++- .../transform_merge_sort/collect.rs | 3 +- .../transform_merge_sort/execute.rs | 139 ++++++++++++++++++ .../transform_merge_sort/sort_spill.rs | 40 ++++- 5 files changed, 287 insertions(+), 16 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/transform_merge_sort/execute.rs diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs index 2e7ffe1f650dc..e48cfaf9188fb 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs @@ -52,6 +52,7 @@ mod builder; pub use builder::TransformSortBuilder; mod collect; +mod execute; #[derive(Debug)] enum State { @@ -73,7 +74,7 @@ struct Base { #[derive(Debug)] pub struct SortCollectedMeta { - batch_rows: usize, + params: SortSpillParams, bounds: Vec, blocks: Vec>, } diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/builder.rs b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/builder.rs index 313e69d4412c6..e3a6712dc897e 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/builder.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/builder.rs @@ -31,9 +31,17 @@ use databend_common_pipeline_transforms::sort::Rows; use databend_common_pipeline_transforms::sort::RowsTypeVisitor; use databend_common_pipeline_transforms::MemorySettings; +use super::collect::TransformSortCollect; +use super::execute::TransformSortExecute; use super::TransformSort; use crate::spillers::Spiller; +enum SortType { + Sort, + Collect, + Execute, +} + pub struct TransformSortBuilder { input: Arc, output: Arc, @@ -47,6 +55,7 @@ pub struct TransformSortBuilder { enable_loser_tree: bool, limit: Option, processor: Option>>, + typ: SortType, } impl TransformSortBuilder { @@ -71,6 +80,7 @@ impl TransformSortBuilder { limit: None, memory_settings: MemorySettings::disable_spill(), processor: None, + typ: SortType::Sort, } } @@ -149,6 +159,83 @@ impl TransformSortBuilder { self.memory_settings.clone(), )?)) } + + pub fn build_collect(mut self) -> Result> { + debug_assert!(if self.output_order_col { + self.schema.has_field(ORDER_COL_NAME) + } else { + !self.schema.has_field(ORDER_COL_NAME) + }); + self.typ = SortType::Collect; + + select_row_type(&mut self); + self.processor.unwrap() + } + + fn build_sort_collect(&mut self) -> Result> + where + A: SortAlgorithm + 'static, + C: RowConverter + Send + 'static, + { + let schema = add_order_field(self.schema.clone(), &self.sort_desc); + + Ok(Box::new(TransformSortCollect::::new( + self.input.clone(), + self.output.clone(), + schema, + self.sort_desc.clone(), + self.block_size, + self.limit.map(|limit| (limit, false)), + self.spiller.clone(), + self.order_col_generated, + self.memory_settings.clone(), + )?)) + } + + fn build_sort_limit_collect(&mut self) -> Result> + where + A: SortAlgorithm + 'static, + C: RowConverter + Send + 'static, + { + let schema = add_order_field(self.schema.clone(), &self.sort_desc); + Ok(Box::new(TransformSortCollect::::new( + self.input.clone(), + self.output.clone(), + schema, + self.sort_desc.clone(), + self.block_size, + Some((self.limit.unwrap(), true)), + self.spiller.clone(), + self.order_col_generated, + self.memory_settings.clone(), + )?)) + } + + pub fn build_exec(mut self) -> Result> { + debug_assert!(if self.output_order_col { + self.schema.has_field(ORDER_COL_NAME) + } else { + !self.schema.has_field(ORDER_COL_NAME) + }); + self.typ = SortType::Execute; + + select_row_type(&mut self); + self.processor.unwrap() + } + + fn build_sort_exec(&mut self) -> Result> + where A: SortAlgorithm + 'static { + let schema = add_order_field(self.schema.clone(), &self.sort_desc); + + Ok(Box::new(TransformSortExecute::::new( + self.input.clone(), + self.output.clone(), + schema, + self.limit, + self.spiller.clone(), + self.output_order_col, + )?)) + } } impl RowsTypeVisitor for TransformSortBuilder { @@ -165,14 +252,29 @@ impl RowsTypeVisitor for TransformSortBuilder { R: Rows + 'static, C: RowConverter + Send + 'static, { - let processor = match ( - self.limit.map(|limit| limit < 10000).unwrap_or_default(), - self.enable_loser_tree, - ) { - (true, true) => self.build_sort_limit::, C>(), - (true, false) => self.build_sort_limit::, C>(), - (false, true) => self.build_sort::, C>(), - (false, false) => self.build_sort::, C>(), + let processor = match self.typ { + SortType::Sort => match ( + self.limit.map(|limit| limit < 10000).unwrap_or_default(), + self.enable_loser_tree, + ) { + (true, true) => self.build_sort_limit::, C>(), + (true, false) => self.build_sort_limit::, C>(), + (false, true) => self.build_sort::, C>(), + (false, false) => self.build_sort::, C>(), + }, + SortType::Collect => match ( + self.limit.map(|limit| limit < 10000).unwrap_or_default(), + self.enable_loser_tree, + ) { + (true, true) => self.build_sort_limit_collect::, C>(), + (true, false) => self.build_sort_limit_collect::, C>(), + (false, true) => self.build_sort_collect::, C>(), + (false, false) => self.build_sort_collect::, C>(), + }, + SortType::Execute => match self.enable_loser_tree { + true => self.build_sort_exec::>(), + false => self.build_sort_exec::>(), + }, }; self.processor = Some(processor) } diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/collect.rs b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/collect.rs index 121f3a258d51c..4f2e58ef2e7b4 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/collect.rs @@ -74,11 +74,12 @@ where C: RowConverter, { #[allow(clippy::too_many_arguments)] - fn new( + pub fn new( input: Arc, output: Arc, schema: DataSchemaRef, sort_desc: Arc<[SortColumnDescription]>, + max_block_size: usize, limit: Option<(usize, bool)>, spiller: Arc, order_col_generated: bool, diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/execute.rs b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/execute.rs new file mode 100644 index 0000000000000..3612af89d156d --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/execute.rs @@ -0,0 +1,139 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::DataBlock; +use databend_common_expression::DataSchemaRef; +use databend_common_pipeline_core::processors::Event; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::Processor; +use databend_common_pipeline_transforms::processors::sort::algorithm::SortAlgorithm; + +use super::sort_spill::SortSpill; +use super::Base; +use super::SortCollectedMeta; +use crate::spillers::Spiller; + +pub struct TransformSortExecute { + input: Arc, + output: Arc, + + /// If the next transform of current transform is [`super::transform_multi_sort_merge::MultiSortMergeProcessor`], + /// we can generate and output the order column to avoid the extra converting in the next transform. + remove_order_col: bool, + + base: Base, + inner: Option>, +} + +impl TransformSortExecute +where A: SortAlgorithm +{ + #[allow(clippy::too_many_arguments)] + pub fn new( + input: Arc, + output: Arc, + schema: DataSchemaRef, + limit: Option, + spiller: Arc, + output_order_col: bool, + ) -> Result { + let sort_row_offset = schema.fields().len() - 1; + Ok(Self { + input, + output, + remove_order_col: !output_order_col, + base: Base { + schema, + spiller, + sort_row_offset, + limit, + }, + inner: None, + }) + } + + fn output_block(&self, mut block: DataBlock) { + if self.remove_order_col { + block.pop_columns(1); + } + self.output.push_data(Ok(block)); + } +} + +#[async_trait::async_trait] +impl Processor for TransformSortExecute +where + A: SortAlgorithm + 'static, + A::Rows: 'static, +{ + fn name(&self) -> String { + "TransformSortExecute".to_string() + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.output.is_finished() { + self.input.finish(); + return Ok(Event::Finished); + } + + if !self.output.can_push() { + self.input.set_not_need_data(); + return Ok(Event::NeedConsume); + } + + if let Some(mut block) = self.input.pull_data().transpose()? { + assert!(self.inner.is_none()); + let meta = block + .take_meta() + .and_then(SortCollectedMeta::downcast_from) + .expect("require a SortCollectedMeta"); + + self.inner = Some(SortSpill::::from_meta(self.base.clone(), meta)); + return Ok(Event::Async); + } + + if self.input.is_finished() { + Ok(Event::Async) + } else { + self.input.set_need_data(); + Ok(Event::NeedData) + } + } + + #[async_backtrace::framed] + async fn async_process(&mut self) -> Result<()> { + let Some(spill_sort) = &mut self.inner else { + unreachable!() + }; + let (block, finish) = spill_sort.on_restore().await?; + if let Some(block) = block { + assert!(!self.output.has_data()); + self.output_block(block); + } + if finish { + self.output.finish(); + } + Ok(()) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/sort_spill.rs index 7f8911325bd08..2c3a85b4c75b9 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/sort_spill.rs @@ -94,6 +94,31 @@ where A: SortAlgorithm Self { base, step } } + pub fn from_meta(base: Base, meta: SortCollectedMeta) -> Self { + let SortCollectedMeta { + params, + bounds, + blocks, + } = meta; + + let subsequent = blocks + .into_iter() + .map(|list| base.new_stream(Vec::from(list).into(), None)) + .collect(); + + Self { + base, + step: Step::Sort(StepSort { + params, + bounds, + cur_bound: None, + subsequent, + current: vec![], + output_merger: None, + }), + } + } + pub fn sort_input_data( &mut self, input_data: Vec, @@ -176,11 +201,14 @@ where A: SortAlgorithm let blocks = subsequent .into_iter() - .map(|stream| Vec::from(stream.blocks).into_boxed_slice()) + .map(|stream| { + assert!(stream.bound.is_none()); + Vec::from(stream.blocks).into_boxed_slice() + }) .collect(); Ok(SortCollectedMeta { - batch_rows: params.batch_rows, + params, blocks, bounds, }) @@ -648,7 +676,7 @@ fn sort_column(data: &DataBlock, sort_row_offset: usize) -> &Column { #[async_trait::async_trait] pub trait Spill: Send { async fn spill(&self, data_block: DataBlock) -> Result; - async fn resotre(&self, location: &Location) -> Result; + async fn restore(&self, location: &Location) -> Result; } #[async_trait::async_trait] @@ -657,7 +685,7 @@ impl Spill for Arc { self.as_ref().spill(vec![data_block]).await } - async fn resotre(&self, location: &Location) -> Result { + async fn restore(&self, location: &Location) -> Result { self.read_spilled_file(location).await } } @@ -750,7 +778,7 @@ impl BoundBlockStream { } let location = block.location.as_ref().unwrap(); - let data = self.spiller.resotre(location).await?; + let data = self.spiller.restore(location).await?; block.data = Some(if block.processed != 0 { debug_assert_eq!(block.rows + block.processed, data.num_rows()); data.slice(block.processed..data.num_rows()) @@ -998,7 +1026,7 @@ mod tests { Ok(Location::Remote(name)) } - async fn resotre(&self, location: &Location) -> Result { + async fn restore(&self, location: &Location) -> Result { match location { Location::Remote(name) => Ok(self.map.lock().unwrap().get(name).unwrap().clone()), _ => unreachable!(), From 4a2bed42f6ad3b6ad6a395f7b954f98297565b5e Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 21 Apr 2025 19:26:25 +0800 Subject: [PATCH 08/33] move --- .../pipelines/processors/transforms/mod.rs | 3 +- .../{transform_merge_sort => sort}/builder.rs | 2 +- .../{transform_merge_sort => sort}/collect.rs | 5 +-- .../{transform_merge_sort => sort}/execute.rs | 0 .../merge_sort.rs} | 41 +++--------------- .../processors/transforms/sort/mod.rs | 43 +++++++++++++++++++ .../sort_spill.rs | 0 7 files changed, 51 insertions(+), 43 deletions(-) rename src/query/service/src/pipelines/processors/transforms/{transform_merge_sort => sort}/builder.rs (99%) rename src/query/service/src/pipelines/processors/transforms/{transform_merge_sort => sort}/collect.rs (99%) rename src/query/service/src/pipelines/processors/transforms/{transform_merge_sort => sort}/execute.rs (100%) rename src/query/service/src/pipelines/processors/transforms/{transform_merge_sort.rs => sort/merge_sort.rs} (95%) rename src/query/service/src/pipelines/processors/transforms/{transform_merge_sort => sort}/sort_spill.rs (100%) diff --git a/src/query/service/src/pipelines/processors/transforms/mod.rs b/src/query/service/src/pipelines/processors/transforms/mod.rs index cdefac2d1a7e9..546ca3a9595ab 100644 --- a/src/query/service/src/pipelines/processors/transforms/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/mod.rs @@ -30,7 +30,6 @@ mod transform_expression_scan; mod transform_filter; mod transform_limit; mod transform_merge_block; -mod transform_merge_sort; mod transform_null_if; mod transform_recursive_cte_scan; mod transform_recursive_cte_source; @@ -42,6 +41,7 @@ mod transform_udf_server; mod window; pub use hash_join::*; +pub use sort::*; pub use transform_add_computed_columns::TransformAddComputedColumns; pub use transform_add_const_columns::TransformAddConstColumns; pub use transform_add_internal_columns::TransformAddInternalColumns; @@ -56,7 +56,6 @@ pub use transform_expression_scan::TransformExpressionScan; pub use transform_filter::TransformFilter; pub use transform_limit::TransformLimit; pub use transform_merge_block::TransformMergeBlock; -pub use transform_merge_sort::*; pub use transform_null_if::TransformNullIf; pub use transform_recursive_cte_scan::TransformRecursiveCteScan; pub use transform_recursive_cte_source::TransformRecursiveCteSource; diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/builder.rs b/src/query/service/src/pipelines/processors/transforms/sort/builder.rs similarity index 99% rename from src/query/service/src/pipelines/processors/transforms/transform_merge_sort/builder.rs rename to src/query/service/src/pipelines/processors/transforms/sort/builder.rs index e3a6712dc897e..b16686b4829fd 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/builder.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/builder.rs @@ -33,7 +33,7 @@ use databend_common_pipeline_transforms::MemorySettings; use super::collect::TransformSortCollect; use super::execute::TransformSortExecute; -use super::TransformSort; +use super::merge_sort::TransformSort; use crate::spillers::Spiller; enum SortType { diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/collect.rs b/src/query/service/src/pipelines/processors/transforms/sort/collect.rs similarity index 99% rename from src/query/service/src/pipelines/processors/transforms/transform_merge_sort/collect.rs rename to src/query/service/src/pipelines/processors/transforms/sort/collect.rs index 4f2e58ef2e7b4..90f3055b8fcc0 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/collect.rs @@ -37,6 +37,7 @@ use databend_common_pipeline_transforms::TransformSortMergeLimit; use super::sort_spill::SortSpill; use super::Base; +use super::MemoryRows; use crate::spillers::Spiller; enum Inner { @@ -256,10 +257,6 @@ where } } -trait MemoryRows { - fn in_memory_rows(&self) -> usize; -} - impl MemoryRows for Vec { fn in_memory_rows(&self) -> usize { self.iter().map(|s| s.num_rows()).sum::() diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/execute.rs b/src/query/service/src/pipelines/processors/transforms/sort/execute.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/transform_merge_sort/execute.rs rename to src/query/service/src/pipelines/processors/transforms/sort/execute.rs diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs b/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs similarity index 95% rename from src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs rename to src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs index e48cfaf9188fb..74ac2b9c7425d 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs @@ -20,10 +20,7 @@ use std::sync::atomic::AtomicBool; use std::sync::Arc; use databend_common_exception::Result; -use databend_common_expression::local_block_meta_serde; use databend_common_expression::BlockEntry; -use databend_common_expression::BlockMetaInfo; -use databend_common_expression::Column; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_expression::SortColumnDescription; @@ -39,21 +36,13 @@ use databend_common_pipeline_transforms::MemorySettings; use databend_common_pipeline_transforms::MergeSort; use databend_common_pipeline_transforms::SortSpillParams; use databend_common_pipeline_transforms::TransformSortMergeLimit; -use sort_spill::SpillableBlock; +use super::sort_spill::create_memory_merger; +use super::sort_spill::MemoryMerger; +use super::sort_spill::SortSpill; +use super::Base; use crate::spillers::Spiller; -mod sort_spill; -use sort_spill::create_memory_merger; -use sort_spill::MemoryMerger; -use sort_spill::SortSpill; - -mod builder; -pub use builder::TransformSortBuilder; - -mod collect; -mod execute; - #[derive(Debug)] enum State { /// This state means the processor will collect incoming blocks. @@ -64,26 +53,6 @@ enum State { Finish, } -#[derive(Clone)] -struct Base { - schema: DataSchemaRef, - spiller: Arc, - sort_row_offset: usize, - limit: Option, -} - -#[derive(Debug)] -pub struct SortCollectedMeta { - params: SortSpillParams, - bounds: Vec, - blocks: Vec>, -} - -local_block_meta_serde!(SortCollectedMeta); - -#[typetag::serde(name = "sort_collected")] -impl BlockMetaInfo for SortCollectedMeta {} - enum Inner { Collect(Vec), Limit(TransformSortMergeLimit), @@ -124,7 +93,7 @@ where C: RowConverter, { #[allow(clippy::too_many_arguments)] - fn new( + pub(super) fn new( input: Arc, output: Arc, schema: DataSchemaRef, diff --git a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs index c237beb982d8e..832f165d6341f 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs @@ -17,5 +17,48 @@ mod sort_merge; mod sort_sample; mod sort_wait; +use std::sync::Arc; + +use databend_common_expression::local_block_meta_serde; +use databend_common_expression::BlockMetaInfo; +use databend_common_expression::Column; +use databend_common_expression::DataSchemaRef; +use databend_common_pipeline_transforms::SortSpillParams; pub use sort_merge::*; pub use sort_sample::*; + +mod builder; +pub use builder::TransformSortBuilder; + +mod collect; +mod execute; +mod merge_sort; +mod sort_spill; + +use sort_spill::SpillableBlock; + +use crate::spillers::Spiller; + +#[derive(Clone)] +struct Base { + schema: DataSchemaRef, + spiller: Arc, + sort_row_offset: usize, + limit: Option, +} + +#[derive(Debug)] +pub struct SortCollectedMeta { + params: SortSpillParams, + bounds: Vec, + blocks: Vec>, +} + +local_block_meta_serde!(SortCollectedMeta); + +#[typetag::serde(name = "sort_collected")] +impl BlockMetaInfo for SortCollectedMeta {} + +trait MemoryRows { + fn in_memory_rows(&self) -> usize; +} diff --git a/src/query/service/src/pipelines/processors/transforms/transform_merge_sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/transform_merge_sort/sort_spill.rs rename to src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs From c4b1182435066686e142524962f408d06fa24326 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 22 Apr 2025 10:50:14 +0800 Subject: [PATCH 09/33] update --- .../processors/transforms/sort/rows/mod.rs | 13 +++++ .../processors/transforms/sort/collect.rs | 15 +----- .../processors/transforms/sort/execute.rs | 2 +- .../processors/transforms/sort/merge_sort.rs | 48 +++++++------------ .../processors/transforms/sort/mod.rs | 9 +++- .../processors/transforms/sort/sort_spill.rs | 2 +- 6 files changed, 43 insertions(+), 46 deletions(-) diff --git a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/mod.rs b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/mod.rs index 479d2559c049e..64c2599fe4e79 100644 --- a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/mod.rs +++ b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/mod.rs @@ -25,6 +25,7 @@ use databend_common_expression::types::ArgType; use databend_common_expression::types::DataType; use databend_common_expression::BlockEntry; use databend_common_expression::Column; +use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_expression::SortColumnDescription; pub use simple::*; @@ -39,6 +40,18 @@ where Self: Sized + Debug output_schema: DataSchemaRef, ) -> Result; fn convert(&self, columns: &[BlockEntry], num_rows: usize) -> Result; + + fn convert_data_block( + &self, + sort_desc: &[SortColumnDescription], + data_block: &DataBlock, + ) -> Result { + let order_by_cols = sort_desc + .iter() + .map(|desc| block.get_by_offset(desc.offset).clone()) + .collect::>(); + self.convert(&order_by_cols, block.num_rows()) + } } /// Rows can be compared. diff --git a/src/query/service/src/pipelines/processors/transforms/sort/collect.rs b/src/query/service/src/pipelines/processors/transforms/sort/collect.rs index 90f3055b8fcc0..c09301379ee90 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/collect.rs @@ -75,7 +75,7 @@ where C: RowConverter, { #[allow(clippy::too_many_arguments)] - pub fn new( + pub(super) fn new( input: Arc, output: Arc, schema: DataSchemaRef, @@ -122,14 +122,9 @@ where } fn generate_order_column(&self, mut block: DataBlock) -> Result<(A::Rows, DataBlock)> { - let order_by_cols = self - .sort_desc - .iter() - .map(|desc| block.get_by_offset(desc.offset).clone()) - .collect::>(); let rows = self .row_converter - .convert(&order_by_cols, block.num_rows())?; + .convert_data_block(&self.sort_desc, &block); let order_col = rows.to_column(); block.add_column(BlockEntry { data_type: order_col.data_type(), @@ -257,12 +252,6 @@ where } } -impl MemoryRows for Vec { - fn in_memory_rows(&self) -> usize { - self.iter().map(|s| s.num_rows()).sum::() - } -} - #[async_trait::async_trait] impl Processor for TransformSortCollect where diff --git a/src/query/service/src/pipelines/processors/transforms/sort/execute.rs b/src/query/service/src/pipelines/processors/transforms/sort/execute.rs index 3612af89d156d..c8b8f12f46855 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/execute.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/execute.rs @@ -46,7 +46,7 @@ impl TransformSortExecute where A: SortAlgorithm { #[allow(clippy::too_many_arguments)] - pub fn new( + pub(super) fn new( input: Arc, output: Arc, schema: DataSchemaRef, diff --git a/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs b/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs index 74ac2b9c7425d..af6d2ad7591a7 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs @@ -41,6 +41,7 @@ use super::sort_spill::create_memory_merger; use super::sort_spill::MemoryMerger; use super::sort_spill::SortSpill; use super::Base; +use super::MemoryRows; use crate::spillers::Spiller; #[derive(Debug)] @@ -140,14 +141,9 @@ where } fn generate_order_column(&self, mut block: DataBlock) -> Result<(A::Rows, DataBlock)> { - let order_by_cols = self - .sort_desc - .iter() - .map(|desc| block.get_by_offset(desc.offset).clone()) - .collect::>(); let rows = self .row_converter - .convert(&order_by_cols, block.num_rows())?; + .convert_data_block(&self.sort_desc, &block); let order_col = rows.to_column(); block.add_column(BlockEntry { data_type: order_col.data_type(), @@ -156,7 +152,7 @@ where Ok((rows, block)) } - fn prepare_spill_limit(&mut self) -> Result<()> { + fn limit_trans_to_spill(&mut self) -> Result<()> { let Inner::Limit(merger) = &self.inner else { unreachable!() }; @@ -170,7 +166,7 @@ where Ok(()) } - fn prepare_spill(&mut self, input_data: Vec) { + fn collect_trans_to_spill(&mut self, input_data: Vec) { let (num_rows, num_bytes) = input_data .iter() .map(|block| (block.num_rows(), block.memory_size())) @@ -182,6 +178,19 @@ where self.inner = Inner::Spill(input_data, spill_sort); } + fn trans_to_spill(&mut self) -> Result<()> { + match &mut self.inner { + Inner::Limit(_) => self.limit_trans_to_spill(), + Inner::Collect(input_data) => { + let input_data = std::mem::take(input_data); + self.collect_trans_to_spill(input_data); + Ok(()) + } + Inner::Spill(_, _) => Ok(()), + Inner::Memory(_) => unreachable!(), + } + } + fn determine_params(&self, bytes: usize, rows: usize) -> SortSpillParams { // We use the first memory calculation to estimate the batch size and the number of merge. let unit_size = self.memory_settings.spill_unit_size; @@ -295,16 +304,6 @@ where } } -trait MemoryRows { - fn in_memory_rows(&self) -> usize; -} - -impl MemoryRows for Vec { - fn in_memory_rows(&self) -> usize { - self.iter().map(|s| s.num_rows()).sum::() - } -} - #[async_trait::async_trait] impl Processor for TransformSort where @@ -422,18 +421,7 @@ where match &self.state { State::Collect => { let finished = self.input.is_finished(); - match &mut self.inner { - Inner::Limit(_) => { - self.prepare_spill_limit()?; - } - Inner::Collect(input_data) => { - assert!(!finished); - let input_data = std::mem::take(input_data); - self.prepare_spill(input_data); - } - Inner::Spill(_, _) => (), - Inner::Memory(_) => unreachable!(), - }; + self.trans_to_spill(); let input = self.input_rows(); let Inner::Spill(input_data, spill_sort) = &mut self.inner else { diff --git a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs index 832f165d6341f..9f220cdf03c24 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs @@ -22,6 +22,7 @@ use std::sync::Arc; use databend_common_expression::local_block_meta_serde; use databend_common_expression::BlockMetaInfo; use databend_common_expression::Column; +use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_pipeline_transforms::SortSpillParams; pub use sort_merge::*; @@ -48,7 +49,7 @@ struct Base { } #[derive(Debug)] -pub struct SortCollectedMeta { +struct SortCollectedMeta { params: SortSpillParams, bounds: Vec, blocks: Vec>, @@ -62,3 +63,9 @@ impl BlockMetaInfo for SortCollectedMeta {} trait MemoryRows { fn in_memory_rows(&self) -> usize; } + +impl MemoryRows for Vec { + fn in_memory_rows(&self) -> usize { + self.iter().map(|s| s.num_rows()).sum::() + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs index 2c3a85b4c75b9..28e03b335ccbd 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs @@ -44,7 +44,7 @@ use super::SortCollectedMeta; use crate::spillers::Location; use crate::spillers::Spiller; -pub struct SortSpill { +pub(super) struct SortSpill { base: Base, step: Step, } From 71fce4cbcf74a6d898e6c42f734b6519ebebfec4 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 22 Apr 2025 13:37:06 +0800 Subject: [PATCH 10/33] bounds --- .../processors/transforms/sort/rows/mod.rs | 4 +- .../processors/transforms/sort/bounds.rs | 171 ++++++++++++++++++ .../processors/transforms/sort/collect.rs | 3 +- .../processors/transforms/sort/execute.rs | 1 - .../processors/transforms/sort/merge_sort.rs | 5 +- .../processors/transforms/sort/mod.rs | 5 +- .../processors/transforms/sort/sort_spill.rs | 80 ++------ 7 files changed, 198 insertions(+), 71 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/sort/bounds.rs diff --git a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/mod.rs b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/mod.rs index 64c2599fe4e79..b1498c183c1e3 100644 --- a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/mod.rs +++ b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/mod.rs @@ -48,9 +48,9 @@ where Self: Sized + Debug ) -> Result { let order_by_cols = sort_desc .iter() - .map(|desc| block.get_by_offset(desc.offset).clone()) + .map(|desc| data_block.get_by_offset(desc.offset).clone()) .collect::>(); - self.convert(&order_by_cols, block.num_rows()) + self.convert(&order_by_cols, data_block.num_rows()) } } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs b/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs new file mode 100644 index 0000000000000..fe9d357fd8ba1 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs @@ -0,0 +1,171 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_exception::Result; +use databend_common_expression::Column; +use databend_common_expression::DataBlock; +use databend_common_expression::DataField; +use databend_common_expression::DataSchema; +use databend_common_expression::SortColumnDescription; +use databend_common_pipeline_transforms::sort::LoserTreeMerger; +use databend_common_pipeline_transforms::sort::Rows; +use databend_common_pipeline_transforms::sort::SortedStream; + +#[derive(Debug, PartialEq, Eq, Default)] +pub struct Bounds( + // stored in reverse order of Column. + Vec, +); + +impl Bounds { + pub fn from_column(column: Column) -> Result { + let block = DataBlock::sort( + &DataBlock::new_from_columns(vec![column]), + &[SortColumnDescription { + offset: 0, + asc: R::IS_ASC_COLUMN, + nulls_first: false, + }], + None, + )?; + + Ok(Bounds(vec![block.get_last_column().clone()])) + } + + pub fn merge(mut vector: Vec, batch_rows: usize) -> Result { + match vector.len() { + 0 => Ok(Bounds(vec![])), + 1 => Ok(vector.pop().unwrap()), + _ => { + let schema = DataSchema::new(vec![DataField::new("order_col", R::data_type())]); + let mut merger = + LoserTreeMerger::::create(schema.into(), vector, batch_rows, None); + + let mut blocks = Vec::new(); + while let Some(block) = merger.next_block()? { + blocks.push(block) + } + debug_assert!(merger.is_finished()); + + Ok(Bounds( + blocks + .iter() + .rev() + .map(|b| b.get_last_column().clone()) + .collect(), + )) + } + } + } + + pub fn next_bound(&mut self) -> Option { + let last = self.0.last_mut()?; + match last.len() { + 0 => unreachable!(), + 1 => Some(self.0.pop().unwrap()), + _ => { + let bound = last.slice(0..1).maybe_gc(); + *last = last.slice(1..last.len()); + Some(bound) + } + } + } + + #[expect(dead_code)] + pub fn len(&self) -> usize { + self.0.iter().map(Column::len).sum() + } + + #[expect(dead_code)] + pub fn is_empty(&self) -> bool { + self.0.iter().all(|col| col.len() == 0) + } +} + +impl SortedStream for Bounds { + fn next(&mut self) -> Result<(Option<(DataBlock, Column)>, bool)> { + match self.0.pop() { + Some(column) => Ok(( + Some((DataBlock::new_from_columns(vec![column.clone()]), column)), + false, + )), + None => Ok((None, false)), + } + } +} + +#[cfg(test)] +mod tests { + + use databend_common_expression::types::Int32Type; + use databend_common_expression::FromData; + use databend_common_pipeline_transforms::sort::SimpleRowsAsc; + use databend_common_pipeline_transforms::sort::SimpleRowsDesc; + + use super::*; + + #[test] + fn test_merge() -> Result<()> { + { + let column = Int32Type::from_data(vec![0, 7, 6, 6, 6]); + let bounds = Bounds::from_column::>(column)?; + assert_eq!( + bounds, + Bounds(vec![Int32Type::from_data(vec![0, 6, 6, 6, 7])]) + ); + + let vector = vec![ + bounds, + Bounds::default(), + Bounds::from_column::>(Int32Type::from_data(vec![ + 0, 1, 2, + ])) + .unwrap(), + ]; + let bounds = Bounds::merge::>(vector, 3)?; + + assert_eq!( + bounds, + Bounds(vec![ + Int32Type::from_data(vec![6, 7]), + Int32Type::from_data(vec![2, 6, 6]), + Int32Type::from_data(vec![0, 0, 1]), + ]) + ); + } + + { + let data = vec![vec![77, -2, 7], vec![3, 8, 6, 1, 1], vec![2]]; + + let data = data + .into_iter() + .map(|v| Bounds::from_column::>(Int32Type::from_data(v))) + .collect::>>()?; + let bounds = Bounds::merge::>(data, 2)?; + + assert_eq!( + bounds, + Bounds(vec![ + Int32Type::from_data(vec![-2]), + Int32Type::from_data(vec![1, 1]), + Int32Type::from_data(vec![3, 2]), + Int32Type::from_data(vec![7, 6]), + Int32Type::from_data(vec![77, 8]), + ]) + ); + } + + Ok(()) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/collect.rs b/src/query/service/src/pipelines/processors/transforms/sort/collect.rs index c09301379ee90..83a9001a59ea3 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/collect.rs @@ -74,7 +74,6 @@ where A: SortAlgorithm, C: RowConverter, { - #[allow(clippy::too_many_arguments)] pub(super) fn new( input: Arc, output: Arc, @@ -124,7 +123,7 @@ where fn generate_order_column(&self, mut block: DataBlock) -> Result<(A::Rows, DataBlock)> { let rows = self .row_converter - .convert_data_block(&self.sort_desc, &block); + .convert_data_block(&self.sort_desc, &block)?; let order_col = rows.to_column(); block.add_column(BlockEntry { data_type: order_col.data_type(), diff --git a/src/query/service/src/pipelines/processors/transforms/sort/execute.rs b/src/query/service/src/pipelines/processors/transforms/sort/execute.rs index c8b8f12f46855..39f01dc0497e1 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/execute.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/execute.rs @@ -45,7 +45,6 @@ pub struct TransformSortExecute { impl TransformSortExecute where A: SortAlgorithm { - #[allow(clippy::too_many_arguments)] pub(super) fn new( input: Arc, output: Arc, diff --git a/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs b/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs index af6d2ad7591a7..165664a9df464 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs @@ -93,7 +93,6 @@ where A: SortAlgorithm, C: RowConverter, { - #[allow(clippy::too_many_arguments)] pub(super) fn new( input: Arc, output: Arc, @@ -143,7 +142,7 @@ where fn generate_order_column(&self, mut block: DataBlock) -> Result<(A::Rows, DataBlock)> { let rows = self .row_converter - .convert_data_block(&self.sort_desc, &block); + .convert_data_block(&self.sort_desc, &block)?; let order_col = rows.to_column(); block.add_column(BlockEntry { data_type: order_col.data_type(), @@ -421,7 +420,7 @@ where match &self.state { State::Collect => { let finished = self.input.is_finished(); - self.trans_to_spill(); + self.trans_to_spill()?; let input = self.input_rows(); let Inner::Spill(input_data, spill_sort) = &mut self.inner else { diff --git a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs index 9f220cdf03c24..128bd76f80a9e 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs @@ -19,9 +19,9 @@ mod sort_wait; use std::sync::Arc; +use bounds::Bounds; use databend_common_expression::local_block_meta_serde; use databend_common_expression::BlockMetaInfo; -use databend_common_expression::Column; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_pipeline_transforms::SortSpillParams; @@ -31,6 +31,7 @@ pub use sort_sample::*; mod builder; pub use builder::TransformSortBuilder; +mod bounds; mod collect; mod execute; mod merge_sort; @@ -51,7 +52,7 @@ struct Base { #[derive(Debug)] struct SortCollectedMeta { params: SortSpillParams, - bounds: Vec, + bounds: Bounds, blocks: Vec>, } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs index 28e03b335ccbd..f64805a649e5e 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs @@ -29,7 +29,6 @@ use databend_common_expression::sampler::FixedRateSampler; use databend_common_expression::Column; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; -use databend_common_expression::SortColumnDescription; use databend_common_pipeline_transforms::processors::sort::algorithm::SortAlgorithm; use databend_common_pipeline_transforms::processors::sort::Merger; use databend_common_pipeline_transforms::processors::sort::Rows; @@ -38,6 +37,7 @@ use databend_common_pipeline_transforms::processors::SortSpillParams; use rand::rngs::StdRng; use rand::SeedableRng; +use super::bounds::Bounds; use super::Base; use super::MemoryRows; use super::SortCollectedMeta; @@ -62,15 +62,15 @@ struct StepCollect { struct StepSort { params: SortSpillParams, - /// Partition boundaries for restoring and sorting blocks, stored in reverse order of Column. + /// Partition boundaries for restoring and sorting blocks. /// Each boundary represents a cutoff point where data less than or equal to it belongs to one partition. - bounds: Vec, + bounds: Bounds, cur_bound: Option, subsequent: Vec>>, current: Vec>>, - #[allow(clippy::type_complexity)] + #[expect(clippy::type_complexity)] output_merger: Option>>>, } @@ -178,7 +178,7 @@ where A: SortAlgorithm params.num_merge * params.batch_rows } - #[allow(unused)] + #[expect(unused)] pub fn format_memory_usage(&self) -> FmtMemoryUsage<'_, A> { FmtMemoryUsage(self) } @@ -297,20 +297,10 @@ impl StepCollect { impl StepSort { fn next_bound(&mut self) { - let Some(last) = self.bounds.last_mut() else { - self.cur_bound = None; - return; - }; - let bound = match last.len() { - 0 => unreachable!(), - 1 => self.bounds.pop().unwrap(), - _ => { - let bound = last.slice(0..1).maybe_gc(); - *last = last.slice(1..last.len()); - bound - } - }; - self.cur_bound = Some(A::Rows::from_column(&bound).unwrap()); + match self.bounds.next_bound() { + Some(bound) => self.cur_bound = Some(A::Rows::from_column(&bound).unwrap()), + None => self.cur_bound = None, + } } async fn merge_current(&mut self, base: &Base) -> Result<()> { @@ -518,52 +508,19 @@ impl Base { &self, sampled_rows: Vec, batch_rows: usize, - ) -> Result> { + ) -> Result { match sampled_rows.len() { - 0 => Ok(vec![]), - 1 => Ok(vec![DataBlock::sort( - &sampled_rows[0], - &[SortColumnDescription { - offset: 0, - asc: A::Rows::IS_ASC_COLUMN, - nulls_first: false, - }], - None, - )? - .get_last_column() - .clone()]), + 0 => Ok(Bounds::default()), + 1 => Bounds::from_column::(sampled_rows[0].get_last_column().clone()), _ => { - let streams = sampled_rows + let ls = sampled_rows .into_iter() .map(|data| { - let data = DataBlock::sort( - &data, - &[SortColumnDescription { - offset: 0, - asc: A::Rows::IS_ASC_COLUMN, - nulls_first: false, - }], - None, - ) - .unwrap(); - DataBlockStream::new(data, 0) + let col = data.get_last_column().clone(); + Bounds::from_column::(col) }) - .collect::>(); - - let schema = self.schema.project(&[self.sort_row_offset]); - let mut merger = Merger::::create(schema.into(), streams, batch_rows, None); - - let mut blocks = Vec::new(); - while let Some(block) = merger.next_block()? { - blocks.push(block) - } - debug_assert!(merger.is_finished()); - - Ok(blocks - .iter() - .rev() - .map(|b| b.get_last_column().clone()) - .collect::>()) + .collect::>>()?; + Bounds::merge::(ls, batch_rows) } } } @@ -850,7 +807,7 @@ impl SortedStream for DataBlockStream { } impl DataBlockStream { - fn new(data: DataBlock, sort_row_offset: usize) -> Self { + pub(super) fn new(data: DataBlock, sort_row_offset: usize) -> Self { let col = sort_column(&data, sort_row_offset).clone(); Self(Some((data, col))) } @@ -902,6 +859,7 @@ mod tests { use databend_common_expression::DataField; use databend_common_expression::DataSchemaRefExt; use databend_common_expression::FromData; + use databend_common_expression::SortColumnDescription; use databend_common_expression::Value; use databend_common_pipeline_transforms::processors::sort::convert_rows; use databend_common_pipeline_transforms::processors::sort::SimpleRowsAsc; From 7c3ae2bd92826e8062bbc8263554540058e8ec46 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 22 Apr 2025 18:46:28 +0800 Subject: [PATCH 11/33] wait --- .../processors/transforms/sort/bounds.rs | 67 +++++- .../processors/transforms/sort/mod.rs | 1 + .../processors/transforms/sort/sort_sample.rs | 8 +- .../processors/transforms/sort/wait.rs | 210 ++++++++++++++++++ 4 files changed, 279 insertions(+), 7 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/sort/wait.rs diff --git a/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs b/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs index fe9d357fd8ba1..172a8fb1a80fc 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs @@ -13,6 +13,7 @@ // limitations under the License. use databend_common_exception::Result; +use databend_common_expression::types::DataType; use databend_common_expression::Column; use databend_common_expression::DataBlock; use databend_common_expression::DataField; @@ -22,7 +23,7 @@ use databend_common_pipeline_transforms::sort::LoserTreeMerger; use databend_common_pipeline_transforms::sort::Rows; use databend_common_pipeline_transforms::sort::SortedStream; -#[derive(Debug, PartialEq, Eq, Default)] +#[derive(Debug, PartialEq, Eq, Default, Clone)] pub struct Bounds( // stored in reverse order of Column. Vec, @@ -82,7 +83,6 @@ impl Bounds { } } - #[expect(dead_code)] pub fn len(&self) -> usize { self.0.iter().map(Column::len).sum() } @@ -91,6 +91,42 @@ impl Bounds { pub fn is_empty(&self) -> bool { self.0.iter().all(|col| col.len() == 0) } + + pub fn reduce(&self, n: usize, data_type: DataType) -> Option { + if n == 0 { + return Some(Self::default()); + } + let count = self.len(); + if n >= count { + return None; + } + + let step = count / n; + let offset = step / 2; + let indices = self + .0 + .iter() + .enumerate() + .rev() + .flat_map(|(b_idx, col)| std::iter::repeat_n(b_idx, col.len()).zip(0..col.len())) + .enumerate() + .take(step * n) + .filter_map(|(i, (block, row))| { + if i % step == offset { + Some((block as u32, row as u32, 1)) + } else { + None + } + }) + .collect::>(); + + Some(Bounds(vec![Column::take_column_indices( + &self.0, + data_type, + &indices, + indices.len(), + )])) + } } impl SortedStream for Bounds { @@ -107,7 +143,7 @@ impl SortedStream for Bounds { #[cfg(test)] mod tests { - + use databend_common_expression::types::ArgType; use databend_common_expression::types::Int32Type; use databend_common_expression::FromData; use databend_common_pipeline_transforms::sort::SimpleRowsAsc; @@ -168,4 +204,29 @@ mod tests { Ok(()) } + + #[test] + fn test_reduce() -> Result<()> { + let data = vec![vec![77, -2, 7], vec![3, 8, 6, 1, 1], vec![2]]; + + let data = data + .into_iter() + .map(|v| Bounds::from_column::>(Int32Type::from_data(v))) + .collect::>>()?; + let bounds = Bounds::merge::>(data, 2)?; + + let got = bounds.reduce(4, Int32Type::data_type()).unwrap(); + assert_eq!(got, Bounds(vec![Int32Type::from_data(vec![8, 6, 2, 1])])); // 77 _8 7 _6 3 _2 1 _1 -2 + + let got = bounds.reduce(3, Int32Type::data_type()).unwrap(); + assert_eq!(got, Bounds(vec![Int32Type::from_data(vec![8, 3, 1])])); // 77 _8 7 6 _3 2 1 _1 -2 + + let got = bounds.reduce(2, Int32Type::data_type()).unwrap(); + assert_eq!(got, Bounds(vec![Int32Type::from_data(vec![7, 1])])); // 77 8 _7 6 3 2 _1 1 -2 + + let got = bounds.reduce(1, Int32Type::data_type()).unwrap(); + assert_eq!(got, Bounds(vec![Int32Type::from_data(vec![3])])); // 77 8 7 6 _3 2 1 1 -2 + + Ok(()) + } } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs index 128bd76f80a9e..4223252dd2b34 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs @@ -36,6 +36,7 @@ mod collect; mod execute; mod merge_sort; mod sort_spill; +mod wait; use sort_spill::SpillableBlock; diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_sample.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_sample.rs index 6f5adb69a3c84..96c3ace403033 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_sample.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_sample.rs @@ -206,14 +206,14 @@ impl Transform for TransformSortSample { fn on_finish(&mut self) -> Result<()> { self.sampler.compact_blocks(); - let mut simple = self.sampler.take_blocks(); - assert!(simple.len() <= 1); // Unlikely to sample rows greater than 65536 + let mut sample = self.sampler.take_blocks(); + assert!(sample.len() <= 1); // Unlikely to sample rows greater than 65536 self.state.commit_sample( self.id, - if simple.is_empty() { + if sample.is_empty() { None } else { - Some(simple.remove(0)) + Some(sample.remove(0)) }, )?; Ok(()) diff --git a/src/query/service/src/pipelines/processors/transforms/sort/wait.rs b/src/query/service/src/pipelines/processors/transforms/sort/wait.rs new file mode 100644 index 0000000000000..7029a4ae551fb --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/sort/wait.rs @@ -0,0 +1,210 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::sync::Arc; +use std::sync::RwLock; + +use databend_common_base::base::WatchNotify; +use databend_common_exception::Result; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::DataBlock; +use databend_common_expression::DataSchemaRef; +use databend_common_expression::SortColumnDescription; +use databend_common_pipeline_transforms::processors::sort::select_row_type; +use databend_common_pipeline_transforms::processors::sort::Rows; +use databend_common_pipeline_transforms::processors::sort::RowsTypeVisitor; +use databend_common_pipeline_transforms::sort::RowConverter; + +use super::bounds::Bounds; +use super::SortCollectedMeta; +use crate::pipelines::processors::Event; +use crate::pipelines::processors::InputPort; +use crate::pipelines::processors::OutputPort; +use crate::pipelines::processors::Processor; + +pub struct TransformSortSampleWait { + input: Arc, + output: Arc, + schema: DataSchemaRef, + sort_desc: Arc<[SortColumnDescription]>, + id: usize, + meta: Option>, + state: Arc, +} + +impl TransformSortSampleWait { + pub fn new( + input: Arc, + output: Arc, + id: usize, + schema: DataSchemaRef, + sort_desc: Arc<[SortColumnDescription]>, + state: Arc, + ) -> Self { + Self { + input, + output, + id, + state, + schema, + sort_desc, + meta: None, + } + } +} + +#[async_trait::async_trait] +impl Processor for TransformSortSampleWait { + fn name(&self) -> String { + "TransformSortSimpleWait".to_string() + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.output.is_finished() { + self.input.finish(); + return Ok(Event::Finished); + } + + if !self.output.can_push() { + self.input.set_not_need_data(); + return Ok(Event::NeedConsume); + } + + if let Some(meta) = self.meta.take() { + self.output.push_data(Ok(DataBlock::empty_with_meta(meta))); + self.output.finish(); + return Ok(Event::Finished); + } + + if let Some(mut block) = self.input.pull_data().transpose()? { + assert!(self.meta.is_none()); + let meta = block + .take_meta() + .and_then(SortCollectedMeta::downcast_from) + .expect("require a SortCollectedMeta"); + + self.meta = Some(Box::new(meta)); + return Ok(Event::Async); + } + + if self.input.is_finished() { + if self.state.done.has_notified() { + self.output.finish(); + Ok(Event::Finished) + } else { + Ok(Event::Async) + } + } else { + self.input.set_need_data(); + Ok(Event::NeedData) + } + } + + #[async_backtrace::framed] + async fn async_process(&mut self) -> Result<()> { + let bounds = self + .meta + .as_ref() + .map(|meta| meta.bounds.clone()) + .unwrap_or_default(); + + let mut commit = CommitSample { + inner: self, + bounds: Some(bounds), + result: Ok(false), + }; + select_row_type(&mut commit); + commit.result?; + self.state.done.notified().await; + Ok(()) + } +} + +struct CommitSample<'a> { + inner: &'a TransformSortSampleWait, + bounds: Option, + result: Result, +} + +impl<'a> RowsTypeVisitor for CommitSample<'a> { + fn schema(&self) -> DataSchemaRef { + self.inner.schema.clone() + } + + fn sort_desc(&self) -> &[SortColumnDescription] { + &self.inner.sort_desc + } + + fn visit_type(&mut self) + where + R: Rows + 'static, + C: RowConverter + Send + 'static, + { + self.result = self + .inner + .state + .commit_sample::(self.inner.id, self.bounds.take().unwrap()); + } +} + +pub struct SortSampleState { + inner: RwLock, + pub(super) done: WatchNotify, +} + +impl SortSampleState { + pub fn commit_sample(&self, id: usize, bounds: Bounds) -> Result { + let mut inner = self.inner.write().unwrap(); + + let x = inner.partial[id].replace(bounds); + assert!(x.is_none()); + let done = inner.partial.iter().all(Option::is_some); + if done { + inner.determine_bounds::()?; + self.done.notify_waiters(); + } + Ok(done) + } +} + +struct StateInner { + // target partitions + partitions: usize, + // schema for bounds DataBlock + // schema: DataSchemaRef, + // sort_desc for bounds DataBlock + // sort_desc: Vec, + partial: Vec>, + bounds: Option, + batch_rows: usize, +} + +impl StateInner { + fn determine_bounds(&mut self) -> Result<()> { + let v = self.partial.drain(..).map(Option::unwrap).collect(); + let bounds = Bounds::merge::(v, self.batch_rows)?; + let bounds = bounds + .reduce(self.partitions - 1, R::data_type()) + .unwrap_or(bounds); + assert!(bounds.len() <= self.partitions - 1); + + self.bounds = Some(bounds); + Ok(()) + } +} From 7966294b6af849afb7b86bb5fbe003393b13555e Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 23 Apr 2025 11:07:11 +0800 Subject: [PATCH 12/33] remove --- src/query/expression/src/simpler.rs | 200 ---------------------------- 1 file changed, 200 deletions(-) delete mode 100644 src/query/expression/src/simpler.rs diff --git a/src/query/expression/src/simpler.rs b/src/query/expression/src/simpler.rs deleted file mode 100644 index ab928d721ec77..0000000000000 --- a/src/query/expression/src/simpler.rs +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod reservoir_sampling; - -use std::collections::HashSet; - -use rand::Rng; -use reservoir_sampling::AlgoL; - -use crate::BlockRowIndex; -use crate::DataBlock; - -pub struct Simpler { - columns: Vec, - k: usize, - block_size: usize, - - blocks: Vec, - indices: Vec, - core: AlgoL, - - s: usize, -} - -impl Simpler { - pub fn new(columns: Vec, block_size: usize, k: usize, rng: R) -> Self { - let core = AlgoL::new(k.try_into().unwrap(), rng); - Self { - columns, - blocks: Vec::new(), - indices: Vec::with_capacity(k), - k, - block_size, - core, - s: usize::MAX, - } - } - - pub fn add_block(&mut self, data: DataBlock) -> bool { - let rows = data.num_rows(); - assert!(rows > 0); - let block_idx = self.blocks.len() as u32; - let change = self.add_indices(rows, block_idx); - if change { - let columns = self - .columns - .iter() - .map(|&offset| data.get_by_offset(offset).to_owned()) - .collect::>(); - - self.blocks.push(DataBlock::new(columns, rows)); - if self.blocks.len() > self.k { - self.compact_blocks() - } - } - change - } - - fn add_indices(&mut self, rows: usize, block_idx: u32) -> bool { - let mut change = false; - let mut cur: usize = 0; - if self.indices.len() < self.k { - if rows + self.indices.len() <= self.k { - for i in 0..rows { - self.indices.push((block_idx, i as u32, 1)); - } - if self.indices.len() == self.k { - self.s = self.core.search() - } - return true; - } - while self.indices.len() < self.k { - self.indices.push((block_idx, cur as u32, 1)); - cur += 1; - } - self.s = self.core.search(); - change = true; - } - - while rows - cur > self.s { - change = true; - cur += self.s; - self.indices[self.core.pos()] = (block_idx, cur as u32, 1); - self.core.update_w(); - self.s = self.core.search(); - } - - self.s -= rows - cur; - change - } - - pub fn compact_indices(&mut self) { - let used_set: HashSet<_> = self.indices.iter().map(|&(b, _, _)| b).collect(); - if used_set.len() == self.blocks.len() { - return; - } - - let mut used: Vec<_> = used_set.iter().cloned().collect(); - used.sort(); - - self.indices = self - .indices - .drain(..) - .map(|(b, r, c)| (used.binary_search(&b).unwrap() as u32, r, c)) - .collect(); - - self.blocks = self - .blocks - .drain(..) - .enumerate() - .filter_map(|(i, block)| { - if used_set.contains(&(i as u32)) { - Some(block) - } else { - None - } - }) - .collect(); - } - - pub fn compact_blocks(&mut self) { - self.blocks = self - .indices - .chunks_mut(self.block_size) - .enumerate() - .map(|(i, indices)| { - let rows = indices.len(); - let block = DataBlock::take_blocks(&self.blocks, indices, rows); - - for (j, (b, r, _)) in indices.iter_mut().enumerate() { - *b = i as u32; - *r = j as u32; - } - - block - }) - .collect::>(); - } - - pub fn memory_size(self) -> usize { - self.blocks.iter().map(|b| b.memory_size()).sum() - } - - pub fn take_blocks(&mut self) -> Vec { - std::mem::take(&mut self.blocks) - } - - pub fn k(&self) -> usize { - self.k - } -} - -#[cfg(test)] -mod tests { - use rand::rngs::StdRng; - use rand::SeedableRng; - - use super::*; - - #[test] - fn test_add_indices() { - let rng = StdRng::seed_from_u64(0); - let k = 5; - let core = AlgoL::new(k.try_into().unwrap(), rng); - let mut simpler = Simpler { - columns: vec![0], - k, - block_size: 65536, - blocks: Vec::new(), - indices: Vec::new(), - core, - s: usize::MAX, - }; - - simpler.add_indices(15, 0); - - let want: Vec = - vec![(0, 10, 1), (0, 1, 1), (0, 2, 1), (0, 8, 1), (0, 12, 1)]; - assert_eq!(&want, &simpler.indices); - assert_eq!(0, simpler.s); - - simpler.add_indices(20, 1); - - let want: Vec = vec![(1, 0, 1), (0, 1, 1), (1, 6, 1), (0, 8, 1), (1, 9, 1)]; - assert_eq!(&want, &simpler.indices); - assert_eq!(1, simpler.s); - } -} From 0fe057df46e12a9b511354011f73af788dcf7f01 Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 23 Apr 2025 11:53:52 +0800 Subject: [PATCH 13/33] scalar --- src/query/expression/src/types.rs | 2 +- src/query/expression/src/types/any.rs | 2 +- src/query/expression/src/types/array.rs | 2 +- src/query/expression/src/types/binary.rs | 2 +- src/query/expression/src/types/bitmap.rs | 2 +- src/query/expression/src/types/boolean.rs | 2 +- src/query/expression/src/types/date.rs | 2 +- src/query/expression/src/types/decimal.rs | 2 +- src/query/expression/src/types/empty_array.rs | 2 +- src/query/expression/src/types/empty_map.rs | 2 +- src/query/expression/src/types/generic.rs | 2 +- src/query/expression/src/types/geography.rs | 2 +- src/query/expression/src/types/geometry.rs | 2 +- src/query/expression/src/types/interval.rs | 2 +- src/query/expression/src/types/map.rs | 4 +- src/query/expression/src/types/null.rs | 2 +- src/query/expression/src/types/nullable.rs | 2 +- src/query/expression/src/types/number.rs | 2 +- src/query/expression/src/types/string.rs | 2 +- src/query/expression/src/types/timestamp.rs | 2 +- src/query/expression/src/types/variant.rs | 2 +- .../processors/transforms/sort/rows/common.rs | 11 +++ .../processors/transforms/sort/rows/mod.rs | 8 +- .../processors/transforms/sort/rows/simple.rs | 19 +++++ .../processors/transforms/sort/bounds.rs | 11 ++- .../processors/transforms/sort/sort_spill.rs | 33 +++++---- .../processors/transforms/sort/wait.rs | 73 +++++++++---------- 27 files changed, 120 insertions(+), 79 deletions(-) diff --git a/src/query/expression/src/types.rs b/src/query/expression/src/types.rs index 12999b7c501b8..9a812de36dcec 100755 --- a/src/query/expression/src/types.rs +++ b/src/query/expression/src/types.rs @@ -373,7 +373,7 @@ pub trait ValueType: Debug + Clone + PartialEq + Sized + 'static { fn to_owned_scalar(scalar: Self::ScalarRef<'_>) -> Self::Scalar; fn to_scalar_ref(scalar: &Self::Scalar) -> Self::ScalarRef<'_>; - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option>; + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option>; fn try_downcast_column(col: &Column) -> Option; fn try_downcast_domain(domain: &Domain) -> Option; diff --git a/src/query/expression/src/types/any.rs b/src/query/expression/src/types/any.rs index b0ed227866ca7..6d29f530362a0 100755 --- a/src/query/expression/src/types/any.rs +++ b/src/query/expression/src/types/any.rs @@ -48,7 +48,7 @@ impl ValueType for AnyType { scalar.as_ref() } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { Some(scalar.clone()) } diff --git a/src/query/expression/src/types/array.rs b/src/query/expression/src/types/array.rs index dff2a7383adf8..aca49f149d6f6 100755 --- a/src/query/expression/src/types/array.rs +++ b/src/query/expression/src/types/array.rs @@ -57,7 +57,7 @@ impl ValueType for ArrayType { scalar.clone() } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { match scalar { ScalarRef::Array(array) => T::try_downcast_column(array), _ => None, diff --git a/src/query/expression/src/types/binary.rs b/src/query/expression/src/types/binary.rs index 8b125cd4e7d04..7288eb8091a35 100644 --- a/src/query/expression/src/types/binary.rs +++ b/src/query/expression/src/types/binary.rs @@ -54,7 +54,7 @@ impl ValueType for BinaryType { scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { scalar.as_binary().cloned() } diff --git a/src/query/expression/src/types/bitmap.rs b/src/query/expression/src/types/bitmap.rs index 1bd9e6386e7d2..28a9d1042712a 100644 --- a/src/query/expression/src/types/bitmap.rs +++ b/src/query/expression/src/types/bitmap.rs @@ -53,7 +53,7 @@ impl ValueType for BitmapType { scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { scalar.as_bitmap().cloned() } diff --git a/src/query/expression/src/types/boolean.rs b/src/query/expression/src/types/boolean.rs index 4724d930dde2c..58c0037cbf87a 100644 --- a/src/query/expression/src/types/boolean.rs +++ b/src/query/expression/src/types/boolean.rs @@ -53,7 +53,7 @@ impl ValueType for BooleanType { *scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { match scalar { ScalarRef::Boolean(scalar) => Some(*scalar), _ => None, diff --git a/src/query/expression/src/types/date.rs b/src/query/expression/src/types/date.rs index 794fea85022b0..e5cbecdc5e428 100644 --- a/src/query/expression/src/types/date.rs +++ b/src/query/expression/src/types/date.rs @@ -83,7 +83,7 @@ impl ValueType for DateType { *scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { match scalar { ScalarRef::Date(scalar) => Some(*scalar), _ => None, diff --git a/src/query/expression/src/types/decimal.rs b/src/query/expression/src/types/decimal.rs index 6fdaee2845f67..3bca4711c617d 100644 --- a/src/query/expression/src/types/decimal.rs +++ b/src/query/expression/src/types/decimal.rs @@ -76,7 +76,7 @@ impl ValueType for DecimalType { *scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { Num::try_downcast_scalar(scalar.as_decimal()?) } diff --git a/src/query/expression/src/types/empty_array.rs b/src/query/expression/src/types/empty_array.rs index e67b7ba3f6aad..a622fd42df1f5 100644 --- a/src/query/expression/src/types/empty_array.rs +++ b/src/query/expression/src/types/empty_array.rs @@ -48,7 +48,7 @@ impl ValueType for EmptyArrayType { *scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { match scalar { ScalarRef::EmptyArray => Some(()), _ => None, diff --git a/src/query/expression/src/types/empty_map.rs b/src/query/expression/src/types/empty_map.rs index d32fd921c4b52..f97be542d6dcd 100644 --- a/src/query/expression/src/types/empty_map.rs +++ b/src/query/expression/src/types/empty_map.rs @@ -48,7 +48,7 @@ impl ValueType for EmptyMapType { *scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { match scalar { ScalarRef::EmptyMap => Some(()), _ => None, diff --git a/src/query/expression/src/types/generic.rs b/src/query/expression/src/types/generic.rs index 91f5e9fd9de7d..92cf62d1db53e 100755 --- a/src/query/expression/src/types/generic.rs +++ b/src/query/expression/src/types/generic.rs @@ -51,7 +51,7 @@ impl ValueType for GenericType { scalar.as_ref() } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { Some(scalar.clone()) } diff --git a/src/query/expression/src/types/geography.rs b/src/query/expression/src/types/geography.rs index 25f3522176984..e920ffc0450e8 100644 --- a/src/query/expression/src/types/geography.rs +++ b/src/query/expression/src/types/geography.rs @@ -130,7 +130,7 @@ impl ValueType for GeographyType { scalar.as_ref() } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { scalar.as_geography().cloned() } diff --git a/src/query/expression/src/types/geometry.rs b/src/query/expression/src/types/geometry.rs index fac3c4ad4c060..0451df5a7bb12 100644 --- a/src/query/expression/src/types/geometry.rs +++ b/src/query/expression/src/types/geometry.rs @@ -57,7 +57,7 @@ impl ValueType for GeometryType { scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { scalar.as_geometry().cloned() } diff --git a/src/query/expression/src/types/interval.rs b/src/query/expression/src/types/interval.rs index 243f257ed805c..409c8e365d20d 100644 --- a/src/query/expression/src/types/interval.rs +++ b/src/query/expression/src/types/interval.rs @@ -57,7 +57,7 @@ impl ValueType for IntervalType { *scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { match scalar { ScalarRef::Interval(scalar) => Some(*scalar), _ => None, diff --git a/src/query/expression/src/types/map.rs b/src/query/expression/src/types/map.rs index b430bb2946f6d..c9095c949538e 100755 --- a/src/query/expression/src/types/map.rs +++ b/src/query/expression/src/types/map.rs @@ -54,7 +54,7 @@ impl ValueType for KvPair { (K::to_scalar_ref(k), V::to_scalar_ref(v)) } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { match scalar { ScalarRef::Tuple(fields) if fields.len() == 2 => Some(( K::try_downcast_scalar(&fields[0])?, @@ -351,7 +351,7 @@ impl ValueType for MapType { as ValueType>::to_scalar_ref(scalar) } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { match scalar { ScalarRef::Map(array) => KvPair::::try_downcast_column(array), _ => None, diff --git a/src/query/expression/src/types/null.rs b/src/query/expression/src/types/null.rs index b975046ae1cad..ca9778a5f860b 100644 --- a/src/query/expression/src/types/null.rs +++ b/src/query/expression/src/types/null.rs @@ -49,7 +49,7 @@ impl ValueType for NullType { *scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { match scalar { ScalarRef::Null => Some(()), _ => None, diff --git a/src/query/expression/src/types/nullable.rs b/src/query/expression/src/types/nullable.rs index 51b74539f1020..2955a059bddc1 100755 --- a/src/query/expression/src/types/nullable.rs +++ b/src/query/expression/src/types/nullable.rs @@ -60,7 +60,7 @@ impl ValueType for NullableType { scalar.as_ref().map(T::to_scalar_ref) } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { match scalar { ScalarRef::Null => Some(None), scalar => Some(Some(T::try_downcast_scalar(scalar)?)), diff --git a/src/query/expression/src/types/number.rs b/src/query/expression/src/types/number.rs index 462a70efed168..4a3d99aea7bb1 100644 --- a/src/query/expression/src/types/number.rs +++ b/src/query/expression/src/types/number.rs @@ -122,7 +122,7 @@ impl ValueType for NumberType { *scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { Num::try_downcast_scalar(scalar.as_number()?) } diff --git a/src/query/expression/src/types/string.rs b/src/query/expression/src/types/string.rs index 0da330c0df77d..e8a817cb9a130 100644 --- a/src/query/expression/src/types/string.rs +++ b/src/query/expression/src/types/string.rs @@ -56,7 +56,7 @@ impl ValueType for StringType { scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { scalar.as_string().cloned() } diff --git a/src/query/expression/src/types/timestamp.rs b/src/query/expression/src/types/timestamp.rs index 02a6ec922fdce..4ea0a901b436d 100644 --- a/src/query/expression/src/types/timestamp.rs +++ b/src/query/expression/src/types/timestamp.rs @@ -88,7 +88,7 @@ impl ValueType for TimestampType { *scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { match scalar { ScalarRef::Timestamp(scalar) => Some(*scalar), _ => None, diff --git a/src/query/expression/src/types/variant.rs b/src/query/expression/src/types/variant.rs index 41af0f3518818..a755e84f40471 100644 --- a/src/query/expression/src/types/variant.rs +++ b/src/query/expression/src/types/variant.rs @@ -71,7 +71,7 @@ impl ValueType for VariantType { scalar } - fn try_downcast_scalar<'a>(scalar: &'a ScalarRef) -> Option> { + fn try_downcast_scalar<'a>(scalar: &ScalarRef<'a>) -> Option> { scalar.as_variant().cloned() } diff --git a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/common.rs b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/common.rs index 3af2c413e7f28..7584378015106 100644 --- a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/common.rs +++ b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/common.rs @@ -60,6 +60,17 @@ impl Rows for BinaryColumn { fn slice(&self, range: Range) -> Self { self.slice(range) } + + fn scalar_as_item<'a>(s: &'a Scalar) -> Self::Item<'a> { + match s { + Scalar::Binary(s) => s, + _ => unreachable!(), + } + } + + fn owned_item(item: Self::Item<'_>) -> Scalar { + Scalar::Binary(Vec::from(item)) + } } impl RowConverter for CommonRowConverter { diff --git a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/mod.rs b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/mod.rs index b1498c183c1e3..5eb5036aec6c5 100644 --- a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/mod.rs +++ b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/mod.rs @@ -17,6 +17,7 @@ mod simple; mod utils; use std::fmt::Debug; +use std::ops::Range; pub use common::*; use databend_common_exception::ErrorCode; @@ -27,6 +28,7 @@ use databend_common_expression::BlockEntry; use databend_common_expression::Column; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; +use databend_common_expression::Scalar; use databend_common_expression::SortColumnDescription; pub use simple::*; pub use utils::*; @@ -95,5 +97,9 @@ where Self: Sized + Clone + Debug + Send self.row(self.len() - 1) } - fn slice(&self, range: std::ops::Range) -> Self; + fn slice(&self, range: Range) -> Self; + + fn scalar_as_item<'a>(s: &'a Scalar) -> Self::Item<'a>; + + fn owned_item(item: Self::Item<'_>) -> Scalar; } diff --git a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/simple.rs b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/simple.rs index 0c7b45ab0268d..046dfa1b753c5 100644 --- a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/simple.rs +++ b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/simple.rs @@ -24,6 +24,7 @@ use databend_common_expression::BlockEntry; use databend_common_expression::Column; use databend_common_expression::ColumnBuilder; use databend_common_expression::DataSchemaRef; +use databend_common_expression::Scalar; use databend_common_expression::SortColumnDescription; use databend_common_expression::Value; @@ -70,6 +71,15 @@ where inner: T::slice_column(&self.inner, range), } } + + fn scalar_as_item<'a>(s: &'a Scalar) -> Self::Item<'a> { + let s = &s.as_ref(); + T::try_downcast_scalar(s).unwrap() + } + + fn owned_item(item: Self::Item<'_>) -> Scalar { + T::upcast_scalar(T::to_owned_scalar(item)) + } } /// Rows structure for single simple types. (numbers, date, timestamp) @@ -113,6 +123,15 @@ where inner: T::slice_column(&self.inner, range), } } + + fn scalar_as_item<'a>(s: &'a Scalar) -> Self::Item<'a> { + let s = &s.as_ref(); + Reverse(T::try_downcast_scalar(s).unwrap()) + } + + fn owned_item(item: Self::Item<'_>) -> Scalar { + T::upcast_scalar(T::to_owned_scalar(item.0)) + } } /// If there is only one sort field and its type is a primitive type, diff --git a/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs b/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs index 172a8fb1a80fc..f139d6ec2a409 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs @@ -18,6 +18,7 @@ use databend_common_expression::Column; use databend_common_expression::DataBlock; use databend_common_expression::DataField; use databend_common_expression::DataSchema; +use databend_common_expression::Scalar; use databend_common_expression::SortColumnDescription; use databend_common_pipeline_transforms::sort::LoserTreeMerger; use databend_common_pipeline_transforms::sort::Rows; @@ -70,13 +71,17 @@ impl Bounds { } } - pub fn next_bound(&mut self) -> Option { + pub fn next_bound(&mut self) -> Option { let last = self.0.last_mut()?; match last.len() { 0 => unreachable!(), - 1 => Some(self.0.pop().unwrap()), + 1 => { + let bound = last.index(0).unwrap().to_owned(); + self.0.pop(); + Some(bound) + } _ => { - let bound = last.slice(0..1).maybe_gc(); + let bound = last.index(0).unwrap().to_owned(); *last = last.slice(1..last.len()); Some(bound) } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs index f64805a649e5e..995111dffc320 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs @@ -17,6 +17,7 @@ use std::fmt; use std::fmt::Debug; use std::fmt::Formatter; use std::intrinsics::unlikely; +use std::marker::PhantomData; use std::mem; use std::sync::atomic; use std::sync::atomic::AtomicBool; @@ -29,6 +30,7 @@ use databend_common_expression::sampler::FixedRateSampler; use databend_common_expression::Column; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; +use databend_common_expression::Scalar; use databend_common_pipeline_transforms::processors::sort::algorithm::SortAlgorithm; use databend_common_pipeline_transforms::processors::sort::Merger; use databend_common_pipeline_transforms::processors::sort::Rows; @@ -65,7 +67,7 @@ struct StepSort { /// Partition boundaries for restoring and sorting blocks. /// Each boundary represents a cutoff point where data less than or equal to it belongs to one partition. bounds: Bounds, - cur_bound: Option, + cur_bound: Option, subsequent: Vec>>, current: Vec>>, @@ -298,7 +300,7 @@ impl StepCollect { impl StepSort { fn next_bound(&mut self) { match self.bounds.next_bound() { - Some(bound) => self.cur_bound = Some(A::Rows::from_column(&bound).unwrap()), + Some(bound) => self.cur_bound = Some(bound), None => self.cur_bound = None, } } @@ -490,13 +492,14 @@ impl Base { fn new_stream( &self, blocks: VecDeque, - bound: Option, + bound: Option, ) -> BoundBlockStream> { BoundBlockStream { blocks, bound, sort_row_offset: self.sort_row_offset, spiller: self.spiller.clone(), + _r: Default::default(), } } @@ -650,9 +653,10 @@ impl Spill for Arc { /// BoundBlockStream is a stream of blocks that are cutoff less or equal than bound. struct BoundBlockStream { blocks: VecDeque, - bound: Option, + bound: Option, sort_row_offset: usize, spiller: S, + _r: PhantomData, } impl Debug for BoundBlockStream { @@ -686,7 +690,7 @@ impl BoundBlockStream { }; match &self.bound { - Some(bound) => block.domain::().first() <= bound.row(0), + Some(bound) => block.domain::().first() <= R::scalar_as_item(bound), None => true, } } @@ -698,7 +702,7 @@ impl BoundBlockStream { let block = self.blocks.front_mut().unwrap(); if let Some(pos) = - block_split_off_position(block.data.as_ref().unwrap(), bound, self.sort_row_offset) + block_split_off_position::(block.data.as_ref().unwrap(), bound, self.sort_row_offset) { block.slice(pos, self.sort_row_offset) } else { @@ -767,13 +771,12 @@ impl BoundBlockStream { fn block_split_off_position( data: &DataBlock, - bound: &R, + bound: &Scalar, sort_row_offset: usize, ) -> Option { let rows = R::from_column(sort_column(data, sort_row_offset)).unwrap(); debug_assert!(rows.len() > 0); - debug_assert!(bound.len() == 1); - let bound = bound.row(0); + let bound = R::scalar_as_item(bound); partition_point(&rows, &bound) } @@ -853,6 +856,7 @@ mod tests { use databend_common_expression::types::DataType; use databend_common_expression::types::Int32Type; use databend_common_expression::types::NumberDataType; + use databend_common_expression::types::NumberScalar; use databend_common_expression::types::StringType; use databend_common_expression::BlockEntry; use databend_common_expression::Column; @@ -884,13 +888,13 @@ mod tests { async fn run_bound_block_stream( spiller: impl Spill + Clone, sort_desc: Arc>, - bound: Column, + bound: Scalar, block_part: usize, want: Column, ) -> Result<()> { let (schema, block) = test_data(); let block = DataBlock::sort(&block, &sort_desc, None)?; - let bound = Some(R::from_column(&bound)?); + let bound = Some(bound); let sort_row_offset = schema.fields().len(); let blocks = vec![ @@ -910,6 +914,7 @@ mod tests { bound, sort_row_offset, spiller: spiller.clone(), + _r: Default::default(), }; let data = stream.take_next_bounded_block(); @@ -935,7 +940,7 @@ mod tests { run_bound_block_stream::>( spiller.clone(), sort_desc.clone(), - Int32Type::from_data(vec![5]), + Scalar::Number(NumberScalar::Int32(5)), 4, Int32Type::from_data(vec![3, 5]), ) @@ -944,7 +949,7 @@ mod tests { run_bound_block_stream::>( spiller.clone(), sort_desc.clone(), - Int32Type::from_data(vec![8]), + Scalar::Number(NumberScalar::Int32(8)), 4, Int32Type::from_data(vec![3, 5, 7, 7]), ) @@ -961,7 +966,7 @@ mod tests { run_bound_block_stream::>( spiller.clone(), sort_desc.clone(), - StringType::from_data(vec!["f"]), + Scalar::String("f".to_string()), 4, StringType::from_data(vec!["w", "h", "g", "f"]), ) diff --git a/src/query/service/src/pipelines/processors/transforms/sort/wait.rs b/src/query/service/src/pipelines/processors/transforms/sort/wait.rs index 7029a4ae551fb..7e1fc53740541 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/wait.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/wait.rs @@ -124,59 +124,31 @@ impl Processor for TransformSortSampleWait { .map(|meta| meta.bounds.clone()) .unwrap_or_default(); - let mut commit = CommitSample { - inner: self, - bounds: Some(bounds), - result: Ok(false), - }; - select_row_type(&mut commit); - commit.result?; + self.state.commit_sample(self.id, bounds)?; self.state.done.notified().await; Ok(()) } } -struct CommitSample<'a> { - inner: &'a TransformSortSampleWait, - bounds: Option, - result: Result, -} - -impl<'a> RowsTypeVisitor for CommitSample<'a> { - fn schema(&self) -> DataSchemaRef { - self.inner.schema.clone() - } - - fn sort_desc(&self) -> &[SortColumnDescription] { - &self.inner.sort_desc - } - - fn visit_type(&mut self) - where - R: Rows + 'static, - C: RowConverter + Send + 'static, - { - self.result = self - .inner - .state - .commit_sample::(self.inner.id, self.bounds.take().unwrap()); - } -} - pub struct SortSampleState { inner: RwLock, pub(super) done: WatchNotify, } impl SortSampleState { - pub fn commit_sample(&self, id: usize, bounds: Bounds) -> Result { + pub fn commit_sample(&self, id: usize, bounds: Bounds) -> Result { let mut inner = self.inner.write().unwrap(); let x = inner.partial[id].replace(bounds); assert!(x.is_none()); let done = inner.partial.iter().all(Option::is_some); if done { - inner.determine_bounds::()?; + let mut visitor = DetermineBounds { + inner: &mut inner, + result: Ok(()), + }; + select_row_type(&mut visitor); + visitor.result?; self.done.notify_waiters(); } Ok(done) @@ -187,9 +159,9 @@ struct StateInner { // target partitions partitions: usize, // schema for bounds DataBlock - // schema: DataSchemaRef, + schema: DataSchemaRef, // sort_desc for bounds DataBlock - // sort_desc: Vec, + sort_desc: Arc<[SortColumnDescription]>, partial: Vec>, bounds: Option, batch_rows: usize, @@ -202,9 +174,32 @@ impl StateInner { let bounds = bounds .reduce(self.partitions - 1, R::data_type()) .unwrap_or(bounds); - assert!(bounds.len() <= self.partitions - 1); + assert!(bounds.len() < self.partitions); self.bounds = Some(bounds); Ok(()) } } + +struct DetermineBounds<'a> { + inner: &'a mut StateInner, + result: Result<()>, +} + +impl<'a> RowsTypeVisitor for DetermineBounds<'a> { + fn schema(&self) -> DataSchemaRef { + self.inner.schema.clone() + } + + fn sort_desc(&self) -> &[SortColumnDescription] { + &self.inner.sort_desc + } + + fn visit_type(&mut self) + where + R: Rows + 'static, + C: RowConverter + Send + 'static, + { + self.result = self.inner.determine_bounds::(); + } +} From 5312e494d1cbd5de862e6c2ca7ead77f3b3a8f07 Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 23 Apr 2025 16:46:29 +0800 Subject: [PATCH 14/33] exchange --- .../processors/transforms/sort/exchange.rs | 72 +++++++++++++++++++ .../processors/transforms/sort/mod.rs | 1 + 2 files changed, 73 insertions(+) create mode 100644 src/query/service/src/pipelines/processors/transforms/sort/exchange.rs diff --git a/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs b/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs new file mode 100644 index 0000000000000..8f47822383d78 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs @@ -0,0 +1,72 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Ordering; +use std::marker::PhantomData; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_pipeline_core::processors::Exchange; +use databend_common_pipeline_transforms::processors::sort::Rows; + +use super::sort_sample::SortSampleState; + +pub struct SortRangeExchange { + state: Arc, + _r: PhantomData, +} + +unsafe impl Send for SortRangeExchange {} + +unsafe impl Sync for SortRangeExchange {} + +impl Exchange for SortRangeExchange { + const NAME: &'static str = "SortRange"; + fn partition(&self, data: DataBlock, n: usize) -> Result> { + let bounds = self.state.bounds().unwrap(); + debug_assert_eq!(n, self.state.partitions()); + debug_assert!(bounds.len() < n); + + if data.is_empty() { + return Ok(vec![]); + } + + if bounds.len() == 0 { + return Ok(vec![data]); + } + + let bounds = R::from_column(&bounds)?; + let rows = R::from_column(data.get_last_column())?; + + let mut i = 0; + let mut j = 0; + let mut bound = bounds.row(j); + let mut indices = Vec::new(); + while i < rows.len() { + match rows.row(i).cmp(&bound) { + Ordering::Less => indices.push(j as u32), + Ordering::Greater if j + 1 < bounds.len() => { + j += 1; + bound = bounds.row(j); + continue; + } + _ => indices.push(j as u32 + 1), + } + i += 1; + } + + DataBlock::scatter(&data, &indices, n) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs index 4223252dd2b34..bb40e97ee8d6d 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs @@ -33,6 +33,7 @@ pub use builder::TransformSortBuilder; mod bounds; mod collect; +mod exchange; mod execute; mod merge_sort; mod sort_spill; From 751505d394933f0959c0ee9912cdd5945a49e22b Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 23 Apr 2025 21:59:07 +0800 Subject: [PATCH 15/33] update --- .../processors/transforms/sort/bounds.rs | 1 - .../processors/transforms/sort/exchange.rs | 53 +++--- .../processors/transforms/sort/mod.rs | 8 + .../processors/transforms/sort/sort_spill.rs | 55 +++++- .../processors/transforms/sort/wait.rs | 180 +++++++++++------- 5 files changed, 198 insertions(+), 99 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs b/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs index f139d6ec2a409..741ccca55d66a 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs @@ -92,7 +92,6 @@ impl Bounds { self.0.iter().map(Column::len).sum() } - #[expect(dead_code)] pub fn is_empty(&self) -> bool { self.0.iter().all(|col| col.len() == 0) } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs b/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs index 8f47822383d78..9aeeb29cc7414 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp::Ordering; use std::marker::PhantomData; use std::sync::Arc; @@ -21,7 +20,7 @@ use databend_common_expression::DataBlock; use databend_common_pipeline_core::processors::Exchange; use databend_common_pipeline_transforms::processors::sort::Rows; -use super::sort_sample::SortSampleState; +use super::wait::SortSampleState; pub struct SortRangeExchange { state: Arc, @@ -35,38 +34,40 @@ unsafe impl Sync for SortRangeExchange {} impl Exchange for SortRangeExchange { const NAME: &'static str = "SortRange"; fn partition(&self, data: DataBlock, n: usize) -> Result> { - let bounds = self.state.bounds().unwrap(); - debug_assert_eq!(n, self.state.partitions()); - debug_assert!(bounds.len() < n); - if data.is_empty() { return Ok(vec![]); } - if bounds.len() == 0 { + let bounds = self.state.bounds(); + // debug_assert_eq!(n, self.state.partitions()); + debug_assert!(bounds.len() < n); + + if bounds.is_empty() { return Ok(vec![data]); } - let bounds = R::from_column(&bounds)?; - let rows = R::from_column(data.get_last_column())?; + todo!() - let mut i = 0; - let mut j = 0; - let mut bound = bounds.row(j); - let mut indices = Vec::new(); - while i < rows.len() { - match rows.row(i).cmp(&bound) { - Ordering::Less => indices.push(j as u32), - Ordering::Greater if j + 1 < bounds.len() => { - j += 1; - bound = bounds.row(j); - continue; - } - _ => indices.push(j as u32 + 1), - } - i += 1; - } + // let bounds = R::from_column(&bounds.0)?; + // let rows = R::from_column(data.get_last_column())?; + + // let mut i = 0; + // let mut j = 0; + // let mut bound = bounds.row(j); + // let mut indices = Vec::new(); + // while i < rows.len() { + // match rows.row(i).cmp(&bound) { + // Ordering::Less => indices.push(j as u32), + // Ordering::Greater if j + 1 < bounds.len() => { + // j += 1; + // bound = bounds.row(j); + // continue; + // } + // _ => indices.push(j as u32 + 1), + // } + // i += 1; + // } - DataBlock::scatter(&data, &indices, n) + // DataBlock::scatter(&data, &indices, n) } } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs index bb40e97ee8d6d..307c1cefdf4d9 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs @@ -63,6 +63,14 @@ local_block_meta_serde!(SortCollectedMeta); #[typetag::serde(name = "sort_collected")] impl BlockMetaInfo for SortCollectedMeta {} +#[derive(Debug)] +struct SortScatteredMeta(pub Vec); + +local_block_meta_serde!(SortScatteredMeta); + +#[typetag::serde(name = "sort_scattered")] +impl BlockMetaInfo for SortScatteredMeta {} + trait MemoryRows { fn in_memory_rows(&self) -> usize; } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs index 995111dffc320..010e19c8f876e 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs @@ -527,6 +527,27 @@ impl Base { } } } + + pub async fn scatter_stream( + &self, + mut blocks: VecDeque, + mut bounds: Bounds, + ) -> Result>> { + let mut scattered = Vec::with_capacity(bounds.len() + 1); + while !blocks.is_empty() { + let bound = bounds.next_bound(); + let mut stream = self.new_stream::(blocks, bound); + + let mut part = Vec::new(); + while let Some(block) = stream.take_next_bounded_spillable().await? { + part.push(block); + } + + scattered.push(part); + blocks = stream.blocks; + } + Ok(scattered) + } } impl MemoryRows for Vec> { @@ -690,8 +711,8 @@ impl BoundBlockStream { }; match &self.bound { - Some(bound) => block.domain::().first() <= R::scalar_as_item(bound), None => true, + Some(bound) => block.domain::().first() <= R::scalar_as_item(bound), } } @@ -767,6 +788,38 @@ impl BoundBlockStream { } Ok(()) } + + async fn take_next_bounded_spillable(&mut self) -> Result> { + let Some(bound) = &self.bound else { + return Ok(self.blocks.pop_front()); + }; + let Some(block) = self.blocks.front() else { + return Ok(None); + }; + { + let domain = block.domain::(); + let bound_item = R::scalar_as_item(bound); + if domain.first() > bound_item { + return Ok(None); + } + if domain.last() <= bound_item { + return Ok(self.blocks.pop_front()); + } + } + self.restore_first().await?; + + let block = self.blocks.front_mut().unwrap(); + if let Some(pos) = block_split_off_position::( + block.data.as_ref().unwrap(), + self.bound.as_ref().unwrap(), + self.sort_row_offset, + ) { + let data = block.slice(pos, self.sort_row_offset); + Ok(Some(SpillableBlock::new(data, self.sort_row_offset))) + } else { + Ok(self.blocks.pop_front()) + } + } } fn block_split_off_position( diff --git a/src/query/service/src/pipelines/processors/transforms/sort/wait.rs b/src/query/service/src/pipelines/processors/transforms/sort/wait.rs index 7e1fc53740541..1e2f04581c1ae 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/wait.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/wait.rs @@ -13,6 +13,8 @@ // limitations under the License. use std::any::Any; +use std::assert_matches::assert_matches; +use std::marker::PhantomData; use std::sync::Arc; use std::sync::RwLock; @@ -21,54 +23,108 @@ use databend_common_exception::Result; use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; -use databend_common_expression::SortColumnDescription; -use databend_common_pipeline_transforms::processors::sort::select_row_type; use databend_common_pipeline_transforms::processors::sort::Rows; -use databend_common_pipeline_transforms::processors::sort::RowsTypeVisitor; -use databend_common_pipeline_transforms::sort::RowConverter; use super::bounds::Bounds; +use super::Base; use super::SortCollectedMeta; +use super::SortScatteredMeta; use crate::pipelines::processors::Event; use crate::pipelines::processors::InputPort; use crate::pipelines::processors::OutputPort; use crate::pipelines::processors::Processor; +use crate::spillers::Spiller; -pub struct TransformSortSampleWait { +#[derive(Debug)] +enum Step { + None, + Meta(Box), + Scattered(Vec), +} + +pub struct TransformSortWait { input: Arc, output: Arc, - schema: DataSchemaRef, - sort_desc: Arc<[SortColumnDescription]>, id: usize, - meta: Option>, + step: Step, state: Arc, + spiller: Arc, + _r: PhantomData, } -impl TransformSortSampleWait { - pub fn new( - input: Arc, - output: Arc, - id: usize, - schema: DataSchemaRef, - sort_desc: Arc<[SortColumnDescription]>, - state: Arc, - ) -> Self { - Self { - input, - output, - id, - state, - schema, - sort_desc, - meta: None, +impl TransformSortWait { + // pub fn new( + // input: Arc, + // output: Arc, + // id: usize, + // state: Arc, + // ) -> Self { + // Self { + // input, + // output, + // id, + // state, + // meta: None, + // _r: Default::default(), + // } + // } + + async fn scatter(&mut self) -> Result<()> { + let scatter_bounds = self.state.bounds(); + + let Step::Meta(box SortCollectedMeta { + params, + bounds, + blocks, + }) = std::mem::replace(&mut self.step, Step::None) + else { + unreachable!() + }; + + if scatter_bounds.is_empty() { + Step::Scattered(vec![SortCollectedMeta { + params, + bounds, + blocks, + }]); + return Ok(()); + } + + let base = { + let inner = self.state.inner.read().unwrap(); + Base { + schema: inner.schema.clone(), + spiller: self.spiller.clone(), + sort_row_offset: inner.schema.fields.len() - 1, + limit: None, + } + }; + + let mut scattered_meta = std::iter::repeat_with(|| SortCollectedMeta { + params, + bounds: bounds.clone(), + blocks: vec![], + }) + .take(scatter_bounds.len() + 1) + .collect::>(); + for blocks in blocks { + let scattered = base + .scatter_stream::(Vec::from(blocks).into(), scatter_bounds.clone()) + .await?; + for (i, part) in scattered.into_iter().enumerate() { + scattered_meta[i].blocks.push(part.into_boxed_slice()); + } } + self.step = Step::Scattered(scattered_meta); + + Ok(()) } } #[async_trait::async_trait] -impl Processor for TransformSortSampleWait { +impl Processor for TransformSortWait { fn name(&self) -> String { - "TransformSortSimpleWait".to_string() + "TransformSortWait".to_string() } fn as_any(&mut self) -> &mut dyn Any { @@ -86,20 +142,25 @@ impl Processor for TransformSortSampleWait { return Ok(Event::NeedConsume); } - if let Some(meta) = self.meta.take() { - self.output.push_data(Ok(DataBlock::empty_with_meta(meta))); + if matches!(self.step, Step::Scattered(_)) { + let Step::Scattered(scattered) = std::mem::replace(&mut self.step, Step::None) else { + unreachable!() + }; + + let data = DataBlock::empty_with_meta(Box::new(SortScatteredMeta(scattered))); + self.output.push_data(Ok(data)); self.output.finish(); return Ok(Event::Finished); } if let Some(mut block) = self.input.pull_data().transpose()? { - assert!(self.meta.is_none()); + assert_matches!(self.step, Step::None); let meta = block .take_meta() .and_then(SortCollectedMeta::downcast_from) .expect("require a SortCollectedMeta"); - self.meta = Some(Box::new(meta)); + self.step = Step::Meta(Box::new(meta)); return Ok(Event::Async); } @@ -118,15 +179,13 @@ impl Processor for TransformSortSampleWait { #[async_backtrace::framed] async fn async_process(&mut self) -> Result<()> { - let bounds = self - .meta - .as_ref() - .map(|meta| meta.bounds.clone()) - .unwrap_or_default(); - - self.state.commit_sample(self.id, bounds)?; + let bounds = match &self.step { + Step::Meta(meta) => meta.bounds.clone(), + _ => unreachable!(), + }; + self.state.commit_sample::(self.id, bounds)?; self.state.done.notified().await; - Ok(()) + self.scatter().await } } @@ -136,23 +195,27 @@ pub struct SortSampleState { } impl SortSampleState { - pub fn commit_sample(&self, id: usize, bounds: Bounds) -> Result { + pub fn commit_sample(&self, id: usize, bounds: Bounds) -> Result { let mut inner = self.inner.write().unwrap(); let x = inner.partial[id].replace(bounds); assert!(x.is_none()); let done = inner.partial.iter().all(Option::is_some); if done { - let mut visitor = DetermineBounds { - inner: &mut inner, - result: Ok(()), - }; - select_row_type(&mut visitor); - visitor.result?; + inner.determine_bounds::()?; self.done.notify_waiters(); } Ok(done) } + + pub fn bounds(&self) -> Bounds { + self.inner + .read() + .unwrap() + .bounds + .clone() + .unwrap_or_default() + } } struct StateInner { @@ -160,8 +223,6 @@ struct StateInner { partitions: usize, // schema for bounds DataBlock schema: DataSchemaRef, - // sort_desc for bounds DataBlock - sort_desc: Arc<[SortColumnDescription]>, partial: Vec>, bounds: Option, batch_rows: usize, @@ -180,26 +241,3 @@ impl StateInner { Ok(()) } } - -struct DetermineBounds<'a> { - inner: &'a mut StateInner, - result: Result<()>, -} - -impl<'a> RowsTypeVisitor for DetermineBounds<'a> { - fn schema(&self) -> DataSchemaRef { - self.inner.schema.clone() - } - - fn sort_desc(&self) -> &[SortColumnDescription] { - &self.inner.sort_desc - } - - fn visit_type(&mut self) - where - R: Rows + 'static, - C: RowConverter + Send + 'static, - { - self.result = self.inner.determine_bounds::(); - } -} From 3a1af372bae7656aa1505e733eb5ad5c696b4406 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 24 Apr 2025 00:58:07 +0800 Subject: [PATCH 16/33] test Signed-off-by: coldWater --- .../processors/transforms/sort/sort_spill.rs | 294 ++++++++++++++++++ 1 file changed, 294 insertions(+) diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs index 010e19c8f876e..61af0983335d8 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs @@ -1029,6 +1029,300 @@ mod tests { Ok(()) } + // Create test data with multiple blocks, including spilled and sliced blocks + async fn run_take_next_bounded_spillable( + spiller: impl Spill + Clone, + sort_desc: Arc>, + bound: Option, + expected_blocks: Vec, + // Flag to test with spilled blocks + with_spilled: bool, + // Flag to test with sliced blocks + with_sliced: bool, + ) -> Result<()> { + let (schema, block) = test_data(); + let block = DataBlock::sort(&block, &sort_desc, None)?; + let sort_row_offset = schema.fields().len(); + + // Create multiple blocks with different splits + let mut blocks = VecDeque::new(); + + // First block: 0..2 + let mut block1 = block.slice(0..2); + let col1 = convert_rows(schema.clone(), &sort_desc, block1.clone()).unwrap(); + block1.add_column(BlockEntry::new(col1.data_type(), Value::Column(col1))); + blocks.push_back(SpillableBlock::new(block1, sort_row_offset)); + + // Second block: 2..5 + let mut block2 = block.slice(2..5); + let col2 = convert_rows(schema.clone(), &sort_desc, block2.clone()).unwrap(); + block2.add_column(BlockEntry::new(col2.data_type(), Value::Column(col2))); + blocks.push_back(SpillableBlock::new(block2, sort_row_offset)); + + // We'll add the third block only if we're not using sliced blocks + // This is to avoid duplicating the data with additional_block + if !with_sliced { + // Third block: 5..8 + let mut block3 = block.slice(5..8); + let col3 = convert_rows(schema.clone(), &sort_desc, block3.clone()).unwrap(); + block3.add_column(BlockEntry::new(col3.data_type(), Value::Column(col3))); + blocks.push_back(SpillableBlock::new(block3, sort_row_offset)); + } + + // Spill some blocks if requested + if with_spilled { + // Spill the second block + blocks[1].spill(&spiller).await?; + } + + // Create a sliced block if requested + if with_sliced { + // Create a block for values 8..11 (the last part of the sorted data) + let mut additional_block = block.slice(5..8); + let col = convert_rows(schema.clone(), &sort_desc, additional_block.clone()).unwrap(); + additional_block.add_column(BlockEntry::new(col.data_type(), Value::Column(col))); + let mut spillable_block = SpillableBlock::new(additional_block, sort_row_offset); + + // Use SpillableBlock::slice to create a sliced block + // This tests the SpillableBlock::slice functionality by slicing at position 1 + // For ascending Int32: [8, 10, 11] -> [8] and [10, 11] + // For descending String: ["d", "e", "f"] -> ["d"] and ["e", "f"] + let sliced_data = spillable_block.slice(1, sort_row_offset); + let sliced_block = SpillableBlock::new(sliced_data, sort_row_offset); + + // Add both blocks to maintain the order + blocks.push_back(sliced_block); + blocks.push_back(spillable_block); + } + + let mut stream = BoundBlockStream:: { + blocks, + bound, + sort_row_offset, + spiller: spiller.clone(), + _r: Default::default(), + }; + + // Take blocks one by one and compare with expected + let mut result_blocks = Vec::new(); + while let Some(mut block) = stream.take_next_bounded_spillable().await? { + // If the block data is None (spilled), restore it first + if block.data.is_none() { + block.data = Some(spiller.restore(block.location.as_ref().unwrap()).await?); + } + + let data = block.data.unwrap(); + let col = sort_column(&data, sort_row_offset).clone(); + result_blocks.push(col); + } + + assert_eq!( + expected_blocks.len(), + result_blocks.len(), + "Number of blocks doesn't match" + ); + for (expected, actual) in expected_blocks.iter().zip(result_blocks.iter()) { + assert_eq!(expected, actual, "Block content doesn't match"); + } + + Ok(()) + } + + #[tokio::test] + async fn test_take_next_bounded_spillable() -> Result<()> { + let spiller = MockSpiller { + map: Arc::new(Mutex::new(HashMap::new())), + }; + + // Test with ascending Int32 type + { + let sort_desc = Arc::new(vec![SortColumnDescription { + offset: 0, + asc: true, + nulls_first: false, + }]); + + // Test 1: Basic test with bound = 5 (should return blocks with values <= 5) + // No spilled blocks, no sliced blocks + run_take_next_bounded_spillable::>( + spiller.clone(), + sort_desc.clone(), + Some(Scalar::Number(NumberScalar::Int32(5))), + vec![Int32Type::from_data(vec![3, 5])], + false, // no spilled blocks + false, // no sliced blocks + ) + .await?; + + // Test 2: With spilled blocks, bound = 8 (should return blocks with values <= 8) + run_take_next_bounded_spillable::>( + spiller.clone(), + sort_desc.clone(), + Some(Scalar::Number(NumberScalar::Int32(8))), + vec![ + Int32Type::from_data(vec![3, 5]), + Int32Type::from_data(vec![7, 7, 8]), + ], + true, // with spilled blocks + false, // no sliced blocks + ) + .await?; + + // Test 3: With sliced blocks, bound = 7 (should return blocks with values <= 7) + run_take_next_bounded_spillable::>( + spiller.clone(), + sort_desc.clone(), + Some(Scalar::Number(NumberScalar::Int32(7))), + vec![ + Int32Type::from_data(vec![3, 5]), + Int32Type::from_data(vec![7, 7]), + ], + false, // no spilled blocks + true, // with sliced blocks + ) + .await?; + + // Test 4: With both spilled and sliced blocks, bound = 10 + run_take_next_bounded_spillable::>( + spiller.clone(), + sort_desc.clone(), + Some(Scalar::Number(NumberScalar::Int32(10))), + vec![ + Int32Type::from_data(vec![3, 5]), + Int32Type::from_data(vec![7, 7, 8]), + Int32Type::from_data(vec![10]), + ], + true, // with spilled blocks + true, // with sliced blocks + ) + .await?; + + // Test 5: With bound = 2 (should return no blocks as all values > 2) + run_take_next_bounded_spillable::>( + spiller.clone(), + sort_desc.clone(), + Some(Scalar::Number(NumberScalar::Int32(2))), + vec![], + true, // with spilled blocks + true, // with sliced blocks + ) + .await?; + + // Test 6: With bound = 12 (should return all blocks as all values <= 12) + run_take_next_bounded_spillable::>( + spiller.clone(), + sort_desc.clone(), + Some(Scalar::Number(NumberScalar::Int32(12))), + vec![ + Int32Type::from_data(vec![3, 5]), + Int32Type::from_data(vec![7, 7, 8]), + Int32Type::from_data(vec![10, 11, 11]), + ], + true, // with spilled blocks + false, // no sliced blocks + ) + .await?; + + // Test 7: With no bound (should return all blocks) + run_take_next_bounded_spillable::>( + spiller.clone(), + sort_desc.clone(), + None, + vec![ + Int32Type::from_data(vec![3, 5]), + Int32Type::from_data(vec![7, 7, 8]), + Int32Type::from_data(vec![10, 11, 11]), + ], + true, // with spilled blocks + false, // no sliced blocks + ) + .await?; + } + + // Test with descending String type + { + let sort_desc = Arc::new(vec![SortColumnDescription { + offset: 1, + asc: false, + nulls_first: false, + }]); + + // Test 8: With bound = "f" (should return blocks with values >= "f") + run_take_next_bounded_spillable::>( + spiller.clone(), + sort_desc.clone(), + Some(Scalar::String("f".to_string())), + vec![ + StringType::from_data(vec!["w", "h"]), + StringType::from_data(vec!["g", "f"]), + ], + false, // no spilled blocks + false, // no sliced blocks + ) + .await?; + + // Test 9: With spilled blocks, bound = "e" (should return blocks with values >= "e") + run_take_next_bounded_spillable::>( + spiller.clone(), + sort_desc.clone(), + Some(Scalar::String("e".to_string())), + vec![ + StringType::from_data(vec!["w", "h"]), + StringType::from_data(vec!["g", "f", "e"]), + StringType::from_data(vec!["e"]), + ], + true, // with spilled blocks + false, // no sliced blocks + ) + .await?; + + // Test 10: With sliced blocks, bound = "d" (should return blocks with values >= "d") + run_take_next_bounded_spillable::>( + spiller.clone(), + sort_desc.clone(), + Some(Scalar::String("d".to_string())), + vec![ + StringType::from_data(vec!["w", "h"]), + StringType::from_data(vec!["g", "f", "e"]), + StringType::from_data(vec!["e"]), + StringType::from_data(vec!["d", "d"]), + ], + false, // no spilled blocks + true, // with sliced blocks + ) + .await?; + + // Test 11: With both spilled and sliced blocks, bound = "c" (should return all blocks) + run_take_next_bounded_spillable::>( + spiller.clone(), + sort_desc.clone(), + Some(Scalar::String("c".to_string())), + vec![ + StringType::from_data(vec!["w", "h"]), + StringType::from_data(vec!["g", "f", "e"]), + StringType::from_data(vec!["e"]), + StringType::from_data(vec!["d", "d"]), + ], + true, // with spilled blocks + true, // with sliced blocks + ) + .await?; + + // Test 12: With bound = "z" (should return no blocks as all values < "z") + run_take_next_bounded_spillable::>( + spiller.clone(), + sort_desc.clone(), + Some(Scalar::String("z".to_string())), + vec![], + true, // with spilled blocks + true, // with sliced blocks + ) + .await?; + } + + Ok(()) + } + #[derive(Clone)] struct MockSpiller { map: Arc>>, From b430371c055328df60f16dbf58682dff62d62250 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 24 Apr 2025 12:06:05 +0800 Subject: [PATCH 17/33] x Signed-off-by: coldWater --- .../processors/transforms/sort/sort_spill.rs | 219 ++++++++++-------- .../processors/transforms/sort/wait.rs | 64 +++-- 2 files changed, 176 insertions(+), 107 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs index 61af0983335d8..40881547751db 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs @@ -903,6 +903,7 @@ fn get_domain(col: &Column) -> Column { #[cfg(test)] mod tests { use std::collections::HashMap; + use std::ops::Range; use std::sync::Mutex; use databend_common_base::base::GlobalUniqName; @@ -940,7 +941,7 @@ mod tests { async fn run_bound_block_stream( spiller: impl Spill + Clone, - sort_desc: Arc>, + sort_desc: Arc<[SortColumnDescription]>, bound: Scalar, block_part: usize, want: Column, @@ -984,7 +985,7 @@ mod tests { }; { - let sort_desc = Arc::new(vec![SortColumnDescription { + let sort_desc = Arc::new([SortColumnDescription { offset: 0, asc: true, nulls_first: false, @@ -1010,7 +1011,7 @@ mod tests { } { - let sort_desc = Arc::new(vec![SortColumnDescription { + let sort_desc = Arc::new([SortColumnDescription { offset: 1, asc: false, nulls_first: false, @@ -1029,64 +1030,77 @@ mod tests { Ok(()) } - // Create test data with multiple blocks, including spilled and sliced blocks - async fn run_take_next_bounded_spillable( - spiller: impl Spill + Clone, - sort_desc: Arc>, - bound: Option, - expected_blocks: Vec, - // Flag to test with spilled blocks + fn create_spillable_block( + block: &DataBlock, + range: Range, + schema: &DataSchemaRef, + sort_desc: &[SortColumnDescription], + sort_row_offset: usize, + ) -> SpillableBlock { + let mut sliced_block = block.slice(range); + let col = convert_rows(schema.clone(), sort_desc, sliced_block.clone()).unwrap(); + sliced_block.add_column(BlockEntry::new(col.data_type(), Value::Column(col))); + SpillableBlock::new(sliced_block, sort_row_offset) + } + + async fn prepare_test_blocks( + spiller: &impl Spill, + sort_desc: &[SortColumnDescription], with_spilled: bool, - // Flag to test with sliced blocks with_sliced: bool, - ) -> Result<()> { + ) -> Result<(DataSchemaRef, VecDeque, usize)> { let (schema, block) = test_data(); - let block = DataBlock::sort(&block, &sort_desc, None)?; + let block = DataBlock::sort(&block, sort_desc, None)?; let sort_row_offset = schema.fields().len(); // Create multiple blocks with different splits let mut blocks = VecDeque::new(); // First block: 0..2 - let mut block1 = block.slice(0..2); - let col1 = convert_rows(schema.clone(), &sort_desc, block1.clone()).unwrap(); - block1.add_column(BlockEntry::new(col1.data_type(), Value::Column(col1))); - blocks.push_back(SpillableBlock::new(block1, sort_row_offset)); + blocks.push_back(create_spillable_block( + &block, + 0..2, + &schema, + sort_desc, + sort_row_offset, + )); // Second block: 2..5 - let mut block2 = block.slice(2..5); - let col2 = convert_rows(schema.clone(), &sort_desc, block2.clone()).unwrap(); - block2.add_column(BlockEntry::new(col2.data_type(), Value::Column(col2))); - blocks.push_back(SpillableBlock::new(block2, sort_row_offset)); - - // We'll add the third block only if we're not using sliced blocks - // This is to avoid duplicating the data with additional_block - if !with_sliced { - // Third block: 5..8 - let mut block3 = block.slice(5..8); - let col3 = convert_rows(schema.clone(), &sort_desc, block3.clone()).unwrap(); - block3.add_column(BlockEntry::new(col3.data_type(), Value::Column(col3))); - blocks.push_back(SpillableBlock::new(block3, sort_row_offset)); - } + blocks.push_back(create_spillable_block( + &block, + 2..5, + &schema, + sort_desc, + sort_row_offset, + )); // Spill some blocks if requested if with_spilled { // Spill the second block - blocks[1].spill(&spiller).await?; + blocks[1].spill(spiller).await?; } - // Create a sliced block if requested - if with_sliced { + if !with_sliced { + // Third block: 5..8 + blocks.push_back(create_spillable_block( + &block, + 5..8, + &schema, + sort_desc, + sort_row_offset, + )); + } else { // Create a block for values 8..11 (the last part of the sorted data) - let mut additional_block = block.slice(5..8); - let col = convert_rows(schema.clone(), &sort_desc, additional_block.clone()).unwrap(); - additional_block.add_column(BlockEntry::new(col.data_type(), Value::Column(col))); - let mut spillable_block = SpillableBlock::new(additional_block, sort_row_offset); - - // Use SpillableBlock::slice to create a sliced block - // This tests the SpillableBlock::slice functionality by slicing at position 1 - // For ascending Int32: [8, 10, 11] -> [8] and [10, 11] - // For descending String: ["d", "e", "f"] -> ["d"] and ["e", "f"] + let mut spillable_block = + create_spillable_block(&block, 5..8, &schema, sort_desc, sort_row_offset); + + spillable_block.spill(spiller).await?; + spillable_block.data = Some( + spiller + .restore(spillable_block.location.as_ref().unwrap()) + .await?, + ); + let sliced_data = spillable_block.slice(1, sort_row_offset); let sliced_block = SpillableBlock::new(sliced_data, sort_row_offset); @@ -1095,15 +1109,14 @@ mod tests { blocks.push_back(spillable_block); } - let mut stream = BoundBlockStream:: { - blocks, - bound, - sort_row_offset, - spiller: spiller.clone(), - _r: Default::default(), - }; + Ok((schema, blocks, sort_row_offset)) + } - // Take blocks one by one and compare with expected + async fn collect_and_verify_blocks( + stream: &mut BoundBlockStream, + spiller: &impl Spill, + expected_blocks: &[Column], + ) -> Result<()> { let mut result_blocks = Vec::new(); while let Some(mut block) = stream.take_next_bounded_spillable().await? { // If the block data is None (spilled), restore it first @@ -1112,7 +1125,7 @@ mod tests { } let data = block.data.unwrap(); - let col = sort_column(&data, sort_row_offset).clone(); + let col = sort_column(&data, stream.sort_row_offset).clone(); result_blocks.push(col); } @@ -1128,6 +1141,28 @@ mod tests { Ok(()) } + async fn run_take_next_bounded_spillable( + spiller: impl Spill + Clone, + sort_desc: &[SortColumnDescription], + bound: Option, + expected_blocks: Vec, + with_spilled: bool, + with_sliced: bool, + ) -> Result<()> { + let (_, blocks, sort_row_offset) = + prepare_test_blocks::(&spiller, sort_desc, with_spilled, with_sliced).await?; + + let mut stream = BoundBlockStream:: { + blocks, + bound, + sort_row_offset, + spiller: spiller.clone(), + _r: Default::default(), + }; + + collect_and_verify_blocks(&mut stream, &spiller, &expected_blocks).await + } + #[tokio::test] async fn test_take_next_bounded_spillable() -> Result<()> { let spiller = MockSpiller { @@ -1136,150 +1171,150 @@ mod tests { // Test with ascending Int32 type { - let sort_desc = Arc::new(vec![SortColumnDescription { + let sort_desc = [SortColumnDescription { offset: 0, asc: true, nulls_first: false, - }]); + }]; // Test 1: Basic test with bound = 5 (should return blocks with values <= 5) // No spilled blocks, no sliced blocks run_take_next_bounded_spillable::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Some(Scalar::Number(NumberScalar::Int32(5))), vec![Int32Type::from_data(vec![3, 5])], - false, // no spilled blocks - false, // no sliced blocks + false, + false, ) .await?; // Test 2: With spilled blocks, bound = 8 (should return blocks with values <= 8) run_take_next_bounded_spillable::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Some(Scalar::Number(NumberScalar::Int32(8))), vec![ Int32Type::from_data(vec![3, 5]), Int32Type::from_data(vec![7, 7, 8]), ], - true, // with spilled blocks - false, // no sliced blocks + true, + false, ) .await?; // Test 3: With sliced blocks, bound = 7 (should return blocks with values <= 7) run_take_next_bounded_spillable::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Some(Scalar::Number(NumberScalar::Int32(7))), vec![ Int32Type::from_data(vec![3, 5]), Int32Type::from_data(vec![7, 7]), ], - false, // no spilled blocks - true, // with sliced blocks + false, + true, ) .await?; // Test 4: With both spilled and sliced blocks, bound = 10 run_take_next_bounded_spillable::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Some(Scalar::Number(NumberScalar::Int32(10))), vec![ Int32Type::from_data(vec![3, 5]), Int32Type::from_data(vec![7, 7, 8]), Int32Type::from_data(vec![10]), ], - true, // with spilled blocks - true, // with sliced blocks + true, + true, ) .await?; // Test 5: With bound = 2 (should return no blocks as all values > 2) run_take_next_bounded_spillable::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Some(Scalar::Number(NumberScalar::Int32(2))), vec![], - true, // with spilled blocks - true, // with sliced blocks + true, + true, ) .await?; // Test 6: With bound = 12 (should return all blocks as all values <= 12) run_take_next_bounded_spillable::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Some(Scalar::Number(NumberScalar::Int32(12))), vec![ Int32Type::from_data(vec![3, 5]), Int32Type::from_data(vec![7, 7, 8]), Int32Type::from_data(vec![10, 11, 11]), ], - true, // with spilled blocks - false, // no sliced blocks + true, + false, ) .await?; // Test 7: With no bound (should return all blocks) run_take_next_bounded_spillable::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, None, vec![ Int32Type::from_data(vec![3, 5]), Int32Type::from_data(vec![7, 7, 8]), Int32Type::from_data(vec![10, 11, 11]), ], - true, // with spilled blocks - false, // no sliced blocks + true, + false, ) .await?; } // Test with descending String type { - let sort_desc = Arc::new(vec![SortColumnDescription { + let sort_desc = [SortColumnDescription { offset: 1, asc: false, nulls_first: false, - }]); + }]; // Test 8: With bound = "f" (should return blocks with values >= "f") run_take_next_bounded_spillable::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Some(Scalar::String("f".to_string())), vec![ StringType::from_data(vec!["w", "h"]), StringType::from_data(vec!["g", "f"]), ], - false, // no spilled blocks - false, // no sliced blocks + false, + false, ) .await?; // Test 9: With spilled blocks, bound = "e" (should return blocks with values >= "e") run_take_next_bounded_spillable::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Some(Scalar::String("e".to_string())), vec![ StringType::from_data(vec!["w", "h"]), StringType::from_data(vec!["g", "f", "e"]), StringType::from_data(vec!["e"]), ], - true, // with spilled blocks - false, // no sliced blocks + true, + false, ) .await?; // Test 10: With sliced blocks, bound = "d" (should return blocks with values >= "d") run_take_next_bounded_spillable::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Some(Scalar::String("d".to_string())), vec![ StringType::from_data(vec!["w", "h"]), @@ -1287,15 +1322,15 @@ mod tests { StringType::from_data(vec!["e"]), StringType::from_data(vec!["d", "d"]), ], - false, // no spilled blocks - true, // with sliced blocks + false, + true, ) .await?; // Test 11: With both spilled and sliced blocks, bound = "c" (should return all blocks) run_take_next_bounded_spillable::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Some(Scalar::String("c".to_string())), vec![ StringType::from_data(vec!["w", "h"]), @@ -1303,19 +1338,19 @@ mod tests { StringType::from_data(vec!["e"]), StringType::from_data(vec!["d", "d"]), ], - true, // with spilled blocks - true, // with sliced blocks + true, + true, ) .await?; // Test 12: With bound = "z" (should return no blocks as all values < "z") run_take_next_bounded_spillable::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Some(Scalar::String("z".to_string())), vec![], - true, // with spilled blocks - true, // with sliced blocks + true, + true, ) .await?; } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/wait.rs b/src/query/service/src/pipelines/processors/transforms/sort/wait.rs index 1e2f04581c1ae..193dcaf4e89a6 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/wait.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/wait.rs @@ -53,21 +53,37 @@ pub struct TransformSortWait { } impl TransformSortWait { - // pub fn new( - // input: Arc, - // output: Arc, - // id: usize, - // state: Arc, - // ) -> Self { - // Self { - // input, - // output, - // id, - // state, - // meta: None, - // _r: Default::default(), - // } - // } + pub fn new( + input: Arc, + output: Arc, + id: usize, + state: Arc, + spiller: Arc, + ) -> Self { + Self { + input, + output, + id, + state, + spiller, + step: Step::None, + _r: PhantomData, + } + } + + pub fn create( + input: Arc, + output: Arc, + id: usize, + inputs: usize, + partitions: usize, + schema: DataSchemaRef, + batch_rows: usize, + spiller: Arc, + ) -> Self { + let state = SortSampleState::new(inputs, partitions, schema, batch_rows); + Self::new(input, output, id, state, spiller) + } async fn scatter(&mut self) -> Result<()> { let scatter_bounds = self.state.bounds(); @@ -195,6 +211,24 @@ pub struct SortSampleState { } impl SortSampleState { + pub fn new( + inputs: usize, + partitions: usize, + schema: DataSchemaRef, + batch_rows: usize, + ) -> Arc { + Arc::new(SortSampleState { + inner: RwLock::new(StateInner { + partitions, + schema, + partial: vec![None; inputs], + bounds: None, + batch_rows, + }), + done: WatchNotify::new(), + }) + } + pub fn commit_sample(&self, id: usize, bounds: Bounds) -> Result { let mut inner = self.inner.write().unwrap(); From 83094fe9de3ef7f8a94b3c17f6c4faa6e22a239e Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 24 Apr 2025 13:45:55 +0800 Subject: [PATCH 18/33] route Signed-off-by: coldWater --- .../processors/transforms/sort/exchange.rs | 50 +++------- .../processors/transforms/sort/mod.rs | 1 + .../processors/transforms/sort/route.rs | 96 +++++++++++++++++++ .../processors/transforms/sort/sort_spill.rs | 20 ++-- 4 files changed, 122 insertions(+), 45 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/sort/route.rs diff --git a/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs b/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs index 9aeeb29cc7414..06b990e28a617 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs @@ -13,17 +13,16 @@ // limitations under the License. use std::marker::PhantomData; -use std::sync::Arc; use databend_common_exception::Result; +use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; use databend_common_pipeline_core::processors::Exchange; use databend_common_pipeline_transforms::processors::sort::Rows; -use super::wait::SortSampleState; +use super::SortScatteredMeta; pub struct SortRangeExchange { - state: Arc, _r: PhantomData, } @@ -33,41 +32,22 @@ unsafe impl Sync for SortRangeExchange {} impl Exchange for SortRangeExchange { const NAME: &'static str = "SortRange"; - fn partition(&self, data: DataBlock, n: usize) -> Result> { - if data.is_empty() { - return Ok(vec![]); - } + fn partition(&self, mut data: DataBlock, n: usize) -> Result> { + let Some(meta) = data.take_meta() else { + unreachable!(); + }; - let bounds = self.state.bounds(); - // debug_assert_eq!(n, self.state.partitions()); - debug_assert!(bounds.len() < n); + let Some(SortScatteredMeta(scattered)) = SortScatteredMeta::downcast_from(meta) else { + unreachable!(); + }; - if bounds.is_empty() { - return Ok(vec![data]); - } + assert!(scattered.len() <= n); - todo!() + let blocks = scattered + .into_iter() + .map(|meta| DataBlock::empty_with_meta(Box::new(meta))) + .collect(); - // let bounds = R::from_column(&bounds.0)?; - // let rows = R::from_column(data.get_last_column())?; - - // let mut i = 0; - // let mut j = 0; - // let mut bound = bounds.row(j); - // let mut indices = Vec::new(); - // while i < rows.len() { - // match rows.row(i).cmp(&bound) { - // Ordering::Less => indices.push(j as u32), - // Ordering::Greater if j + 1 < bounds.len() => { - // j += 1; - // bound = bounds.row(j); - // continue; - // } - // _ => indices.push(j as u32 + 1), - // } - // i += 1; - // } - - // DataBlock::scatter(&data, &indices, n) + Ok(blocks) } } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs index 307c1cefdf4d9..b897d2a262270 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs @@ -36,6 +36,7 @@ mod collect; mod exchange; mod execute; mod merge_sort; +mod route; mod sort_spill; mod wait; diff --git a/src/query/service/src/pipelines/processors/transforms/sort/route.rs b/src/query/service/src/pipelines/processors/transforms/sort/route.rs new file mode 100644 index 0000000000000..12d88fac8abfd --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/sort/route.rs @@ -0,0 +1,96 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_pipeline_core::processors::Event; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::Processor; + +pub struct TransformSortRoute { + inputs: Vec>, + output: Arc, + cur_input: usize, +} + +impl TransformSortRoute { + pub fn new(inputs: Vec>, output: Arc) -> Self { + Self { + inputs, + output, + cur_input: 0, + } + } + + fn process_input(&mut self) -> Result<()> { + for (i, input) in self.inputs.iter().enumerate() { + if i != self.cur_input { + if !input.is_finished() && !input.has_data() { + input.set_need_data(); + } + continue; + } + + if input.is_finished() { + self.cur_input = i + 1; + continue; + } + + match input.pull_data() { + Some(data) => self.output.push_data(data), + None => input.set_need_data(), + } + } + + Ok(()) + } +} + +impl Processor for TransformSortRoute { + fn name(&self) -> String { + "SortRoute".to_string() + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.output.is_finished() { + for input in &self.inputs { + input.finish(); + } + return Ok(Event::Finished); + } + + if !self.output.can_push() { + for input in &self.inputs { + input.set_not_need_data(); + } + return Ok(Event::NeedConsume); + } + + self.process_input()?; + + if self.inputs.iter().all(|input| input.is_finished()) { + self.output.finish(); + return Ok(Event::Finished); + } + + Ok(Event::NeedData) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs index 40881547751db..7f3ffa1b514c0 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs @@ -941,13 +941,13 @@ mod tests { async fn run_bound_block_stream( spiller: impl Spill + Clone, - sort_desc: Arc<[SortColumnDescription]>, + sort_desc: &[SortColumnDescription], bound: Scalar, block_part: usize, want: Column, ) -> Result<()> { let (schema, block) = test_data(); - let block = DataBlock::sort(&block, &sort_desc, None)?; + let block = DataBlock::sort(&block, sort_desc, None)?; let bound = Some(bound); let sort_row_offset = schema.fields().len(); @@ -957,7 +957,7 @@ mod tests { ] .into_iter() .map(|mut data| { - let col = convert_rows(schema.clone(), &sort_desc, data.clone()).unwrap(); + let col = convert_rows(schema.clone(), sort_desc, data.clone()).unwrap(); data.add_column(BlockEntry::new(col.data_type(), Value::Column(col))); SpillableBlock::new(data, sort_row_offset) }) @@ -985,15 +985,15 @@ mod tests { }; { - let sort_desc = Arc::new([SortColumnDescription { + let sort_desc = [SortColumnDescription { offset: 0, asc: true, nulls_first: false, - }]); + }]; run_bound_block_stream::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Scalar::Number(NumberScalar::Int32(5)), 4, Int32Type::from_data(vec![3, 5]), @@ -1002,7 +1002,7 @@ mod tests { run_bound_block_stream::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Scalar::Number(NumberScalar::Int32(8)), 4, Int32Type::from_data(vec![3, 5, 7, 7]), @@ -1011,15 +1011,15 @@ mod tests { } { - let sort_desc = Arc::new([SortColumnDescription { + let sort_desc = [SortColumnDescription { offset: 1, asc: false, nulls_first: false, - }]); + }]; run_bound_block_stream::>( spiller.clone(), - sort_desc.clone(), + &sort_desc, Scalar::String("f".to_string()), 4, StringType::from_data(vec!["w", "h", "g", "f"]), From b7b15af09bf60dc6b12377a4c7749793f681e195 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 24 Apr 2025 17:26:07 +0800 Subject: [PATCH 19/33] builder Signed-off-by: coldWater --- .../src/pipelines/builders/builder_sort.rs | 4 +- .../processors/transforms/sort/builder.rs | 244 ++++++++++++------ .../processors/transforms/sort/exchange.rs | 13 +- .../processors/transforms/sort/mod.rs | 2 +- .../transforms/sort/{wait.rs => shuffle.rs} | 22 +- 5 files changed, 170 insertions(+), 115 deletions(-) rename src/query/service/src/pipelines/processors/transforms/sort/{wait.rs => shuffle.rs} (92%) diff --git a/src/query/service/src/pipelines/builders/builder_sort.rs b/src/query/service/src/pipelines/builders/builder_sort.rs index a9ac35ea53dce..a3f5bd6132c2b 100644 --- a/src/query/service/src/pipelines/builders/builder_sort.rs +++ b/src/query/service/src/pipelines/builders/builder_sort.rs @@ -290,8 +290,6 @@ impl SortPipelineBuilder { pipeline.add_transform(|input, output| { let builder = TransformSortBuilder::create( - input, - output, sort_merge_output_schema.clone(), self.sort_desc.clone(), self.block_size, @@ -303,7 +301,7 @@ impl SortPipelineBuilder { .with_memory_settings(memory_settings.clone()) .with_enable_loser_tree(enable_loser_tree); - Ok(ProcessorPtr::create(builder.build()?)) + Ok(ProcessorPtr::create(builder.build(input, output)?)) }) } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/builder.rs b/src/query/service/src/pipelines/processors/transforms/sort/builder.rs index b16686b4829fd..80343cf5acca4 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/builder.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/builder.rs @@ -34,17 +34,18 @@ use databend_common_pipeline_transforms::MemorySettings; use super::collect::TransformSortCollect; use super::execute::TransformSortExecute; use super::merge_sort::TransformSort; +use super::shuffle::SortSampleState; +use super::shuffle::TransformSortShuffle; use crate::spillers::Spiller; enum SortType { Sort, Collect, Execute, + Shuffle, } pub struct TransformSortBuilder { - input: Arc, - output: Arc, schema: DataSchemaRef, block_size: usize, sort_desc: Arc<[SortColumnDescription]>, @@ -54,22 +55,16 @@ pub struct TransformSortBuilder { spiller: Arc, enable_loser_tree: bool, limit: Option, - processor: Option>>, - typ: SortType, } impl TransformSortBuilder { pub fn create( - input: Arc, - output: Arc, schema: DataSchemaRef, sort_desc: Arc<[SortColumnDescription]>, block_size: usize, spiller: Arc, ) -> Self { - Self { - input, - output, + TransformSortBuilder { block_size, schema, sort_desc, @@ -79,8 +74,6 @@ impl TransformSortBuilder { enable_loser_tree: false, limit: None, memory_settings: MemorySettings::disable_spill(), - processor: None, - typ: SortType::Sort, } } @@ -109,34 +102,133 @@ impl TransformSortBuilder { self } - pub fn build(mut self) -> Result> { - debug_assert!(if self.output_order_col { + pub fn build( + &self, + input: Arc, + output: Arc, + ) -> Result> { + self.check(); + + let mut build = Build { + params: self, + input, + output, + processor: None, + typ: SortType::Sort, + id: 0, + state: None, + }; + + select_row_type(&mut build); + build.processor.unwrap() + } + + pub fn build_collect( + &self, + input: Arc, + output: Arc, + ) -> Result> { + self.check(); + + let mut build = Build { + params: self, + input, + output, + processor: None, + typ: SortType::Collect, + id: 0, + state: None, + }; + + select_row_type(&mut build); + build.processor.unwrap() + } + + pub fn build_exec( + &self, + input: Arc, + output: Arc, + ) -> Result> { + self.check(); + + let mut build = Build { + params: self, + input, + output, + processor: None, + typ: SortType::Execute, + id: 0, + state: None, + }; + + select_row_type(&mut build); + build.processor.unwrap() + } + + pub fn build_shuffle( + &self, + input: Arc, + output: Arc, + id: usize, + state: Arc, + ) -> Result> { + self.check(); + + let mut build = Build { + params: self, + input, + output, + processor: None, + typ: SortType::Shuffle, + id, + state: Some(state), + }; + + select_row_type(&mut build); + build.processor.unwrap() + } + + fn should_use_sort_limit(&self) -> bool { + self.limit.map(|limit| limit < 10000).unwrap_or_default() + } + + fn check(&self) { + assert!(if self.output_order_col { self.schema.has_field(ORDER_COL_NAME) } else { !self.schema.has_field(ORDER_COL_NAME) }); - - select_row_type(&mut self); - self.processor.unwrap() } +} +pub struct Build<'a> { + params: &'a TransformSortBuilder, + typ: SortType, + input: Arc, + output: Arc, + processor: Option>>, + id: usize, + state: Option>, +} + +impl Build<'_> { fn build_sort(&mut self) -> Result> where A: SortAlgorithm + 'static, C: RowConverter + Send + 'static, { - let schema = add_order_field(self.schema.clone(), &self.sort_desc); + let schema = add_order_field(self.params.schema.clone(), &self.params.sort_desc); Ok(Box::new(TransformSort::::new( self.input.clone(), self.output.clone(), schema, - self.sort_desc.clone(), - self.block_size, - self.limit.map(|limit| (limit, false)), - self.spiller.clone(), - self.output_order_col, - self.order_col_generated, - self.memory_settings.clone(), + self.params.sort_desc.clone(), + self.params.block_size, + self.params.limit.map(|limit| (limit, false)), + self.params.spiller.clone(), + self.params.output_order_col, + self.params.order_col_generated, + self.params.memory_settings.clone(), )?)) } @@ -145,50 +237,38 @@ impl TransformSortBuilder { A: SortAlgorithm + 'static, C: RowConverter + Send + 'static, { - let schema = add_order_field(self.schema.clone(), &self.sort_desc); + let schema = add_order_field(self.params.schema.clone(), &self.params.sort_desc); Ok(Box::new(TransformSort::::new( self.input.clone(), self.output.clone(), schema, - self.sort_desc.clone(), - self.block_size, - Some((self.limit.unwrap(), true)), - self.spiller.clone(), - self.output_order_col, - self.order_col_generated, - self.memory_settings.clone(), + self.params.sort_desc.clone(), + self.params.block_size, + Some((self.params.limit.unwrap(), true)), + self.params.spiller.clone(), + self.params.output_order_col, + self.params.order_col_generated, + self.params.memory_settings.clone(), )?)) } - pub fn build_collect(mut self) -> Result> { - debug_assert!(if self.output_order_col { - self.schema.has_field(ORDER_COL_NAME) - } else { - !self.schema.has_field(ORDER_COL_NAME) - }); - self.typ = SortType::Collect; - - select_row_type(&mut self); - self.processor.unwrap() - } - fn build_sort_collect(&mut self) -> Result> where A: SortAlgorithm + 'static, C: RowConverter + Send + 'static, { - let schema = add_order_field(self.schema.clone(), &self.sort_desc); + let schema = add_order_field(self.params.schema.clone(), &self.params.sort_desc); Ok(Box::new(TransformSortCollect::::new( self.input.clone(), self.output.clone(), schema, - self.sort_desc.clone(), - self.block_size, - self.limit.map(|limit| (limit, false)), - self.spiller.clone(), - self.order_col_generated, - self.memory_settings.clone(), + self.params.sort_desc.clone(), + self.params.block_size, + self.params.limit.map(|limit| (limit, false)), + self.params.spiller.clone(), + self.params.order_col_generated, + self.params.memory_settings.clone(), )?)) } @@ -197,54 +277,53 @@ impl TransformSortBuilder { A: SortAlgorithm + 'static, C: RowConverter + Send + 'static, { - let schema = add_order_field(self.schema.clone(), &self.sort_desc); + let schema = add_order_field(self.params.schema.clone(), &self.params.sort_desc); Ok(Box::new(TransformSortCollect::::new( self.input.clone(), self.output.clone(), schema, - self.sort_desc.clone(), - self.block_size, - Some((self.limit.unwrap(), true)), - self.spiller.clone(), - self.order_col_generated, - self.memory_settings.clone(), + self.params.sort_desc.clone(), + self.params.block_size, + Some((self.params.limit.unwrap(), true)), + self.params.spiller.clone(), + self.params.order_col_generated, + self.params.memory_settings.clone(), )?)) } - pub fn build_exec(mut self) -> Result> { - debug_assert!(if self.output_order_col { - self.schema.has_field(ORDER_COL_NAME) - } else { - !self.schema.has_field(ORDER_COL_NAME) - }); - self.typ = SortType::Execute; - - select_row_type(&mut self); - self.processor.unwrap() - } - fn build_sort_exec(&mut self) -> Result> where A: SortAlgorithm + 'static { - let schema = add_order_field(self.schema.clone(), &self.sort_desc); + let schema = add_order_field(self.params.schema.clone(), &self.params.sort_desc); Ok(Box::new(TransformSortExecute::::new( self.input.clone(), self.output.clone(), schema, - self.limit, - self.spiller.clone(), - self.output_order_col, + self.params.limit, + self.params.spiller.clone(), + self.params.output_order_col, )?)) } + + fn build_sort_shuffle(&mut self) -> Result> + where R: Rows + 'static { + Ok(Box::new(TransformSortShuffle::::new( + self.input.clone(), + self.output.clone(), + self.id, + self.state.clone().unwrap(), + self.params.spiller.clone(), + ))) + } } -impl RowsTypeVisitor for TransformSortBuilder { +impl RowsTypeVisitor for Build<'_> { fn schema(&self) -> DataSchemaRef { - self.schema.clone() + self.params.schema.clone() } fn sort_desc(&self) -> &[SortColumnDescription] { - &self.sort_desc + &self.params.sort_desc } fn visit_type(&mut self) @@ -254,8 +333,8 @@ impl RowsTypeVisitor for TransformSortBuilder { { let processor = match self.typ { SortType::Sort => match ( - self.limit.map(|limit| limit < 10000).unwrap_or_default(), - self.enable_loser_tree, + self.params.should_use_sort_limit(), + self.params.enable_loser_tree, ) { (true, true) => self.build_sort_limit::, C>(), (true, false) => self.build_sort_limit::, C>(), @@ -263,18 +342,19 @@ impl RowsTypeVisitor for TransformSortBuilder { (false, false) => self.build_sort::, C>(), }, SortType::Collect => match ( - self.limit.map(|limit| limit < 10000).unwrap_or_default(), - self.enable_loser_tree, + self.params.should_use_sort_limit(), + self.params.enable_loser_tree, ) { (true, true) => self.build_sort_limit_collect::, C>(), (true, false) => self.build_sort_limit_collect::, C>(), (false, true) => self.build_sort_collect::, C>(), (false, false) => self.build_sort_collect::, C>(), }, - SortType::Execute => match self.enable_loser_tree { + SortType::Execute => match self.params.enable_loser_tree { true => self.build_sort_exec::>(), false => self.build_sort_exec::>(), }, + SortType::Shuffle => self.build_sort_shuffle::(), }; self.processor = Some(processor) } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs b/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs index 06b990e28a617..53795063c4f61 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs @@ -12,25 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::marker::PhantomData; - use databend_common_exception::Result; use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; use databend_common_pipeline_core::processors::Exchange; -use databend_common_pipeline_transforms::processors::sort::Rows; use super::SortScatteredMeta; -pub struct SortRangeExchange { - _r: PhantomData, -} - -unsafe impl Send for SortRangeExchange {} - -unsafe impl Sync for SortRangeExchange {} +pub struct SortRangeExchange; -impl Exchange for SortRangeExchange { +impl Exchange for SortRangeExchange { const NAME: &'static str = "SortRange"; fn partition(&self, mut data: DataBlock, n: usize) -> Result> { let Some(meta) = data.take_meta() else { diff --git a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs index b897d2a262270..120ad5a556b97 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs @@ -37,8 +37,8 @@ mod exchange; mod execute; mod merge_sort; mod route; +mod shuffle; mod sort_spill; -mod wait; use sort_spill::SpillableBlock; diff --git a/src/query/service/src/pipelines/processors/transforms/sort/wait.rs b/src/query/service/src/pipelines/processors/transforms/sort/shuffle.rs similarity index 92% rename from src/query/service/src/pipelines/processors/transforms/sort/wait.rs rename to src/query/service/src/pipelines/processors/transforms/sort/shuffle.rs index 193dcaf4e89a6..9c9a3b7067019 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/wait.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/shuffle.rs @@ -42,7 +42,7 @@ enum Step { Scattered(Vec), } -pub struct TransformSortWait { +pub struct TransformSortShuffle { input: Arc, output: Arc, id: usize, @@ -52,7 +52,7 @@ pub struct TransformSortWait { _r: PhantomData, } -impl TransformSortWait { +impl TransformSortShuffle { pub fn new( input: Arc, output: Arc, @@ -71,20 +71,6 @@ impl TransformSortWait { } } - pub fn create( - input: Arc, - output: Arc, - id: usize, - inputs: usize, - partitions: usize, - schema: DataSchemaRef, - batch_rows: usize, - spiller: Arc, - ) -> Self { - let state = SortSampleState::new(inputs, partitions, schema, batch_rows); - Self::new(input, output, id, state, spiller) - } - async fn scatter(&mut self) -> Result<()> { let scatter_bounds = self.state.bounds(); @@ -138,9 +124,9 @@ impl TransformSortWait { } #[async_trait::async_trait] -impl Processor for TransformSortWait { +impl Processor for TransformSortShuffle { fn name(&self) -> String { - "TransformSortWait".to_string() + "TransformSortShuffle".to_string() } fn as_any(&mut self) -> &mut dyn Any { From 056c3d99a2a2465024f6913be33f2671d82872bf Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 24 Apr 2025 18:34:45 +0800 Subject: [PATCH 20/33] update Signed-off-by: coldWater --- .../processors/transforms/sort/rows/utils.rs | 125 ++++++++++-------- .../processors/transforms/sort/builder.rs | 93 ++++--------- .../processors/transforms/sort/collect.rs | 29 +--- .../processors/transforms/sort/execute.rs | 14 +- .../processors/transforms/sort/sort_spill.rs | 4 +- 5 files changed, 105 insertions(+), 160 deletions(-) diff --git a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/utils.rs b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/utils.rs index da1a63bf2deeb..49d0dc1c25ed8 100644 --- a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/utils.rs +++ b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/utils.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::row::RowConverter as CommonConverter; use databend_common_expression::types::DataType; @@ -21,7 +22,6 @@ use databend_common_expression::types::NumberType; use databend_common_expression::types::StringType; use databend_common_expression::types::TimestampType; use databend_common_expression::with_number_mapped_type; -use databend_common_expression::BlockEntry; use databend_common_expression::Column; use databend_common_expression::DataBlock; use databend_common_expression::DataSchema; @@ -41,55 +41,50 @@ pub fn convert_rows( sort_desc: &[SortColumnDescription], data: DataBlock, ) -> Result { - let num_rows = data.num_rows(); + struct ConvertRowsVisitor<'a> { + schema: DataSchemaRef, + sort_desc: &'a [SortColumnDescription], + data: DataBlock, + result: Result, + } - if sort_desc.len() == 1 { - let sort_type = schema.field(sort_desc[0].offset).data_type(); - let asc = sort_desc[0].asc; + impl RowsTypeVisitor for ConvertRowsVisitor<'_> { + fn schema(&self) -> DataSchemaRef { + self.schema.clone() + } - let offset = sort_desc[0].offset; - let columns = &data.columns()[offset..offset + 1]; + fn sort_desc(&self) -> &[SortColumnDescription] { + self.sort_desc + } - match_template! { - T = [ Date => DateType, Timestamp => TimestampType, String => StringType ], - match sort_type { - DataType::T => { - if asc { - convert_columns::,SimpleRowConverter<_>>(schema, sort_desc, columns, num_rows) - } else { - convert_columns::,SimpleRowConverter<_>>(schema, sort_desc, columns, num_rows) - } - }, - DataType::Number(num_ty) => with_number_mapped_type!(|NUM_TYPE| match num_ty { - NumberDataType::NUM_TYPE => { - if asc { - convert_columns::>,SimpleRowConverter<_>>(schema, sort_desc, columns, num_rows) - } else { - convert_columns::>,SimpleRowConverter<_>>(schema, sort_desc, columns, num_rows) - } - } - }), - _ => convert_columns::(schema, sort_desc, columns, num_rows), + fn visit_type(&mut self) + where + R: Rows + 'static, + C: RowConverter + Send + 'static, + { + let columns = self + .sort_desc + .iter() + .map(|desc| self.data.get_by_offset(desc.offset).to_owned()) + .collect::>(); + + self.result = try { + let converter = C::create(self.sort_desc, self.schema.clone())?; + let rows = C::convert(&converter, &columns, self.data.num_rows())?; + rows.to_column() } } - } else { - let columns = sort_desc - .iter() - .map(|desc| data.get_by_offset(desc.offset).to_owned()) - .collect::>(); - convert_columns::(schema, sort_desc, &columns, num_rows) } -} -fn convert_columns>( - schema: DataSchemaRef, - sort_desc: &[SortColumnDescription], - columns: &[BlockEntry], - num_rows: usize, -) -> Result { - let converter = C::create(sort_desc, schema)?; - let rows = C::convert(&converter, columns, num_rows)?; - Ok(rows.to_column()) + let mut visitor = ConvertRowsVisitor { + schema: schema.clone(), + sort_desc, + data, + result: Err(ErrorCode::Internal("unreachable")), + }; + + select_row_type(&mut visitor); + visitor.result } pub fn select_row_type(visitor: &mut impl RowsTypeVisitor) { @@ -138,19 +133,37 @@ pub trait RowsTypeVisitor { } pub fn order_field_type(schema: &DataSchema, desc: &[SortColumnDescription]) -> DataType { - debug_assert!(!desc.is_empty()); - if desc.len() == 1 { - let order_by_field = schema.field(desc[0].offset); - if matches!( - order_by_field.data_type(), - DataType::Number(_) - | DataType::Date - | DataType::Timestamp - | DataType::Binary - | DataType::String - ) { - return order_by_field.data_type().clone(); + struct OrderFieldTypeVisitor<'a> { + schema: DataSchemaRef, + sort_desc: &'a [SortColumnDescription], + result: Option, + } + + impl RowsTypeVisitor for OrderFieldTypeVisitor<'_> { + fn schema(&self) -> DataSchemaRef { + self.schema.clone() + } + + fn sort_desc(&self) -> &[SortColumnDescription] { + self.sort_desc + } + + fn visit_type(&mut self) + where + R: Rows + 'static, + C: RowConverter + Send + 'static, + { + self.result = Some(R::data_type()); } } - DataType::Binary + + assert!(!desc.is_empty()); + let mut visitor = OrderFieldTypeVisitor { + schema: schema.clone().into(), + sort_desc: desc, + result: None, + }; + + select_row_type(&mut visitor); + visitor.result.unwrap() } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/builder.rs b/src/query/service/src/pipelines/processors/transforms/sort/builder.rs index 80343cf5acca4..00bd45de5f434 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/builder.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/builder.rs @@ -36,6 +36,7 @@ use super::execute::TransformSortExecute; use super::merge_sort::TransformSort; use super::shuffle::SortSampleState; use super::shuffle::TransformSortShuffle; +use super::Base; use crate::spillers::Spiller; enum SortType { @@ -199,6 +200,17 @@ impl TransformSortBuilder { !self.schema.has_field(ORDER_COL_NAME) }); } + + fn new_base(&self) -> Base { + let schema = add_order_field(self.schema.clone(), &self.sort_desc); + let sort_row_offset = schema.fields().len() - 1; + Base { + sort_row_offset, + schema, + spiller: self.spiller.clone(), + limit: self.limit, + } + } } pub struct Build<'a> { @@ -212,27 +224,7 @@ pub struct Build<'a> { } impl Build<'_> { - fn build_sort(&mut self) -> Result> - where - A: SortAlgorithm + 'static, - C: RowConverter + Send + 'static, - { - let schema = add_order_field(self.params.schema.clone(), &self.params.sort_desc); - Ok(Box::new(TransformSort::::new( - self.input.clone(), - self.output.clone(), - schema, - self.params.sort_desc.clone(), - self.params.block_size, - self.params.limit.map(|limit| (limit, false)), - self.params.spiller.clone(), - self.params.output_order_col, - self.params.order_col_generated, - self.params.memory_settings.clone(), - )?)) - } - - fn build_sort_limit(&mut self) -> Result> + fn build_sort(&mut self, limit_sort: bool) -> Result> where A: SortAlgorithm + 'static, C: RowConverter + Send + 'static, @@ -244,7 +236,7 @@ impl Build<'_> { schema, self.params.sort_desc.clone(), self.params.block_size, - Some((self.params.limit.unwrap(), true)), + self.params.limit.map(|limit| (limit, limit_sort)), self.params.spiller.clone(), self.params.output_order_col, self.params.order_col_generated, @@ -252,40 +244,18 @@ impl Build<'_> { )?)) } - fn build_sort_collect(&mut self) -> Result> + fn build_sort_collect(&mut self, limit_sort: bool) -> Result> where A: SortAlgorithm + 'static, C: RowConverter + Send + 'static, { - let schema = add_order_field(self.params.schema.clone(), &self.params.sort_desc); - Ok(Box::new(TransformSortCollect::::new( self.input.clone(), self.output.clone(), - schema, + self.params.new_base(), self.params.sort_desc.clone(), self.params.block_size, - self.params.limit.map(|limit| (limit, false)), - self.params.spiller.clone(), - self.params.order_col_generated, - self.params.memory_settings.clone(), - )?)) - } - - fn build_sort_limit_collect(&mut self) -> Result> - where - A: SortAlgorithm + 'static, - C: RowConverter + Send + 'static, - { - let schema = add_order_field(self.params.schema.clone(), &self.params.sort_desc); - Ok(Box::new(TransformSortCollect::::new( - self.input.clone(), - self.output.clone(), - schema, - self.params.sort_desc.clone(), - self.params.block_size, - Some((self.params.limit.unwrap(), true)), - self.params.spiller.clone(), + limit_sort, self.params.order_col_generated, self.params.memory_settings.clone(), )?)) @@ -293,14 +263,10 @@ impl Build<'_> { fn build_sort_exec(&mut self) -> Result> where A: SortAlgorithm + 'static { - let schema = add_order_field(self.params.schema.clone(), &self.params.sort_desc); - Ok(Box::new(TransformSortExecute::::new( self.input.clone(), self.output.clone(), - schema, - self.params.limit, - self.params.spiller.clone(), + self.params.new_base(), self.params.output_order_col, )?)) } @@ -331,24 +297,15 @@ impl RowsTypeVisitor for Build<'_> { R: Rows + 'static, C: RowConverter + Send + 'static, { + let limit_sort = self.params.should_use_sort_limit(); let processor = match self.typ { - SortType::Sort => match ( - self.params.should_use_sort_limit(), - self.params.enable_loser_tree, - ) { - (true, true) => self.build_sort_limit::, C>(), - (true, false) => self.build_sort_limit::, C>(), - (false, true) => self.build_sort::, C>(), - (false, false) => self.build_sort::, C>(), + SortType::Sort => match self.params.enable_loser_tree { + true => self.build_sort::, C>(limit_sort), + false => self.build_sort::, C>(limit_sort), }, - SortType::Collect => match ( - self.params.should_use_sort_limit(), - self.params.enable_loser_tree, - ) { - (true, true) => self.build_sort_limit_collect::, C>(), - (true, false) => self.build_sort_limit_collect::, C>(), - (false, true) => self.build_sort_collect::, C>(), - (false, false) => self.build_sort_collect::, C>(), + SortType::Collect => match self.params.enable_loser_tree { + true => self.build_sort_collect::, C>(limit_sort), + false => self.build_sort_collect::, C>(limit_sort), }, SortType::Execute => match self.params.enable_loser_tree { true => self.build_sort_exec::>(), diff --git a/src/query/service/src/pipelines/processors/transforms/sort/collect.rs b/src/query/service/src/pipelines/processors/transforms/sort/collect.rs index 83a9001a59ea3..2f7330ea2c494 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/collect.rs @@ -20,7 +20,6 @@ use std::sync::Arc; use databend_common_exception::Result; use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; -use databend_common_expression::DataSchemaRef; use databend_common_expression::SortColumnDescription; use databend_common_expression::Value; use databend_common_pipeline_core::processors::Event; @@ -38,7 +37,6 @@ use databend_common_pipeline_transforms::TransformSortMergeLimit; use super::sort_spill::SortSpill; use super::Base; use super::MemoryRows; -use crate::spillers::Spiller; enum Inner { Collect(Vec), @@ -77,28 +75,20 @@ where pub(super) fn new( input: Arc, output: Arc, - schema: DataSchemaRef, + base: Base, sort_desc: Arc<[SortColumnDescription]>, max_block_size: usize, - limit: Option<(usize, bool)>, - spiller: Arc, + sort_limit: bool, order_col_generated: bool, memory_settings: MemorySettings, ) -> Result { - let sort_row_offset = schema.fields().len() - 1; - let row_converter = C::create(&sort_desc, schema.clone())?; - let (name, inner, limit) = match limit { - Some((limit, true)) => ( + let row_converter = C::create(&sort_desc, base.schema.clone())?; + let (name, inner) = match base.limit { + Some(limit) if sort_limit => ( "TransformSortMergeCollectLimit", Inner::Limit(TransformSortMergeLimit::create(max_block_size, limit)), - Some(limit), ), - Some((limit, false)) => ( - "TransformSortMergeCollect", - Inner::Collect(vec![]), - Some(limit), - ), - None => ("TransformSortMergeCollect", Inner::Collect(vec![]), None), + _ => ("TransformSortMergeCollect", Inner::Collect(vec![])), }; Ok(Self { input, @@ -108,12 +98,7 @@ where output_data: None, sort_desc, order_col_generated, - base: Base { - schema, - spiller, - sort_row_offset, - limit, - }, + base, inner, aborting: AtomicBool::new(false), memory_settings, diff --git a/src/query/service/src/pipelines/processors/transforms/sort/execute.rs b/src/query/service/src/pipelines/processors/transforms/sort/execute.rs index 39f01dc0497e1..3c5d832aa5973 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/execute.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/execute.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use databend_common_exception::Result; use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; -use databend_common_expression::DataSchemaRef; use databend_common_pipeline_core::processors::Event; use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; @@ -28,7 +27,6 @@ use databend_common_pipeline_transforms::processors::sort::algorithm::SortAlgori use super::sort_spill::SortSpill; use super::Base; use super::SortCollectedMeta; -use crate::spillers::Spiller; pub struct TransformSortExecute { input: Arc, @@ -48,22 +46,14 @@ where A: SortAlgorithm pub(super) fn new( input: Arc, output: Arc, - schema: DataSchemaRef, - limit: Option, - spiller: Arc, + base: Base, output_order_col: bool, ) -> Result { - let sort_row_offset = schema.fields().len() - 1; Ok(Self { input, output, remove_order_col: !output_order_col, - base: Base { - schema, - spiller, - sort_row_offset, - limit, - }, + base, inner: None, }) } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs index 7f3ffa1b514c0..379ea962ed772 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs @@ -1043,7 +1043,7 @@ mod tests { SpillableBlock::new(sliced_block, sort_row_offset) } - async fn prepare_test_blocks( + async fn prepare_test_blocks( spiller: &impl Spill, sort_desc: &[SortColumnDescription], with_spilled: bool, @@ -1150,7 +1150,7 @@ mod tests { with_sliced: bool, ) -> Result<()> { let (_, blocks, sort_row_offset) = - prepare_test_blocks::(&spiller, sort_desc, with_spilled, with_sliced).await?; + prepare_test_blocks(&spiller, sort_desc, with_spilled, with_sliced).await?; let mut stream = BoundBlockStream:: { blocks, From a5af6b7b0787af73af07eba6fef5fa303611e3c5 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 24 Apr 2025 21:20:25 +0800 Subject: [PATCH 21/33] build Signed-off-by: coldWater --- .../processors/transforms/sort/rows/utils.rs | 81 ++--- .../src/pipelines/builders/builder_sort.rs | 70 ++-- .../processors/transforms/sort/builder.rs | 43 ++- .../processors/transforms/sort/exchange.rs | 45 ++- .../processors/transforms/sort/mod.rs | 19 +- .../processors/transforms/sort/route.rs | 32 +- .../transforms/sort/sort_exchange.rs | 148 -------- .../processors/transforms/sort/sort_merge.rs | 62 ---- .../processors/transforms/sort/sort_sample.rs | 333 ------------------ .../processors/transforms/sort/sort_wait.rs | 113 ------ 10 files changed, 187 insertions(+), 759 deletions(-) delete mode 100644 src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs delete mode 100644 src/query/service/src/pipelines/processors/transforms/sort/sort_merge.rs delete mode 100644 src/query/service/src/pipelines/processors/transforms/sort/sort_sample.rs delete mode 100644 src/query/service/src/pipelines/processors/transforms/sort/sort_wait.rs diff --git a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/utils.rs b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/utils.rs index 49d0dc1c25ed8..373e2e2e99281 100644 --- a/src/query/pipeline/transforms/src/processors/transforms/sort/rows/utils.rs +++ b/src/query/pipeline/transforms/src/processors/transforms/sort/rows/utils.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::row::RowConverter as CommonConverter; use databend_common_expression::types::DataType; @@ -45,10 +44,10 @@ pub fn convert_rows( schema: DataSchemaRef, sort_desc: &'a [SortColumnDescription], data: DataBlock, - result: Result, } impl RowsTypeVisitor for ConvertRowsVisitor<'_> { + type Result = Result; fn schema(&self) -> DataSchemaRef { self.schema.clone() } @@ -57,7 +56,7 @@ pub fn convert_rows( self.sort_desc } - fn visit_type(&mut self) + fn visit_type(&mut self) -> Self::Result where R: Rows + 'static, C: RowConverter + Send + 'static, @@ -68,11 +67,9 @@ pub fn convert_rows( .map(|desc| self.data.get_by_offset(desc.offset).to_owned()) .collect::>(); - self.result = try { - let converter = C::create(self.sort_desc, self.schema.clone())?; - let rows = C::convert(&converter, &columns, self.data.num_rows())?; - rows.to_column() - } + let converter = C::create(self.sort_desc, self.schema.clone())?; + let rows = C::convert(&converter, &columns, self.data.num_rows())?; + Ok(rows.to_column()) } } @@ -80,53 +77,53 @@ pub fn convert_rows( schema: schema.clone(), sort_desc, data, - result: Err(ErrorCode::Internal("unreachable")), }; - select_row_type(&mut visitor); - visitor.result + select_row_type(&mut visitor) } -pub fn select_row_type(visitor: &mut impl RowsTypeVisitor) { - let sort_desc = visitor.sort_desc(); - if sort_desc.len() == 1 { - let schema = visitor.schema(); - let sort_type = schema.field(sort_desc[0].offset).data_type(); - let asc = sort_desc[0].asc; - - match_template! { - T = [ Date => DateType, Timestamp => TimestampType, String => StringType ], - match sort_type { - DataType::T => { - if asc { - visitor.visit_type::, SimpleRowConverter>() - } else { - visitor.visit_type::, SimpleRowConverter>() - } - }, - DataType::Number(num_ty) => with_number_mapped_type!(|NUM_TYPE| match num_ty { - NumberDataType::NUM_TYPE => { +pub fn select_row_type(visitor: &mut V) -> V::Result +where V: RowsTypeVisitor { + match &visitor.sort_desc() { + &[desc] => { + let schema = visitor.schema(); + let sort_type = schema.field(desc.offset).data_type(); + let asc = desc.asc; + + match_template! { + T = [ Date => DateType, Timestamp => TimestampType, String => StringType ], + match sort_type { + DataType::T => { if asc { - visitor.visit_type::>, SimpleRowConverter>>() + visitor.visit_type::, SimpleRowConverter>() } else { - visitor.visit_type::>, SimpleRowConverter>>() + visitor.visit_type::, SimpleRowConverter>() + } + }, + DataType::Number(num_ty) => with_number_mapped_type!(|NUM_TYPE| match num_ty { + NumberDataType::NUM_TYPE => { + if asc { + visitor.visit_type::>, SimpleRowConverter>>() + } else { + visitor.visit_type::>, SimpleRowConverter>>() + } } + }), + _ => visitor.visit_type::() } - }), - _ => visitor.visit_type::() } } - } else { - visitor.visit_type::() + _ => visitor.visit_type::(), } } pub trait RowsTypeVisitor { + type Result; fn schema(&self) -> DataSchemaRef; fn sort_desc(&self) -> &[SortColumnDescription]; - fn visit_type(&mut self) + fn visit_type(&mut self) -> Self::Result where R: Rows + 'static, C: RowConverter + Send + 'static; @@ -136,10 +133,10 @@ pub fn order_field_type(schema: &DataSchema, desc: &[SortColumnDescription]) -> struct OrderFieldTypeVisitor<'a> { schema: DataSchemaRef, sort_desc: &'a [SortColumnDescription], - result: Option, } impl RowsTypeVisitor for OrderFieldTypeVisitor<'_> { + type Result = DataType; fn schema(&self) -> DataSchemaRef { self.schema.clone() } @@ -148,12 +145,12 @@ pub fn order_field_type(schema: &DataSchema, desc: &[SortColumnDescription]) -> self.sort_desc } - fn visit_type(&mut self) + fn visit_type(&mut self) -> Self::Result where R: Rows + 'static, C: RowConverter + Send + 'static, { - self.result = Some(R::data_type()); + R::data_type() } } @@ -161,9 +158,7 @@ pub fn order_field_type(schema: &DataSchema, desc: &[SortColumnDescription]) -> let mut visitor = OrderFieldTypeVisitor { schema: schema.clone().into(), sort_desc: desc, - result: None, }; - select_row_type(&mut visitor); - visitor.result.unwrap() + select_row_type(&mut visitor) } diff --git a/src/query/service/src/pipelines/builders/builder_sort.rs b/src/query/service/src/pipelines/builders/builder_sort.rs index a3f5bd6132c2b..e5041bb4a9ed6 100644 --- a/src/query/service/src/pipelines/builders/builder_sort.rs +++ b/src/query/service/src/pipelines/builders/builder_sort.rs @@ -33,10 +33,9 @@ use databend_common_storage::DataOperator; use databend_common_storages_fuse::TableContext; use crate::pipelines::memory_settings::MemorySettingsExt; -use crate::pipelines::processors::transforms::sort::add_range_shuffle; -use crate::pipelines::processors::transforms::sort::add_range_shuffle_merge; -use crate::pipelines::processors::transforms::sort::add_sort_sample; -use crate::pipelines::processors::transforms::sort::SortSampleState; +use crate::pipelines::processors::transforms::add_range_shuffle_exchange; +use crate::pipelines::processors::transforms::add_range_shuffle_route; +use crate::pipelines::processors::transforms::SortSampleState; use crate::pipelines::processors::transforms::TransformLimit; use crate::pipelines::processors::transforms::TransformSortBuilder; use crate::pipelines::PipelineBuilder; @@ -142,7 +141,7 @@ impl PipelineBuilder { if k > 0 && self.main_pipeline.output_len() > 1 { builder .remove_order_col_at_last() - .build_range_shuffle_sort_pipeline(&mut self.main_pipeline, k) + .build_range_shuffle_sort_pipeline(&mut self.main_pipeline) } else { builder .remove_order_col_at_last() @@ -211,18 +210,11 @@ impl SortPipelineBuilder { self.build_merge_sort_pipeline(pipeline, false) } - fn build_range_shuffle_sort_pipeline(self, pipeline: &mut Pipeline, k: usize) -> Result<()> { + fn build_range_shuffle_sort_pipeline(self, pipeline: &mut Pipeline) -> Result<()> { let inputs = pipeline.output_len(); let settings = self.ctx.get_settings(); let max_threads = settings.get_max_threads()? as usize; - let sample = SortSampleState::new( - inputs, - max_threads, - self.schema.clone(), - self.sort_desc.clone(), - ); - - add_sort_sample(pipeline, sample.clone(), self.sort_desc.clone(), k)?; + let max_block_size = settings.get_max_block_size()? as usize; // Partial sort pipeline.add_transformer(|| { @@ -232,20 +224,48 @@ impl SortPipelineBuilder { ) }); - self.build_merge_sort(pipeline, false)?; + let spiller = { + let location_prefix = self.ctx.query_id_spill_prefix(); + let config = SpillerConfig { + spiller_type: SpillerType::OrderBy, + location_prefix, + disk_spill: None, + use_parquet: settings.get_spilling_file_format()?.is_parquet(), + }; + let op = DataOperator::instance().spill_operator(); + Arc::new(Spiller::create(self.ctx.clone(), op, config)?) + }; + + let memory_settings = MemorySettings::from_sort_settings(&self.ctx)?; + let enable_loser_tree = settings.get_enable_loser_tree_merge_sort()?; - add_range_shuffle( - pipeline, - sample.clone(), - self.sort_desc.clone(), + let builder = TransformSortBuilder::create( self.schema.clone(), - self.block_size, - self.limit, - self.remove_order_col_at_last, - self.enable_loser_tree, - )?; + self.sort_desc.clone(), + max_block_size, + spiller, + ) + .with_limit(self.limit) + .with_order_col_generated(false) + .with_output_order_col(false) + .with_memory_settings(memory_settings) + .with_enable_loser_tree(enable_loser_tree); + + pipeline.add_transform(|input, output| { + Ok(ProcessorPtr::create(builder.build_collect(input, output)?)) + })?; + + let state = SortSampleState::new(inputs, max_threads, self.schema.clone(), max_block_size); + + builder.add_shuffle(pipeline, state.clone())?; + + add_range_shuffle_exchange(pipeline, max_threads)?; + + pipeline.add_transform(|input, output| { + Ok(ProcessorPtr::create(builder.build_exec(input, output)?)) + })?; - add_range_shuffle_merge(pipeline)?; + add_range_shuffle_route(pipeline)?; if self.limit.is_none() { return Ok(()); diff --git a/src/query/service/src/pipelines/processors/transforms/sort/builder.rs b/src/query/service/src/pipelines/processors/transforms/sort/builder.rs index 00bd45de5f434..da9e0e232cf16 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/builder.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/builder.rs @@ -20,6 +20,8 @@ use databend_common_expression::SortColumnDescription; use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::Processor; +use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_pipeline_core::Pipeline; use databend_common_pipeline_transforms::processors::sort::algorithm::SortAlgorithm; use databend_common_pipeline_transforms::sort::algorithm::HeapSort; use databend_common_pipeline_transforms::sort::algorithm::LoserTreeSort; @@ -114,14 +116,12 @@ impl TransformSortBuilder { params: self, input, output, - processor: None, typ: SortType::Sort, id: 0, state: None, }; - select_row_type(&mut build); - build.processor.unwrap() + select_row_type(&mut build) } pub fn build_collect( @@ -135,14 +135,12 @@ impl TransformSortBuilder { params: self, input, output, - processor: None, typ: SortType::Collect, id: 0, state: None, }; - select_row_type(&mut build); - build.processor.unwrap() + select_row_type(&mut build) } pub fn build_exec( @@ -156,14 +154,12 @@ impl TransformSortBuilder { params: self, input, output, - processor: None, typ: SortType::Execute, id: 0, state: None, }; - select_row_type(&mut build); - build.processor.unwrap() + select_row_type(&mut build) } pub fn build_shuffle( @@ -179,14 +175,12 @@ impl TransformSortBuilder { params: self, input, output, - processor: None, typ: SortType::Shuffle, id, state: Some(state), }; - select_row_type(&mut build); - build.processor.unwrap() + select_row_type(&mut build) } fn should_use_sort_limit(&self) -> bool { @@ -211,14 +205,27 @@ impl TransformSortBuilder { limit: self.limit, } } + + pub fn add_shuffle(&self, pipeline: &mut Pipeline, state: Arc) -> Result<()> { + use std::sync::atomic; + let i = atomic::AtomicUsize::new(0); + pipeline.add_transform(|input, output| { + let id = i.fetch_add(1, atomic::Ordering::AcqRel); + Ok(ProcessorPtr::create(self.build_shuffle( + input, + output, + id, + state.clone(), + )?)) + }) + } } -pub struct Build<'a> { +struct Build<'a> { params: &'a TransformSortBuilder, typ: SortType, input: Arc, output: Arc, - processor: Option>>, id: usize, state: Option>, } @@ -284,6 +291,7 @@ impl Build<'_> { } impl RowsTypeVisitor for Build<'_> { + type Result = Result>; fn schema(&self) -> DataSchemaRef { self.params.schema.clone() } @@ -292,13 +300,13 @@ impl RowsTypeVisitor for Build<'_> { &self.params.sort_desc } - fn visit_type(&mut self) + fn visit_type(&mut self) -> Self::Result where R: Rows + 'static, C: RowConverter + Send + 'static, { let limit_sort = self.params.should_use_sort_limit(); - let processor = match self.typ { + match self.typ { SortType::Sort => match self.params.enable_loser_tree { true => self.build_sort::, C>(limit_sort), false => self.build_sort::, C>(limit_sort), @@ -312,7 +320,6 @@ impl RowsTypeVisitor for Build<'_> { false => self.build_sort_exec::>(), }, SortType::Shuffle => self.build_sort_shuffle::(), - }; - self.processor = Some(processor) + } } } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs b/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs index 53795063c4f61..cb378244c61d3 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs @@ -12,14 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::iter; +use std::sync::Arc; + use databend_common_exception::Result; use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; use databend_common_pipeline_core::processors::Exchange; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::PartitionProcessor; +use databend_common_pipeline_core::Pipe; +use databend_common_pipeline_core::PipeItem; +use databend_common_pipeline_core::Pipeline; use super::SortScatteredMeta; -pub struct SortRangeExchange; +struct SortRangeExchange; impl Exchange for SortRangeExchange { const NAME: &'static str = "SortRange"; @@ -42,3 +51,37 @@ impl Exchange for SortRangeExchange { Ok(blocks) } } + +fn create_exchange_pipe(num_input: usize, num_output: usize) -> Pipe { + let items = iter::repeat_with(|| { + let input = InputPort::create(); + let outputs = iter::repeat_with(OutputPort::create) + .take(num_output) + .collect::>(); + + PipeItem::create( + PartitionProcessor::create(input.clone(), outputs.clone(), Arc::new(SortRangeExchange)), + vec![input], + outputs, + ) + }) + .take(num_input) + .collect::>(); + + Pipe::create(num_input, num_input * num_output, items) +} + +pub fn add_range_shuffle_exchange(pipeline: &mut Pipeline, num_output: usize) -> Result<()> { + let num_input = pipeline.output_len(); + + pipeline.add_pipe(create_exchange_pipe(num_input, num_output)); + + let n = num_output; + let reorder_edges = (0..num_input * n) + .map(|i| (i % n) * num_input + (i / n)) + .collect::>(); + + pipeline.reorder_inputs(reorder_edges); + + Ok(()) +} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs index 120ad5a556b97..7e1e4fdff52bd 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs @@ -12,11 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod sort_exchange; -mod sort_merge; -mod sort_sample; -mod sort_wait; - use std::sync::Arc; use bounds::Bounds; @@ -25,13 +20,12 @@ use databend_common_expression::BlockMetaInfo; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_pipeline_transforms::SortSpillParams; -pub use sort_merge::*; -pub use sort_sample::*; +use sort_spill::SpillableBlock; -mod builder; -pub use builder::TransformSortBuilder; +use crate::spillers::Spiller; mod bounds; +mod builder; mod collect; mod exchange; mod execute; @@ -40,9 +34,10 @@ mod route; mod shuffle; mod sort_spill; -use sort_spill::SpillableBlock; - -use crate::spillers::Spiller; +pub use builder::*; +pub use exchange::*; +pub use route::*; +pub use shuffle::*; #[derive(Clone)] struct Base { diff --git a/src/query/service/src/pipelines/processors/transforms/sort/route.rs b/src/query/service/src/pipelines/processors/transforms/sort/route.rs index 12d88fac8abfd..a9592f4cd405a 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/route.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/route.rs @@ -20,15 +20,19 @@ use databend_common_pipeline_core::processors::Event; use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::Processor; +use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_pipeline_core::Pipe; +use databend_common_pipeline_core::PipeItem; +use databend_common_pipeline_core::Pipeline; -pub struct TransformSortRoute { +struct TransformSortRoute { inputs: Vec>, output: Arc, cur_input: usize, } impl TransformSortRoute { - pub fn new(inputs: Vec>, output: Arc) -> Self { + fn new(inputs: Vec>, output: Arc) -> Self { Self { inputs, output, @@ -36,7 +40,7 @@ impl TransformSortRoute { } } - fn process_input(&mut self) -> Result<()> { + fn process(&mut self) -> Result<()> { for (i, input) in self.inputs.iter().enumerate() { if i != self.cur_input { if !input.is_finished() && !input.has_data() { @@ -84,7 +88,7 @@ impl Processor for TransformSortRoute { return Ok(Event::NeedConsume); } - self.process_input()?; + self.process()?; if self.inputs.iter().all(|input| input.is_finished()) { self.output.finish(); @@ -94,3 +98,23 @@ impl Processor for TransformSortRoute { Ok(Event::NeedData) } } + +pub fn add_range_shuffle_route(pipeline: &mut Pipeline) -> Result<()> { + let inputs = pipeline.output_len(); + let inputs_port = (0..inputs).map(|_| InputPort::create()).collect::>(); + let output = OutputPort::create(); + + let processor = ProcessorPtr::create(Box::new(TransformSortRoute::new( + inputs_port.clone(), + output.clone(), + ))); + + let pipe = Pipe::create(inputs, 1, vec![PipeItem::create( + processor, + inputs_port, + vec![output], + )]); + + pipeline.add_pipe(pipe); + Ok(()) +} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs deleted file mode 100644 index a528b5f3352f0..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::cmp::Ordering; -use std::iter; -use std::marker::PhantomData; -use std::sync::Arc; - -use databend_common_exception::Result; -use databend_common_expression::DataBlock; -use databend_common_expression::DataSchemaRef; -use databend_common_expression::SortColumnDescription; -use databend_common_pipeline_core::processors::Exchange; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::Pipe; -use databend_common_pipeline_core::PipeItem; -use databend_common_pipeline_transforms::processors::sort::select_row_type; -use databend_common_pipeline_transforms::processors::sort::Rows; -use databend_common_pipeline_transforms::processors::sort::RowsTypeVisitor; -use databend_common_pipeline_transforms::sort::RowConverter; - -use super::sort_sample::SortSampleState; -use crate::pipelines::processors::PartitionProcessor; - -pub struct SortRangeExchange { - state: Arc, - _r: PhantomData, -} - -unsafe impl Send for SortRangeExchange {} - -unsafe impl Sync for SortRangeExchange {} - -impl Exchange for SortRangeExchange { - const NAME: &'static str = "SortRange"; - fn partition(&self, data: DataBlock, n: usize) -> Result> { - let bounds = self.state.bounds().unwrap(); - debug_assert_eq!(n, self.state.partitions()); - debug_assert!(bounds.len() < n); - - if data.is_empty() { - return Ok(vec![]); - } - - if bounds.len() == 0 { - return Ok(vec![data]); - } - - let bounds = R::from_column(&bounds)?; - let rows = R::from_column(data.get_last_column())?; - - let mut i = 0; - let mut j = 0; - let mut bound = bounds.row(j); - let mut indices = Vec::new(); - while i < rows.len() { - match rows.row(i).cmp(&bound) { - Ordering::Less => indices.push(j as u32), - Ordering::Greater if j + 1 < bounds.len() => { - j += 1; - bound = bounds.row(j); - continue; - } - _ => indices.push(j as u32 + 1), - } - i += 1; - } - - DataBlock::scatter(&data, &indices, n) - } -} - -pub fn create_exchange_pipe( - inputs: usize, - partitions: usize, - schema: DataSchemaRef, - sort_desc: Arc<[SortColumnDescription]>, - state: Arc, -) -> Pipe { - let mut builder = Builder { - inputs, - partitions, - sort_desc, - schema, - state, - items: Vec::new(), - }; - - select_row_type(&mut builder); - - Pipe::create(inputs, inputs * partitions, builder.items) -} - -struct Builder { - inputs: usize, - partitions: usize, - sort_desc: Arc<[SortColumnDescription]>, - schema: DataSchemaRef, - state: Arc, - items: Vec, -} - -impl RowsTypeVisitor for Builder { - fn visit_type(&mut self) - where - R: Rows + 'static, - C: RowConverter + Send + 'static, - { - let exchange = Arc::new(SortRangeExchange:: { - state: self.state.clone(), - _r: PhantomData, - }); - self.items = iter::repeat_with(|| { - let input = InputPort::create(); - let outputs = iter::repeat_with(OutputPort::create) - .take(self.partitions) - .collect::>(); - - PipeItem::create( - PartitionProcessor::create(input.clone(), outputs.clone(), exchange.clone()), - vec![input], - outputs, - ) - }) - .take(self.inputs) - .collect::>(); - } - - fn schema(&self) -> DataSchemaRef { - self.schema.clone() - } - - fn sort_desc(&self) -> &[SortColumnDescription] { - &self.sort_desc - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_merge.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_merge.rs deleted file mode 100644 index 569a1e54e83b3..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_merge.rs +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use databend_common_exception::Result; -use databend_common_expression::DataBlock; -use databend_common_pipeline_core::processors::Exchange; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::MergePartitionProcessor; -use databend_common_pipeline_core::processors::MultiwayStrategy; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::Pipe; -use databend_common_pipeline_core::PipeItem; -use databend_common_pipeline_core::Pipeline; - -pub struct TransformSortRangeMerge {} - -impl Exchange for TransformSortRangeMerge { - const NAME: &'static str = "SortRangeMerge"; - const STRATEGY: MultiwayStrategy = MultiwayStrategy::Custom; - - fn partition(&self, block: DataBlock, _: usize) -> Result> { - Ok(vec![block]) - } - - fn multiway_pick(&self, partitions: &[Option]) -> Result { - Ok(partitions.iter().position(Option::is_some).unwrap()) - } -} - -pub fn add_range_shuffle_merge(pipeline: &mut Pipeline) -> Result<()> { - let inputs = pipeline.output_len(); - let inputs_port = (0..inputs).map(|_| InputPort::create()).collect::>(); - let output = OutputPort::create(); - - let processor = MergePartitionProcessor::create( - inputs_port.clone(), - output.clone(), - Arc::new(TransformSortRangeMerge {}), - ); - - let pipe = Pipe::create(inputs, 1, vec![PipeItem::create( - processor, - inputs_port, - vec![output], - )]); - - pipeline.add_pipe(pipe); - Ok(()) -} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_sample.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_sample.rs deleted file mode 100644 index 96c3ace403033..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_sample.rs +++ /dev/null @@ -1,333 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; -use std::sync::RwLock; - -use databend_common_base::base::WatchNotify; -use databend_common_exception::Result; -use databend_common_expression::sampler::FixedSizeSampler; -use databend_common_expression::visitor::ValueVisitor; -use databend_common_expression::Column; -use databend_common_expression::DataBlock; -use databend_common_expression::DataSchemaRef; -use databend_common_expression::SortColumnDescription; -use databend_common_expression::SortCompare; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::ProcessorPtr; -use databend_common_pipeline_core::Pipe; -use databend_common_pipeline_core::PipeItem; -use databend_common_pipeline_core::Pipeline; -use databend_common_pipeline_transforms::processors::create_multi_sort_merge_processor; -use databend_common_pipeline_transforms::processors::sort::convert_rows; -use databend_common_pipeline_transforms::processors::Transform; -use databend_common_pipeline_transforms::TransformPipelineHelper; -use rand::rngs::StdRng; -use rand::SeedableRng; - -use super::sort_exchange::create_exchange_pipe; -use super::sort_wait::TransformSortSampleWait; - -pub struct SortSampleState { - inner: RwLock, - pub(super) done: WatchNotify, -} - -impl SortSampleState { - pub fn partitions(&self) -> usize { - self.inner.read().unwrap().partitions - } -} - -struct StateInner { - partitions: usize, - // schema for bounds DataBlock - schema: DataSchemaRef, - // sort_desc for bounds DataBlock - sort_desc: Vec, - partial: Vec>, - bounds: Option, -} - -impl StateInner { - fn determine_bounds(&mut self) -> Result<()> { - let partial = std::mem::take(&mut self.partial) - .into_iter() - .filter_map(|b| { - let b = b.unwrap(); - if b.is_empty() { - None - } else { - Some(b) - } - }) - .collect::>(); - - if partial.is_empty() { - let bounds = convert_rows( - self.schema.clone(), - &self.sort_desc, - DataBlock::empty_with_schema(self.schema.clone()), - )?; - - self.bounds = Some(bounds); - return Ok(()); - } - - let candidates = DataBlock::concat(&partial)?; - let rows = candidates.num_rows(); - - let mut sort_compare = SortCompare::with_force_equality(self.sort_desc.clone(), rows); - - for desc in &self.sort_desc { - let array = candidates.get_by_offset(desc.offset).value.clone(); - sort_compare.visit_value(array)?; - sort_compare.increment_column_index(); - } - - let equality = sort_compare.equality_index().to_vec(); - let permutation = sort_compare.take_permutation(); - - let step = permutation.len() as f64 / self.partitions as f64; - let mut target = step; - let mut bounds = Vec::with_capacity(self.partitions - 1); - let mut equals = true; - for (i, (&pos, eq)) in permutation.iter().zip(equality).enumerate() { - if bounds.len() >= self.partitions - 1 { - break; - } - if equals && eq == 0 { - equals = false - } - if i as f64 >= target && (!equals || i != 0) { - bounds.push(pos); - target += step; - equals = true - } - } - - let bounds = convert_rows( - self.schema.clone(), - &self.sort_desc, - candidates.take(&bounds)?, - )?; - self.bounds = Some(bounds); - Ok(()) - } -} - -impl SortSampleState { - pub fn new( - inputs: usize, - partitions: usize, - schema: DataSchemaRef, - sort_desc: Arc<[SortColumnDescription]>, - ) -> Arc { - let columns = sort_desc.iter().map(|desc| desc.offset).collect::>(); - let schema = schema.project(&columns).into(); - let sort_desc = sort_desc - .iter() - .enumerate() - .map(|(i, desc)| SortColumnDescription { - offset: i, - asc: desc.asc, - nulls_first: desc.nulls_first, - }) - .collect::>(); - Arc::new(SortSampleState { - inner: RwLock::new(StateInner { - partitions, - schema, - sort_desc, - partial: vec![None; inputs], - bounds: None, - }), - done: WatchNotify::new(), - }) - } - - pub fn bounds(&self) -> Option { - if let Some(bounds) = &self.inner.read().unwrap().bounds { - return Some(bounds.clone()); - } - None - } - - pub fn commit_sample(&self, id: usize, block: Option) -> Result { - let mut inner = self.inner.write().unwrap(); - - let block = block.unwrap_or(DataBlock::empty_with_schema(inner.schema.clone())); - let x = inner.partial[id].replace(block); - debug_assert!(x.is_none()); - let done = inner.partial.iter().all(|x| x.is_some()); - if done { - inner.determine_bounds()?; - self.done.notify_waiters(); - } - Ok(done) - } -} - -pub struct TransformSortSample { - id: usize, - sampler: FixedSizeSampler, - state: Arc, -} - -unsafe impl Send for TransformSortSample {} - -impl TransformSortSample { - fn new(id: usize, k: usize, columns: Vec, state: Arc) -> Self { - let rng = StdRng::from_rng(rand::thread_rng()).unwrap(); - let sampler = FixedSizeSampler::new(columns, 65536, k, rng); - TransformSortSample { id, sampler, state } - } -} - -impl Transform for TransformSortSample { - const NAME: &'static str = "TransformSortSample"; - - fn transform(&mut self, data: DataBlock) -> Result { - self.sampler.add_block(data.clone()); - Ok(data) - } - - fn on_finish(&mut self) -> Result<()> { - self.sampler.compact_blocks(); - let mut sample = self.sampler.take_blocks(); - assert!(sample.len() <= 1); // Unlikely to sample rows greater than 65536 - self.state.commit_sample( - self.id, - if sample.is_empty() { - None - } else { - Some(sample.remove(0)) - }, - )?; - Ok(()) - } -} - -pub fn add_sort_sample( - pipeline: &mut Pipeline, - state: Arc, - sort_desc: Arc<[SortColumnDescription]>, - k: usize, -) -> Result<()> { - use std::sync::atomic; - let i = atomic::AtomicUsize::new(0); - let columns = sort_desc.iter().map(|desc| desc.offset).collect::>(); - pipeline.add_transformer(|| { - let id = i.fetch_add(1, atomic::Ordering::AcqRel); - TransformSortSample::new(id, k, columns.clone(), state.clone()) - }); - Ok(()) -} - -pub fn add_range_shuffle( - pipeline: &mut Pipeline, - state: Arc, - sort_desc: Arc<[SortColumnDescription]>, - schema: DataSchemaRef, - block_size: usize, - limit: Option, - remove_order_col: bool, - enable_loser_tree: bool, -) -> Result<()> { - pipeline.add_transform(|input, output| { - Ok(ProcessorPtr::create(Box::new( - TransformSortSampleWait::new(input, output, state.clone()), - ))) - })?; - - // partition data block - let input_len = pipeline.output_len(); - let n = state.partitions(); - let exchange = create_exchange_pipe(input_len, n, schema.clone(), sort_desc.clone(), state); - pipeline.add_pipe(exchange); - - let reorder_edges = (0..input_len * n) - .map(|index| (index % n) * input_len + (index / n)) - .collect::>(); - - pipeline.reorder_inputs(reorder_edges); - - let mut items = Vec::with_capacity(input_len); - for _ in 0..n { - let output = OutputPort::create(); - let inputs: Vec<_> = (0..input_len).map(|_| InputPort::create()).collect(); - - let proc = create_multi_sort_merge_processor( - inputs.clone(), - output.clone(), - schema.clone(), - block_size, - limit, - sort_desc.clone(), - remove_order_col, - enable_loser_tree, - )?; - - items.push(PipeItem::create(ProcessorPtr::create(proc), inputs, vec![ - output, - ])); - } - - // merge partition - pipeline.add_pipe(Pipe::create(input_len * n, n, items)); - - Ok(()) -} - -#[cfg(test)] -mod tests { - use databend_common_expression::types::ArgType; - use databend_common_expression::types::Int32Type; - use databend_common_expression::DataField; - use databend_common_expression::DataSchemaRefExt; - use databend_common_expression::FromData; - - use super::*; - - #[test] - fn test_determine_bounds() { - let partial = vec![vec![1, 2, 3, 4], vec![4, 5, 6, 7], vec![0, 2, 4, 5]] - .into_iter() - .map(|data| { - Some(DataBlock::new_from_columns(vec![Int32Type::from_data( - data, - )])) - }) - .collect::>(); - - let schema = DataSchemaRefExt::create(vec![DataField::new("a", Int32Type::data_type())]); - let mut inner = StateInner { - partitions: 3, - schema, - sort_desc: vec![SortColumnDescription { - offset: 0, - asc: true, - nulls_first: false, - }], - partial, - bounds: None, - }; - - inner.determine_bounds().unwrap(); - - // 0 1 2 2 | 3 4 4 4 | 5 5 6 7 - assert_eq!(Int32Type::from_data(vec![3, 5]), inner.bounds.unwrap()) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_wait.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_wait.rs deleted file mode 100644 index 3f1543d6b2760..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_wait.rs +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::collections::VecDeque; -use std::sync::Arc; - -use databend_common_exception::Result; -use databend_common_expression::DataBlock; -use databend_common_pipeline_core::processors::Event; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; - -use super::SortSampleState; - -pub struct TransformSortSampleWait { - input: Arc, - output: Arc, - output_data: VecDeque, - blocks: Vec, - state: Arc, -} - -impl TransformSortSampleWait { - pub fn new( - input: Arc, - output: Arc, - state: Arc, - ) -> Self { - Self { - input, - output, - output_data: VecDeque::new(), - blocks: Vec::new(), - state, - } - } -} - -#[async_trait::async_trait] -impl Processor for TransformSortSampleWait { - fn name(&self) -> String { - "TransformSortSimpleWait".to_string() - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn event(&mut self) -> Result { - if self.output.is_finished() { - self.input.finish(); - return Ok(Event::Finished); - } - - if !self.output.can_push() { - self.input.set_not_need_data(); - return Ok(Event::NeedConsume); - } - - if let Some(data_block) = self.output_data.pop_front() { - self.output.push_data(Ok(data_block)); - return Ok(Event::NeedConsume); - } - - if self.input.has_data() { - self.blocks.push(self.input.pull_data().unwrap()?); - self.input.set_need_data(); - return Ok(Event::NeedData); - } - - if self.input.is_finished() { - if self.blocks.is_empty() { - self.output.finish(); - return Ok(Event::Finished); - } - - return if self.state.done.has_notified() { - Ok(Event::Sync) - } else { - Ok(Event::Async) - }; - } - - self.input.set_need_data(); - Ok(Event::NeedData) - } - - fn process(&mut self) -> Result<()> { - debug_assert!(!self.blocks.is_empty()); - self.output_data = VecDeque::from(std::mem::take(&mut self.blocks)); - Ok(()) - } - - #[async_backtrace::framed] - async fn async_process(&mut self) -> Result<()> { - self.state.done.notified().await; - self.output_data = VecDeque::from(std::mem::take(&mut self.blocks)); - Ok(()) - } -} From 5039da3e028e9c6485be7af9052b7b4444f1b074 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 24 Apr 2025 21:42:49 +0800 Subject: [PATCH 22/33] rename Signed-off-by: coldWater --- .../transforms/transform_multi_sort_merge.rs | 4 ++-- .../processors/transforms/sort/mod.rs | 20 +++++++++---------- .../sort/{builder.rs => sort_builder.rs} | 8 ++++---- .../sort/{collect.rs => sort_collect.rs} | 0 .../sort/{exchange.rs => sort_exchange.rs} | 0 .../sort/{execute.rs => sort_execute.rs} | 0 .../sort/{route.rs => sort_route.rs} | 0 .../sort/{shuffle.rs => sort_shuffle.rs} | 0 src/query/settings/src/settings_default.rs | 2 +- 9 files changed, 17 insertions(+), 17 deletions(-) rename src/query/service/src/pipelines/processors/transforms/sort/{builder.rs => sort_builder.rs} (98%) rename src/query/service/src/pipelines/processors/transforms/sort/{collect.rs => sort_collect.rs} (100%) rename src/query/service/src/pipelines/processors/transforms/sort/{exchange.rs => sort_exchange.rs} (100%) rename src/query/service/src/pipelines/processors/transforms/sort/{execute.rs => sort_execute.rs} (100%) rename src/query/service/src/pipelines/processors/transforms/sort/{route.rs => sort_route.rs} (100%) rename src/query/service/src/pipelines/processors/transforms/sort/{shuffle.rs => sort_shuffle.rs} (100%) diff --git a/src/query/pipeline/transforms/src/processors/transforms/transform_multi_sort_merge.rs b/src/query/pipeline/transforms/src/processors/transforms/transform_multi_sort_merge.rs index 5ab82226a9940..573315604e414 100644 --- a/src/query/pipeline/transforms/src/processors/transforms/transform_multi_sort_merge.rs +++ b/src/query/pipeline/transforms/src/processors/transforms/transform_multi_sort_merge.rs @@ -77,7 +77,7 @@ pub fn try_add_multi_sort_merge( } let output_port = OutputPort::create(); - let processor = ProcessorPtr::create(create_multi_sort_merge_processor( + let processor = ProcessorPtr::create(create_processor( inputs_port.clone(), output_port.clone(), schema, @@ -98,7 +98,7 @@ pub fn try_add_multi_sort_merge( } } -pub fn create_multi_sort_merge_processor( +fn create_processor( inputs: Vec>, output: Arc, schema: DataSchemaRef, diff --git a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs index 7e1e4fdff52bd..f59ae6a767d42 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs @@ -25,19 +25,19 @@ use sort_spill::SpillableBlock; use crate::spillers::Spiller; mod bounds; -mod builder; -mod collect; -mod exchange; -mod execute; mod merge_sort; -mod route; -mod shuffle; +mod sort_builder; +mod sort_collect; +mod sort_exchange; +mod sort_execute; +mod sort_route; +mod sort_shuffle; mod sort_spill; -pub use builder::*; -pub use exchange::*; -pub use route::*; -pub use shuffle::*; +pub use sort_builder::*; +pub use sort_exchange::*; +pub use sort_route::*; +pub use sort_shuffle::*; #[derive(Clone)] struct Base { diff --git a/src/query/service/src/pipelines/processors/transforms/sort/builder.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_builder.rs similarity index 98% rename from src/query/service/src/pipelines/processors/transforms/sort/builder.rs rename to src/query/service/src/pipelines/processors/transforms/sort/sort_builder.rs index da9e0e232cf16..c7290b30af271 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/builder.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_builder.rs @@ -33,11 +33,11 @@ use databend_common_pipeline_transforms::sort::Rows; use databend_common_pipeline_transforms::sort::RowsTypeVisitor; use databend_common_pipeline_transforms::MemorySettings; -use super::collect::TransformSortCollect; -use super::execute::TransformSortExecute; use super::merge_sort::TransformSort; -use super::shuffle::SortSampleState; -use super::shuffle::TransformSortShuffle; +use super::sort_collect::TransformSortCollect; +use super::sort_execute::TransformSortExecute; +use super::sort_shuffle::SortSampleState; +use super::sort_shuffle::TransformSortShuffle; use super::Base; use crate::spillers::Spiller; diff --git a/src/query/service/src/pipelines/processors/transforms/sort/collect.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/sort/collect.rs rename to src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs diff --git a/src/query/service/src/pipelines/processors/transforms/sort/exchange.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/sort/exchange.rs rename to src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs diff --git a/src/query/service/src/pipelines/processors/transforms/sort/execute.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_execute.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/sort/execute.rs rename to src/query/service/src/pipelines/processors/transforms/sort/sort_execute.rs diff --git a/src/query/service/src/pipelines/processors/transforms/sort/route.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_route.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/sort/route.rs rename to src/query/service/src/pipelines/processors/transforms/sort/sort_route.rs diff --git a/src/query/service/src/pipelines/processors/transforms/sort/shuffle.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/sort/shuffle.rs rename to src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 4e8debfbe1e7e..6849efa5819a4 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -606,7 +606,7 @@ impl DefaultSettings { range: Some(SettingRange::Numeric(4 * 1024..=u64::MAX)), }), ("range_shuffle_sort_simple_size", DefaultSettingValue { - value: UserSettingValue::UInt64(20), + value: UserSettingValue::UInt64(0), desc: "Sets the simple size per partition used for range shuffle sorting, 0 to disable range shuffle sorting.", mode: SettingMode::Both, scope: SettingScope::Both, From 61383d0e5eb84dc625110df50ef12b728c87826b Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 25 Apr 2025 11:49:17 +0800 Subject: [PATCH 23/33] fix Signed-off-by: coldWater --- .../src/pipelines/builders/builder_sort.rs | 13 ++- .../src/pipelines/executor/executor_graph.rs | 10 ++- .../pipelines/executor/pipeline_executor.rs | 2 +- .../executor/processor_async_task.rs | 2 +- .../executor/query_pipeline_executor.rs | 2 +- .../processors/transforms/sort/mod.rs | 3 +- .../transforms/sort/sort_builder.rs | 32 ++++++++ .../transforms/sort/sort_combine.rs | 80 +++++++++++++++++++ .../transforms/sort/sort_exchange.rs | 63 +++------------ .../transforms/sort/sort_execute.rs | 16 ++-- .../transforms/sort/sort_shuffle.rs | 54 ++++++++----- .../processors/transforms/sort/sort_spill.rs | 11 ++- src/query/settings/src/settings_default.rs | 8 +- .../settings/src/settings_getter_setter.rs | 4 +- 14 files changed, 200 insertions(+), 100 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/sort/sort_combine.rs diff --git a/src/query/service/src/pipelines/builders/builder_sort.rs b/src/query/service/src/pipelines/builders/builder_sort.rs index e5041bb4a9ed6..34783ff749b31 100644 --- a/src/query/service/src/pipelines/builders/builder_sort.rs +++ b/src/query/service/src/pipelines/builders/builder_sort.rs @@ -33,8 +33,8 @@ use databend_common_storage::DataOperator; use databend_common_storages_fuse::TableContext; use crate::pipelines::memory_settings::MemorySettingsExt; -use crate::pipelines::processors::transforms::add_range_shuffle_exchange; use crate::pipelines::processors::transforms::add_range_shuffle_route; +use crate::pipelines::processors::transforms::SortRangeExchange; use crate::pipelines::processors::transforms::SortSampleState; use crate::pipelines::processors::transforms::TransformLimit; use crate::pipelines::processors::transforms::TransformSortBuilder; @@ -137,8 +137,9 @@ impl PipelineBuilder { None => { // Build for single node mode. // We build the full sort pipeline for it. - let k = self.settings.get_range_shuffle_sort_simple_size()?; - if k > 0 && self.main_pipeline.output_len() > 1 { + if self.settings.get_enable_range_shuffle_sort()? + && self.main_pipeline.output_len() > 1 + { builder .remove_order_col_at_last() .build_range_shuffle_sort_pipeline(&mut self.main_pipeline) @@ -259,7 +260,11 @@ impl SortPipelineBuilder { builder.add_shuffle(pipeline, state.clone())?; - add_range_shuffle_exchange(pipeline, max_threads)?; + pipeline.exchange(max_threads, Arc::new(SortRangeExchange)); + + pipeline.add_transform(|input, output| { + Ok(ProcessorPtr::create(builder.build_combine(input, output)?)) + })?; pipeline.add_transform(|input, output| { Ok(ProcessorPtr::create(builder.build_exec(input, output)?)) diff --git a/src/query/service/src/pipelines/executor/executor_graph.rs b/src/query/service/src/pipelines/executor/executor_graph.rs index c7b2c27272aeb..4059501a4ff9a 100644 --- a/src/query/service/src/pipelines/executor/executor_graph.rs +++ b/src/query/service/src/pipelines/executor/executor_graph.rs @@ -789,7 +789,7 @@ impl RunningGraph { true => Ok(()), false => Err(ErrorCode::Internal(format!( "Pipeline graph is not finished, details: {}", - self.format_graph_nodes() + self.format_graph_nodes(true) ))), } } @@ -855,7 +855,7 @@ impl RunningGraph { self.0.finished_notify.clone() } - pub fn format_graph_nodes(&self) -> String { + pub fn format_graph_nodes(&self, pretty: bool) -> String { pub struct NodeDisplay { id: usize, name: String, @@ -955,7 +955,11 @@ impl RunningGraph { } } - format!("{:?}", nodes_display) + if pretty { + format!("{:#?}", nodes_display) + } else { + format!("{:?}", nodes_display) + } } /// Change the priority diff --git a/src/query/service/src/pipelines/executor/pipeline_executor.rs b/src/query/service/src/pipelines/executor/pipeline_executor.rs index 7fef8013bbb51..c0a05425144a4 100644 --- a/src/query/service/src/pipelines/executor/pipeline_executor.rs +++ b/src/query/service/src/pipelines/executor/pipeline_executor.rs @@ -268,7 +268,7 @@ impl PipelineExecutor { pub fn format_graph_nodes(&self) -> String { match self { PipelineExecutor::QueryPipelineExecutor(executor) => executor.format_graph_nodes(), - PipelineExecutor::QueriesPipelineExecutor(v) => v.graph.format_graph_nodes(), + PipelineExecutor::QueriesPipelineExecutor(v) => v.graph.format_graph_nodes(false), } } diff --git a/src/query/service/src/pipelines/executor/processor_async_task.rs b/src/query/service/src/pipelines/executor/processor_async_task.rs index b864e5dab8d60..83a57bb5f8811 100644 --- a/src/query/service/src/pipelines/executor/processor_async_task.rs +++ b/src/query/service/src/pipelines/executor/processor_async_task.rs @@ -157,7 +157,7 @@ impl ProcessorAsyncTask { processor_name, elapsed, active_workers, - graph_clone.format_graph_nodes() + graph_clone.format_graph_nodes(false) ); } }; diff --git a/src/query/service/src/pipelines/executor/query_pipeline_executor.rs b/src/query/service/src/pipelines/executor/query_pipeline_executor.rs index 844a1d8316fd7..94b7d3b353c6e 100644 --- a/src/query/service/src/pipelines/executor/query_pipeline_executor.rs +++ b/src/query/service/src/pipelines/executor/query_pipeline_executor.rs @@ -431,7 +431,7 @@ impl QueryPipelineExecutor { } pub fn format_graph_nodes(&self) -> String { - self.graph.format_graph_nodes() + self.graph.format_graph_nodes(false) } pub fn fetch_plans_profile(&self, collect_metrics: bool) -> HashMap { diff --git a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs index f59ae6a767d42..3d7c57e31beb0 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/mod.rs @@ -28,6 +28,7 @@ mod bounds; mod merge_sort; mod sort_builder; mod sort_collect; +mod sort_combine; mod sort_exchange; mod sort_execute; mod sort_route; @@ -60,7 +61,7 @@ local_block_meta_serde!(SortCollectedMeta); impl BlockMetaInfo for SortCollectedMeta {} #[derive(Debug)] -struct SortScatteredMeta(pub Vec); +struct SortScatteredMeta(pub Vec>); local_block_meta_serde!(SortScatteredMeta); diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_builder.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_builder.rs index c7290b30af271..1eb032fb1bdc3 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_builder.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_builder.rs @@ -31,10 +31,12 @@ use databend_common_pipeline_transforms::sort::utils::ORDER_COL_NAME; use databend_common_pipeline_transforms::sort::RowConverter; use databend_common_pipeline_transforms::sort::Rows; use databend_common_pipeline_transforms::sort::RowsTypeVisitor; +use databend_common_pipeline_transforms::AccumulatingTransformer; use databend_common_pipeline_transforms::MemorySettings; use super::merge_sort::TransformSort; use super::sort_collect::TransformSortCollect; +use super::sort_combine::TransformSortCombine; use super::sort_execute::TransformSortExecute; use super::sort_shuffle::SortSampleState; use super::sort_shuffle::TransformSortShuffle; @@ -46,6 +48,7 @@ enum SortType { Collect, Execute, Shuffle, + Combine, } pub struct TransformSortBuilder { @@ -183,6 +186,25 @@ impl TransformSortBuilder { select_row_type(&mut build) } + pub fn build_combine( + &self, + input: Arc, + output: Arc, + ) -> Result> { + self.check(); + + let mut build = Build { + params: self, + input, + output, + typ: SortType::Combine, + id: 0, + state: None, + }; + + select_row_type(&mut build) + } + fn should_use_sort_limit(&self) -> bool { self.limit.map(|limit| limit < 10000).unwrap_or_default() } @@ -288,6 +310,15 @@ impl Build<'_> { self.params.spiller.clone(), ))) } + + fn build_sort_combine(&mut self) -> Result> + where R: Rows + 'static { + Ok(AccumulatingTransformer::create( + self.input.clone(), + self.output.clone(), + TransformSortCombine::::new(self.params.block_size), + )) + } } impl RowsTypeVisitor for Build<'_> { @@ -320,6 +351,7 @@ impl RowsTypeVisitor for Build<'_> { false => self.build_sort_exec::>(), }, SortType::Shuffle => self.build_sort_shuffle::(), + SortType::Combine => self.build_sort_combine::(), } } } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_combine.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_combine.rs new file mode 100644 index 0000000000000..538bd7aff390b --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_combine.rs @@ -0,0 +1,80 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_exception::Result; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::DataBlock; +use databend_common_pipeline_transforms::sort::Rows; +use databend_common_pipeline_transforms::AccumulatingTransform; + +use super::bounds::Bounds; +use super::SortCollectedMeta; + +pub struct TransformSortCombine { + batch_rows: usize, + metas: Vec, + _r: std::marker::PhantomData, +} + +impl TransformSortCombine { + pub fn new(batch_rows: usize) -> Self { + Self { + batch_rows, + metas: vec![], + _r: Default::default(), + } + } +} + +impl AccumulatingTransform for TransformSortCombine { + const NAME: &'static str = "TransformSortCombine"; + + fn transform(&mut self, mut data: DataBlock) -> Result> { + self.metas.push( + data.take_meta() + .and_then(SortCollectedMeta::downcast_from) + .expect("require a SortCollectedMeta"), + ); + Ok(vec![]) + } + + fn on_finish(&mut self, output: bool) -> Result> { + if !output || self.metas.is_empty() { + return Ok(vec![]); + } + + let params = self.metas.first().map(|meta| meta.params).unwrap(); + + let bounds = self + .metas + .iter_mut() + .map(|meta| std::mem::take(&mut meta.bounds)) + .collect(); + let bounds = Bounds::merge::(bounds, self.batch_rows)?; + + let blocks = self + .metas + .drain(..) + .flat_map(|meta| meta.blocks.into_iter()) + .collect(); + + Ok(vec![DataBlock::empty_with_meta(Box::new( + SortCollectedMeta { + params, + bounds, + blocks, + }, + ))]) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs index cb378244c61d3..700c5a3e0e81a 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_exchange.rs @@ -12,76 +12,33 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::iter; -use std::sync::Arc; - use databend_common_exception::Result; use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; use databend_common_pipeline_core::processors::Exchange; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::PartitionProcessor; -use databend_common_pipeline_core::Pipe; -use databend_common_pipeline_core::PipeItem; -use databend_common_pipeline_core::Pipeline; use super::SortScatteredMeta; -struct SortRangeExchange; +pub struct SortRangeExchange; impl Exchange for SortRangeExchange { const NAME: &'static str = "SortRange"; fn partition(&self, mut data: DataBlock, n: usize) -> Result> { - let Some(meta) = data.take_meta() else { - unreachable!(); - }; - - let Some(SortScatteredMeta(scattered)) = SortScatteredMeta::downcast_from(meta) else { - unreachable!(); - }; - + let scattered = data + .take_meta() + .and_then(SortScatteredMeta::downcast_from) + .expect("require a SortScatteredMeta") + .0; assert!(scattered.len() <= n); let blocks = scattered .into_iter() - .map(|meta| DataBlock::empty_with_meta(Box::new(meta))) + .map(|meta| { + meta.map(|meta| DataBlock::empty_with_meta(Box::new(meta))) + .unwrap_or_else(DataBlock::empty) + }) .collect(); Ok(blocks) } } - -fn create_exchange_pipe(num_input: usize, num_output: usize) -> Pipe { - let items = iter::repeat_with(|| { - let input = InputPort::create(); - let outputs = iter::repeat_with(OutputPort::create) - .take(num_output) - .collect::>(); - - PipeItem::create( - PartitionProcessor::create(input.clone(), outputs.clone(), Arc::new(SortRangeExchange)), - vec![input], - outputs, - ) - }) - .take(num_input) - .collect::>(); - - Pipe::create(num_input, num_input * num_output, items) -} - -pub fn add_range_shuffle_exchange(pipeline: &mut Pipeline, num_output: usize) -> Result<()> { - let num_input = pipeline.output_len(); - - pipeline.add_pipe(create_exchange_pipe(num_input, num_output)); - - let n = num_output; - let reorder_edges = (0..num_input * n) - .map(|i| (i % n) * num_input + (i / n)) - .collect::>(); - - pipeline.reorder_inputs(reorder_edges); - - Ok(()) -} diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_execute.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_execute.rs index 3c5d832aa5973..044858dc4358a 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_execute.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_execute.rs @@ -91,6 +91,11 @@ where return Ok(Event::NeedConsume); } + if self.input.is_finished() && self.inner.is_none() { + self.output.finish(); + return Ok(Event::Finished); + } + if let Some(mut block) = self.input.pull_data().transpose()? { assert!(self.inner.is_none()); let meta = block @@ -102,12 +107,12 @@ where return Ok(Event::Async); } - if self.input.is_finished() { - Ok(Event::Async) - } else { - self.input.set_need_data(); - Ok(Event::NeedData) + if self.inner.is_some() { + return Ok(Event::Async); } + + self.input.set_need_data(); + Ok(Event::NeedData) } #[async_backtrace::framed] @@ -122,6 +127,7 @@ where } if finish { self.output.finish(); + self.inner = None; } Ok(()) } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs index 9c9a3b7067019..f617e50ebcdac 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs @@ -39,7 +39,7 @@ use crate::spillers::Spiller; enum Step { None, Meta(Box), - Scattered(Vec), + Scattered(Vec>), } pub struct TransformSortShuffle { @@ -71,25 +71,26 @@ impl TransformSortShuffle { } } - async fn scatter(&mut self) -> Result<()> { - let scatter_bounds = self.state.bounds(); - - let Step::Meta(box SortCollectedMeta { + async fn scatter(&mut self) -> Result>> { + let SortCollectedMeta { params, bounds, blocks, - }) = std::mem::replace(&mut self.step, Step::None) - else { - unreachable!() + } = match std::mem::replace(&mut self.step, Step::None) { + Step::None => { + return Ok(vec![]); + } + Step::Meta(box meta) => meta, + _ => unreachable!(), }; + let scatter_bounds = self.state.bounds(); if scatter_bounds.is_empty() { - Step::Scattered(vec![SortCollectedMeta { + return Ok(vec![Some(SortCollectedMeta { params, bounds, blocks, - }]); - return Ok(()); + })]); } let base = { @@ -102,24 +103,31 @@ impl TransformSortShuffle { } }; - let mut scattered_meta = std::iter::repeat_with(|| SortCollectedMeta { - params, - bounds: bounds.clone(), - blocks: vec![], - }) - .take(scatter_bounds.len() + 1) - .collect::>(); + let mut scattered_blocks = std::iter::repeat_with(Vec::new) + .take(scatter_bounds.len() + 1) + .collect::>(); for blocks in blocks { let scattered = base .scatter_stream::(Vec::from(blocks).into(), scatter_bounds.clone()) .await?; for (i, part) in scattered.into_iter().enumerate() { - scattered_meta[i].blocks.push(part.into_boxed_slice()); + if !part.is_empty() { + scattered_blocks[i].push(part.into_boxed_slice()); + } } } - self.step = Step::Scattered(scattered_meta); - Ok(()) + let scattered_meta = scattered_blocks + .into_iter() + .map(|blocks| { + (!blocks.is_empty()).then_some(SortCollectedMeta { + params, + bounds: bounds.clone(), + blocks, + }) + }) + .collect(); + Ok(scattered_meta) } } @@ -182,12 +190,14 @@ impl Processor for TransformSortShuffle { #[async_backtrace::framed] async fn async_process(&mut self) -> Result<()> { let bounds = match &self.step { + Step::None if self.input.is_finished() => Bounds::default(), Step::Meta(meta) => meta.bounds.clone(), _ => unreachable!(), }; self.state.commit_sample::(self.id, bounds)?; self.state.done.notified().await; - self.scatter().await + self.step = Step::Scattered(self.scatter().await?); + Ok(()) } } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs index 379ea962ed772..588878cd88bd8 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs @@ -105,9 +105,11 @@ where A: SortAlgorithm let subsequent = blocks .into_iter() - .map(|list| base.new_stream(Vec::from(list).into(), None)) - .collect(); - + .filter_map(|list| { + (!list.is_empty()).then(|| base.new_stream(Vec::from(list).into(), None)) + }) + .collect::>(); + debug_assert!(!subsequent.is_empty()); Self { base, step: Step::Sort(StepSort { @@ -235,6 +237,8 @@ impl StepCollect { let data = input_data.pop().unwrap(); vec![base.new_block(data)].into() } else { + // todo: using multi-threaded cascade two-way merge sorting algorithm to obtain the best performance + // also see https://arxiv.org/pdf/1406.2628 let mut merger = create_memory_merger::( input_data, base.schema.clone(), @@ -494,6 +498,7 @@ impl Base { blocks: VecDeque, bound: Option, ) -> BoundBlockStream> { + assert!(!blocks.is_empty()); BoundBlockStream { blocks, bound, diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 6849efa5819a4..d79bf762155e7 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -605,12 +605,12 @@ impl DefaultSettings { scope: SettingScope::Both, range: Some(SettingRange::Numeric(4 * 1024..=u64::MAX)), }), - ("range_shuffle_sort_simple_size", DefaultSettingValue { - value: UserSettingValue::UInt64(0), - desc: "Sets the simple size per partition used for range shuffle sorting, 0 to disable range shuffle sorting.", + ("enable_range_shuffle_sort", DefaultSettingValue { + value: UserSettingValue::UInt64(1), + desc: "Enable range shuffle sort.", mode: SettingMode::Both, scope: SettingScope::Both, - range: Some(SettingRange::Numeric(0..=500)), + range: Some(SettingRange::Numeric(0..=1)), }), ("group_by_shuffle_mode", DefaultSettingValue { value: UserSettingValue::String(String::from("before_merge")), diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index f4bbb0b305cb0..c9482937c2838 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -486,8 +486,8 @@ impl Settings { Ok(self.try_get_u64("sort_spilling_memory_ratio")? as usize) } - pub fn get_range_shuffle_sort_simple_size(&self) -> Result { - Ok(self.try_get_u64("range_shuffle_sort_simple_size")? as usize) + pub fn get_enable_range_shuffle_sort(&self) -> Result { + Ok(self.try_get_u64("enable_range_shuffle_sort")? == 1) } pub fn get_group_by_shuffle_mode(&self) -> Result { From 28f5adf43b4a1d7c769eb74b5059ffbee25f9286 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 25 Apr 2025 15:54:25 +0800 Subject: [PATCH 24/33] fix Signed-off-by: coldWater --- src/query/pipeline/core/src/processors/mod.rs | 1 - .../pipeline/core/src/processors/shuffle_processor.rs | 10 +--------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/src/query/pipeline/core/src/processors/mod.rs b/src/query/pipeline/core/src/processors/mod.rs index 095a9d597be61..c3b0e1772a341 100644 --- a/src/query/pipeline/core/src/processors/mod.rs +++ b/src/query/pipeline/core/src/processors/mod.rs @@ -39,6 +39,5 @@ pub use resize_processor::create_resize_item; pub use resize_processor::ResizeProcessor; pub use shuffle_processor::Exchange; pub use shuffle_processor::MergePartitionProcessor; -pub use shuffle_processor::MultiwayStrategy; pub use shuffle_processor::PartitionProcessor; pub use shuffle_processor::ShuffleProcessor; diff --git a/src/query/pipeline/core/src/processors/shuffle_processor.rs b/src/query/pipeline/core/src/processors/shuffle_processor.rs index dac49ea50b79e..2b57c3b3cc333 100644 --- a/src/query/pipeline/core/src/processors/shuffle_processor.rs +++ b/src/query/pipeline/core/src/processors/shuffle_processor.rs @@ -345,10 +345,7 @@ impl Processor for MergePartitionProcessor { input.set_need_data(); } - if all_inputs_finished - && (!matches!(T::STRATEGY, MultiwayStrategy::Custom) - || self.inputs_data.iter().all(Option::is_none)) - { + if all_inputs_finished { self.output.finish(); return Ok(Event::Finished); } @@ -360,11 +357,6 @@ impl Processor for MergePartitionProcessor { self.output.push_data(Ok(block)); return Ok(Event::NeedConsume); } - - if all_inputs_finished && self.inputs_data.iter().all(Option::is_none) { - self.output.finish(); - return Ok(Event::Finished); - } } Ok(Event::NeedData) From 533262d62899ab89dc03b7837b33d805291d7b46 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 25 Apr 2025 17:16:13 +0800 Subject: [PATCH 25/33] fix Signed-off-by: coldWater --- src/query/service/src/pipelines/builders/builder_sort.rs | 3 ++- .../pipelines/processors/transforms/sort/sort_builder.rs | 6 +++++- .../pipelines/processors/transforms/sort/sort_shuffle.rs | 1 - 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/query/service/src/pipelines/builders/builder_sort.rs b/src/query/service/src/pipelines/builders/builder_sort.rs index 34783ff749b31..c3c9dc1b3a9ea 100644 --- a/src/query/service/src/pipelines/builders/builder_sort.rs +++ b/src/query/service/src/pipelines/builders/builder_sort.rs @@ -256,7 +256,8 @@ impl SortPipelineBuilder { Ok(ProcessorPtr::create(builder.build_collect(input, output)?)) })?; - let state = SortSampleState::new(inputs, max_threads, self.schema.clone(), max_block_size); + let state = + SortSampleState::new(inputs, max_threads, builder.inner_schema(), max_block_size); builder.add_shuffle(pipeline, state.clone())?; diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_builder.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_builder.rs index 1eb032fb1bdc3..4bfbe9f9646ff 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_builder.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_builder.rs @@ -218,7 +218,7 @@ impl TransformSortBuilder { } fn new_base(&self) -> Base { - let schema = add_order_field(self.schema.clone(), &self.sort_desc); + let schema = self.inner_schema(); let sort_row_offset = schema.fields().len() - 1; Base { sort_row_offset, @@ -228,6 +228,10 @@ impl TransformSortBuilder { } } + pub fn inner_schema(&self) -> DataSchemaRef { + add_order_field(self.schema.clone(), &self.sort_desc) + } + pub fn add_shuffle(&self, pipeline: &mut Pipeline, state: Arc) -> Result<()> { use std::sync::atomic; let i = atomic::AtomicUsize::new(0); diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs index f617e50ebcdac..422537c486f80 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs @@ -251,7 +251,6 @@ impl SortSampleState { struct StateInner { // target partitions partitions: usize, - // schema for bounds DataBlock schema: DataSchemaRef, partial: Vec>, bounds: Option, From 88a11cb5fb258986a58d55267bb485c4f54627b4 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 25 Apr 2025 17:50:05 +0800 Subject: [PATCH 26/33] fix Signed-off-by: coldWater --- tests/sqllogictests/suites/mode/standalone/explain/sort.test | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/sqllogictests/suites/mode/standalone/explain/sort.test b/tests/sqllogictests/suites/mode/standalone/explain/sort.test index 79adbf6c96efe..12c60372efffa 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/sort.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/sort.test @@ -1,6 +1,9 @@ statement ok create or replace table t1(a int, b int); +statement ok +set enable_range_shuffle_sort = 0; + query T explain select a from (select * from t1 order by a) as t2 where a > 1; ---- From 0ee61543c1ddcbfcb36e51febcdcdad3e2c1da98 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 25 Apr 2025 18:04:06 +0800 Subject: [PATCH 27/33] fix Signed-off-by: coldWater --- .../sqllogictests/suites/stage/formats/parquet/read_policy.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/sqllogictests/suites/stage/formats/parquet/read_policy.test b/tests/sqllogictests/suites/stage/formats/parquet/read_policy.test index df51ce3bb1325..bc32805e36ad3 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/read_policy.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/read_policy.test @@ -122,7 +122,7 @@ select id, t:a from @data/parquet/tuple.parquet where t:a > 1; # topk does not contain output query TT -select id, t:b from @data/parquet/tuple.parquet order by t:a desc limit 2; +select id, t:b from @data/parquet/tuple.parquet order by t:a desc, id desc limit 2; ---- 3 c 2 b From 3906b41596f9155741b740f7cf1712c9658d5ffa Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 25 Apr 2025 23:05:42 +0800 Subject: [PATCH 28/33] update Signed-off-by: coldWater --- .../processors/transforms/sort/bounds.rs | 91 ++++++++++++++++++- .../transforms/sort/sort_shuffle.rs | 33 ++++++- .../processors/transforms/sort/sort_spill.rs | 2 +- .../mode/standalone/explain/window.test | 3 + .../stage/formats/parquet/read_policy.test | 2 +- 5 files changed, 122 insertions(+), 9 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs b/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs index 741ccca55d66a..c8f2f6bfae7cb 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs @@ -31,6 +31,10 @@ pub struct Bounds( ); impl Bounds { + pub fn new_unchecked(column: Column) -> Bounds { + Bounds(vec![column]) + } + pub fn from_column(column: Column) -> Result { let block = DataBlock::sort( &DataBlock::new_from_columns(vec![column]), @@ -100,12 +104,12 @@ impl Bounds { if n == 0 { return Some(Self::default()); } - let count = self.len(); - if n >= count { + let total = self.len(); + if n >= total { return None; } - let step = count / n; + let step = total / n; let offset = step / 2; let indices = self .0 @@ -131,6 +135,56 @@ impl Bounds { indices.len(), )])) } + + pub fn dedup_reduce(&self, n: usize) -> Self { + if n == 0 { + return Self::default(); + } + let total = self.len(); + let mut step = total as f64 / n as f64; + let mut target = step / 2.0; + let mut indices = Vec::with_capacity(n); + let mut last: Option<(R, _)> = None; + for (i, (b_idx, r_idx)) in self + .0 + .iter() + .enumerate() + .rev() + .flat_map(|(b_idx, col)| std::iter::repeat_n(b_idx, col.len()).zip(0..col.len())) + .enumerate() + { + if indices.len() >= n { + break; + } + if (i as f64) < target { + continue; + } + + let cur_rows = R::from_column(&self.0[b_idx]).unwrap(); + if last + .as_ref() + .map(|(last_rows, last_idx)| cur_rows.row(r_idx) == last_rows.row(*last_idx)) + .unwrap_or_default() + { + continue; + } + + indices.push((b_idx as u32, r_idx as u32, 1)); + target += step; + if (i as f64) > target && indices.len() < n { + step = (total - i) as f64 / (n - indices.len()) as f64; + target = i as f64 + step / 2.0; + } + last = Some((cur_rows, r_idx)); + } + + Bounds(vec![Column::take_column_indices( + &self.0, + R::data_type(), + &indices, + indices.len(), + )]) + } } impl SortedStream for Bounds { @@ -233,4 +287,35 @@ mod tests { Ok(()) } + + #[test] + fn test_dedup_reduce() -> Result<()> { + let column = Int32Type::from_data(vec![1, 2, 2, 3, 3, 3, 4, 5, 5]); + let bounds = Bounds::new_unchecked(column); + let reduced = bounds.dedup_reduce::>(3); + assert_eq!(reduced, Bounds(vec![Int32Type::from_data(vec![2, 3, 5])])); + + let column = Int32Type::from_data(vec![5, 5, 4, 3, 3, 3, 2, 2, 1]); + let bounds = Bounds::new_unchecked(column); + let reduced = bounds.dedup_reduce::>(3); + assert_eq!(reduced, Bounds(vec![Int32Type::from_data(vec![4, 3, 1])])); + + let bounds_vec = [vec![5, 6, 7, 7], vec![3, 3, 4, 5], vec![1, 2, 2, 3]] + .into_iter() + .map(|v| Int32Type::from_data(v)) + .collect::>(); + let bounds = Bounds(bounds_vec); + let reduced = bounds.dedup_reduce::>(5); + assert_eq!( + reduced, + Bounds(vec![Int32Type::from_data(vec![2, 3, 4, 6, 7])]) + ); + + let column = Int32Type::from_data(vec![1, 1, 1, 1, 1]); + let bounds = Bounds(vec![column]); + let reduced = bounds.dedup_reduce::>(3); + assert_eq!(reduced, Bounds(vec![Int32Type::from_data(vec![1])])); + + Ok(()) + } } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs index 422537c486f80..41c4ae5360132 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_shuffle.rs @@ -191,7 +191,7 @@ impl Processor for TransformSortShuffle { async fn async_process(&mut self) -> Result<()> { let bounds = match &self.step { Step::None if self.input.is_finished() => Bounds::default(), - Step::Meta(meta) => meta.bounds.clone(), + Step::Meta(meta) => meta.generate_bounds(), _ => unreachable!(), }; self.state.commit_sample::(self.id, bounds)?; @@ -201,6 +201,27 @@ impl Processor for TransformSortShuffle { } } +impl SortCollectedMeta { + fn generate_bounds(&self) -> Bounds { + if self.bounds.len() > 1 { + return self.bounds.clone(); + } + + let Some(blocks) = self.blocks.get(self.blocks.len() / 2) else { + return Bounds::default(); + }; + + blocks + .get(blocks.len() / 2) + .map(|block| match block.domain.len() { + 0 => Bounds::default(), + 1 => Bounds::new_unchecked(block.domain.clone()), + _ => Bounds::new_unchecked(block.domain.slice(0..1)), + }) + .unwrap_or_default() + } +} + pub struct SortSampleState { inner: RwLock, pub(super) done: WatchNotify, @@ -261,9 +282,13 @@ impl StateInner { fn determine_bounds(&mut self) -> Result<()> { let v = self.partial.drain(..).map(Option::unwrap).collect(); let bounds = Bounds::merge::(v, self.batch_rows)?; - let bounds = bounds - .reduce(self.partitions - 1, R::data_type()) - .unwrap_or(bounds); + + let n = self.partitions - 1; + let bounds = if bounds.len() < n { + bounds + } else { + bounds.dedup_reduce::(n) + }; assert!(bounds.len() < self.partitions); self.bounds = Some(bounds); diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs index 588878cd88bd8..859a535a23efc 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs @@ -596,7 +596,7 @@ pub struct SpillableBlock { data: Option, rows: usize, location: Option, - domain: Column, + pub(super) domain: Column, processed: usize, } diff --git a/tests/sqllogictests/suites/mode/standalone/explain/window.test b/tests/sqllogictests/suites/mode/standalone/explain/window.test index 469a7088d4cd9..3b30ff5d865ea 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/window.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/window.test @@ -47,6 +47,9 @@ set sort_spilling_memory_ratio = 0; statement ok set enable_parallel_multi_merge_sort = 0; +statement ok +set enable_range_shuffle_sort = 0; + query T explain pipeline SELECT depname, empno, salary, sum(salary) OVER (PARTITION BY depname ORDER BY empno) FROM empsalary ORDER BY depname, empno; ---- diff --git a/tests/sqllogictests/suites/stage/formats/parquet/read_policy.test b/tests/sqllogictests/suites/stage/formats/parquet/read_policy.test index bc32805e36ad3..075d2f24d28b9 100644 --- a/tests/sqllogictests/suites/stage/formats/parquet/read_policy.test +++ b/tests/sqllogictests/suites/stage/formats/parquet/read_policy.test @@ -135,7 +135,7 @@ select t, t:a from @data/parquet/tuple.parquet order by id desc limit 2; # topk contains output query TT -select id, t:b, t:a from @data/parquet/tuple.parquet order by t:a desc limit 2; +select id, t:b, t:a from @data/parquet/tuple.parquet order by t:a desc, id desc limit 2; ---- 3 c 3 2 b 3 From 2c821873cd91b3becb7c77746c88f1ea98b22edf Mon Sep 17 00:00:00 2001 From: coldWater Date: Sat, 26 Apr 2025 18:12:46 +0800 Subject: [PATCH 29/33] fix Signed-off-by: coldWater --- .../processors/transforms/sort/bounds.rs | 55 +++++++++---------- .../processors/transforms/sort/merge_sort.rs | 4 ++ .../transforms/sort/sort_collect.rs | 4 ++ 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs b/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs index c8f2f6bfae7cb..6f20fb9faf1ef 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/bounds.rs @@ -209,6 +209,11 @@ mod tests { use super::*; + fn int32_columns(data: T) -> Vec + where T: IntoIterator> { + data.into_iter().map(Int32Type::from_data).collect() + } + #[test] fn test_merge() -> Result<()> { { @@ -231,17 +236,12 @@ mod tests { assert_eq!( bounds, - Bounds(vec![ - Int32Type::from_data(vec![6, 7]), - Int32Type::from_data(vec![2, 6, 6]), - Int32Type::from_data(vec![0, 0, 1]), - ]) + Bounds(int32_columns([vec![6, 7], vec![2, 6, 6], vec![0, 0, 1]])) ); } { - let data = vec![vec![77, -2, 7], vec![3, 8, 6, 1, 1], vec![2]]; - + let data = [vec![77, -2, 7], vec![3, 8, 6, 1, 1], vec![2]]; let data = data .into_iter() .map(|v| Bounds::from_column::>(Int32Type::from_data(v))) @@ -250,13 +250,13 @@ mod tests { assert_eq!( bounds, - Bounds(vec![ - Int32Type::from_data(vec![-2]), - Int32Type::from_data(vec![1, 1]), - Int32Type::from_data(vec![3, 2]), - Int32Type::from_data(vec![7, 6]), - Int32Type::from_data(vec![77, 8]), - ]) + Bounds(int32_columns([ + vec![-2], + vec![1, 1], + vec![3, 2], + vec![7, 6], + vec![77, 8] + ])) ); } @@ -274,16 +274,16 @@ mod tests { let bounds = Bounds::merge::>(data, 2)?; let got = bounds.reduce(4, Int32Type::data_type()).unwrap(); - assert_eq!(got, Bounds(vec![Int32Type::from_data(vec![8, 6, 2, 1])])); // 77 _8 7 _6 3 _2 1 _1 -2 + assert_eq!(got, Bounds(int32_columns([vec![8, 6, 2, 1]]))); // 77 _8 7 _6 3 _2 1 _1 -2 let got = bounds.reduce(3, Int32Type::data_type()).unwrap(); - assert_eq!(got, Bounds(vec![Int32Type::from_data(vec![8, 3, 1])])); // 77 _8 7 6 _3 2 1 _1 -2 + assert_eq!(got, Bounds(int32_columns([vec![8, 3, 1]]))); // 77 _8 7 6 _3 2 1 _1 -2 let got = bounds.reduce(2, Int32Type::data_type()).unwrap(); - assert_eq!(got, Bounds(vec![Int32Type::from_data(vec![7, 1])])); // 77 8 _7 6 3 2 _1 1 -2 + assert_eq!(got, Bounds(int32_columns([vec![7, 1]]))); // 77 8 _7 6 3 2 _1 1 -2 let got = bounds.reduce(1, Int32Type::data_type()).unwrap(); - assert_eq!(got, Bounds(vec![Int32Type::from_data(vec![3])])); // 77 8 7 6 _3 2 1 1 -2 + assert_eq!(got, Bounds(int32_columns([vec![3]]))); // 77 8 7 6 _3 2 1 1 -2 Ok(()) } @@ -293,28 +293,23 @@ mod tests { let column = Int32Type::from_data(vec![1, 2, 2, 3, 3, 3, 4, 5, 5]); let bounds = Bounds::new_unchecked(column); let reduced = bounds.dedup_reduce::>(3); - assert_eq!(reduced, Bounds(vec![Int32Type::from_data(vec![2, 3, 5])])); + assert_eq!(reduced, Bounds(int32_columns([vec![2, 3, 5]]))); let column = Int32Type::from_data(vec![5, 5, 4, 3, 3, 3, 2, 2, 1]); let bounds = Bounds::new_unchecked(column); let reduced = bounds.dedup_reduce::>(3); - assert_eq!(reduced, Bounds(vec![Int32Type::from_data(vec![4, 3, 1])])); + assert_eq!(reduced, Bounds(int32_columns([vec![4, 3, 1]]))); - let bounds_vec = [vec![5, 6, 7, 7], vec![3, 3, 4, 5], vec![1, 2, 2, 3]] - .into_iter() - .map(|v| Int32Type::from_data(v)) - .collect::>(); - let bounds = Bounds(bounds_vec); + let bounds = Bounds(int32_columns([vec![5, 6, 7, 7], vec![3, 3, 4, 5], vec![ + 1, 2, 2, 3, + ]])); let reduced = bounds.dedup_reduce::>(5); - assert_eq!( - reduced, - Bounds(vec![Int32Type::from_data(vec![2, 3, 4, 6, 7])]) - ); + assert_eq!(reduced, Bounds(int32_columns([vec![2, 3, 4, 6, 7]]))); let column = Int32Type::from_data(vec![1, 1, 1, 1, 1]); let bounds = Bounds(vec![column]); let reduced = bounds.dedup_reduce::>(3); - assert_eq!(reduced, Bounds(vec![Int32Type::from_data(vec![1])])); + assert_eq!(reduced, Bounds(int32_columns([vec![1]]))); Ok(()) } diff --git a/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs b/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs index 165664a9df464..7fdd0f4b09c98 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs @@ -195,6 +195,10 @@ where let unit_size = self.memory_settings.spill_unit_size; let num_merge = bytes.div_ceil(unit_size).max(2); let batch_rows = rows.div_ceil(num_merge); + + /// The memory will be doubled during merging. + const MERGE_RATIO: usize = 2; + let num_merge = num_merge.div_ceil(MERGE_RATIO).max(2); log::info!("determine sort spill params, buffer_bytes: {bytes}, buffer_rows: {rows}, spill_unit_size: {unit_size}, batch_rows: {batch_rows}, batch_num_merge {num_merge}"); SortSpillParams { batch_rows, diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs index 2f7330ea2c494..5a620668e02d8 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs @@ -163,6 +163,10 @@ where let unit_size = self.memory_settings.spill_unit_size; let num_merge = bytes.div_ceil(unit_size).max(2); let batch_rows = rows.div_ceil(num_merge); + + /// The memory will be doubled during merging. + const MERGE_RATIO: usize = 2; + let num_merge = num_merge.div_ceil(MERGE_RATIO).max(2); log::info!("determine sort spill params, buffer_bytes: {bytes}, buffer_rows: {rows}, spill_unit_size: {unit_size}, batch_rows: {batch_rows}, batch_num_merge {num_merge}"); SortSpillParams { batch_rows, From cef81b4dcb9d7b975ce05d89202cf1f97e63e7a2 Mon Sep 17 00:00:00 2001 From: coldWater Date: Sun, 27 Apr 2025 16:33:00 +0800 Subject: [PATCH 30/33] fix Signed-off-by: coldWater --- .../src/pipelines/builders/builder_sort.rs | 7 ++-- .../processors/transforms/sort/merge_sort.rs | 2 +- .../transforms/sort/sort_collect.rs | 37 ++++++++++++++----- .../processors/transforms/sort/sort_spill.rs | 2 +- 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/query/service/src/pipelines/builders/builder_sort.rs b/src/query/service/src/pipelines/builders/builder_sort.rs index c3c9dc1b3a9ea..902cb1e66b088 100644 --- a/src/query/service/src/pipelines/builders/builder_sort.rs +++ b/src/query/service/src/pipelines/builders/builder_sort.rs @@ -214,7 +214,7 @@ impl SortPipelineBuilder { fn build_range_shuffle_sort_pipeline(self, pipeline: &mut Pipeline) -> Result<()> { let inputs = pipeline.output_len(); let settings = self.ctx.get_settings(); - let max_threads = settings.get_max_threads()? as usize; + let num_exec = inputs; let max_block_size = settings.get_max_block_size()? as usize; // Partial sort @@ -256,12 +256,11 @@ impl SortPipelineBuilder { Ok(ProcessorPtr::create(builder.build_collect(input, output)?)) })?; - let state = - SortSampleState::new(inputs, max_threads, builder.inner_schema(), max_block_size); + let state = SortSampleState::new(inputs, num_exec, builder.inner_schema(), max_block_size); builder.add_shuffle(pipeline, state.clone())?; - pipeline.exchange(max_threads, Arc::new(SortRangeExchange)); + pipeline.exchange(num_exec, Arc::new(SortRangeExchange)); pipeline.add_transform(|input, output| { Ok(ProcessorPtr::create(builder.build_combine(input, output)?)) diff --git a/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs b/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs index 7fdd0f4b09c98..ac0abeec46ec0 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/merge_sort.rs @@ -435,7 +435,7 @@ where if memory_rows > 0 && memory_rows + input > max { spill_sort - .subsequent_spill_last(memory_rows + input - max) + .collect_spill_last(memory_rows + input - max) .await?; } if input > max || finished && input > 0 { diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs index 5a620668e02d8..910f0b76e05aa 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs @@ -51,6 +51,7 @@ pub struct TransformSortCollect { output: Arc, output_data: Option, + max_block_size: usize, row_converter: C, sort_desc: Arc<[SortColumnDescription]>, /// If this transform is after an Exchange transform, @@ -102,6 +103,7 @@ where inner, aborting: AtomicBool::new(false), memory_settings, + max_block_size, }) } @@ -117,12 +119,19 @@ where Ok((rows, block)) } - fn limit_trans_to_spill(&mut self) -> Result<()> { + fn limit_trans_to_spill(&mut self, no_spill: bool) -> Result<()> { let Inner::Limit(merger) = &self.inner else { unreachable!() }; assert!(merger.num_rows() > 0); - let params = self.determine_params(merger.num_bytes(), merger.num_rows()); + let params = if no_spill { + SortSpillParams { + batch_rows: self.max_block_size, + num_merge: merger.num_rows().div_ceil(self.max_block_size), + } + } else { + self.determine_params(merger.num_bytes(), merger.num_rows()) + }; let Inner::Limit(merger) = &mut self.inner else { unreachable!() }; @@ -132,7 +141,7 @@ where Ok(()) } - fn collect_trans_to_spill(&mut self, input_data: Vec) { + fn collect_trans_to_spill(&mut self, input_data: Vec, no_spill: bool) { let (num_rows, num_bytes) = input_data .iter() .map(|block| (block.num_rows(), block.memory_size())) @@ -140,17 +149,24 @@ where (acc_rows + rows, acc_bytes + bytes) }); assert!(num_rows > 0); - let params = self.determine_params(num_bytes, num_rows); + let params = if no_spill { + SortSpillParams { + batch_rows: self.max_block_size, + num_merge: num_rows.div_ceil(self.max_block_size), + } + } else { + self.determine_params(num_bytes, num_rows) + }; let spill_sort = SortSpill::new(self.base.clone(), params); self.inner = Inner::Spill(input_data, spill_sort); } - fn trans_to_spill(&mut self) -> Result<()> { + fn trans_to_spill(&mut self, no_spill: bool) -> Result<()> { match &mut self.inner { - Inner::Limit(_) => self.limit_trans_to_spill(), + Inner::Limit(_) => self.limit_trans_to_spill(no_spill), Inner::Collect(input_data) => { let input_data = std::mem::take(input_data); - self.collect_trans_to_spill(input_data); + self.collect_trans_to_spill(input_data, no_spill); Ok(()) } Inner::Spill(_, _) => Ok(()), @@ -322,18 +338,19 @@ where #[async_backtrace::framed] async fn async_process(&mut self) -> Result<()> { let finished = self.input.is_finished(); - self.trans_to_spill()?; + self.trans_to_spill(finished)?; - let input = self.input_rows(); let Inner::Spill(input_data, spill_sort) = &mut self.inner else { unreachable!() }; + + let input = input_data.in_memory_rows(); let memory_rows = spill_sort.collect_memory_rows(); let max = spill_sort.max_rows(); if memory_rows > 0 && memory_rows + input > max { spill_sort - .subsequent_spill_last(memory_rows + input - max) + .collect_spill_last(memory_rows + input - max) .await?; } if input > max || finished && input > 0 { diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs index 859a535a23efc..440932b258da7 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_spill.rs @@ -134,7 +134,7 @@ where A: SortAlgorithm collect.sort_input_data(&self.base, input_data, aborting) } - pub async fn subsequent_spill_last(&mut self, target_rows: usize) -> Result<()> { + pub async fn collect_spill_last(&mut self, target_rows: usize) -> Result<()> { let Step::Collect(collect) = &mut self.step else { unreachable!() }; From f2883684eb737f9fbbadf176d519c4627a2201fc Mon Sep 17 00:00:00 2001 From: coldWater Date: Sun, 27 Apr 2025 17:22:16 +0800 Subject: [PATCH 31/33] fix Signed-off-by: coldWater --- .../pipelines/processors/transforms/sort/sort_collect.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs index 910f0b76e05aa..5393e569aa54e 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs @@ -216,13 +216,6 @@ where } } - fn input_rows(&self) -> usize { - match &self.inner { - Inner::Collect(input_data) | Inner::Spill(input_data, _) => input_data.in_memory_rows(), - _ => 0, - } - } - fn check_spill(&self) -> bool { if !self.memory_settings.check_spill() { return false; From 5fa4b81de37bda9e5a6ba64cbd01598e6521911b Mon Sep 17 00:00:00 2001 From: coldWater Date: Sun, 27 Apr 2025 18:25:21 +0800 Subject: [PATCH 32/33] fix Signed-off-by: coldWater --- .../src/pipelines/processors/transforms/sort/sort_collect.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs b/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs index 5393e569aa54e..dbf46ca817829 100644 --- a/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/sort/sort_collect.rs @@ -127,7 +127,7 @@ where let params = if no_spill { SortSpillParams { batch_rows: self.max_block_size, - num_merge: merger.num_rows().div_ceil(self.max_block_size), + num_merge: merger.num_rows().div_ceil(self.max_block_size).max(2), } } else { self.determine_params(merger.num_bytes(), merger.num_rows()) @@ -152,7 +152,7 @@ where let params = if no_spill { SortSpillParams { batch_rows: self.max_block_size, - num_merge: num_rows.div_ceil(self.max_block_size), + num_merge: num_rows.div_ceil(self.max_block_size).max(2), } } else { self.determine_params(num_bytes, num_rows) From 8f4987333429a33c2cc80af9960055e9612d7b79 Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 28 Apr 2025 10:06:01 +0800 Subject: [PATCH 33/33] fix Signed-off-by: coldWater --- src/query/settings/src/settings_default.rs | 2 +- .../suites/mode/standalone/explain/sort.test | 3 - .../mode/standalone/explain/window.test | 117 ++++++++++-------- 3 files changed, 63 insertions(+), 59 deletions(-) diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index d79bf762155e7..a4f314fc0b936 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -606,7 +606,7 @@ impl DefaultSettings { range: Some(SettingRange::Numeric(4 * 1024..=u64::MAX)), }), ("enable_range_shuffle_sort", DefaultSettingValue { - value: UserSettingValue::UInt64(1), + value: UserSettingValue::UInt64(0), desc: "Enable range shuffle sort.", mode: SettingMode::Both, scope: SettingScope::Both, diff --git a/tests/sqllogictests/suites/mode/standalone/explain/sort.test b/tests/sqllogictests/suites/mode/standalone/explain/sort.test index 12c60372efffa..79adbf6c96efe 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/sort.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/sort.test @@ -1,9 +1,6 @@ statement ok create or replace table t1(a int, b int); -statement ok -set enable_range_shuffle_sort = 0; - query T explain select a from (select * from t1 order by a) as t2 where a > 1; ---- diff --git a/tests/sqllogictests/suites/mode/standalone/explain/window.test b/tests/sqllogictests/suites/mode/standalone/explain/window.test index 3b30ff5d865ea..c0a5aa212603a 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/window.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/window.test @@ -44,9 +44,6 @@ set max_threads=4; statement ok set sort_spilling_memory_ratio = 0; -statement ok -set enable_parallel_multi_merge_sort = 0; - statement ok set enable_range_shuffle_sort = 0; @@ -54,17 +51,19 @@ query T explain pipeline SELECT depname, empno, salary, sum(salary) OVER (PARTITION BY depname ORDER BY empno) FROM empsalary ORDER BY depname, empno; ---- CompoundBlockOperator(Project) × 1 - Merge to MultiSortMerge × 1 - TransformSortMerge × 4 - SortPartialTransform × 4 - Merge to Resize × 4 - Transform Window × 1 - TransformWindowPartitionCollect(Sort) × 1 - ShuffleMergePartition(Window) × 1 - ShufflePartition(Window) × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataTransform × 1 - BlockPartitionSource × 1 + Merge to KWayMergeCombiner × 1 + KWayMergeWorker × 4 + KWayMergePartitioner × 1 + TransformSortMerge × 4 + SortPartialTransform × 4 + Merge to Resize × 4 + Transform Window × 1 + TransformWindowPartitionCollect(Sort) × 1 + ShuffleMergePartition(Window) × 1 + ShufflePartition(Window) × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataTransform × 1 + BlockPartitionSource × 1 # Enable sort spilling @@ -75,17 +74,19 @@ query T explain pipeline SELECT depname, empno, salary, sum(salary) OVER (PARTITION BY depname ORDER BY empno) FROM empsalary ORDER BY depname, empno; ---- CompoundBlockOperator(Project) × 1 - Merge to MultiSortMerge × 1 - TransformSortMerge × 4 - SortPartialTransform × 4 - Merge to Resize × 4 - Transform Window × 1 - TransformWindowPartitionCollect(Sort) × 1 - ShuffleMergePartition(Window) × 1 - ShufflePartition(Window) × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataTransform × 1 - BlockPartitionSource × 1 + Merge to KWayMergeCombiner × 1 + KWayMergeWorker × 4 + KWayMergePartitioner × 1 + TransformSortMerge × 4 + SortPartialTransform × 4 + Merge to Resize × 4 + Transform Window × 1 + TransformWindowPartitionCollect(Sort) × 1 + ShuffleMergePartition(Window) × 1 + ShufflePartition(Window) × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataTransform × 1 + BlockPartitionSource × 1 statement ok @@ -450,17 +451,19 @@ avg(a) over (order by a rows between unbounded preceding and current row) from t CompoundBlockOperator(Project) × 1 LimitTransform × 1 Transform Window × 1 - Merge to MultiSortMerge × 1 - TransformSortMerge × 4 - SortPartialTransform × 4 - Merge to Resize × 4 - Transform Window × 1 - TransformWindowPartitionCollect(Sort) × 1 - ShuffleMergePartition(Window) × 1 - ShufflePartition(Window) × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataTransform × 1 - BlockPartitionSource × 1 + Merge to KWayMergeCombiner × 1 + KWayMergeWorker × 4 + KWayMergePartitioner × 1 + TransformSortMerge × 4 + SortPartialTransform × 4 + Merge to Resize × 4 + Transform Window × 1 + TransformWindowPartitionCollect(Sort) × 1 + ShuffleMergePartition(Window) × 1 + ShufflePartition(Window) × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataTransform × 1 + BlockPartitionSource × 1 # row fetch with window function(pipeline explain) query T @@ -469,19 +472,21 @@ explain pipeline select *, sum(a) over (partition by a order by a desc rows betw CompoundBlockOperator(Project) × 1 TransformRowsFetcher × 1 LimitTransform × 1 - Merge to MultiSortMerge × 1 - TransformSortMergeLimit × 4 - SortPartialTransform × 4 - Merge to Resize × 4 - Transform Window × 1 - TransformWindowPartitionCollect(Sort) × 1 - ShuffleMergePartition(Window) × 1 - ShufflePartition(Window) × 1 - TransformFilter × 1 - AddInternalColumnsTransform × 1 - DeserializeDataTransform × 1 - SyncReadParquetDataTransform × 1 - BlockPartitionSource × 1 + Merge to KWayMergeCombiner × 1 + KWayMergeWorker × 4 + KWayMergePartitioner × 1 + TransformSortMergeLimit × 4 + SortPartialTransform × 4 + Merge to Resize × 4 + Transform Window × 1 + TransformWindowPartitionCollect(Sort) × 1 + ShuffleMergePartition(Window) × 1 + ShufflePartition(Window) × 1 + TransformFilter × 1 + AddInternalColumnsTransform × 1 + DeserializeDataTransform × 1 + SyncReadParquetDataTransform × 1 + BlockPartitionSource × 1 # row fetch with window function(plan explain) query T @@ -562,12 +567,14 @@ CompoundBlockOperator(Project) × 1 Transform Window × 1 CompoundBlockOperator(Map) × 1 Transform Window × 1 - Merge to MultiSortMerge × 1 - TransformSortMerge × 4 - SortPartialTransform × 4 - Merge to Resize × 4 - CompoundBlockOperator(Map) × 1 - NumbersSourceTransform × 1 + Merge to KWayMergeCombiner × 1 + KWayMergeWorker × 4 + KWayMergePartitioner × 1 + TransformSortMerge × 4 + SortPartialTransform × 4 + Merge to Resize × 4 + CompoundBlockOperator(Map) × 1 + NumbersSourceTransform × 1 # same order same partiton by multi window query T