diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs index ebe94450c1f8..4807dac11efb 100644 --- a/datafusion/core/benches/aggregate_query_sql.rs +++ b/datafusion/core/benches/aggregate_query_sql.rs @@ -148,7 +148,7 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { query( ctx.clone(), - "SELECT utf8, approx_percentile_cont(u64_wide, 0.5, 2500) \ + "SELECT utf8, approx_percentile_cont(0.5, 2500) WITHIN GROUP (ORDER BY u64_wide) \ FROM t GROUP BY utf8", ) }) @@ -158,7 +158,7 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { query( ctx.clone(), - "SELECT utf8, approx_percentile_cont(f32, 0.5, 2500) \ + "SELECT utf8, approx_percentile_cont(0.5, 2500) WITHIN GROUP (ORDER BY f32) \ FROM t GROUP BY utf8", ) }) diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index 28c0740ca76b..8ada55d5caa0 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -360,14 +360,29 @@ async fn test_fn_approx_median() -> Result<()> { #[tokio::test] async fn test_fn_approx_percentile_cont() -> Result<()> { - let expr = approx_percentile_cont(col("b"), lit(0.5), None); + let expr = approx_percentile_cont(col("b").sort(true, false), lit(0.5), None); let expected = [ - "+---------------------------------------------+", - "| approx_percentile_cont(test.b,Float64(0.5)) |", - "+---------------------------------------------+", - "| 10 |", - "+---------------------------------------------+", + "+---------------------------------------------------------------------------+", + "| approx_percentile_cont(Float64(0.5)) WITHIN GROUP [test.b ASC NULLS LAST] |", + "+---------------------------------------------------------------------------+", + "| 10 |", + "+---------------------------------------------------------------------------+", + ]; + + let df = create_test_table().await?; + let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; + + assert_batches_eq!(expected, &batches); + + let expr = approx_percentile_cont(col("b").sort(false, false), lit(0.1), None); + + let expected = [ + "+----------------------------------------------------------------------------+", + "| approx_percentile_cont(Float64(0.1)) WITHIN GROUP [test.b DESC NULLS LAST] |", + "+----------------------------------------------------------------------------+", + "| 100 |", + "+----------------------------------------------------------------------------+", ]; let df = create_test_table().await?; @@ -381,27 +396,60 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { None::<&str>, "arg_2".to_string(), )); - let expr = approx_percentile_cont(col("b"), alias_expr, None); + let expr = approx_percentile_cont(col("b").sort(true, false), alias_expr, None); + let df = create_test_table().await?; + let expected = [ + "+--------------------------------------------------------------------+", + "| approx_percentile_cont(arg_2) WITHIN GROUP [test.b ASC NULLS LAST] |", + "+--------------------------------------------------------------------+", + "| 10 |", + "+--------------------------------------------------------------------+", + ]; + let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; + + assert_batches_eq!(expected, &batches); + + let alias_expr = Expr::Alias(Alias::new( + cast(lit(0.1), DataType::Float32), + None::<&str>, + "arg_2".to_string(), + )); + let expr = approx_percentile_cont(col("b").sort(false, false), alias_expr, None); let df = create_test_table().await?; let expected = [ - "+--------------------------------------+", - "| approx_percentile_cont(test.b,arg_2) |", - "+--------------------------------------+", - "| 10 |", - "+--------------------------------------+", + "+---------------------------------------------------------------------+", + "| approx_percentile_cont(arg_2) WITHIN GROUP [test.b DESC NULLS LAST] |", + "+---------------------------------------------------------------------+", + "| 100 |", + "+---------------------------------------------------------------------+", ]; let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; assert_batches_eq!(expected, &batches); // with number of centroids set - let expr = approx_percentile_cont(col("b"), lit(0.5), Some(lit(2))); + let expr = approx_percentile_cont(col("b").sort(true, false), lit(0.5), Some(lit(2))); + let expected = [ + "+------------------------------------------------------------------------------------+", + "| approx_percentile_cont(Float64(0.5),Int32(2)) WITHIN GROUP [test.b ASC NULLS LAST] |", + "+------------------------------------------------------------------------------------+", + "| 30 |", + "+------------------------------------------------------------------------------------+", + ]; + + let df = create_test_table().await?; + let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; + + assert_batches_eq!(expected, &batches); + + let expr = + approx_percentile_cont(col("b").sort(false, false), lit(0.1), Some(lit(2))); let expected = [ - "+------------------------------------------------------+", - "| approx_percentile_cont(test.b,Float64(0.5),Int32(2)) |", - "+------------------------------------------------------+", - "| 30 |", - "+------------------------------------------------------+", + "+-------------------------------------------------------------------------------------+", + "| approx_percentile_cont(Float64(0.1),Int32(2)) WITHIN GROUP [test.b DESC NULLS LAST] |", + "+-------------------------------------------------------------------------------------+", + "| 69 |", + "+-------------------------------------------------------------------------------------+", ]; let df = create_test_table().await?; diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index f9039cea2edc..574fcb878ab0 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -308,6 +308,16 @@ impl AggregateUDF { self.inner.default_value(data_type) } + /// See [`AggregateUDFImpl::supports_null_handling_clause`] for more details. + pub fn supports_null_handling_clause(&self) -> bool { + self.inner.supports_null_handling_clause() + } + + /// See [`AggregateUDFImpl::is_ordered_set_aggregate`] for more details. + pub fn is_ordered_set_aggregate(&self) -> bool { + self.inner.is_ordered_set_aggregate() + } + /// Returns the documentation for this Aggregate UDF. /// /// Documentation can be accessed programmatically as well as @@ -425,6 +435,14 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { null_treatment, } = params; + // exclude the first function argument(= column) in ordered set aggregate function, + // because it is duplicated with the WITHIN GROUP clause in schema name. + let args = if self.is_ordered_set_aggregate() { + &args[1..] + } else { + &args[..] + }; + let mut schema_name = String::new(); schema_name.write_fmt(format_args!( @@ -443,8 +461,14 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { }; if let Some(order_by) = order_by { + let clause = match self.is_ordered_set_aggregate() { + true => "WITHIN GROUP", + false => "ORDER BY", + }; + schema_name.write_fmt(format_args!( - " ORDER BY [{}]", + " {} [{}]", + clause, schema_name_from_sorts(order_by)? ))?; }; @@ -845,6 +869,18 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { ScalarValue::try_from(data_type) } + /// If this function supports `[IGNORE NULLS | RESPECT NULLS]` clause, return true + /// If the function does not, return false + fn supports_null_handling_clause(&self) -> bool { + true + } + + /// If this function is ordered-set aggregate function, return true + /// If the function is not, return false + fn is_ordered_set_aggregate(&self) -> bool { + false + } + /// Returns the documentation for this Aggregate UDF. /// /// Documentation can be accessed programmatically as well as diff --git a/datafusion/functions-aggregate/src/approx_median.rs b/datafusion/functions-aggregate/src/approx_median.rs index 787e08bae286..9a202879d94a 100644 --- a/datafusion/functions-aggregate/src/approx_median.rs +++ b/datafusion/functions-aggregate/src/approx_median.rs @@ -45,7 +45,7 @@ make_udaf_expr_and_func!( /// APPROX_MEDIAN aggregate expression #[user_doc( doc_section(label = "Approximate Functions"), - description = "Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`.", + description = "Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY x)`.", syntax_example = "approx_median(expression)", sql_example = r#"```sql > SELECT approx_median(column_name) FROM table_name; diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs index 1fad5f73703c..41281733f5de 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs @@ -34,6 +34,7 @@ use datafusion_common::{ downcast_value, internal_err, not_impl_datafusion_err, not_impl_err, plan_err, Result, ScalarValue, }; +use datafusion_expr::expr::{AggregateFunction, Sort}; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS}; use datafusion_expr::utils::format_state_name; @@ -51,29 +52,39 @@ create_func!(ApproxPercentileCont, approx_percentile_cont_udaf); /// Computes the approximate percentile continuous of a set of numbers pub fn approx_percentile_cont( - expression: Expr, + order_by: Sort, percentile: Expr, centroids: Option, ) -> Expr { + let expr = order_by.expr.clone(); + let args = if let Some(centroids) = centroids { - vec![expression, percentile, centroids] + vec![expr, percentile, centroids] } else { - vec![expression, percentile] + vec![expr, percentile] }; - approx_percentile_cont_udaf().call(args) + + Expr::AggregateFunction(AggregateFunction::new_udf( + approx_percentile_cont_udaf(), + args, + false, + None, + Some(vec![order_by]), + None, + )) } #[user_doc( doc_section(label = "Approximate Functions"), description = "Returns the approximate percentile of input values using the t-digest algorithm.", - syntax_example = "approx_percentile_cont(expression, percentile, centroids)", + syntax_example = "approx_percentile_cont(percentile, centroids) WITHIN GROUP (ORDER BY expression)", sql_example = r#"```sql -> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name; -+-------------------------------------------------+ -| approx_percentile_cont(column_name, 0.75, 100) | -+-------------------------------------------------+ -| 65.0 | -+-------------------------------------------------+ +> SELECT approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) FROM table_name; ++-----------------------------------------------------------------------+ +| approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) | ++-----------------------------------------------------------------------+ +| 65.0 | ++-----------------------------------------------------------------------+ ```"#, standard_argument(name = "expression",), argument( @@ -130,6 +141,19 @@ impl ApproxPercentileCont { args: AccumulatorArgs, ) -> Result { let percentile = validate_input_percentile_expr(&args.exprs[1])?; + + let is_descending = args + .ordering_req + .first() + .map(|sort_expr| sort_expr.options.descending) + .unwrap_or(false); + + let percentile = if is_descending { + 1.0 - percentile + } else { + percentile + }; + let tdigest_max_size = if args.exprs.len() == 3 { Some(validate_input_max_size_expr(&args.exprs[2])?) } else { @@ -292,6 +316,14 @@ impl AggregateUDFImpl for ApproxPercentileCont { Ok(arg_types[0].clone()) } + fn supports_null_handling_clause(&self) -> bool { + false + } + + fn is_ordered_set_aggregate(&self) -> bool { + true + } + fn documentation(&self) -> Option<&Documentation> { self.doc() } diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs index 16dac2c1b8f0..0316757f26d0 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs @@ -52,14 +52,14 @@ make_udaf_expr_and_func!( #[user_doc( doc_section(label = "Approximate Functions"), description = "Returns the weighted approximate percentile of input values using the t-digest algorithm.", - syntax_example = "approx_percentile_cont_with_weight(expression, weight, percentile)", + syntax_example = "approx_percentile_cont_with_weight(weight, percentile) WITHIN GROUP (ORDER BY expression)", sql_example = r#"```sql -> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90) FROM table_name; -+----------------------------------------------------------------------+ -| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) | -+----------------------------------------------------------------------+ -| 78.5 | -+----------------------------------------------------------------------+ +> SELECT approx_percentile_cont_with_weight(weight_column, 0.90) WITHIN GROUP (ORDER BY column_name) FROM table_name; ++---------------------------------------------------------------------------------------------+ +| approx_percentile_cont_with_weight(weight_column, 0.90) WITHIN GROUP (ORDER BY column_name) | ++---------------------------------------------------------------------------------------------+ +| 78.5 | ++---------------------------------------------------------------------------------------------+ ```"#, standard_argument(name = "expression", prefix = "The"), argument( @@ -178,6 +178,14 @@ impl AggregateUDFImpl for ApproxPercentileContWithWeight { self.approx_percentile_cont.state_fields(args) } + fn supports_null_handling_clause(&self) -> bool { + false + } + + fn is_ordered_set_aggregate(&self) -> bool { + true + } + fn documentation(&self) -> Option<&Documentation> { self.doc() } diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 9cc7514a0d33..7fe586d64e6b 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -970,8 +970,8 @@ async fn roundtrip_expr_api() -> Result<()> { stddev_pop(lit(2.2)), approx_distinct(lit(2)), approx_median(lit(2)), - approx_percentile_cont(lit(2), lit(0.5), None), - approx_percentile_cont(lit(2), lit(0.5), Some(lit(50))), + approx_percentile_cont(lit(2).sort(true, false), lit(0.5), None), + approx_percentile_cont(lit(2).sort(true, false), lit(0.5), Some(lit(50))), approx_percentile_cont_with_weight(lit(2), lit(1), lit(0.5)), grouping(lit(1)), bit_and(lit(2)), diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index 54c4946a2c9a..f466155b478c 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -503,7 +503,7 @@ fn rountrip_aggregate_with_approx_pencentile_cont() -> Result<()> { vec![col("b", &schema)?, lit(0.5)], ) .schema(Arc::clone(&schema)) - .alias("APPROX_PERCENTILE_CONT(b, 0.5)") + .alias("APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY b)") .build() .map(Arc::new)?]; diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index cdf61183eb3d..e8ccca542dfc 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -75,7 +75,7 @@ fn find_closest_match(candidates: Vec, target: &str) -> Option { }) } -/// Arguments to for a function call extracted from the SQL AST +/// Arguments for a function call extracted from the SQL AST #[derive(Debug)] struct FunctionArgs { /// Function name @@ -92,6 +92,8 @@ struct FunctionArgs { null_treatment: Option, /// DISTINCT distinct: bool, + /// WITHIN GROUP clause, if any + within_group: Vec, } impl FunctionArgs { @@ -116,6 +118,7 @@ impl FunctionArgs { filter, null_treatment, distinct: false, + within_group, }); }; @@ -145,6 +148,9 @@ impl FunctionArgs { } FunctionArgumentClause::OrderBy(oby) => { if order_by.is_some() { + if !within_group.is_empty() { + return plan_err!("ORDER BY clause is only permitted in WITHIN GROUP clause when a WITHIN GROUP is used"); + } return not_impl_err!("Calling {name}: Duplicated ORDER BY clause in function arguments"); } order_by = Some(oby); @@ -177,8 +183,10 @@ impl FunctionArgs { } } - if !within_group.is_empty() { - return not_impl_err!("WITHIN GROUP is not supported yet: {within_group:?}"); + if within_group.len() > 1 { + return not_impl_err!( + "Only a single ordering expression is permitted in a WITHIN GROUP clause" + ); } let order_by = order_by.unwrap_or_default(); @@ -191,6 +199,7 @@ impl FunctionArgs { filter, null_treatment, distinct, + within_group, }) } } @@ -211,8 +220,14 @@ impl SqlToRel<'_, S> { filter, null_treatment, distinct, + within_group, } = function_args; + if over.is_some() && !within_group.is_empty() { + return plan_err!("OVER and WITHIN GROUP clause are can not be used together. \ + OVER is for window function, whereas WITHIN GROUP is for ordered set aggregate function"); + } + // If function is a window function (it has an OVER clause), // it shouldn't have ordering requirement as function argument // required ordering should be defined in OVER clause. @@ -349,15 +364,49 @@ impl SqlToRel<'_, S> { } else { // User defined aggregate functions (UDAF) have precedence in case it has the same name as a scalar built-in function if let Some(fm) = self.context_provider.get_aggregate_meta(&name) { - let order_by = self.order_by_to_sort_expr( - order_by, - schema, - planner_context, - true, - None, - )?; - let order_by = (!order_by.is_empty()).then_some(order_by); - let args = self.function_args_to_expr(args, schema, planner_context)?; + if fm.is_ordered_set_aggregate() && within_group.is_empty() { + return plan_err!("WITHIN GROUP clause is required when calling ordered set aggregate function({})", fm.name()); + } + + if null_treatment.is_some() && !fm.supports_null_handling_clause() { + return plan_err!( + "[IGNORE | RESPECT] NULLS are not permitted for {}", + fm.name() + ); + } + + let mut args = + self.function_args_to_expr(args, schema, planner_context)?; + + let order_by = if fm.is_ordered_set_aggregate() { + let within_group = self.order_by_to_sort_expr( + within_group, + schema, + planner_context, + false, + None, + )?; + + // add target column expression in within group clause to function arguments + if !within_group.is_empty() { + args = within_group + .iter() + .map(|sort| sort.expr.clone()) + .chain(args) + .collect::>(); + } + (!within_group.is_empty()).then_some(within_group) + } else { + let order_by = self.order_by_to_sort_expr( + order_by, + schema, + planner_context, + true, + None, + )?; + (!order_by.is_empty()).then_some(order_by) + }; + let filter: Option> = filter .map(|e| self.sql_expr_to_logical_expr(*e, schema, planner_context)) .transpose()? diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index d051cb78a8d5..99c765eb8a9d 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -291,6 +291,7 @@ impl Unparser<'_> { distinct, args, filter, + order_by, .. } = &agg.params; @@ -299,6 +300,16 @@ impl Unparser<'_> { Some(filter) => Some(Box::new(self.expr_to_sql_inner(filter)?)), None => None, }; + let within_group = if agg.func.is_ordered_set_aggregate() { + order_by + .as_ref() + .unwrap_or(&Vec::new()) + .iter() + .map(|sort_expr| self.sort_to_sql(sort_expr)) + .collect::>>()? + } else { + Vec::new() + }; Ok(ast::Expr::Function(Function { name: ObjectName(vec![Ident { value: func_name.to_string(), @@ -314,7 +325,7 @@ impl Unparser<'_> { filter, null_treatment: None, over: None, - within_group: vec![], + within_group, parameters: ast::FunctionArguments::None, uses_odbc_syntax: false, })) diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index bc43f6bc8e61..61a34eca6d1a 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -133,36 +133,50 @@ SELECT approx_distinct(c9) count_c9, approx_distinct(cast(c9 as varchar)) count_ # csv_query_approx_percentile_cont_with_weight statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function: coercion from \[Utf8, Int8, Float64\] to the signature OneOf(.*) failed(.|\n)* -SELECT approx_percentile_cont_with_weight(c1, c2, 0.95) FROM aggregate_test_100 +SELECT approx_percentile_cont_with_weight(c2, 0.95) WITHIN GROUP (ORDER BY c1) FROM aggregate_test_100 statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function: coercion from \[Int16, Utf8, Float64\] to the signature OneOf(.*) failed(.|\n)* -SELECT approx_percentile_cont_with_weight(c3, c1, 0.95) FROM aggregate_test_100 +SELECT approx_percentile_cont_with_weight(c1, 0.95) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function: coercion from \[Int16, Int8, Utf8\] to the signature OneOf(.*) failed(.|\n)* -SELECT approx_percentile_cont_with_weight(c3, c2, c1) FROM aggregate_test_100 +SELECT approx_percentile_cont_with_weight(c2, c1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 # csv_query_approx_percentile_cont_with_histogram_bins statement error DataFusion error: This feature is not implemented: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal \(got data type Int64\)\. -SELECT c1, approx_percentile_cont(c3, 0.95, -1000) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +SELECT c1, approx_percentile_cont(0.95, -1000) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from \[Int16, Float64, Utf8\] to the signature OneOf(.*) failed(.|\n)* -SELECT approx_percentile_cont(c3, 0.95, c1) FROM aggregate_test_100 +SELECT approx_percentile_cont(0.95, c1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from \[Int16, Float64, Float64\] to the signature OneOf(.*) failed(.|\n)* -SELECT approx_percentile_cont(c3, 0.95, 111.1) FROM aggregate_test_100 +SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from \[Float64, Float64, Float64\] to the signature OneOf(.*) failed(.|\n)* -SELECT approx_percentile_cont(c12, 0.95, 111.1) FROM aggregate_test_100 +SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100 statement error DataFusion error: This feature is not implemented: Percentile value for 'APPROX_PERCENTILE_CONT' must be a literal -SELECT approx_percentile_cont(c12, c12) FROM aggregate_test_100 +SELECT approx_percentile_cont(c12) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100 statement error DataFusion error: This feature is not implemented: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be a literal -SELECT approx_percentile_cont(c12, 0.95, c5) FROM aggregate_test_100 +SELECT approx_percentile_cont(0.95, c5) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100 + +statement error DataFusion error: This feature is not implemented: Conflicting ordering requirements in aggregate functions is not supported +SELECT approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c5), approx_percentile_cont(0.2) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100 + +statement error DataFusion error: Error during planning: \[IGNORE | RESPECT\] NULLS are not permitted for approx_percentile_cont +SELECT approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c5) IGNORE NULLS FROM aggregate_test_100 + +statement error DataFusion error: Error during planning: \[IGNORE | RESPECT\] NULLS are not permitted for approx_percentile_cont +SELECT approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c5) RESPECT NULLS FROM aggregate_test_100 + +statement error DataFusion error: This feature is not implemented: Only a single ordering expression is permitted in a WITHIN GROUP clause +SELECT approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c5, c12) FROM aggregate_test_100 # Not supported over sliding windows -query error This feature is not implemented: Aggregate can not be used as a sliding accumulator because `retract_batch` is not implemented -SELECT approx_percentile_cont(c3, 0.5) OVER (ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) +query error DataFusion error: Error during planning: OVER and WITHIN GROUP clause are can not be used together. OVER is for window function, whereas WITHIN GROUP is for ordered set aggregate function +SELECT approx_percentile_cont(0.5) +WITHIN GROUP (ORDER BY c3) +OVER (ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) FROM aggregate_test_100 # array agg can use order by @@ -1233,173 +1247,173 @@ SELECT approx_distinct(c9) AS a, approx_distinct(c9) AS b FROM aggregate_test_10 #csv_query_approx_percentile_cont (c2) query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.1) AS DOUBLE) / 1.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.1) WITHIN GROUP (ORDER BY c2) AS DOUBLE) / 1.0) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.5) AS DOUBLE) / 3.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY c2) AS DOUBLE) / 3.0) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.9) AS DOUBLE) / 5.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c2) AS DOUBLE) / 5.0) < 0.05) AS q FROM aggregate_test_100 ---- true # csv_query_approx_percentile_cont (c3) query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.1) AS DOUBLE) / -95.3) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.1) WITHIN GROUP (ORDER BY c3) AS DOUBLE) / -95.3) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.5) AS DOUBLE) / 15.5) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY c3) AS DOUBLE) / 15.5) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.9) AS DOUBLE) / 102.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c3) AS DOUBLE) / 102.0) < 0.05) AS q FROM aggregate_test_100 ---- true # csv_query_approx_percentile_cont (c4) query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.1) AS DOUBLE) / -22925.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.1) WITHIN GROUP (ORDER BY c4) AS DOUBLE) / -22925.0) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.5) AS DOUBLE) / 4599.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY c4) AS DOUBLE) / 4599.0) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.9) AS DOUBLE) / 25334.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c4) AS DOUBLE) / 25334.0) < 0.05) AS q FROM aggregate_test_100 ---- true # csv_query_approx_percentile_cont (c5) query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.1) AS DOUBLE) / -1882606710.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.1) WITHIN GROUP (ORDER BY c5) AS DOUBLE) / -1882606710.0) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.5) AS DOUBLE) / 377164262.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY c5) AS DOUBLE) / 377164262.0) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.9) AS DOUBLE) / 1991374996.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c5) AS DOUBLE) / 1991374996.0) < 0.05) AS q FROM aggregate_test_100 ---- true # csv_query_approx_percentile_cont (c6) query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.1) AS DOUBLE) / -7250000000000000000) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.1) WITHIN GROUP (ORDER BY c6) AS DOUBLE) / -7250000000000000000) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.5) AS DOUBLE) / 1130000000000000000) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY c6) AS DOUBLE) / 1130000000000000000) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.9) AS DOUBLE) / 7370000000000000000) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c6) AS DOUBLE) / 7370000000000000000) < 0.05) AS q FROM aggregate_test_100 ---- true # csv_query_approx_percentile_cont (c7) query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.1) AS DOUBLE) / 18.9) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.1) WITHIN GROUP (ORDER BY c7) AS DOUBLE) / 18.9) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.5) AS DOUBLE) / 134.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY c7) AS DOUBLE) / 134.0) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.9) AS DOUBLE) / 231.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c7) AS DOUBLE) / 231.0) < 0.05) AS q FROM aggregate_test_100 ---- true # csv_query_approx_percentile_cont (c8) query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.1) AS DOUBLE) / 2671.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.1) WITHIN GROUP (ORDER BY c8) AS DOUBLE) / 2671.0) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.5) AS DOUBLE) / 30634.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY c8) AS DOUBLE) / 30634.0) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.9) AS DOUBLE) / 57518.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c8) AS DOUBLE) / 57518.0) < 0.05) AS q FROM aggregate_test_100 ---- true # csv_query_approx_percentile_cont (c9) query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.1) AS DOUBLE) / 472608672.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.1) WITHIN GROUP (ORDER BY c9) AS DOUBLE) / 472608672.0) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.5) AS DOUBLE) / 2365817608.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY c9) AS DOUBLE) / 2365817608.0) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.9) AS DOUBLE) / 3776538487.0) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c9) AS DOUBLE) / 3776538487.0) < 0.05) AS q FROM aggregate_test_100 ---- true # csv_query_approx_percentile_cont (c10) query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.1) AS DOUBLE) / 1830000000000000000) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.1) WITHIN GROUP (ORDER BY c10) AS DOUBLE) / 1830000000000000000) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.5) AS DOUBLE) / 9300000000000000000) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY c10) AS DOUBLE) / 9300000000000000000) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.9) AS DOUBLE) / 16100000000000000000) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c10) AS DOUBLE) / 16100000000000000000) < 0.05) AS q FROM aggregate_test_100 ---- true # csv_query_approx_percentile_cont (c11) query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.1) AS DOUBLE) / 0.109) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.1) WITHIN GROUP (ORDER BY c11) AS DOUBLE) / 0.109) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.5) AS DOUBLE) / 0.491) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY c11) AS DOUBLE) / 0.491) < 0.05) AS q FROM aggregate_test_100 ---- true query B -SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.9) AS DOUBLE) / 0.834) < 0.05) AS q FROM aggregate_test_100 +SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c11) AS DOUBLE) / 0.834) < 0.05) AS q FROM aggregate_test_100 ---- true # percentile_cont_with_nulls query I -SELECT APPROX_PERCENTILE_CONT(v, 0.5) FROM (VALUES (1), (2), (3), (NULL), (NULL), (NULL)) as t (v); +SELECT APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (NULL), (NULL), (NULL)) as t (v); ---- 2 # percentile_cont_with_nulls_only query I -SELECT APPROX_PERCENTILE_CONT(v, 0.5) FROM (VALUES (CAST(NULL as INT))) as t (v); +SELECT APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v); ---- NULL @@ -1422,7 +1436,7 @@ NaN # ISSUE: https://github.com/apache/datafusion/issues/11870 query R -select APPROX_PERCENTILE_CONT(v2, 0.8) from tmp_percentile_cont; +select APPROX_PERCENTILE_CONT(0.8) WITHIN GROUP (ORDER BY v2) from tmp_percentile_cont; ---- NaN @@ -1430,10 +1444,10 @@ NaN # Note: `approx_percentile_cont_with_weight()` uses the same implementation as `approx_percentile_cont()` query R SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT( - v2, '+Inf'::Double, 0.9 ) +WITHIN GROUP (ORDER BY v2) FROM tmp_percentile_cont; ---- NaN @@ -1452,7 +1466,7 @@ INSERT INTO t1 VALUES (TRUE); # ISSUE: https://github.com/apache/datafusion/issues/12716 # This test verifies that approx_percentile_cont_with_weight does not panic when given 'NaN' and returns 'inf' query R -SELECT approx_percentile_cont_with_weight('NaN'::DOUBLE, 0, 0) FROM t1 WHERE t1.v1; +SELECT approx_percentile_cont_with_weight(0, 0) WITHIN GROUP (ORDER BY 'NaN'::DOUBLE) FROM t1 WHERE t1.v1; ---- Infinity @@ -1679,7 +1693,7 @@ b NULL NULL 7732.315789473684 # csv_query_approx_percentile_cont_with_weight query TI -SELECT c1, approx_percentile_cont(c3, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a 73 b 68 @@ -1687,9 +1701,18 @@ c 122 d 124 e 115 +query TI +SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +---- +a -101 +b -114 +c -109 +d -98 +e -93 + # csv_query_approx_percentile_cont_with_weight (2) query TI -SELECT c1, approx_percentile_cont_with_weight(c3, 1, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a 73 b 68 @@ -1697,9 +1720,18 @@ c 122 d 124 e 115 +query TI +SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +---- +a -101 +b -114 +c -109 +d -98 +e -93 + # csv_query_approx_percentile_cont_with_histogram_bins query TI -SELECT c1, approx_percentile_cont(c3, 0.95, 200) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +SELECT c1, approx_percentile_cont(0.95, 200) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a 73 b 68 @@ -1708,7 +1740,7 @@ d 124 e 115 query TI -SELECT c1, approx_percentile_cont_with_weight(c3, c2, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +SELECT c1, approx_percentile_cont_with_weight(c2, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a 74 b 68 @@ -2998,7 +3030,7 @@ SELECT COUNT(DISTINCT c1) FROM test # test_approx_percentile_cont_decimal_support query TI -SELECT c1, approx_percentile_cont(c2, cast(0.85 as decimal(10,2))) apc FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +SELECT c1, approx_percentile_cont(cast(0.85 as decimal(10,2))) WITHIN GROUP (ORDER BY c2) apc FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 ---- a 4 b 5 @@ -6625,7 +6657,7 @@ group1 0.0003 # median with all nulls statement ok create table group_median_all_nulls( - a STRING NOT NULL, + a STRING NOT NULL, b INT ) AS VALUES ( 'group0', NULL), diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md index 7d88d3168d23..2b082229c911 100644 --- a/docs/source/user-guide/sql/aggregate_functions.md +++ b/docs/source/user-guide/sql/aggregate_functions.md @@ -787,7 +787,7 @@ approx_distinct(expression) ### `approx_median` -Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`. +Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY x)`. ```sql approx_median(expression) @@ -813,7 +813,7 @@ approx_median(expression) Returns the approximate percentile of input values using the t-digest algorithm. ```sql -approx_percentile_cont(expression, percentile, centroids) +approx_percentile_cont(percentile, centroids) WITHIN GROUP (ORDER BY expression) ``` #### Arguments @@ -825,12 +825,12 @@ approx_percentile_cont(expression, percentile, centroids) #### Example ```sql -> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name; -+-------------------------------------------------+ -| approx_percentile_cont(column_name, 0.75, 100) | -+-------------------------------------------------+ -| 65.0 | -+-------------------------------------------------+ +> SELECT approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) FROM table_name; ++-----------------------------------------------------------------------+ +| approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) | ++-----------------------------------------------------------------------+ +| 65.0 | ++-----------------------------------------------------------------------+ ``` ### `approx_percentile_cont_with_weight` @@ -838,7 +838,7 @@ approx_percentile_cont(expression, percentile, centroids) Returns the weighted approximate percentile of input values using the t-digest algorithm. ```sql -approx_percentile_cont_with_weight(expression, weight, percentile) +approx_percentile_cont_with_weight(weight, percentile) WITHIN GROUP (ORDER BY expression) ``` #### Arguments @@ -850,10 +850,10 @@ approx_percentile_cont_with_weight(expression, weight, percentile) #### Example ```sql -> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90) FROM table_name; -+----------------------------------------------------------------------+ -| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) | -+----------------------------------------------------------------------+ -| 78.5 | -+----------------------------------------------------------------------+ +> SELECT approx_percentile_cont_with_weight(weight_column, 0.90) WITHIN GROUP (ORDER BY column_name) FROM table_name; ++---------------------------------------------------------------------------------------------+ +| approx_percentile_cont_with_weight(weight_column, 0.90) WITHIN GROUP (ORDER BY column_name) | ++---------------------------------------------------------------------------------------------+ +| 78.5 | ++---------------------------------------------------------------------------------------------+ ```