-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Support WITHIN GROUP syntax to standardize certain existing aggregate functions #13511
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 40 commits
a9b901a
0918000
070a96b
3fd92fd
4082a78
c3be3c6
9fd05a3
8518a59
79669d9
597f4d7
d3b483c
a827c9d
23bdf70
97d96ca
1b61b5b
d0fdde3
3c8bce3
be99a35
f9aa1fc
d5f0b62
d7f2f59
7ef2139
91565b3
d482bff
005a27c
1179bc4
e5fc1a4
cf4faad
fc7d2bc
5469e39
d96b667
293d33e
ecdb21b
d65420e
b6d426a
4b0c52f
36a732d
8d6db85
db0355a
37b783e
124d8c5
3259c95
57c4281
40d7055
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -308,6 +308,16 @@ impl AggregateUDF { | |
self.inner.default_value(data_type) | ||
} | ||
|
||
/// See [`AggregateUDFImpl::supports_null_handling_clause`] for more details. | ||
pub fn supports_null_handling_clause(&self) -> bool { | ||
self.inner.supports_null_handling_clause() | ||
} | ||
|
||
/// See [`AggregateUDFImpl::is_ordered_set_aggregate`] for more details. | ||
pub fn is_ordered_set_aggregate(&self) -> bool { | ||
self.inner.is_ordered_set_aggregate() | ||
} | ||
|
||
/// Returns the documentation for this Aggregate UDF. | ||
/// | ||
/// Documentation can be accessed programmatically as well as | ||
|
@@ -425,6 +435,14 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { | |
null_treatment, | ||
} = params; | ||
|
||
// exclude the first function argument(= column) in ordered set aggregate function, | ||
// because it is duplicated with the WITHIN GROUP clause in schema name. | ||
let args = if self.is_ordered_set_aggregate() { | ||
&args[1..] | ||
} else { | ||
&args[..] | ||
}; | ||
|
||
let mut schema_name = String::new(); | ||
|
||
schema_name.write_fmt(format_args!( | ||
|
@@ -443,8 +461,14 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { | |
}; | ||
|
||
if let Some(order_by) = order_by { | ||
let clause = match self.is_ordered_set_aggregate() { | ||
true => "WITHIN GROUP", | ||
false => "ORDER BY", | ||
}; | ||
|
||
schema_name.write_fmt(format_args!( | ||
" ORDER BY [{}]", | ||
" {} [{}]", | ||
clause, | ||
schema_name_from_sorts(order_by)? | ||
))?; | ||
}; | ||
|
@@ -845,6 +869,18 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { | |
ScalarValue::try_from(data_type) | ||
} | ||
|
||
/// If this function supports `[IGNORE NULLS | RESPECT NULLS]` clause, return true | ||
/// If the function does not, return false | ||
fn supports_null_handling_clause(&self) -> bool { | ||
true | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this something we need? From what I know, there aren't any aggregate functions that have options for null handling. At the moment, the 2 overrides you have of this both return Speaking of which, if we do need this, do we need to return an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are some aggregate functions using null handling in current datafusion. And I refactored the function to just return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was smelling odd, so I dug a bit deeper. I think you've inadvertantly stumbled into something even weirder than you anticipated The example you've linked is SELECT FIRST_VALUE(column1) RESPECT NULLS FROM t; which I don't think is a valid query because If you try running something like SELECT first_value(column1) FROM t; against Postgres you get an error like
The I'm going to file a ticket for the above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Filed #15006 |
||
|
||
/// If this function is ordered-set aggregate function, return true | ||
/// If the function is not, return false | ||
fn is_ordered_set_aggregate(&self) -> bool { | ||
false | ||
} | ||
|
||
/// Returns the documentation for this Aggregate UDF. | ||
/// | ||
/// Documentation can be accessed programmatically as well as | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,6 +34,7 @@ use datafusion_common::{ | |
downcast_value, internal_err, not_impl_datafusion_err, not_impl_err, plan_err, | ||
Result, ScalarValue, | ||
}; | ||
use datafusion_expr::expr::{AggregateFunction, Sort}; | ||
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; | ||
use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS}; | ||
use datafusion_expr::utils::format_state_name; | ||
|
@@ -51,29 +52,39 @@ create_func!(ApproxPercentileCont, approx_percentile_cont_udaf); | |
|
||
/// Computes the approximate percentile continuous of a set of numbers | ||
pub fn approx_percentile_cont( | ||
expression: Expr, | ||
within_group: Sort, | ||
Garamda marked this conversation as resolved.
Show resolved
Hide resolved
|
||
percentile: Expr, | ||
centroids: Option<Expr>, | ||
) -> Expr { | ||
let expr = within_group.expr.clone(); | ||
|
||
let args = if let Some(centroids) = centroids { | ||
vec![expression, percentile, centroids] | ||
vec![expr, percentile, centroids] | ||
} else { | ||
vec![expression, percentile] | ||
vec![expr, percentile] | ||
}; | ||
approx_percentile_cont_udaf().call(args) | ||
|
||
Expr::AggregateFunction(AggregateFunction::new_udf( | ||
approx_percentile_cont_udaf(), | ||
args, | ||
false, | ||
None, | ||
Some(vec![within_group]), | ||
None, | ||
)) | ||
} | ||
|
||
#[user_doc( | ||
doc_section(label = "Approximate Functions"), | ||
description = "Returns the approximate percentile of input values using the t-digest algorithm.", | ||
syntax_example = "approx_percentile_cont(expression, percentile, centroids)", | ||
syntax_example = "approx_percentile_cont(percentile, centroids) WITHIN GROUP (ORDER BY expression)", | ||
sql_example = r#"```sql | ||
> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name; | ||
+-------------------------------------------------+ | ||
| approx_percentile_cont(column_name, 0.75, 100) | | ||
+-------------------------------------------------+ | ||
| 65.0 | | ||
+-------------------------------------------------+ | ||
> SELECT approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) FROM table_name; | ||
+-----------------------------------------------------------------------+ | ||
| approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) | | ||
+-----------------------------------------------------------------------+ | ||
| 65.0 | | ||
+-----------------------------------------------------------------------+ | ||
```"#, | ||
standard_argument(name = "expression",), | ||
argument( | ||
|
@@ -130,6 +141,19 @@ impl ApproxPercentileCont { | |
args: AccumulatorArgs, | ||
) -> Result<ApproxPercentileAccumulator> { | ||
let percentile = validate_input_percentile_expr(&args.exprs[1])?; | ||
|
||
let is_descending = args | ||
.ordering_req | ||
.first() | ||
.map(|sort_expr| sort_expr.options.descending) | ||
.unwrap_or(false); | ||
|
||
let percentile = if is_descending { | ||
1.0 - percentile | ||
} else { | ||
percentile | ||
}; | ||
Comment on lines
+151
to
+155
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I used floating point subtraction instead of actual sorting in reverse order, for conciseness. If any slight floating point difference is not permitted (even if this branch passed the tests), please let me know. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems reasonable to me, but I don't have that much experience on the execution side of things. |
||
|
||
let tdigest_max_size = if args.exprs.len() == 3 { | ||
Some(validate_input_max_size_expr(&args.exprs[2])?) | ||
} else { | ||
|
@@ -292,6 +316,14 @@ impl AggregateUDFImpl for ApproxPercentileCont { | |
Ok(arg_types[0].clone()) | ||
} | ||
|
||
fn supports_null_handling_clause(&self) -> bool { | ||
false | ||
} | ||
|
||
fn is_ordered_set_aggregate(&self) -> bool { | ||
true | ||
} | ||
|
||
fn documentation(&self) -> Option<&Documentation> { | ||
self.doc() | ||
} | ||
|
Uh oh!
There was an error while loading. Please reload this page.