apache · jayzhan211 · Apr 23, 2025 · Nov 21, 2024 · Nov 21, 2024 · Jan 21, 2025
diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs
@@ -148,7 +148,7 @@ fn criterion_benchmark(c: &mut Criterion) {
         b.iter(|| {
             query(
                 ctx.clone(),
-                "SELECT utf8, approx_percentile_cont(u64_wide, 0.5, 2500)  \
+                "SELECT utf8, approx_percentile_cont(0.5, 2500) WITHIN GROUP (ORDER BY u64_wide)  \
                  FROM t GROUP BY utf8",
             )
         })
@@ -158,7 +158,7 @@ fn criterion_benchmark(c: &mut Criterion) {
         b.iter(|| {
             query(
                 ctx.clone(),
-                "SELECT utf8, approx_percentile_cont(f32, 0.5, 2500)  \
+                "SELECT utf8, approx_percentile_cont(0.5, 2500) WITHIN GROUP (ORDER BY f32)  \
                  FROM t GROUP BY utf8",
             )
         })

diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs
@@ -360,14 +360,29 @@ async fn test_fn_approx_median() -> Result<()> {
 
 #[tokio::test]
 async fn test_fn_approx_percentile_cont() -> Result<()> {
-    let expr = approx_percentile_cont(col("b"), lit(0.5), None);
+    let expr = approx_percentile_cont(col("b").sort(true, false), lit(0.5), None);
 
     let expected = [
-        "+---------------------------------------------+",
-        "| approx_percentile_cont(test.b,Float64(0.5)) |",
-        "+---------------------------------------------+",
-        "| 10                                          |",
-        "+---------------------------------------------+",
+        "+---------------------------------------------------------------------------+",
+        "| approx_percentile_cont(Float64(0.5)) WITHIN GROUP [test.b ASC NULLS LAST] |",
+        "+---------------------------------------------------------------------------+",
+        "| 10                                                                        |",
+        "+---------------------------------------------------------------------------+",
+    ];
+
+    let df = create_test_table().await?;
+    let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?;
+
+    assert_batches_eq!(expected, &batches);
+
+    let expr = approx_percentile_cont(col("b").sort(false, false), lit(0.1), None);
+
+    let expected = [
+        "+----------------------------------------------------------------------------+",
+        "| approx_percentile_cont(Float64(0.1)) WITHIN GROUP [test.b DESC NULLS LAST] |",
+        "+----------------------------------------------------------------------------+",
+        "| 100                                                                        |",
+        "+----------------------------------------------------------------------------+",
     ];
 
     let df = create_test_table().await?;
@@ -381,27 +396,60 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
         None::<&str>,
         "arg_2".to_string(),
     ));
-    let expr = approx_percentile_cont(col("b"), alias_expr, None);
+    let expr = approx_percentile_cont(col("b").sort(true, false), alias_expr, None);
+    let df = create_test_table().await?;
+    let expected = [
+        "+--------------------------------------------------------------------+",
+        "| approx_percentile_cont(arg_2) WITHIN GROUP [test.b ASC NULLS LAST] |",
+        "+--------------------------------------------------------------------+",
+        "| 10                                                                 |",
+        "+--------------------------------------------------------------------+",
+    ];
+    let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?;
+
+    assert_batches_eq!(expected, &batches);
+
+    let alias_expr = Expr::Alias(Alias::new(
+        cast(lit(0.1), DataType::Float32),
+        None::<&str>,
+        "arg_2".to_string(),
+    ));
+    let expr = approx_percentile_cont(col("b").sort(false, false), alias_expr, None);
     let df = create_test_table().await?;
     let expected = [
-        "+--------------------------------------+",
-        "| approx_percentile_cont(test.b,arg_2) |",
-        "+--------------------------------------+",
-        "| 10                                   |",
-        "+--------------------------------------+",
+        "+---------------------------------------------------------------------+",
+        "| approx_percentile_cont(arg_2) WITHIN GROUP [test.b DESC NULLS LAST] |",
+        "+---------------------------------------------------------------------+",
+        "| 100                                                                 |",
+        "+---------------------------------------------------------------------+",
     ];
     let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?;
 
     assert_batches_eq!(expected, &batches);
 
     // with number of centroids set
-    let expr = approx_percentile_cont(col("b"), lit(0.5), Some(lit(2)));
+    let expr = approx_percentile_cont(col("b").sort(true, false), lit(0.5), Some(lit(2)));
+    let expected = [
+        "+------------------------------------------------------------------------------------+",
+        "| approx_percentile_cont(Float64(0.5),Int32(2)) WITHIN GROUP [test.b ASC NULLS LAST] |",
+        "+------------------------------------------------------------------------------------+",
+        "| 30                                                                                 |",
+        "+------------------------------------------------------------------------------------+",
+    ];
+
+    let df = create_test_table().await?;
+    let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?;
+
+    assert_batches_eq!(expected, &batches);
+
+    let expr =
+        approx_percentile_cont(col("b").sort(false, false), lit(0.1), Some(lit(2)));
     let expected = [
-        "+------------------------------------------------------+",
-        "| approx_percentile_cont(test.b,Float64(0.5),Int32(2)) |",
-        "+------------------------------------------------------+",
-        "| 30                                                   |",
-        "+------------------------------------------------------+",
+        "+-------------------------------------------------------------------------------------+",
+        "| approx_percentile_cont(Float64(0.1),Int32(2)) WITHIN GROUP [test.b DESC NULLS LAST] |",
+        "+-------------------------------------------------------------------------------------+",
+        "| 69                                                                                  |",
+        "+-------------------------------------------------------------------------------------+",
     ];
 
     let df = create_test_table().await?;

diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs
@@ -295,6 +295,8 @@ pub enum Expr {
     /// See also [`ExprFunctionExt`] to set these fields.
     ///
     /// [`ExprFunctionExt`]: crate::expr_fn::ExprFunctionExt
+    ///
+    /// cf. `WITHIN GROUP` is converted to `ORDER BY` internally in `datafusion/sql/src/expr/function.rs`
     AggregateFunction(AggregateFunction),
     /// Represents the call of a window function with arguments.
     WindowFunction(WindowFunction),

diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs
@@ -308,6 +308,16 @@ impl AggregateUDF {
         self.inner.default_value(data_type)
     }
 
+    /// See [`AggregateUDFImpl::supports_null_handling_clause`] for more details.
+    pub fn supports_null_handling_clause(&self) -> bool {
+        self.inner.supports_null_handling_clause()
+    }
+
+    /// See [`AggregateUDFImpl::is_ordered_set_aggregate`] for more details.
+    pub fn is_ordered_set_aggregate(&self) -> bool {
+        self.inner.is_ordered_set_aggregate()
+    }
+
     /// Returns the documentation for this Aggregate UDF.
     ///
     /// Documentation can be accessed programmatically as well as
@@ -425,6 +435,14 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
             null_treatment,
         } = params;
 
+        // exclude the first function argument(= column) in ordered set aggregate function,
+        // because it is duplicated with the WITHIN GROUP clause in schema name.
+        let args = if self.is_ordered_set_aggregate() {
+            &args[1..]
+        } else {
+            &args[..]
+        };
+
         let mut schema_name = String::new();
 
         schema_name.write_fmt(format_args!(
@@ -443,8 +461,14 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
         };
 
         if let Some(order_by) = order_by {
+            let clause = match self.is_ordered_set_aggregate() {
+                true => "WITHIN GROUP",
+                false => "ORDER BY",
+            };
+
             schema_name.write_fmt(format_args!(
-                " ORDER BY [{}]",
+                " {} [{}]",
+                clause,
                 schema_name_from_sorts(order_by)?
             ))?;
         };
@@ -845,6 +869,18 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
         ScalarValue::try_from(data_type)
     }
 
+    /// If this function supports `[IGNORE NULLS | RESPECT NULLS]` clause, return true
+    /// If the function does not, return false
+    fn supports_null_handling_clause(&self) -> bool {
+        true
+    }
+
+    /// If this function is ordered-set aggregate function, return true
+    /// If the function is not, return false
+    fn is_ordered_set_aggregate(&self) -> bool {
+        false
+    }
+
     /// Returns the documentation for this Aggregate UDF.
     ///
     /// Documentation can be accessed programmatically as well as

diff --git a/datafusion/functions-aggregate/src/approx_median.rs b/datafusion/functions-aggregate/src/approx_median.rs
@@ -45,7 +45,7 @@ make_udaf_expr_and_func!(
 /// APPROX_MEDIAN aggregate expression
 #[user_doc(
     doc_section(label = "Approximate Functions"),
-    description = "Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`.",
+    description = "Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY x)`.",
     syntax_example = "approx_median(expression)",
     sql_example = r#"```sql
 > SELECT approx_median(column_name) FROM table_name;

diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs
@@ -34,6 +34,7 @@ use datafusion_common::{
     downcast_value, internal_err, not_impl_datafusion_err, not_impl_err, plan_err,
     Result, ScalarValue,
 };
+use datafusion_expr::expr::{AggregateFunction, Sort};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS};
 use datafusion_expr::utils::format_state_name;
@@ -51,29 +52,39 @@ create_func!(ApproxPercentileCont, approx_percentile_cont_udaf);
 
 /// Computes the approximate percentile continuous of a set of numbers
 pub fn approx_percentile_cont(
-    expression: Expr,
+    within_group: Sort,
     percentile: Expr,
     centroids: Option<Expr>,
 ) -> Expr {
+    let expr = within_group.expr.clone();
+
     let args = if let Some(centroids) = centroids {
-        vec![expression, percentile, centroids]
+        vec![expr, percentile, centroids]
     } else {
-        vec![expression, percentile]
+        vec![expr, percentile]
     };
-    approx_percentile_cont_udaf().call(args)
+
+    Expr::AggregateFunction(AggregateFunction::new_udf(
+        approx_percentile_cont_udaf(),
+        args,
+        false,
+        None,
+        Some(vec![within_group]),
+        None,
+    ))
 }
 
 #[user_doc(
     doc_section(label = "Approximate Functions"),
     description = "Returns the approximate percentile of input values using the t-digest algorithm.",
-    syntax_example = "approx_percentile_cont(expression, percentile, centroids)",
+    syntax_example = "approx_percentile_cont(percentile, centroids) WITHIN GROUP (ORDER BY expression)",
     sql_example = r#"```sql
-> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name;
-+-------------------------------------------------+
-| approx_percentile_cont(column_name, 0.75, 100)  |
-+-------------------------------------------------+
-| 65.0                                            |
-+-------------------------------------------------+
+> SELECT approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) FROM table_name;
++-----------------------------------------------------------------------+
+| approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) |
++-----------------------------------------------------------------------+
+| 65.0                                                                  |
++-----------------------------------------------------------------------+
 ```"#,
     standard_argument(name = "expression",),
     argument(
@@ -130,6 +141,19 @@ impl ApproxPercentileCont {
         args: AccumulatorArgs,
     ) -> Result<ApproxPercentileAccumulator> {
         let percentile = validate_input_percentile_expr(&args.exprs[1])?;
+
+        let is_descending = args
+            .ordering_req
+            .first()
+            .map(|sort_expr| sort_expr.options.descending)
+            .unwrap_or(false);
+
+        let percentile = if is_descending {
+            1.0 - percentile
+        } else {
+            percentile
+        };
+
         let tdigest_max_size = if args.exprs.len() == 3 {
             Some(validate_input_max_size_expr(&args.exprs[2])?)
         } else {
@@ -292,6 +316,14 @@ impl AggregateUDFImpl for ApproxPercentileCont {
         Ok(arg_types[0].clone())
     }
 
+    fn supports_null_handling_clause(&self) -> bool {
+        false
+    }
+
+    fn is_ordered_set_aggregate(&self) -> bool {
+        true
+    }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }

diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs
@@ -52,14 +52,14 @@ make_udaf_expr_and_func!(
 #[user_doc(
     doc_section(label = "Approximate Functions"),
     description = "Returns the weighted approximate percentile of input values using the t-digest algorithm.",
-    syntax_example = "approx_percentile_cont_with_weight(expression, weight, percentile)",
+    syntax_example = "approx_percentile_cont_with_weight(weight, percentile) WITHIN GROUP (ORDER BY expression)",
     sql_example = r#"```sql
-> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90) FROM table_name;
-+----------------------------------------------------------------------+
-| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) |
-+----------------------------------------------------------------------+
-| 78.5                                                                 |
-+----------------------------------------------------------------------+
+> SELECT approx_percentile_cont_with_weight(weight_column, 0.90) WITHIN GROUP (ORDER BY column_name) FROM table_name;
++---------------------------------------------------------------------------------------------+
+| approx_percentile_cont_with_weight(weight_column, 0.90) WITHIN GROUP (ORDER BY column_name) |
++---------------------------------------------------------------------------------------------+
+| 78.5                                                                                        |
++---------------------------------------------------------------------------------------------+
 ```"#,
     standard_argument(name = "expression", prefix = "The"),
     argument(
@@ -178,6 +178,14 @@ impl AggregateUDFImpl for ApproxPercentileContWithWeight {
         self.approx_percentile_cont.state_fields(args)
     }
 
+    fn supports_null_handling_clause(&self) -> bool {
+        false
+    }
+
+    fn is_ordered_set_aggregate(&self) -> bool {
+        true
+    }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }

diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -970,8 +970,8 @@ async fn roundtrip_expr_api() -> Result<()> {
         stddev_pop(lit(2.2)),
         approx_distinct(lit(2)),
         approx_median(lit(2)),
-        approx_percentile_cont(lit(2), lit(0.5), None),
-        approx_percentile_cont(lit(2), lit(0.5), Some(lit(50))),
+        approx_percentile_cont(lit(2).sort(true, false), lit(0.5), None),
+        approx_percentile_cont(lit(2).sort(true, false), lit(0.5), Some(lit(50))),
         approx_percentile_cont_with_weight(lit(2), lit(1), lit(0.5)),
         grouping(lit(1)),
         bit_and(lit(2)),

diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs
@@ -503,7 +503,7 @@ fn rountrip_aggregate_with_approx_pencentile_cont() -> Result<()> {
         vec![col("b", &schema)?, lit(0.5)],
     )
     .schema(Arc::clone(&schema))
-    .alias("APPROX_PERCENTILE_CONT(b, 0.5)")
+    .alias("APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY b)")
     .build()
     .map(Arc::new)?];