[Grafana] Handle min_doc_count also in terms (#1323)

trzysiek · web-flow · commit 14c022bd31ca · 2025-03-03T11:26:15.000Z
I do that by adding `count() >= min_doc_count DESC` to the `ORDER BY` (at the beginning). Then we'll have rows with `>= min_doc_count` first, and can filter out the rest. Before: <img width="1268" alt="Screenshot 2025-03-02 at 21 16 36" src="https://github.com/user-attachments/assets/2d194199-cd5d-4e8e-ac80-bc269d692822" /> After: <img width="1431" alt="Screenshot 2025-03-02 at 21 15 40" src="https://github.com/user-attachments/assets/29f51da1-83b3-4be9-927a-08d02e920baa" />
diff --git a/platform/model/bucket_aggregations/terms.go b/platform/model/bucket_aggregations/terms.go
@@ -16,6 +16,7 @@ import (
 type Terms struct {
 	ctx         context.Context
 	significant bool // true <=> significant_terms, false <=> terms
+	minDocCount int
 	// include is either:
 	//   - single value: then for strings, it can be a regex.
 	//   - array: then field must match exactly one of the values (never a regex)
@@ -28,8 +29,8 @@ type Terms struct {
 	exclude any
 }
 
-func NewTerms(ctx context.Context, significant bool, include, exclude any) Terms {
-	return Terms{ctx: ctx, significant: significant, include: include, exclude: exclude}
+func NewTerms(ctx context.Context, significant bool, minDocCount int, include, exclude any) Terms {
+	return Terms{ctx: ctx, significant: significant, minDocCount: minDocCount, include: include, exclude: exclude}
 }
 
 func (query Terms) AggregationType() model.AggregationType {
@@ -45,6 +46,10 @@ func (query Terms) TranslateSqlResponseToJson(rows []model.QueryResultRow) model
 		return model.JsonMap{"buckets": []model.JsonMap{}}
 	}
 
+	if query.minDocCount > 1 {
+		rows = query.NewRowsTransformer().Transform(query.ctx, rows)
+	}
+
 	buckets := make([]model.JsonMap, 0, len(rows))
 	for _, row := range rows {
 		docCount := query.docCount(row)
@@ -187,8 +192,8 @@ func CheckParamsTerms(ctx context.Context, paramsRaw any) error {
 		"value_type":                "string",
 	}
 	logIfYouSeeThemParams := []string{
-		"shard_size", "min_doc_count", "shard_min_doc_count",
-		"show_term_doc_count_error", "collect_mode", "execution_hint", "value_type",
+		"shard_size", "shard_min_doc_count", "show_term_doc_count_error",
+		"collect_mode", "execution_hint", "value_type",
 	}
 
 	params, ok := paramsRaw.(model.JsonMap)
@@ -245,3 +250,28 @@ func CheckParamsTerms(ctx context.Context, paramsRaw any) error {
 
 	return nil
 }
+
+func (query Terms) NewRowsTransformer() model.QueryRowsTransformer {
+	return &TermsRowsTransformer{minDocCount: int64(query.minDocCount)}
+}
+
+type TermsRowsTransformer struct {
+	minDocCount int64
+}
+
+// TODO unify with other transformers
+func (qt TermsRowsTransformer) Transform(ctx context.Context, rowsFromDB []model.QueryResultRow) []model.QueryResultRow {
+	postprocessedRows := make([]model.QueryResultRow, 0, len(rowsFromDB))
+	for _, row := range rowsFromDB {
+		docCount, err := util.ExtractInt64(row.LastColValue())
+		if err != nil {
+			logger.ErrorWithCtx(ctx).Msgf("unexpected type for terms doc_count: %T, value: %v. Returning empty rows.",
+				row.LastColValue(), row.LastColValue())
+			return []model.QueryResultRow{}
+		}
+		if docCount >= qt.minDocCount {
+			postprocessedRows = append(postprocessedRows, row)
+		}
+	}
+	return postprocessedRows
+}
diff --git a/platform/parsers/elastic_query_dsl/pancake_aggregation_parser_buckets.go b/platform/parsers/elastic_query_dsl/pancake_aggregation_parser_buckets.go
@@ -159,8 +159,14 @@ func (cw *ClickhouseQueryTranslator) parseTermsAggregation(aggregation *pancakeA
 		return err
 	}
 
+	const (
+		defaultSize        = 10
+		defaultMinDocCount = 1
+	)
+
+	minDocCount := cw.parseIntField(params, "min_doc_count", defaultMinDocCount)
 	terms := bucket_aggregations.NewTerms(
-		cw.Ctx, aggrName == "significant_terms", params["include"], params["exclude"],
+		cw.Ctx, aggrName == "significant_terms", minDocCount, params["include"], params["exclude"],
 	)
 
 	var didWeAddMissing, didWeUpdateFieldHere bool
@@ -178,18 +184,37 @@ func (cw *ClickhouseQueryTranslator) parseTermsAggregation(aggregation *pancakeA
 		aggregation.filterOutEmptyKeyBucket = true
 	}
 
-	const defaultSize = 10
-	size := cw.parseSize(params, defaultSize)
-
 	orderBy, err := cw.parseOrder(params, []model.Expr{field})
 	if err != nil {
 		return err
 	}
 
 	aggregation.queryType = terms
 	aggregation.selectedColumns = append(aggregation.selectedColumns, field)
-	aggregation.limit = size
+	aggregation.limit = cw.parseSize(params, defaultSize)
 	aggregation.orderBy = orderBy
+	if minDocCount > 1 {
+		// If you have a better solution, feel free to implement. This works, but adds another ORDER BY + we have to filter out rows later.
+		//
+		// We only want rows with (count() >= min_doc_count).
+		// I think we can't do it in WHERE or HAVING clause, as it might affect the aggregations before/after in the aggregation tree.
+		// Or at least it's not obvious, this solution is much easier.
+		// We add the condition as the first ORDER BY. This way rows with count() < min_doc_count will be at the end, and we'll filter them out later.
+		//
+		// Example:
+		// Without this trick, if we have rows (key, count): (k1, 1), (k2, 1), ..., (k_n, 1) and (a, 100), (b, 100)
+		// (they'll be returned this way, because of some order by)
+		// If we add the condition below, if min_doc_count>1, rows will be returned (a, 100), (b, 100), (k1, 1), (k2, 1), ...
+		// We filter out (k1, 1), (k2, 1), ..., and we are fine.
+		// Without this trick, if we do a query with a limit (without filtering in SQL), we could only receive (k1, 1), (k2, 1), ...,
+		// and after filtering we'd have 0 rows to return.
+		condition := model.NewInfixExpr(model.NewCountFunc(), ">=", model.NewLiteral(minDocCount))
+		firstOrderBy := model.NewOrderByExpr(condition, model.DescOrder)
+		aggregation.orderBy = append(
+			[]model.OrderByExpr{firstOrderBy},
+			aggregation.orderBy...,
+		)
+	}
 	return nil
 }
 
diff --git a/platform/parsers/elastic_query_dsl/pancake_sql_query_generation.go b/platform/parsers/elastic_query_dsl/pancake_sql_query_generation.go
@@ -47,8 +47,9 @@ func (p *pancakeSqlQueryGenerator) generatePartitionBy(groupByColumns []model.Al
 
 // TODO: Implement more if needed.
 func (p *pancakeSqlQueryGenerator) generateAccumAggrFunctions(origExpr model.Expr, queryType model.QueryType) (accumExpr model.Expr, aggrFuncName string, err error) {
-	switch origFunc := origExpr.(type) {
+	switch origExprTyped := origExpr.(type) {
 	case model.FunctionExpr:
+		origFunc := origExprTyped
 		switch origFunc.Name {
 		case "sum", "sumOrNull", "min", "minOrNull", "max", "maxOrNull":
 			return origExpr, origFunc.Name, nil
@@ -64,6 +65,11 @@ func (p *pancakeSqlQueryGenerator) generateAccumAggrFunctions(origExpr model.Exp
 			return model.NewFunction(strings.Replace(origFunc.Name, "quantiles", "quantilesState", 1), origFunc.Args...),
 				strings.Replace(origFunc.Name, "quantiles", "quantilesMerge", 1), nil
 		}
+	case model.InfixExpr:
+		origInfix := origExprTyped
+		if f, ok := origInfix.Left.(model.FunctionExpr); ok && f.Name == "count" {
+			return origInfix, "sum", nil
+		}
 	}
 	debugQueryType := "<nil>"
 	if queryType != nil {
diff --git a/platform/testdata/grafana.go b/platform/testdata/grafana.go
@@ -6,7 +6,7 @@ import "github.com/QuesmaOrg/quesma/platform/model"
 
 var GrafanaAggregationTests = []AggregationTestCase{
 	{ // [0]
-		TestName: "simple max/min aggregation as 2 siblings",
+		TestName: "format: epoch_millis",
 		QueryRequestJson: `
 		{
 			"aggs": {
@@ -83,4 +83,219 @@ var GrafanaAggregationTests = []AggregationTestCase{
 			  "aggr__2__key_0"
 			ORDER BY "aggr__2__key_0" ASC`,
 	},
+	{ // [1]
+		TestName: "1x terms with min_doc_count, need to erase some rows with count < min_doc_count",
+		QueryRequestJson: `
+		{
+			"aggs": {
+				"2": {
+					"terms": {
+						"field": "extension.keyword",
+						"size": 4,
+						"min_doc_count": 40,
+						"order": {
+							"_key": "desc"
+						}
+					}
+				}
+			},
+			"size": 0
+		}`,
+		ExpectedResponse: `
+		{
+            "aggregations": {
+                "2": {
+					"doc_count_error_upper_bound": 0,
+					"sum_other_doc_count": 196,
+					"buckets": [
+						{
+							"key": "zip",
+							"doc_count": 40
+						}
+					]
+				}
+			},
+            "hits": {
+                "hits": [],
+                "max_score": null,
+                "total": {
+                    "relation": "eq",
+                    "value": 234
+                }
+            },
+            "status": 200,
+            "timed_out": false,
+            "took": 1
+        }`,
+		ExpectedPancakeResults: []model.QueryResultRow{
+			{Cols: []model.QueryResultCol{
+				model.NewQueryResultCol("aggr__2__parent_count", int64(236)),
+				model.NewQueryResultCol("aggr__2__key_0", "zip"),
+				model.NewQueryResultCol("aggr__2__count", int64(40)),
+				model.NewQueryResultCol("aggr__2__order_1", int64(1)),
+			}},
+			{Cols: []model.QueryResultCol{
+				model.NewQueryResultCol("aggr__2__parent_count", int64(236)),
+				model.NewQueryResultCol("aggr__2__key_0", "tar"),
+				model.NewQueryResultCol("aggr__2__count", int64(30)),
+				model.NewQueryResultCol("aggr__2__order_1", int64(0)),
+			}},
+		},
+		ExpectedPancakeSQL: `
+		  	SELECT sum(count(*)) OVER () AS "aggr__2__parent_count",
+			  "extension" AS "aggr__2__key_0", count(*) AS "aggr__2__count",
+			  count(*)>=40 AS "aggr__2__order_1"
+			FROM __quesma_table_name
+			GROUP BY "extension" AS "aggr__2__key_0"
+			ORDER BY "aggr__2__order_1" DESC, "aggr__2__key_0" DESC
+			LIMIT 5`,
+	},
+	{ // [2]
+		TestName: "2x terms with min_doc_count",
+		QueryRequestJson: `
+		{
+			"aggs": {
+				"2": {
+					"aggs": {
+						"3": {
+							"terms": {
+								"field": "message"
+							}	
+						}
+					},
+					"terms": {
+						"field": "extension.keyword",
+						"size": 4,
+						"min_doc_count": 30,
+						"order": {
+							"_key": "desc"
+						}
+					}
+				}
+			},
+			"size": 0
+		}`,
+		ExpectedResponse: `
+		{
+            "aggregations": {
+                "2": {
+					"doc_count_error_upper_bound": 0,
+					"sum_other_doc_count": 164,
+					"buckets": [
+						{
+							"3": {
+								"doc_count_error_upper_bound": 0,
+								"sum_other_doc_count": 9,
+								"buckets": [
+									{
+										"key": 0,
+										"doc_count": 18
+									},
+									{
+										"key": 6680,
+										"doc_count": 4
+									}
+								]
+							},
+							"key": "zip",
+							"doc_count": 31
+						},
+						{
+							"3": {
+								"doc_count_error_upper_bound": 0,
+								"sum_other_doc_count": 14,
+								"buckets": [
+									{
+										"key": 0,
+										"doc_count": 25
+									},
+									{
+										"key": 1873,
+										"doc_count": 2
+									}
+								]
+							},
+							"key": "tar",
+							"doc_count": 41
+						}
+					]
+				}
+			},
+            "hits": {
+                "hits": [],
+                "max_score": null,
+                "total": {
+                    "relation": "eq",
+                    "value": 234
+                }
+            },
+            "status": 200,
+            "timed_out": false,
+            "took": 1
+        }`,
+		ExpectedPancakeResults: []model.QueryResultRow{
+			{Cols: []model.QueryResultCol{
+				model.NewQueryResultCol("aggr__2__parent_count", int64(236)),
+				model.NewQueryResultCol("aggr__2__key_0", "zip"),
+				model.NewQueryResultCol("aggr__2__count", int64(31)),
+				model.NewQueryResultCol("aggr__2__order_1", int64(0)),
+				model.NewQueryResultCol("aggr__2__3__parent_count", int64(31)),
+				model.NewQueryResultCol("aggr__2__3__key_0", 0),
+				model.NewQueryResultCol("aggr__2__3__count", int64(18)),
+			}},
+			{Cols: []model.QueryResultCol{
+				model.NewQueryResultCol("aggr__2__parent_count", int64(236)),
+				model.NewQueryResultCol("aggr__2__key_0", "zip"),
+				model.NewQueryResultCol("aggr__2__count", int64(31)),
+				model.NewQueryResultCol("aggr__2__order_1", int64(1)),
+				model.NewQueryResultCol("aggr__2__3__parent_count", int64(31)),
+				model.NewQueryResultCol("aggr__2__3__key_0", 6680),
+				model.NewQueryResultCol("aggr__2__3__count", int64(4)),
+			}},
+			{Cols: []model.QueryResultCol{
+				model.NewQueryResultCol("aggr__2__parent_count", int64(236)),
+				model.NewQueryResultCol("aggr__2__key_0", "tar"),
+				model.NewQueryResultCol("aggr__2__count", int64(41)),
+				model.NewQueryResultCol("aggr__2__order_1", int64(1)),
+				model.NewQueryResultCol("aggr__2__3__parent_count", int64(41)),
+				model.NewQueryResultCol("aggr__2__3__key_0", 0),
+				model.NewQueryResultCol("aggr__2__3__count", int64(25)),
+			}},
+			{Cols: []model.QueryResultCol{
+				model.NewQueryResultCol("aggr__2__parent_count", int64(236)),
+				model.NewQueryResultCol("aggr__2__key_0", "tar"),
+				model.NewQueryResultCol("aggr__2__count", int64(41)),
+				model.NewQueryResultCol("aggr__2__order_1", int64(1)),
+				model.NewQueryResultCol("aggr__2__3__parent_count", int64(41)),
+				model.NewQueryResultCol("aggr__2__3__key_0", 1873),
+				model.NewQueryResultCol("aggr__2__3__count", int64(2)),
+			}},
+		},
+		ExpectedPancakeSQL: `
+		  	SELECT "aggr__2__parent_count", "aggr__2__key_0", "aggr__2__count",
+			  "aggr__2__order_1", "aggr__2__3__parent_count", "aggr__2__3__key_0",
+			  "aggr__2__3__count"
+			FROM (
+			  SELECT "aggr__2__parent_count", "aggr__2__key_0", "aggr__2__count",
+				"aggr__2__order_1", "aggr__2__3__parent_count", "aggr__2__3__key_0",
+				"aggr__2__3__count",
+				dense_rank() OVER (ORDER BY "aggr__2__order_1" DESC, "aggr__2__key_0" DESC)
+				AS "aggr__2__order_1_rank",
+				dense_rank() OVER (PARTITION BY "aggr__2__key_0" ORDER BY
+				"aggr__2__3__count" DESC, "aggr__2__3__key_0" ASC) AS
+				"aggr__2__3__order_1_rank"
+			  FROM (
+				SELECT sum(count(*)) OVER () AS "aggr__2__parent_count",
+				  "extension" AS "aggr__2__key_0",
+				  sum(count(*)) OVER (PARTITION BY "aggr__2__key_0") AS "aggr__2__count",
+				  sum(count(*)>=30) OVER (PARTITION BY "aggr__2__key_0") AS
+				  "aggr__2__order_1",
+				  sum(count(*)) OVER (PARTITION BY "aggr__2__key_0") AS
+				  "aggr__2__3__parent_count", "message" AS "aggr__2__3__key_0",
+				  count(*) AS "aggr__2__3__count"
+				FROM __quesma_table_name
+				GROUP BY "extension" AS "aggr__2__key_0", "message" AS "aggr__2__3__key_0"))
+			WHERE ("aggr__2__order_1_rank"<=5 AND "aggr__2__3__order_1_rank"<=11)
+			ORDER BY "aggr__2__order_1_rank" ASC, "aggr__2__3__order_1_rank" ASC`,
+	},
 }