Skip to content

Commit b5026aa

Browse files
authored
Merge pull request #1875 from Kotlin/count-distinct-on-group-by
`countDistinct` overloads on `GroupBy`
2 parents cace8f7 + 400ad0a commit b5026aa

21 files changed

Lines changed: 4830 additions & 45 deletions

core/api/core.api

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1714,6 +1714,10 @@ public final class org/jetbrains/kotlinx/dataframe/api/CountDistinctKt {
17141714
public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Ljava/lang/String;)I
17151715
public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Lkotlin/reflect/KProperty;)I
17161716
public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Lorg/jetbrains/kotlinx/dataframe/columns/ColumnReference;)I
1717+
public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
1718+
public static final fun countDistinct (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;Lkotlin/jvm/functions/Function2;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
1719+
public static synthetic fun countDistinct$default (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
1720+
public static synthetic fun countDistinct$default (Lorg/jetbrains/kotlinx/dataframe/api/Grouped;Ljava/lang/String;Lkotlin/jvm/functions/Function2;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
17171721
}
17181722

17191723
public final class org/jetbrains/kotlinx/dataframe/api/CountKt {

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,18 @@ import org.jetbrains.kotlinx.dataframe.AnyColumnReference
44
import org.jetbrains.kotlinx.dataframe.ColumnsSelector
55
import org.jetbrains.kotlinx.dataframe.DataFrame
66
import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload
7+
import org.jetbrains.kotlinx.dataframe.annotations.Interpretable
8+
import org.jetbrains.kotlinx.dataframe.annotations.Refine
9+
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COLUMNS_PARAM
10+
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COLUMN_SELECTION_DSL
11+
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COMPARISON_OBJECT
12+
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.EXAMPLE
13+
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.SCOPE
714
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
815
import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls
916
import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources
1017
import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns
18+
import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateValue
1119
import org.jetbrains.kotlinx.dataframe.indices
1220
import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API
1321
import kotlin.reflect.KProperty
@@ -93,3 +101,81 @@ public fun <T> DataFrame<T>.countDistinct(vararg columns: AnyColumnReference): I
93101
countDistinct { columns.toColumnSet() }
94102

95103
// endregion
104+
105+
// region GroupBy
106+
107+
/**
108+
* Aggregates this [GroupBy] by counting the number of distinct {@get [COMPARISON_OBJECT] rows} in each group.
109+
*
110+
* Compares rows in each group based on the values in {@get [SCOPE] all} columns.
111+
* Returns a new [DataFrame] where each row corresponds to a group.
112+
* The resulting [DataFrame] contains:
113+
* - the original group key columns,
114+
* - a new column (named [resultName\], default is `"countDistinct"`)
115+
* that contains the number of distinct {@get [COMPARISON_OBJECT] rows} in each group.
116+
*
117+
* See also:
118+
* - [aggregate][Grouped.aggregate], which aggregates a [GroupBy] using the provided statistics.
119+
* - [count][Grouped.count], which counts the number of rows in each group.
120+
* - [distinct][DataFrame.distinct], which removes duplicate rows and returns a new [DataFrame].
121+
* - [groupBy][DataFrame.groupBy], which groups the rows of a [DataFrame]
122+
* based on the values in one or more specified cols.
123+
*
124+
* For more information: {@include [DocumentationUrls.CountDistinct]}
125+
*
126+
* {@get [COLUMN_SELECTION_DSL]}
127+
*
128+
* ### Example
129+
* ```kotlin
130+
* {@get [EXAMPLE]}
131+
* ```
132+
*
133+
* @param [resultName\] The name of the result column that will store the number
134+
* of distinct {@get [COMPARISON_OBJECT] rows} in each group. Defaults to `"countDistinct"`.
135+
* @get [COLUMNS_PARAM]
136+
* @return A new [DataFrame] with group keys and corresponding numbers of distinct {@get [COMPARISON_OBJECT] rows}.
137+
*/
138+
@ExcludeFromSources
139+
private interface CountDistinctOnGroupByDocs {
140+
typealias COMPARISON_OBJECT = Nothing
141+
typealias SCOPE = Nothing
142+
typealias EXAMPLE = Nothing
143+
typealias COLUMN_SELECTION_DSL = Nothing
144+
typealias COLUMNS_PARAM = Nothing
145+
}
146+
147+
/**
148+
* @include [CountDistinctOnGroupByDocs]
149+
* @set [EXAMPLE]
150+
* // Counts the number of distinct rows for each city, returning
151+
* // a new DataFrame with columns "city" and "countDistinct"
152+
* df.groupBy { city }.countDistinct()
153+
*/
154+
@Refine
155+
@Interpretable("GroupByCountDistinct0")
156+
public fun <T> Grouped<T>.countDistinct(resultName: String = "countDistinct"): DataFrame<T> =
157+
countDistinct(resultName) { all() }
158+
159+
/**
160+
* @include [CountDistinctOnGroupByDocs]
161+
* @set [COMPARISON_OBJECT] combinations of values in the selected [columns]
162+
* @set [SCOPE] the selected
163+
* @set [COLUMN_SELECTION_DSL] {@include [SelectingColumns.ColumnsSelectionDsl]}
164+
* @set [EXAMPLE]
165+
* // Counts unique combinations of values in the "year" and "title" columns
166+
* // for each city, returning a new DataFrame with columns "city" and "countDistinct"
167+
* df.groupBy { city }.countDistinct { year and title }
168+
* @set [COLUMNS_PARAM] @param [columns\] The [ColumnsSelector] used to select columns
169+
* that will be considered for evaluating whether the rows are distinct.
170+
*/
171+
@Refine
172+
@Interpretable("GroupByCountDistinct0")
173+
public fun <T, C> Grouped<T>.countDistinct(
174+
resultName: String = "countDistinct",
175+
columns: ColumnsSelector<T, C>,
176+
): DataFrame<T> =
177+
aggregateValue(resultName) {
178+
countDistinct(columns) default 0
179+
}
180+
181+
// endregion

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,9 @@ internal interface GroupByDocs {
156156
* `| `__`.`__[**`count`**][Grouped.count]**`() `**
157157
*
158158
* {@include [Indent]}
159+
* `| `__`.`__[**`countDistinct`**][Grouped.countDistinct]**`() `**
160+
*
161+
* {@include [Indent]}
159162
* `| `__`.`__[**`aggregate`**][Grouped.aggregate]**` { `**`aggregations: `[`AggregateDsl`][AggregateDsl]**` }`**
160163
*
161164
* {@include [Indent]}
@@ -196,6 +199,8 @@ internal interface GroupByDocs {
196199
*
197200
* * [count][Grouped.count] — calculate the number of rows in each group
198201
* (optionally counting only rows that satisfy the given predicate);
202+
* * [`countDistinct`][Grouped.countDistinct] — calculate the number of distinct rows in each group
203+
* (or distinct combinations of values in selected columns);
199204
* * [max][Grouped.max] / [maxOf][Grouped.maxOf] / [maxFor][Grouped.maxFor] —
200205
* calculate the maximum of all values on the selected columns / by a row expression /
201206
* for each of the selected columns within each group;
@@ -301,6 +306,8 @@ internal interface GroupByDocs {
301306
* from all rows of each group for the selected columns.
302307
* * [count][Grouped.count] — creates a [DataFrame] containing the grouping key columns and an additional column
303308
* with the number of rows in each corresponding group;
309+
* * [countDistinct][Grouped.countDistinct] — creates a [DataFrame] containing the grouping key columns
310+
* and an additional column with the number of distinct rows in each corresponding group;
304311
* * [aggregate][Grouped.aggregate] — performs a set of custom aggregations using [AggregateDsl],
305312
* allowing you to compute one or more derived values per group;
306313
* * [Various aggregation statistics][AggregationStatistics] — predefined shortcuts
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
package org.jetbrains.kotlinx.dataframe.api
2+
3+
import io.kotest.matchers.shouldBe
4+
import org.jetbrains.kotlinx.dataframe.nrow
5+
import org.junit.Test
6+
7+
class CountDistinctTests {
8+
9+
private val df = dataFrameOf(
10+
"name" to columnOf("Alice", "Alice", "Bob", "Charlie"),
11+
"age" to columnOf(15, 15, 20, 25),
12+
"group" to columnOf(1, 1, 1, 2),
13+
)
14+
15+
@Test
16+
fun `countDistinct on GroupBy`() {
17+
val result = df.groupBy("group").countDistinct()
18+
val expected = dataFrameOf(
19+
"group" to columnOf(1, 2),
20+
"countDistinct" to columnOf(2, 1),
21+
)
22+
result shouldBe expected
23+
}
24+
25+
@Test
26+
fun `countDistinct on GroupBy with custom result name`() {
27+
val result = df.groupBy("group").countDistinct("unique")
28+
val expected = dataFrameOf(
29+
"group" to columnOf(1, 2),
30+
"unique" to columnOf(2, 1),
31+
)
32+
result shouldBe expected
33+
}
34+
35+
@Test
36+
fun `countDistinct on GroupBy with one unique row`() {
37+
val df = dataFrameOf(
38+
"name" to columnOf("Alice", "Alice", "Alice"),
39+
"age" to columnOf(15, 15, 15),
40+
"group" to columnOf(1, 1, 1),
41+
)
42+
val result = df.groupBy("group").countDistinct()
43+
val expected = dataFrameOf(
44+
"group" to columnOf(1),
45+
"countDistinct" to columnOf(1),
46+
)
47+
result shouldBe expected
48+
}
49+
50+
// TODO: check columns as well when #1531 is fixed
51+
@Test
52+
fun `countDistinct on empty GroupBy`() {
53+
df
54+
.drop(df.nrow)
55+
.groupBy("group").countDistinct()
56+
.count() shouldBe 0
57+
}
58+
59+
@Test
60+
fun `countDistinct on GroupBy with nulls`() {
61+
val result = df
62+
.append(null, null, 1)
63+
.groupBy("group").countDistinct()
64+
val expected = dataFrameOf(
65+
"group" to columnOf(1, 2),
66+
"countDistinct" to columnOf(3, 1),
67+
)
68+
result shouldBe expected
69+
}
70+
71+
@Test
72+
fun `countDistinct on GroupBy with null group key`() {
73+
val result = df
74+
.append("Dave", 30, null)
75+
.groupBy("group").countDistinct()
76+
val expected = dataFrameOf(
77+
"group" to columnOf(1, 2, null),
78+
"countDistinct" to columnOf(2, 1, 1),
79+
)
80+
result shouldBe expected
81+
}
82+
83+
@Test
84+
fun `countDistinct on GroupBy with columns selector`() {
85+
val result = df.groupBy("group").countDistinct { "name"<String>() }
86+
val expected = dataFrameOf(
87+
"group" to columnOf(1, 2),
88+
"countDistinct" to columnOf(2, 1),
89+
)
90+
result shouldBe expected
91+
}
92+
93+
@Test
94+
fun `countDistinct on GroupBy with columns selector (not distinct only by selected column)`() {
95+
val df = dataFrameOf(
96+
"name" to columnOf("Alice", "Bob", "Charlie"),
97+
"age" to columnOf(15, 15, 20),
98+
"group" to columnOf(1, 1, 2),
99+
)
100+
val result = df.groupBy("group").countDistinct { "age"<Int>() }
101+
val expected = dataFrameOf(
102+
"group" to columnOf(1, 2),
103+
"countDistinct" to columnOf(1, 1),
104+
)
105+
result shouldBe expected
106+
}
107+
108+
@Test
109+
fun `countDistinct on GroupBy with multiple columns selector`() {
110+
val df = dataFrameOf(
111+
"name" to columnOf("Alice", "Alice", "Bob", "Charlie"),
112+
"age" to columnOf(15, 15, 20, 25),
113+
"group" to columnOf(1, 1, 1, 2),
114+
"city" to columnOf("London", "Moscow", "London", "Paris"),
115+
)
116+
val result = df.groupBy("group").countDistinct { "name"<String>() and "age"<Int>() }
117+
val expected = dataFrameOf(
118+
"group" to columnOf(1, 2),
119+
"countDistinct" to columnOf(2, 1),
120+
)
121+
result shouldBe expected
122+
}
123+
124+
@Test
125+
fun `countDistinct on grouped DataFrame with columns selector and custom result name`() {
126+
val result = df.groupBy("group").countDistinct(resultName = "unique") { "name"<String>() }
127+
val expected = dataFrameOf(
128+
"group" to columnOf(1, 2),
129+
"unique" to columnOf(2, 1),
130+
)
131+
result shouldBe expected
132+
}
133+
134+
@Test
135+
fun `countDistinct on grouped DataFrame with multiple columns selector with nulls`() {
136+
val result = df
137+
.append(null, null, 1)
138+
.groupBy("group")
139+
.countDistinct { "name"<String>() and "age"<Int>() }
140+
val expected = dataFrameOf(
141+
"group" to columnOf(1, 2),
142+
"countDistinct" to columnOf(3, 1),
143+
)
144+
result shouldBe expected
145+
}
146+
}

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import org.jetbrains.kotlinx.dataframe.api.add
66
import org.jetbrains.kotlinx.dataframe.api.after
77
import org.jetbrains.kotlinx.dataframe.api.chunked
88
import org.jetbrains.kotlinx.dataframe.api.colsOf
9-
import org.jetbrains.kotlinx.dataframe.api.countDistinct
109
import org.jetbrains.kotlinx.dataframe.api.distinct
1110
import org.jetbrains.kotlinx.dataframe.api.distinctBy
1211
import org.jetbrains.kotlinx.dataframe.api.drop
@@ -431,30 +430,6 @@ class Access : TestBase() {
431430
// SampleEnd
432431
}
433432

434-
@Test
435-
@TransformDataFrameExpressions
436-
fun countDistinct() {
437-
// SampleStart
438-
df.countDistinct()
439-
// SampleEnd
440-
}
441-
442-
@Test
443-
@TransformDataFrameExpressions
444-
fun countDistinctColumns_properties() {
445-
// SampleStart
446-
df.countDistinct { age and name }
447-
// SampleEnd
448-
}
449-
450-
@Test
451-
@TransformDataFrameExpressions
452-
fun countDistinctColumns_strings() {
453-
// SampleStart
454-
df.countDistinct("age", "name")
455-
// SampleEnd
456-
}
457-
458433
@Test
459434
@TransformDataFrameExpressions
460435
fun distinctColumns_strings() {

0 commit comments

Comments
 (0)