Skip to content

Commit 645c9c8

Browse files
Automated commit of generated code
1 parent b5026aa commit 645c9c8

4 files changed

Lines changed: 258 additions & 25 deletions

File tree

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/countDistinct.kt

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,18 @@ import org.jetbrains.kotlinx.dataframe.AnyColumnReference
44
import org.jetbrains.kotlinx.dataframe.ColumnsSelector
55
import org.jetbrains.kotlinx.dataframe.DataFrame
66
import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload
7+
import org.jetbrains.kotlinx.dataframe.annotations.Interpretable
8+
import org.jetbrains.kotlinx.dataframe.annotations.Refine
9+
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COLUMNS_PARAM
10+
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COLUMN_SELECTION_DSL
11+
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.COMPARISON_OBJECT
12+
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.EXAMPLE
13+
import org.jetbrains.kotlinx.dataframe.api.CountDistinctOnGroupByDocs.SCOPE
714
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
815
import org.jetbrains.kotlinx.dataframe.documentation.DocumentationUrls
916
import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources
1017
import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns
18+
import org.jetbrains.kotlinx.dataframe.impl.aggregation.modes.aggregateValue
1119
import org.jetbrains.kotlinx.dataframe.indices
1220
import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API
1321
import kotlin.reflect.KProperty
@@ -115,3 +123,100 @@ public fun <T> DataFrame<T>.countDistinct(vararg columns: AnyColumnReference): I
115123
countDistinct { columns.toColumnSet() }
116124

117125
// endregion
126+
127+
// region GroupBy
128+
129+
/**
130+
* Aggregates this [GroupBy][org.jetbrains.kotlinx.dataframe.api.GroupBy] by counting the number of distinct rows in each group.
131+
*
132+
* Compares rows in each group based on the values in all columns.
133+
* Returns a new [DataFrame][org.jetbrains.kotlinx.dataframe.DataFrame] where each row corresponds to a group.
134+
* The resulting [DataFrame][org.jetbrains.kotlinx.dataframe.DataFrame] contains:
135+
* - the original group key columns,
136+
* - a new column (named [resultName], default is `"countDistinct"`)
137+
* that contains the number of distinct rows in each group.
138+
*
139+
* See also:
140+
* - [aggregate][org.jetbrains.kotlinx.dataframe.api.Grouped.aggregate], which aggregates a [GroupBy][org.jetbrains.kotlinx.dataframe.api.GroupBy] using the provided statistics.
141+
* - [count][org.jetbrains.kotlinx.dataframe.api.Grouped.count], which counts the number of rows in each group.
142+
* - [distinct][org.jetbrains.kotlinx.dataframe.DataFrame.distinct], which removes duplicate rows and returns a new [DataFrame][org.jetbrains.kotlinx.dataframe.DataFrame].
143+
* - [groupBy][org.jetbrains.kotlinx.dataframe.DataFrame.groupBy], which groups the rows of a [DataFrame][org.jetbrains.kotlinx.dataframe.DataFrame]
144+
* based on the values in one or more specified cols.
145+
*
146+
* For more information: [See `countDistinct` on the documentation website.](https://kotlin.github.io/dataframe/countdistinct.html)
147+
*
148+
*
149+
*
150+
* ### Example
151+
* ```kotlin
152+
* // Counts the number of distinct rows for each city, returning
153+
* // a new DataFrame with columns "city" and "countDistinct"
154+
* df.groupBy { city }.countDistinct()
155+
* ```
156+
*
157+
* @param [resultName] The name of the result column that will store the number
158+
* of distinct rows in each group. Defaults to `"countDistinct"`.
159+
* @return A new [DataFrame][org.jetbrains.kotlinx.dataframe.DataFrame] with group keys and corresponding numbers of distinct rows.
160+
*/
161+
@Refine
162+
@Interpretable("GroupByCountDistinct0")
163+
public fun <T> Grouped<T>.countDistinct(resultName: String = "countDistinct"): DataFrame<T> =
164+
countDistinct(resultName) { all() }
165+
166+
/**
167+
* Aggregates this [GroupBy][org.jetbrains.kotlinx.dataframe.api.GroupBy] by counting the number of distinct combinations of values in the selected [columns] in each group.
168+
*
169+
* Compares rows in each group based on the values in the selected columns.
170+
* Returns a new [DataFrame][org.jetbrains.kotlinx.dataframe.DataFrame] where each row corresponds to a group.
171+
* The resulting [DataFrame][org.jetbrains.kotlinx.dataframe.DataFrame] contains:
172+
* - the original group key columns,
173+
* - a new column (named [resultName], default is `"countDistinct"`)
174+
* that contains the number of distinct combinations of values in the selected [columns] in each group.
175+
*
176+
* See also:
177+
* - [aggregate][org.jetbrains.kotlinx.dataframe.api.Grouped.aggregate], which aggregates a [GroupBy][org.jetbrains.kotlinx.dataframe.api.GroupBy] using the provided statistics.
178+
* - [count][org.jetbrains.kotlinx.dataframe.api.Grouped.count], which counts the number of rows in each group.
179+
* - [distinct][org.jetbrains.kotlinx.dataframe.DataFrame.distinct], which removes duplicate rows and returns a new [DataFrame][org.jetbrains.kotlinx.dataframe.DataFrame].
180+
* - [groupBy][org.jetbrains.kotlinx.dataframe.DataFrame.groupBy], which groups the rows of a [DataFrame][org.jetbrains.kotlinx.dataframe.DataFrame]
181+
* based on the values in one or more specified cols.
182+
*
183+
* For more information: [See `countDistinct` on the documentation website.](https://kotlin.github.io/dataframe/countdistinct.html)
184+
*
185+
* Select or express columns using the [Columns Selection DSL][org.jetbrains.kotlinx.dataframe.api.ColumnsSelectionDsl].
186+
*
187+
* This DSL is initiated by a [Columns Selector][org.jetbrains.kotlinx.dataframe.ColumnsSelector] lambda,
188+
* which operates in the context of the [Columns Selection DSL][org.jetbrains.kotlinx.dataframe.api.ColumnsSelectionDsl] and
189+
* expects you to return a [SingleColumn][org.jetbrains.kotlinx.dataframe.columns.SingleColumn] or [ColumnSet][org.jetbrains.kotlinx.dataframe.columns.ColumnSet] (so, a [ColumnsResolver][org.jetbrains.kotlinx.dataframe.columns.ColumnsResolver]).
190+
* This is an entity formed by calling any (combination) of the functions
191+
* in the DSL that is or can be resolved into one or more columns.
192+
*
193+
* Check out: [Columns Selection DSL Grammar][org.jetbrains.kotlinx.dataframe.api.ColumnsSelectionDsl.DslGrammar]
194+
*
195+
* &nbsp;&nbsp;&nbsp;&nbsp;
196+
*
197+
* [See Column Selectors on the documentation website.](https://kotlin.github.io/dataframe/columnselectors.html)
198+
*
199+
* ### Example
200+
* ```kotlin
201+
* // Counts unique combinations of values in the "year" and "title" columns
202+
* // for each city, returning a new DataFrame with columns "city" and "countDistinct"
203+
* df.groupBy { city }.countDistinct { year and title }
204+
* ```
205+
*
206+
* @param [resultName] The name of the result column that will store the number
207+
* of distinct combinations of values in the selected [columns] in each group. Defaults to `"countDistinct"`.
208+
* @param [columns] The [ColumnsSelector] used to select columns
209+
* that will be considered for evaluating whether the rows are distinct.
210+
* @return A new [DataFrame][org.jetbrains.kotlinx.dataframe.DataFrame] with group keys and corresponding numbers of distinct combinations of values in the selected [columns].
211+
*/
212+
@Refine
213+
@Interpretable("GroupByCountDistinct0")
214+
public fun <T, C> Grouped<T>.countDistinct(
215+
resultName: String = "countDistinct",
216+
columns: ColumnsSelector<T, C>,
217+
): DataFrame<T> =
218+
aggregateValue(resultName) {
219+
countDistinct(columns) default 0
220+
}
221+
222+
// endregion

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/groupBy.kt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,9 @@ internal interface GroupByDocs {
169169
* `| `__`.`__[**`count`**][Grouped.count]**`() `**
170170
*
171171
* &nbsp;&nbsp;&nbsp;&nbsp;
172+
* `| `__`.`__[**`countDistinct`**][Grouped.countDistinct]**`() `**
173+
*
174+
* &nbsp;&nbsp;&nbsp;&nbsp;
172175
* `| `__`.`__[**`aggregate`**][Grouped.aggregate]**` { `**`aggregations: `[`AggregateDsl`][AggregateDsl]**` }`**
173176
*
174177
* &nbsp;&nbsp;&nbsp;&nbsp;
@@ -258,6 +261,8 @@ internal interface GroupByDocs {
258261
*
259262
* * [count][Grouped.count] — calculate the number of rows in each group
260263
* (optionally counting only rows that satisfy the given predicate);
264+
* * [`countDistinct`][Grouped.countDistinct] — calculate the number of distinct rows in each group
265+
* (or distinct combinations of values in selected columns);
261266
* * [max][Grouped.max] / [maxOf][Grouped.maxOf] / [maxFor][Grouped.maxFor] —
262267
* calculate the maximum of all values on the selected columns / by a row expression /
263268
* for each of the selected columns within each group;
@@ -363,6 +368,8 @@ internal interface GroupByDocs {
363368
* from all rows of each group for the selected columns.
364369
* * [count][Grouped.count] — creates a [DataFrame] containing the grouping key columns and an additional column
365370
* with the number of rows in each corresponding group;
371+
* * [countDistinct][Grouped.countDistinct] — creates a [DataFrame] containing the grouping key columns
372+
* and an additional column with the number of distinct rows in each corresponding group;
366373
* * [aggregate][Grouped.aggregate] — performs a set of custom aggregations using [AggregateDsl],
367374
* allowing you to compute one or more derived values per group;
368375
* * [Various aggregation statistics][AggregationStatistics] — predefined shortcuts
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
package org.jetbrains.kotlinx.dataframe.api
2+
3+
import io.kotest.matchers.shouldBe
4+
import org.jetbrains.kotlinx.dataframe.nrow
5+
import org.junit.Test
6+
7+
class CountDistinctTests {
8+
9+
private val df = dataFrameOf(
10+
"name" to columnOf("Alice", "Alice", "Bob", "Charlie"),
11+
"age" to columnOf(15, 15, 20, 25),
12+
"group" to columnOf(1, 1, 1, 2),
13+
)
14+
15+
@Test
16+
fun `countDistinct on GroupBy`() {
17+
val result = df.groupBy("group").countDistinct()
18+
val expected = dataFrameOf(
19+
"group" to columnOf(1, 2),
20+
"countDistinct" to columnOf(2, 1),
21+
)
22+
result shouldBe expected
23+
}
24+
25+
@Test
26+
fun `countDistinct on GroupBy with custom result name`() {
27+
val result = df.groupBy("group").countDistinct("unique")
28+
val expected = dataFrameOf(
29+
"group" to columnOf(1, 2),
30+
"unique" to columnOf(2, 1),
31+
)
32+
result shouldBe expected
33+
}
34+
35+
@Test
36+
fun `countDistinct on GroupBy with one unique row`() {
37+
val df = dataFrameOf(
38+
"name" to columnOf("Alice", "Alice", "Alice"),
39+
"age" to columnOf(15, 15, 15),
40+
"group" to columnOf(1, 1, 1),
41+
)
42+
val result = df.groupBy("group").countDistinct()
43+
val expected = dataFrameOf(
44+
"group" to columnOf(1),
45+
"countDistinct" to columnOf(1),
46+
)
47+
result shouldBe expected
48+
}
49+
50+
// TODO: check columns as well when #1531 is fixed
51+
@Test
52+
fun `countDistinct on empty GroupBy`() {
53+
df
54+
.drop(df.nrow)
55+
.groupBy("group").countDistinct()
56+
.count() shouldBe 0
57+
}
58+
59+
@Test
60+
fun `countDistinct on GroupBy with nulls`() {
61+
val result = df
62+
.append(null, null, 1)
63+
.groupBy("group").countDistinct()
64+
val expected = dataFrameOf(
65+
"group" to columnOf(1, 2),
66+
"countDistinct" to columnOf(3, 1),
67+
)
68+
result shouldBe expected
69+
}
70+
71+
@Test
72+
fun `countDistinct on GroupBy with null group key`() {
73+
val result = df
74+
.append("Dave", 30, null)
75+
.groupBy("group").countDistinct()
76+
val expected = dataFrameOf(
77+
"group" to columnOf(1, 2, null),
78+
"countDistinct" to columnOf(2, 1, 1),
79+
)
80+
result shouldBe expected
81+
}
82+
83+
@Test
84+
fun `countDistinct on GroupBy with columns selector`() {
85+
val result = df.groupBy("group").countDistinct { "name"<String>() }
86+
val expected = dataFrameOf(
87+
"group" to columnOf(1, 2),
88+
"countDistinct" to columnOf(2, 1),
89+
)
90+
result shouldBe expected
91+
}
92+
93+
@Test
94+
fun `countDistinct on GroupBy with columns selector (not distinct only by selected column)`() {
95+
val df = dataFrameOf(
96+
"name" to columnOf("Alice", "Bob", "Charlie"),
97+
"age" to columnOf(15, 15, 20),
98+
"group" to columnOf(1, 1, 2),
99+
)
100+
val result = df.groupBy("group").countDistinct { "age"<Int>() }
101+
val expected = dataFrameOf(
102+
"group" to columnOf(1, 2),
103+
"countDistinct" to columnOf(1, 1),
104+
)
105+
result shouldBe expected
106+
}
107+
108+
@Test
109+
fun `countDistinct on GroupBy with multiple columns selector`() {
110+
val df = dataFrameOf(
111+
"name" to columnOf("Alice", "Alice", "Bob", "Charlie"),
112+
"age" to columnOf(15, 15, 20, 25),
113+
"group" to columnOf(1, 1, 1, 2),
114+
"city" to columnOf("London", "Moscow", "London", "Paris"),
115+
)
116+
val result = df.groupBy("group").countDistinct { "name"<String>() and "age"<Int>() }
117+
val expected = dataFrameOf(
118+
"group" to columnOf(1, 2),
119+
"countDistinct" to columnOf(2, 1),
120+
)
121+
result shouldBe expected
122+
}
123+
124+
@Test
125+
fun `countDistinct on grouped DataFrame with columns selector and custom result name`() {
126+
val result = df.groupBy("group").countDistinct(resultName = "unique") { "name"<String>() }
127+
val expected = dataFrameOf(
128+
"group" to columnOf(1, 2),
129+
"unique" to columnOf(2, 1),
130+
)
131+
result shouldBe expected
132+
}
133+
134+
@Test
135+
fun `countDistinct on grouped DataFrame with multiple columns selector with nulls`() {
136+
val result = df
137+
.append(null, null, 1)
138+
.groupBy("group")
139+
.countDistinct { "name"<String>() and "age"<Int>() }
140+
val expected = dataFrameOf(
141+
"group" to columnOf(1, 2),
142+
"countDistinct" to columnOf(3, 1),
143+
)
144+
result shouldBe expected
145+
}
146+
}

core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Access.kt

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import org.jetbrains.kotlinx.dataframe.api.add
66
import org.jetbrains.kotlinx.dataframe.api.after
77
import org.jetbrains.kotlinx.dataframe.api.chunked
88
import org.jetbrains.kotlinx.dataframe.api.colsOf
9-
import org.jetbrains.kotlinx.dataframe.api.countDistinct
109
import org.jetbrains.kotlinx.dataframe.api.distinct
1110
import org.jetbrains.kotlinx.dataframe.api.distinctBy
1211
import org.jetbrains.kotlinx.dataframe.api.drop
@@ -431,30 +430,6 @@ class Access : TestBase() {
431430
// SampleEnd
432431
}
433432

434-
@Test
435-
@TransformDataFrameExpressions
436-
fun countDistinct() {
437-
// SampleStart
438-
df.countDistinct()
439-
// SampleEnd
440-
}
441-
442-
@Test
443-
@TransformDataFrameExpressions
444-
fun countDistinctColumns_properties() {
445-
// SampleStart
446-
df.countDistinct { age and name }
447-
// SampleEnd
448-
}
449-
450-
@Test
451-
@TransformDataFrameExpressions
452-
fun countDistinctColumns_strings() {
453-
// SampleStart
454-
df.countDistinct("age", "name")
455-
// SampleEnd
456-
}
457-
458433
@Test
459434
@TransformDataFrameExpressions
460435
fun distinctColumns_strings() {

0 commit comments

Comments
 (0)