Skip to content

Commit 112cab3

Browse files
docs(python): Provide additional explanation and examples for the value_counts "normalize" parameter (pola-rs#22756)
1 parent 3a7e226 commit 112cab3

File tree

2 files changed

+82
-57
lines changed

2 files changed

+82
-57
lines changed

py-polars/polars/expr/expr.py

Lines changed: 58 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -9975,81 +9975,97 @@ def value_counts(
99759975
normalize: bool = False,
99769976
) -> Expr:
99779977
"""
9978-
Count the occurrences of unique values.
9978+
Count the occurrence of unique values.
99799979
99809980
Parameters
99819981
----------
99829982
sort
9983-
Sort the output by count in descending order.
9984-
If set to `False` (default), the order of the output is random.
9983+
Sort the output by count, in descending order.
9984+
If set to `False` (default), the order is non-deterministic.
99859985
parallel
99869986
Execute the computation in parallel.
99879987
99889988
.. note::
9989-
This option should likely not be enabled in a group by context,
9990-
as the computation is already parallelized per group.
9989+
This option should likely *not* be enabled in a `group_by` context,
9990+
as the computation will already be parallelized per group.
99919991
name
9992-
Give the resulting count column a specific name;
9993-
if `normalize` is True defaults to "proportion",
9994-
otherwise defaults to "count".
9992+
Give the resulting count column a specific name; if `normalize` is
9993+
True this defaults to "proportion", otherwise defaults to "count".
99959994
normalize
9996-
If true gives relative frequencies of the unique values
9995+
If True, the count is returned as the relative frequency of unique
9996+
values normalized to 1.0.
99979997
99989998
Returns
99999999
-------
1000010000
Expr
10001-
Expression of data type :class:`Struct` with mapping of unique values to
10002-
their count.
10001+
Expression of type :class:`Struct`, mapping unique values to their
10002+
count (or proportion).
1000310003
1000410004
Examples
1000510005
--------
1000610006
>>> df = pl.DataFrame(
1000710007
... {"color": ["red", "blue", "red", "green", "blue", "blue"]}
1000810008
... )
10009-
>>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT
10009+
>>> df_count = df.select(pl.col("color").value_counts())
10010+
>>> df_count # doctest: +IGNORE_RESULT
1001010011
shape: (3, 1)
1001110012
┌─────────────┐
1001210013
│ color │
1001310014
│ --- │
1001410015
│ struct[2] │
1001510016
╞═════════════╡
10016-
│ {"red",2} │
1001710017
│ {"green",1} │
1001810018
│ {"blue",3} │
10019+
│ {"red",2} │
1001910020
└─────────────┘
1002010021
10021-
Sort the output by (descending) count and customize the count field name.
10022+
>>> df_count.unnest("color") # doctest: +IGNORE_RESULT
10023+
shape: (3, 2)
10024+
┌───────┬───────┐
10025+
│ color ┆ count │
10026+
│ --- ┆ --- │
10027+
│ str ┆ u32 │
10028+
╞═══════╪═══════╡
10029+
│ green ┆ 1 │
10030+
│ blue ┆ 3 │
10031+
│ red ┆ 2 │
10032+
└───────┴───────┘
1002210033
10023-
>>> df = df.select(pl.col("color").value_counts(sort=True, name="n"))
10024-
>>> df
10025-
shape: (3, 1)
10026-
┌─────────────┐
10027-
│ color │
10028-
│ --- │
10029-
│ struct[2] │
10030-
╞═════════════╡
10031-
│ {"blue",3} │
10032-
│ {"red",2} │
10033-
│ {"green",1} │
10034-
└─────────────┘
10034+
Sort the output by (descending) count, customize the field name,
10035+
and normalize the count to its relative proportion (of 1.0).
1003510036
10036-
>>> df.unnest("color")
10037+
>>> df_count = df.select(
10038+
... pl.col("color").value_counts(
10039+
... name="fraction",
10040+
... normalize=True,
10041+
... sort=True,
10042+
... )
10043+
... )
10044+
>>> df_count
10045+
shape: (3, 1)
10046+
┌────────────────────┐
10047+
│ color │
10048+
│ --- │
10049+
│ struct[2] │
10050+
╞════════════════════╡
10051+
│ {"blue",0.5} │
10052+
│ {"red",0.333333} │
10053+
│ {"green",0.166667} │
10054+
└────────────────────┘
10055+
10056+
>>> df_count.unnest("color")
1003710057
shape: (3, 2)
10038-
┌───────┬─────┐
10039-
│ color ┆ n │
10040-
│ --- ┆ --- │
10041-
│ str ┆ u32 │
10042-
╞═══════╪═════╡
10043-
│ blue ┆ 3 │
10044-
│ red ┆ 2 │
10045-
│ green ┆ 1 │
10046-
└───────┴─────┘
10047-
"""
10048-
if name is None:
10049-
if normalize:
10050-
name = "proportion"
10051-
else:
10052-
name = "count"
10058+
┌───────┬──────────┐
10059+
│ color ┆ fraction │
10060+
│ --- ┆ --- │
10061+
│ str ┆ f64 │
10062+
╞═══════╪══════════╡
10063+
│ blue ┆ 0.5 │
10064+
│ red ┆ 0.333333 │
10065+
│ green ┆ 0.166667 │
10066+
└───────┴──────────┘
10067+
"""
10068+
name = name or ("proportion" if normalize else "count")
1005310069
return self._from_pyexpr(
1005410070
self._pyexpr.value_counts(sort, parallel, name, normalize)
1005510071
)

py-polars/polars/series/series.py

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2614,25 +2614,25 @@ def value_counts(
26142614
Parameters
26152615
----------
26162616
sort
2617-
Sort the output by count in descending order.
2618-
If set to `False` (default), the order of the output is random.
2617+
Sort the output by count, in descending order.
2618+
If set to `False` (default), the order is non-deterministic.
26192619
parallel
26202620
Execute the computation in parallel.
26212621
26222622
.. note::
2623-
This option should likely not be enabled in a group by context,
2624-
as the computation is already parallelized per group.
2623+
This option should likely *not* be enabled in a `group_by` context,
2624+
as the computation will already be parallelized per group.
26252625
name
2626-
Give the resulting count column a specific name;
2627-
if `normalize` is True defaults to "proportion",
2628-
otherwise defaults to "count".
2626+
Give the resulting count column a specific name; if `normalize` is
2627+
True this defaults to "proportion", otherwise defaults to "count".
26292628
normalize
2630-
If true gives relative frequencies of the unique values
2629+
If True, the count is returned as the relative frequency of unique
2630+
values normalized to 1.0.
26312631
26322632
Returns
26332633
-------
26342634
DataFrame
2635-
Mapping of unique values to their count.
2635+
Columns map the unique values to their count (or proportion).
26362636
26372637
Examples
26382638
--------
@@ -2662,12 +2662,21 @@ def value_counts(
26622662
│ red ┆ 2 │
26632663
│ green ┆ 1 │
26642664
└───────┴─────┘
2665-
"""
2666-
if name is None:
2667-
if normalize:
2668-
name = "proportion"
2669-
else:
2670-
name = "count"
2665+
2666+
Return the count as a relative frequency, normalized to 1.0:
2667+
>>> s.value_counts(sort=True, normalize=True, name="fraction")
2668+
shape: (3, 2)
2669+
┌───────┬──────────┐
2670+
│ color ┆ fraction │
2671+
│ --- ┆ --- │
2672+
│ str ┆ f64 │
2673+
╞═══════╪══════════╡
2674+
│ blue ┆ 0.5 │
2675+
│ red ┆ 0.333333 │
2676+
│ green ┆ 0.166667 │
2677+
└───────┴──────────┘
2678+
"""
2679+
name = name or ("proportion" if normalize else "count")
26712680
return pl.DataFrame._from_pydf(
26722681
self._s.value_counts(
26732682
sort=sort, parallel=parallel, name=name, normalize=normalize

0 commit comments

Comments
 (0)