Fix nanlen with strings (#344)

dcherian · web-flow · commit 307899afba5e · 2024-03-19T15:37:09.000Z
* Fix nanlen with strings Closes pydata/xarray#8853 * fix windows * Silence warnings
diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py
@@ -37,7 +37,8 @@ def _lerp(a, b, *, t, dtype, out=None):
     """
     if out is None:
         out = np.empty_like(a, dtype=dtype)
-    diff_b_a = np.subtract(b, a)
+    with np.errstate(invalid="ignore"):
+        diff_b_a = np.subtract(b, a)
     # asanyarray is a stop-gap until gh-13105
     np.add(a, diff_b_a * t, out=out)
     np.subtract(b, diff_b_a * (1 - t), out=out, where=t >= 0.5)
@@ -95,7 +96,8 @@ def quantile_(array, inv_idx, *, q, axis, skipna, group_idx, dtype=None, out=Non
 
     # partition the complex array in-place
     labels_broadcast = np.broadcast_to(group_idx, array.shape)
-    cmplx = labels_broadcast + 1j * array
+    with np.errstate(invalid="ignore"):
+        cmplx = labels_broadcast + 1j * array
     cmplx.partition(kth=kth, axis=-1)
     if is_scalar_q:
         a_ = cmplx.imag
diff --git a/flox/aggregate_npg.py b/flox/aggregate_npg.py
@@ -88,6 +88,8 @@ def nanprod(group_idx, array, engine, *, axis=-1, size=None, fill_value=None, dt
 
 
 def _len(group_idx, array, engine, *, func, axis=-1, size=None, fill_value=None, dtype=None):
+    if array.dtype.kind in "US":
+        array = np.broadcast_to(np.array([1]), array.shape)
     result = _get_aggregate(engine).aggregate(
         group_idx,
         array,
diff --git a/flox/aggregate_numbagg.py b/flox/aggregate_numbagg.py
@@ -105,11 +105,24 @@ def nanstd(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None,
     )
 
 
+def nanlen(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None):
+    if array.dtype.kind in "US":
+        array = np.broadcast_to(np.array([1]), array.shape)
+    return _numbagg_wrapper(
+        group_idx,
+        array,
+        axis=axis,
+        size=size,
+        func="nancount",
+        # fill_value=fill_value,
+        # dtype=dtype,
+    )
+
+
 nansum = partial(_numbagg_wrapper, func="nansum")
 nanmean = partial(_numbagg_wrapper, func="nanmean")
 nanprod = partial(_numbagg_wrapper, func="nanprod")
 nansum_of_squares = partial(_numbagg_wrapper, func="nansum_of_squares")
-nanlen = partial(_numbagg_wrapper, func="nancount")
 nanprod = partial(_numbagg_wrapper, func="nanprod")
 nanfirst = partial(_numbagg_wrapper, func="nanfirst")
 nanlast = partial(_numbagg_wrapper, func="nanlast")
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1127,7 +1127,7 @@ def test_group_by_datetime(engine, method):
 
     edges = pd.date_range("1999-12-31", "2000-12-31", freq="ME").to_series().to_numpy()
     actual, _ = groupby_reduce(daskarray, t.to_numpy(), isbin=True, expected_groups=edges, **kwargs)
-    expected = data.resample("M").mean().to_numpy()
+    expected = data.resample("ME").mean().to_numpy()
     assert_equal(expected, actual)
 
     actual, _ = groupby_reduce(
@@ -1688,3 +1688,12 @@ def test_multiple_quantiles(q, chunk, func, by_ndim):
     if by_ndim == 2:
         expected = expected.squeeze(axis=-2)
     assert_equal(expected, actual, tolerance=1e-14)
+
+
+@pytest.mark.parametrize("dtype", ["U3", "S3"])
+def test_nanlen_string(dtype, engine):
+    array = np.array(["ABC", "DEF", "GHI", "JKL", "MNO", "PQR"], dtype=dtype)
+    by = np.array([0, 0, 1, 2, 1, 0])
+    expected = np.array([3, 2, 1], dtype=np.intp)
+    actual, *_ = groupby_reduce(array, by, func="count", engine=engine)
+    assert_equal(expected, actual)