Convert datetimes to numeric. (#108)

dcherian · web-flow · commit ccbda27cc477 · 2022-06-02T17:23:20.000-06:00
diff --git a/flox/xarray.py b/flox/xarray.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import xarray as xr
 from packaging.version import Version
+from xarray.core.duck_array_ops import _datetime_nanmin
 
 from .aggregations import Aggregation, _atleast_1d
 from .core import (
@@ -15,6 +16,7 @@
     rechunk_for_blockwise as rechunk_array_for_blockwise,
     rechunk_for_cohorts as rechunk_array_for_cohorts,
 )
+from .xrutils import _contains_cftime_datetimes, _to_pytimedelta, datetime_to_numeric
 
 if TYPE_CHECKING:
     from xarray import DataArray, Dataset, Resample
@@ -289,7 +291,27 @@ def wrapper(array, *by, func, skipna, **kwargs):
             if "nan" not in func and func not in ["all", "any", "count"]:
                 func = f"nan{func}"
 
+        requires_numeric = func not in ["count", "any", "all"]
+        if requires_numeric:
+            is_npdatetime = array.dtype.kind in "Mm"
+            is_cftime = _contains_cftime_datetimes(array)
+            if is_npdatetime:
+                offset = _datetime_nanmin(array)
+                # xarray always uses np.datetime64[ns] for np.datetime64 data
+                dtype = "timedelta64[ns]"
+                array = datetime_to_numeric(array, offset)
+            elif _contains_cftime_datetimes(array):
+                offset = min(array)
+                array = datetime_to_numeric(array, offset, datetime_unit="us")
+
         result, *groups = groupby_reduce(array, *by, func=func, **kwargs)
+
+        if requires_numeric:
+            if is_npdatetime:
+                return result.astype(dtype) + offset
+            elif is_cftime:
+                return _to_pytimedelta(result, unit="us") + offset
+
         return result
 
     # These data variables do not have any of the core dimension,
diff --git a/flox/xrutils.py b/flox/xrutils.py
@@ -2,11 +2,18 @@
 # defined in xarray
 
 
+import datetime
 from typing import Any, Iterable
 
 import numpy as np
 import pandas as pd
 
+try:
+    import cftime
+except ImportError:
+    cftime = None
+
+
 try:
     import dask.array
 
@@ -15,6 +22,10 @@
     dask_array_type = ()
 
 
+def asarray(data, xp=np):
+    return data if is_duck_array(data) else xp.asarray(data)
+
+
 def is_duck_array(value: Any) -> bool:
     """Checks if value is a duck array."""
     if isinstance(value, np.ndarray):
@@ -110,3 +121,165 @@ def isnull(data):
             # a null value as well as NaN, but it isn't clear how to do this
             # with duck typing.
             return data != data
+
+
+def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float):
+    """Convert an array containing datetime-like data to numerical values.
+    Convert the datetime array to a timedelta relative to an offset.
+    Parameters
+    ----------
+    array : array-like
+        Input data
+    offset : None, datetime or cftime.datetime
+        Datetime offset. If None, this is set by default to the array's minimum
+        value to reduce round off errors.
+    datetime_unit : {None, Y, M, W, D, h, m, s, ms, us, ns, ps, fs, as}
+        If not None, convert output to a given datetime unit. Note that some
+        conversions are not allowed due to non-linear relationships between units.
+    dtype : dtype
+        Output dtype.
+    Returns
+    -------
+    array
+        Numerical representation of datetime object relative to an offset.
+    Notes
+    -----
+    Some datetime unit conversions won't work, for example from days to years, even
+    though some calendars would allow for them (e.g. no_leap). This is because there
+    is no `cftime.timedelta` object.
+    """
+    # TODO: make this function dask-compatible?
+    # Set offset to minimum if not given
+    from xarray.core.duck_array_ops import _datetime_nanmin
+
+    if offset is None:
+        if array.dtype.kind in "Mm":
+            offset = _datetime_nanmin(array)
+        else:
+            offset = min(array)
+
+    # Compute timedelta object.
+    # For np.datetime64, this can silently yield garbage due to overflow.
+    # One option is to enforce 1970-01-01 as the universal offset.
+
+    # This map_blocks call is for backwards compatibility.
+    # dask == 2021.04.1 does not support subtracting object arrays
+    # which is required for cftime
+    if is_duck_dask_array(array) and np.issubdtype(array.dtype, object):
+        array = array.map_blocks(lambda a, b: a - b, offset, meta=array._meta)
+    else:
+        array = array - offset
+
+    # Scalar is converted to 0d-array
+    if not hasattr(array, "dtype"):
+        array = np.array(array)
+
+    # Convert timedelta objects to float by first converting to microseconds.
+    if array.dtype.kind in "O":
+        return py_timedelta_to_float(array, datetime_unit or "ns").astype(dtype)
+
+    # Convert np.NaT to np.nan
+    elif array.dtype.kind in "mM":
+
+        # Convert to specified timedelta units.
+        if datetime_unit:
+            array = array / np.timedelta64(1, datetime_unit)
+        return np.where(isnull(array), np.nan, array.astype(dtype))
+
+
+def timedelta_to_numeric(value, datetime_unit="ns", dtype=float):
+    """Convert a timedelta-like object to numerical values.
+
+    Parameters
+    ----------
+    value : datetime.timedelta, numpy.timedelta64, pandas.Timedelta, str
+        Time delta representation.
+    datetime_unit : {Y, M, W, D, h, m, s, ms, us, ns, ps, fs, as}
+        The time units of the output values. Note that some conversions are not allowed due to
+        non-linear relationships between units.
+    dtype : type
+        The output data type.
+
+    """
+    import datetime as dt
+
+    if isinstance(value, dt.timedelta):
+        out = py_timedelta_to_float(value, datetime_unit)
+    elif isinstance(value, np.timedelta64):
+        out = np_timedelta64_to_float(value, datetime_unit)
+    elif isinstance(value, pd.Timedelta):
+        out = pd_timedelta_to_float(value, datetime_unit)
+    elif isinstance(value, str):
+        try:
+            a = pd.to_timedelta(value)
+        except ValueError:
+            raise ValueError(
+                f"Could not convert {value!r} to timedelta64 using pandas.to_timedelta"
+            )
+        return py_timedelta_to_float(a, datetime_unit)
+    else:
+        raise TypeError(
+            f"Expected value of type str, pandas.Timedelta, datetime.timedelta "
+            f"or numpy.timedelta64, but received {type(value).__name__}"
+        )
+    return out.astype(dtype)
+
+
+def _to_pytimedelta(array, unit="us"):
+    return array.astype(f"timedelta64[{unit}]").astype(datetime.timedelta)
+
+
+def np_timedelta64_to_float(array, datetime_unit):
+    """Convert numpy.timedelta64 to float.
+
+    Notes
+    -----
+    The array is first converted to microseconds, which is less likely to
+    cause overflow errors.
+    """
+    array = array.astype("timedelta64[ns]").astype(np.float64)
+    conversion_factor = np.timedelta64(1, "ns") / np.timedelta64(1, datetime_unit)
+    return conversion_factor * array
+
+
+def pd_timedelta_to_float(value, datetime_unit):
+    """Convert pandas.Timedelta to float.
+
+    Notes
+    -----
+    Built on the assumption that pandas timedelta values are in nanoseconds,
+    which is also the numpy default resolution.
+    """
+    value = value.to_timedelta64()
+    return np_timedelta64_to_float(value, datetime_unit)
+
+
+def _timedelta_to_seconds(array):
+    return np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6
+
+
+def py_timedelta_to_float(array, datetime_unit):
+    """Convert a timedelta object to a float, possibly at a loss of resolution."""
+    array = asarray(array)
+    if is_duck_dask_array(array):
+        array = array.map_blocks(_timedelta_to_seconds, meta=np.array([], dtype=np.float64))
+    else:
+        array = _timedelta_to_seconds(array)
+    conversion_factor = np.timedelta64(1, "us") / np.timedelta64(1, datetime_unit)
+    return conversion_factor * array
+
+
+def _contains_cftime_datetimes(array) -> bool:
+    """Check if an array contains cftime.datetime objects"""
+    if cftime is None:
+        return False
+    else:
+        if array.dtype == np.dtype("O") and array.size > 0:
+            sample = array.ravel()[0]
+            if is_duck_dask_array(sample):
+                sample = sample.compute()
+                if isinstance(sample, np.ndarray):
+                    sample = sample.item()
+            return isinstance(sample, cftime.datetime)
+        else:
+            return False
diff --git a/tests/test_xarray.py b/tests/test_xarray.py
@@ -414,13 +414,14 @@ def test_cache():
 
 
 @pytest.mark.parametrize("use_cftime", [True, False])
-def test_datetime_array_reduce(use_cftime):
+@pytest.mark.parametrize("func", ["count", "mean"])
+def test_datetime_array_reduce(use_cftime, func):
 
     time = xr.DataArray(
         xr.date_range("2009-01-01", "2012-12-31", use_cftime=use_cftime),
         dims=("time",),
         name="time",
     )
-    expected = time.resample(time="YS").count()  # fails
-    actual = resample_reduce(time.resample(time="YS"), func="count", engine="flox")
+    expected = getattr(time.resample(time="YS"), func)()
+    actual = resample_reduce(time.resample(time="YS"), func=func, engine="flox")
     assert_equal(expected, actual)