Skip to content

Commit ccbda27

Browse files
authored
Convert datetimes to numeric. (#108)
1 parent 227ce04 commit ccbda27

File tree

3 files changed

+199
-3
lines changed

3 files changed

+199
-3
lines changed

flox/xarray.py

+22
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pandas as pd
77
import xarray as xr
88
from packaging.version import Version
9+
from xarray.core.duck_array_ops import _datetime_nanmin
910

1011
from .aggregations import Aggregation, _atleast_1d
1112
from .core import (
@@ -15,6 +16,7 @@
1516
rechunk_for_blockwise as rechunk_array_for_blockwise,
1617
rechunk_for_cohorts as rechunk_array_for_cohorts,
1718
)
19+
from .xrutils import _contains_cftime_datetimes, _to_pytimedelta, datetime_to_numeric
1820

1921
if TYPE_CHECKING:
2022
from xarray import DataArray, Dataset, Resample
@@ -289,7 +291,27 @@ def wrapper(array, *by, func, skipna, **kwargs):
289291
if "nan" not in func and func not in ["all", "any", "count"]:
290292
func = f"nan{func}"
291293

294+
requires_numeric = func not in ["count", "any", "all"]
295+
if requires_numeric:
296+
is_npdatetime = array.dtype.kind in "Mm"
297+
is_cftime = _contains_cftime_datetimes(array)
298+
if is_npdatetime:
299+
offset = _datetime_nanmin(array)
300+
# xarray always uses np.datetime64[ns] for np.datetime64 data
301+
dtype = "timedelta64[ns]"
302+
array = datetime_to_numeric(array, offset)
303+
elif _contains_cftime_datetimes(array):
304+
offset = min(array)
305+
array = datetime_to_numeric(array, offset, datetime_unit="us")
306+
292307
result, *groups = groupby_reduce(array, *by, func=func, **kwargs)
308+
309+
if requires_numeric:
310+
if is_npdatetime:
311+
return result.astype(dtype) + offset
312+
elif is_cftime:
313+
return _to_pytimedelta(result, unit="us") + offset
314+
293315
return result
294316

295317
# These data variables do not have any of the core dimension,

flox/xrutils.py

+173
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,18 @@
22
# defined in xarray
33

44

5+
import datetime
56
from typing import Any, Iterable
67

78
import numpy as np
89
import pandas as pd
910

11+
try:
12+
import cftime
13+
except ImportError:
14+
cftime = None
15+
16+
1017
try:
1118
import dask.array
1219

@@ -15,6 +22,10 @@
1522
dask_array_type = ()
1623

1724

25+
def asarray(data, xp=np):
26+
return data if is_duck_array(data) else xp.asarray(data)
27+
28+
1829
def is_duck_array(value: Any) -> bool:
1930
"""Checks if value is a duck array."""
2031
if isinstance(value, np.ndarray):
@@ -110,3 +121,165 @@ def isnull(data):
110121
# a null value as well as NaN, but it isn't clear how to do this
111122
# with duck typing.
112123
return data != data
124+
125+
126+
def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float):
127+
"""Convert an array containing datetime-like data to numerical values.
128+
Convert the datetime array to a timedelta relative to an offset.
129+
Parameters
130+
----------
131+
array : array-like
132+
Input data
133+
offset : None, datetime or cftime.datetime
134+
Datetime offset. If None, this is set by default to the array's minimum
135+
value to reduce round off errors.
136+
datetime_unit : {None, Y, M, W, D, h, m, s, ms, us, ns, ps, fs, as}
137+
If not None, convert output to a given datetime unit. Note that some
138+
conversions are not allowed due to non-linear relationships between units.
139+
dtype : dtype
140+
Output dtype.
141+
Returns
142+
-------
143+
array
144+
Numerical representation of datetime object relative to an offset.
145+
Notes
146+
-----
147+
Some datetime unit conversions won't work, for example from days to years, even
148+
though some calendars would allow for them (e.g. no_leap). This is because there
149+
is no `cftime.timedelta` object.
150+
"""
151+
# TODO: make this function dask-compatible?
152+
# Set offset to minimum if not given
153+
from xarray.core.duck_array_ops import _datetime_nanmin
154+
155+
if offset is None:
156+
if array.dtype.kind in "Mm":
157+
offset = _datetime_nanmin(array)
158+
else:
159+
offset = min(array)
160+
161+
# Compute timedelta object.
162+
# For np.datetime64, this can silently yield garbage due to overflow.
163+
# One option is to enforce 1970-01-01 as the universal offset.
164+
165+
# This map_blocks call is for backwards compatibility.
166+
# dask == 2021.04.1 does not support subtracting object arrays
167+
# which is required for cftime
168+
if is_duck_dask_array(array) and np.issubdtype(array.dtype, object):
169+
array = array.map_blocks(lambda a, b: a - b, offset, meta=array._meta)
170+
else:
171+
array = array - offset
172+
173+
# Scalar is converted to 0d-array
174+
if not hasattr(array, "dtype"):
175+
array = np.array(array)
176+
177+
# Convert timedelta objects to float by first converting to microseconds.
178+
if array.dtype.kind in "O":
179+
return py_timedelta_to_float(array, datetime_unit or "ns").astype(dtype)
180+
181+
# Convert np.NaT to np.nan
182+
elif array.dtype.kind in "mM":
183+
184+
# Convert to specified timedelta units.
185+
if datetime_unit:
186+
array = array / np.timedelta64(1, datetime_unit)
187+
return np.where(isnull(array), np.nan, array.astype(dtype))
188+
189+
190+
def timedelta_to_numeric(value, datetime_unit="ns", dtype=float):
191+
"""Convert a timedelta-like object to numerical values.
192+
193+
Parameters
194+
----------
195+
value : datetime.timedelta, numpy.timedelta64, pandas.Timedelta, str
196+
Time delta representation.
197+
datetime_unit : {Y, M, W, D, h, m, s, ms, us, ns, ps, fs, as}
198+
The time units of the output values. Note that some conversions are not allowed due to
199+
non-linear relationships between units.
200+
dtype : type
201+
The output data type.
202+
203+
"""
204+
import datetime as dt
205+
206+
if isinstance(value, dt.timedelta):
207+
out = py_timedelta_to_float(value, datetime_unit)
208+
elif isinstance(value, np.timedelta64):
209+
out = np_timedelta64_to_float(value, datetime_unit)
210+
elif isinstance(value, pd.Timedelta):
211+
out = pd_timedelta_to_float(value, datetime_unit)
212+
elif isinstance(value, str):
213+
try:
214+
a = pd.to_timedelta(value)
215+
except ValueError:
216+
raise ValueError(
217+
f"Could not convert {value!r} to timedelta64 using pandas.to_timedelta"
218+
)
219+
return py_timedelta_to_float(a, datetime_unit)
220+
else:
221+
raise TypeError(
222+
f"Expected value of type str, pandas.Timedelta, datetime.timedelta "
223+
f"or numpy.timedelta64, but received {type(value).__name__}"
224+
)
225+
return out.astype(dtype)
226+
227+
228+
def _to_pytimedelta(array, unit="us"):
229+
return array.astype(f"timedelta64[{unit}]").astype(datetime.timedelta)
230+
231+
232+
def np_timedelta64_to_float(array, datetime_unit):
233+
"""Convert numpy.timedelta64 to float.
234+
235+
Notes
236+
-----
237+
The array is first converted to microseconds, which is less likely to
238+
cause overflow errors.
239+
"""
240+
array = array.astype("timedelta64[ns]").astype(np.float64)
241+
conversion_factor = np.timedelta64(1, "ns") / np.timedelta64(1, datetime_unit)
242+
return conversion_factor * array
243+
244+
245+
def pd_timedelta_to_float(value, datetime_unit):
246+
"""Convert pandas.Timedelta to float.
247+
248+
Notes
249+
-----
250+
Built on the assumption that pandas timedelta values are in nanoseconds,
251+
which is also the numpy default resolution.
252+
"""
253+
value = value.to_timedelta64()
254+
return np_timedelta64_to_float(value, datetime_unit)
255+
256+
257+
def _timedelta_to_seconds(array):
258+
return np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6
259+
260+
261+
def py_timedelta_to_float(array, datetime_unit):
262+
"""Convert a timedelta object to a float, possibly at a loss of resolution."""
263+
array = asarray(array)
264+
if is_duck_dask_array(array):
265+
array = array.map_blocks(_timedelta_to_seconds, meta=np.array([], dtype=np.float64))
266+
else:
267+
array = _timedelta_to_seconds(array)
268+
conversion_factor = np.timedelta64(1, "us") / np.timedelta64(1, datetime_unit)
269+
return conversion_factor * array
270+
271+
272+
def _contains_cftime_datetimes(array) -> bool:
273+
"""Check if an array contains cftime.datetime objects"""
274+
if cftime is None:
275+
return False
276+
else:
277+
if array.dtype == np.dtype("O") and array.size > 0:
278+
sample = array.ravel()[0]
279+
if is_duck_dask_array(sample):
280+
sample = sample.compute()
281+
if isinstance(sample, np.ndarray):
282+
sample = sample.item()
283+
return isinstance(sample, cftime.datetime)
284+
else:
285+
return False

tests/test_xarray.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -414,13 +414,14 @@ def test_cache():
414414

415415

416416
@pytest.mark.parametrize("use_cftime", [True, False])
417-
def test_datetime_array_reduce(use_cftime):
417+
@pytest.mark.parametrize("func", ["count", "mean"])
418+
def test_datetime_array_reduce(use_cftime, func):
418419

419420
time = xr.DataArray(
420421
xr.date_range("2009-01-01", "2012-12-31", use_cftime=use_cftime),
421422
dims=("time",),
422423
name="time",
423424
)
424-
expected = time.resample(time="YS").count() # fails
425-
actual = resample_reduce(time.resample(time="YS"), func="count", engine="flox")
425+
expected = getattr(time.resample(time="YS"), func)()
426+
actual = resample_reduce(time.resample(time="YS"), func=func, engine="flox")
426427
assert_equal(expected, actual)

0 commit comments

Comments
 (0)