diff --git a/CHANGELOG.md b/CHANGELOG.md index c81360df7b..a592e63e5d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ but cannot always guarantee backwards compatibility. Changes that may **break co **Improved** +- `TimeSeries.from_dataframe()` and `from_series()` now support creating `TimeSeries` from additional backends (Polars, PyArrow, ...). We leverage `narwhals` as the compatibility layer between dataframe libraries. See the `narwhals` [documentation](https://narwhals-dev.github.io/narwhals/) for all supported backends. [#2661](https://github.com/unit8co/darts/pull/2661) by [Jules Authier](https://github.com/authierj) - Added ONNX support for torch-based models with method `TorchForecastingModel.to_onnx()`. Check out [this example](https://unit8co.github.io/darts/userguide/gpu_and_tpu_usage.html#exporting-model-to-onnx-format-for-inference) from the user guide on how to export and load a model for inference. [#2620](https://github.com/unit8co/darts/pull/2620) by [Antoine Madrona](https://github.com/madtoinou) - Made method `ForecastingModel.untrained_model()` public. Use this method to get a new (untrained) model instance created with the same parameters. [#2684](https://github.com/unit8co/darts/pull/2684) by [Timon Erhart](https://github.com/turbotimon) - `TimeSeries.plot()` now supports setting the color for each component in the series. Simply pass a list / sequence of colors with length matching the number of components as parameters "c" or "colors". [#2680](https://github.com/unit8co/darts/pull/2680) by [Jules Authier](https://github.com/authierj) diff --git a/darts/tests/test_timeseries.py b/darts/tests/test_timeseries.py index 41b04aebd4..f4d52477d3 100644 --- a/darts/tests/test_timeseries.py +++ b/darts/tests/test_timeseries.py @@ -2506,7 +2506,16 @@ def test_tail_numeric_time_index(self): class TestTimeSeriesFromDataFrame: - def test_from_dataframe_sunny_day(self): + def pd_to_backend(self, df, backend, index=False): + if backend == "pandas": + return df + # elif backend == "polars": + # if index: + # return pl.from_pandas(df.reset_index()) + # return pl.from_pandas(df) + + @pytest.mark.parametrize("backend", ["pandas"]) + def test_from_dataframe_sunny_day(self, backend): data_dict = {"Time": pd.date_range(start="20180501", end="20200301", freq="MS")} data_dict["Values1"] = np.random.uniform( low=-10, high=10, size=len(data_dict["Time"]) @@ -2520,40 +2529,55 @@ def test_from_dataframe_sunny_day(self): data_pd2["Time"] = data_pd2["Time"].apply(lambda date: str(date)) data_pd3 = data_pd1.set_index("Time") - data_darts1 = TimeSeries.from_dataframe(df=data_pd1, time_col="Time") - data_darts2 = TimeSeries.from_dataframe(df=data_pd2, time_col="Time") - data_darts3 = TimeSeries.from_dataframe(df=data_pd3) + data_darts1 = TimeSeries.from_dataframe( + df=self.pd_to_backend(data_pd1, backend), time_col="Time" + ) + data_darts2 = TimeSeries.from_dataframe( + df=self.pd_to_backend(data_pd2, backend), time_col="Time" + ) + data_darts3 = TimeSeries.from_dataframe( + df=self.pd_to_backend(data_pd3, backend, index=True), + time_col=None if backend == "pandas" else "Time", + ) assert data_darts1 == data_darts2 assert data_darts1 == data_darts3 - def test_time_col_convert_string_integers(self): + @pytest.mark.parametrize("backend", ["pandas"]) + def test_time_col_convert_string_integers(self, backend): expected = np.array(list(range(3, 10))) data_dict = {"Time": expected.astype(str)} data_dict["Values1"] = np.random.uniform( low=-10, high=10, size=len(data_dict["Time"]) ) df = pd.DataFrame(data_dict) - ts = TimeSeries.from_dataframe(df=df, time_col="Time") + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) assert set(ts.time_index.values.tolist()) == set(expected) assert ts.time_index.dtype == int assert ts.time_index.name == "Time" - def test_time_col_convert_integers(self): + @pytest.mark.parametrize("backend", ["pandas"]) + def test_time_col_convert_integers(self, backend): expected = np.array(list(range(10))) data_dict = {"Time": expected} data_dict["Values1"] = np.random.uniform( low=-10, high=10, size=len(data_dict["Time"]) ) + df = pd.DataFrame(data_dict) - ts = TimeSeries.from_dataframe(df=df, time_col="Time") + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) assert set(ts.time_index.values.tolist()) == set(expected) assert ts.time_index.dtype == int assert ts.time_index.name == "Time" - def test_fail_with_bad_integer_time_col(self): + @pytest.mark.parametrize("backend", ["pandas"]) + def test_fail_with_bad_integer_time_col(self, backend): bad_time_col_vals = np.array([4, 0, 1, 2]) data_dict = {"Time": bad_time_col_vals} data_dict["Values1"] = np.random.uniform( @@ -2561,9 +2585,12 @@ def test_fail_with_bad_integer_time_col(self): ) df = pd.DataFrame(data_dict) with pytest.raises(ValueError): - TimeSeries.from_dataframe(df=df, time_col="Time") + TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) - def test_time_col_convert_rangeindex(self): + @pytest.mark.parametrize("backend", ["pandas"]) + def test_time_col_convert_rangeindex(self, backend): for expected_l, step in zip([[4, 0, 2, 3, 1], [8, 0, 4, 6, 2]], [1, 2]): expected = np.array(expected_l) data_dict = {"Time": expected} @@ -2571,7 +2598,9 @@ def test_time_col_convert_rangeindex(self): low=-10, high=10, size=len(data_dict["Time"]) ) df = pd.DataFrame(data_dict) - ts = TimeSeries.from_dataframe(df=df, time_col="Time") + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) # check type (should convert to RangeIndex): assert type(ts.time_index) is pd.RangeIndex @@ -2586,31 +2615,38 @@ def test_time_col_convert_rangeindex(self): ] assert np.all(ar1 == ar2) - def test_time_col_convert_datetime(self): + @pytest.mark.parametrize("backend", ["pandas"]) + def test_time_col_convert_datetime(self, backend): expected = pd.date_range(start="20180501", end="20200301", freq="MS") data_dict = {"Time": expected} data_dict["Values1"] = np.random.uniform( low=-10, high=10, size=len(data_dict["Time"]) ) df = pd.DataFrame(data_dict) - ts = TimeSeries.from_dataframe(df=df, time_col="Time") + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) assert ts.time_index.dtype == "datetime64[ns]" assert ts.time_index.name == "Time" - def test_time_col_convert_datetime_strings(self): + @pytest.mark.parametrize("backend", ["pandas"]) + def test_time_col_convert_datetime_strings(self, backend): expected = pd.date_range(start="20180501", end="20200301", freq="MS") data_dict = {"Time": expected.values.astype(str)} data_dict["Values1"] = np.random.uniform( low=-10, high=10, size=len(data_dict["Time"]) ) df = pd.DataFrame(data_dict) - ts = TimeSeries.from_dataframe(df=df, time_col="Time") + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) assert ts.time_index.dtype == "datetime64[ns]" assert ts.time_index.name == "Time" - def test_time_col_with_tz(self): + @pytest.mark.parametrize("backend", ["pandas"]) + def test_time_col_with_tz_df(self, backend): # numpy and xarray don't support "timezone aware" pd.DatetimeIndex # the BUGFIX removes timezone information without conversion @@ -2621,13 +2657,10 @@ def test_time_col_with_tz(self): # pd.DataFrame loses the tz information unless it is contained in its index # (other columns are silently converted to UTC, with tz attribute set to None) df = pd.DataFrame(data=values, index=time_range_MS) - ts = TimeSeries.from_dataframe(df=df) - assert list(ts.time_index) == list(time_range_MS.tz_localize(None)) - assert list(ts.time_index.tz_localize("CET")) == list(time_range_MS) - assert ts.time_index.tz is None - - serie = pd.Series(data=values, index=time_range_MS) - ts = TimeSeries.from_series(pd_series=serie) + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend, index=True), + time_col=None if backend == "pandas" else "index", + ) assert list(ts.time_index) == list(time_range_MS.tz_localize(None)) assert list(ts.time_index.tz_localize("CET")) == list(time_range_MS) assert ts.time_index.tz is None @@ -2643,23 +2676,42 @@ def test_time_col_with_tz(self): values = np.random.uniform(low=-10, high=10, size=len(time_range_H)) df = pd.DataFrame(data=values, index=time_range_H) - ts = TimeSeries.from_dataframe(df=df) + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend, index=True), + time_col=None if backend == "pandas" else "index", + ) assert list(ts.time_index) == list(time_range_H.tz_localize(None)) assert list(ts.time_index.tz_localize("CET")) == list(time_range_H) assert ts.time_index.tz is None - series = pd.Series(data=values, index=time_range_H) - ts = TimeSeries.from_series(pd_series=series) + ts = TimeSeries.from_times_and_values(times=time_range_H, values=values) assert list(ts.time_index) == list(time_range_H.tz_localize(None)) assert list(ts.time_index.tz_localize("CET")) == list(time_range_H) assert ts.time_index.tz is None - ts = TimeSeries.from_times_and_values(times=time_range_H, values=values) + def test_time_col_with_tz_series(self): + time_range_MS = pd.date_range( + start="20180501", end="20200301", freq="MS", tz="CET" + ) + values = np.random.uniform(low=-10, high=10, size=len(time_range_MS)) + serie = pd.Series(data=values, index=time_range_MS) + ts = TimeSeries.from_series(pd_series=serie) + assert list(ts.time_index) == list(time_range_MS.tz_localize(None)) + assert list(ts.time_index.tz_localize("CET")) == list(time_range_MS) + assert ts.time_index.tz is None + + time_range_H = pd.date_range( + start="20200518", end="20200521", freq=freqs["h"], tz="CET" + ) + values = np.random.uniform(low=-10, high=10, size=len(time_range_H)) + series = pd.Series(data=values, index=time_range_H) + ts = TimeSeries.from_series(pd_series=series) assert list(ts.time_index) == list(time_range_H.tz_localize(None)) assert list(ts.time_index.tz_localize("CET")) == list(time_range_H) assert ts.time_index.tz is None - def test_time_col_convert_garbage(self): + @pytest.mark.parametrize("backend", ["pandas"]) + def test_time_col_convert_garbage(self, backend): expected = [ "2312312asdfdw", "asdfsdf432sdf", @@ -2674,9 +2726,12 @@ def test_time_col_convert_garbage(self): df = pd.DataFrame(data_dict) with pytest.raises(AttributeError): - TimeSeries.from_dataframe(df=df, time_col="Time") + TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) - def test_df_named_columns_index(self): + @pytest.mark.parametrize("backend", ["pandas"]) + def test_df_named_columns_index(self, backend): time_index = generate_index( start=pd.Timestamp("2000-01-01"), length=4, freq="D", name="index" ) @@ -2686,7 +2741,10 @@ def test_df_named_columns_index(self): columns=["y"], ) df.columns.name = "id" - ts = TimeSeries.from_dataframe(df) + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend, index=True), + time_col=None if backend == "pandas" else "index", + ) exp_ts = TimeSeries.from_times_and_values( times=time_index, diff --git a/darts/timeseries.py b/darts/timeseries.py index 0804c40133..2a1b336bcf 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -33,6 +33,7 @@ ``TimeSeries`` documentation for more information on covariates. """ +import contextlib import itertools import pickle import re @@ -46,9 +47,11 @@ import matplotlib.axes import matplotlib.pyplot as plt +import narwhals as nw import numpy as np import pandas as pd import xarray as xr +from narwhals.typing import IntoDataFrame, IntoSeries from pandas.tseries.frequencies import to_offset from scipy.stats import kurtosis, skew @@ -569,7 +572,7 @@ def from_csv( @classmethod def from_dataframe( cls, - df: pd.DataFrame, + df: IntoDataFrame, time_col: Optional[str] = None, value_cols: Optional[Union[list[str], str]] = None, fill_missing_dates: Optional[bool] = False, @@ -586,7 +589,10 @@ def from_dataframe( Parameters ---------- df - The DataFrame + The DataFrame, or anything which can be converted to a narwhals DataFrame (e.g. pandas.DataFrame, + polars.DataFrame, ...). See the `narwhals documentation + `_ for more + information. time_col The time column name. If set, the column will be cast to a pandas DatetimeIndex (if it contains timestamps) or a RangeIndex (if it contains integers). @@ -649,12 +655,14 @@ def from_dataframe( TimeSeries A univariate or multivariate deterministic TimeSeries constructed from the inputs. """ + df = nw.from_native(df, eager_only=True, pass_through=False) + time_zone = None # get values if value_cols is None: - series_df = df.loc[:, df.columns != time_col] + series_df = df.drop(time_col) if time_col else df else: - if isinstance(value_cols, str): + if isinstance(value_cols, (str, int)): value_cols = [value_cols] series_df = df[value_cols] @@ -663,77 +671,90 @@ def from_dataframe( if time_col not in df.columns: raise_log(AttributeError(f"time_col='{time_col}' is not present.")) - time_index = pd.Index([]) - time_col_vals = df[time_col] + time_col_vals = df.get_column(time_col) - if np.issubdtype(time_col_vals.dtype, object): + if time_col_vals.dtype == nw.String: # Try to convert to integers if needed - try: - time_col_vals = time_col_vals.astype(int) - except ValueError: - pass - - if np.issubdtype(time_col_vals.dtype, np.integer): - # We have to check all integers appear only once to have a valid index - raise_if( - time_col_vals.duplicated().any(), - "The provided integer time index column contains duplicate values.", - ) + with contextlib.suppress(Exception): + time_col_vals = time_col_vals.cast(nw.Int64) + if time_col_vals.dtype.is_integer(): + if time_col_vals.is_duplicated().any(): + raise_log( + ValueError( + "The provided integer time index column contains duplicate values." + ) + ) # Temporarily use an integer Index to sort the values, and replace by a # RangeIndex in `TimeSeries.from_xarray()` time_index = pd.Index(time_col_vals) - elif np.issubdtype(time_col_vals.dtype, object): + elif isinstance(time_col_vals.dtype, nw.String): # The integer conversion failed; try datetimes try: time_index = pd.DatetimeIndex(time_col_vals) except ValueError: raise_log( AttributeError( - "'time_col' is of 'object' dtype but doesn't contain valid timestamps" + "'time_col' is of 'String' dtype but doesn't contain valid timestamps" ) ) - elif np.issubdtype(time_col_vals.dtype, np.datetime64): + elif isinstance(time_col_vals.dtype, nw.Datetime): + # remember time zone here as polars converts to UTC + time_zone = time_col_vals.dtype.time_zone + if time_zone is not None: + time_col_vals = time_col_vals.dt.replace_time_zone(None) time_index = pd.DatetimeIndex(time_col_vals) else: raise_log( AttributeError( - "Invalid type of `time_col`: it needs to be of either 'str', 'datetime' or 'int' dtype." + "Invalid type of `time_col`: it needs to be of either 'String', 'Datetime' or 'Int' dtype." ) ) time_index.name = time_col else: - raise_if_not( - isinstance(df.index, VALID_INDEX_TYPES) - or np.issubdtype(df.index.dtype, np.integer), - "If time_col is not specified, the DataFrame must be indexed either with " - "a DatetimeIndex, a RangeIndex, or an integer Index that can be converted into a RangeIndex", - logger, - ) - # BUGFIX : force time-index to be timezone naive as xarray doesn't support it - # pandas.DataFrame loses the tz information if it's not its index - if isinstance(df.index, pd.DatetimeIndex) and df.index.tz is not None: - logger.warning( - "The provided DatetimeIndex was associated with a timezone, which is currently not supported " - "by xarray. To avoid unexpected behaviour, the tz information was removed. Consider calling " - f"`ts.time_index.tz_localize({df.index.tz})` when exporting the results." - "To plot the series with the right time steps, consider setting the matplotlib.pyplot " - "`rcParams['timezone']` parameter to automatically convert the time axis back to the " - "original timezone." + time_index = nw.maybe_get_index(df) + if time_index is None: + time_index = pd.RangeIndex(len(df)) + logger.info( + "No time column specified (`time_col=None`) and no index found in the DataFrame. Defaulting to " + "`pandas.RangeIndex(len(df))`. If this is not desired consider adding a time column " + "to your dataframe and defining `time_col`." ) - time_index = df.index.tz_localize(None) - else: - time_index = df.index + # if we are here, the dataframe was pandas + elif not ( + isinstance(time_index, VALID_INDEX_TYPES) + or np.issubdtype(time_index.dtype, np.integer) + ): + raise_log( + ValueError( + "If time_col is not specified, the DataFrame must be indexed either with " + "a DatetimeIndex, a RangeIndex, or an integer Index that can be converted into a RangeIndex" + ), + logger, + ) + if isinstance(time_index, pd.DatetimeIndex): + time_zone = time_index.tz + if time_zone is not None: + # remove and remember time zone here as pandas converts to UTC + time_index = time_index.tz_localize(None) + + # BUGFIX : force time-index to be timezone naive as xarray doesn't support it + if time_zone is not None: + logger.warning( + "The provided DatetimeIndex was associated with a timezone, which is currently not supported " + "by xarray. To avoid unexpected behaviour, the tz information was removed. Consider calling " + f"`ts.time_index.tz_localize({time_zone})` when exporting the results." + "To plot the series with the right time steps, consider setting the matplotlib.pyplot " + "`rcParams['timezone']` parameter to automatically convert the time axis back to the " + "original timezone." + ) if not time_index.name: time_index.name = time_col if time_col else DIMS[0] - if series_df.columns.name: - series_df.columns.name = None - xa = xr.DataArray( - series_df.values[:, :, np.newaxis], + series_df.to_numpy()[:, :, np.newaxis], dims=(time_index.name,) + DIMS[-2:], coords={time_index.name: time_index, DIMS[1]: series_df.columns}, attrs={STATIC_COV_TAG: static_covariates, HIERARCHY_TAG: hierarchy}, @@ -960,14 +981,14 @@ def from_group(static_cov_vals, group): @classmethod def from_series( cls, - pd_series: pd.Series, + pd_series: IntoSeries, fill_missing_dates: Optional[bool] = False, freq: Optional[Union[str, int]] = None, fillna_value: Optional[float] = None, static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None, ) -> Self: """ - Build a univariate deterministic series from a pandas Series. + Build a univariate deterministic TimeSeries from a Series. The series must contain an index that is either a pandas DatetimeIndex, a pandas RangeIndex, or a pandas Index that can be converted into a RangeIndex. It is better if the index has no holes; alternatively setting @@ -977,7 +998,10 @@ def from_series( Parameters ---------- pd_series - The pandas Series instance. + The Series, or anything which can be converted to a narwhals Series (e.g. pandas.Series, ...). See the + `narwhals documentation + `_ for more + information. fill_missing_dates Optionally, a boolean value indicating whether to fill missing dates (or indices in case of integer index) with NaN values. This requires either a provided `freq` or the possibility to infer the frequency from the @@ -1001,7 +1025,8 @@ def from_series( TimeSeries A univariate and deterministic TimeSeries constructed from the inputs. """ - df = pd.DataFrame(pd_series) + nw_series = nw.from_native(pd_series, series_only=True, pass_through=False) + df = nw_series.to_frame() return cls.from_dataframe( df, time_col=None, @@ -1563,18 +1588,16 @@ def pd_series(self, copy=True) -> pd.Series: """ self._assert_univariate() self._assert_deterministic() + + data = self._xa[:, 0, 0].values + index = self._time_index + name = self.components[0] + if copy: - return pd.Series( - self._xa[:, 0, 0].values.copy(), - index=self._time_index.copy(), - name=self.components[0], - ) - else: - return pd.Series( - self._xa[:, 0, 0].values, - index=self._time_index, - name=self.components[0], - ) + data = data.copy() + index = index.copy() + + return pd.Series(data=data, index=index, name=name) def pd_dataframe(self, copy=True, suppress_warnings=False) -> pd.DataFrame: """ @@ -1606,36 +1629,22 @@ def pd_dataframe(self, copy=True, suppress_warnings=False) -> pd.DataFrame: comp_name = list(self.components) samples = range(self.n_samples) - df_col_names = [ + columns = [ "_s".join((comp_name, str(sample_id))) for comp_name, sample_id in itertools.product(comp_name, samples) ] - - if copy: - return pd.DataFrame( - self._xa.stack(data=(DIMS[1], DIMS[2])).values.copy(), - index=self._time_index.copy(), - columns=df_col_names.copy(), - ) - else: - return pd.DataFrame( - self._xa.stack(data=(DIMS[1], DIMS[2])).values, - index=self._time_index, - columns=df_col_names, - ) + data = self._xa.stack(data=(DIMS[1], DIMS[2])).values else: - if copy: - return pd.DataFrame( - self._xa[:, :, 0].values.copy(), - index=self._time_index.copy(), - columns=self._xa.get_index(DIMS[1]).copy(), - ) - else: - return pd.DataFrame( - self._xa[:, :, 0].values, - index=self._time_index, - columns=self._xa.get_index(DIMS[1]), - ) + columns = self._xa.get_index(DIMS[1]) + data = self._xa[:, :, 0].values + index = self._time_index + + if copy: + columns = columns.copy() + data = data.copy() + index = index.copy() + + return pd.DataFrame(data=data, index=index, columns=columns) def quantile_df(self, quantile=0.5) -> pd.DataFrame: """ diff --git a/requirements/core.txt b/requirements/core.txt index 0245c46194..21abb02049 100644 --- a/requirements/core.txt +++ b/requirements/core.txt @@ -1,6 +1,7 @@ holidays>=0.11.1 joblib>=0.16.0 matplotlib>=3.3.0 +narwhals>=1.25.1 nfoursid>=1.0.0 numpy>=1.19.0,<2.0.0 pandas>=1.0.5