From 28a929810bb960e2476ab420c7c2d4f174d47e84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cauthierj=E2=80=9D?= <“jules.authier@unit8.co”> Date: Fri, 31 Jan 2025 10:16:19 +0100 Subject: [PATCH 01/29] narwhals implementation for and test benchmark --- darts/timeseries.py | 173 ++++++++++++++++++++++++++++++++++++++++++ narwhals_test_time.py | 123 ++++++++++++++++++++++++++++++ 2 files changed, 296 insertions(+) create mode 100644 narwhals_test_time.py diff --git a/darts/timeseries.py b/darts/timeseries.py index 4b7940e91f..392727c932 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -46,9 +46,11 @@ import matplotlib.axes import matplotlib.pyplot as plt +import narwhals as nw import numpy as np import pandas as pd import xarray as xr +from narwhals.typing import DataFrameT from pandas.tseries.frequencies import to_offset from scipy.stats import kurtosis, skew @@ -745,6 +747,177 @@ def from_dataframe( fillna_value=fillna_value, ) + @classmethod + def from_narwhals_dataframe( + cls, + df: DataFrameT, + time_col: Optional[str] = None, + value_cols: Optional[Union[list[str], str]] = None, + fill_missing_dates: Optional[bool] = False, + freq: Optional[Union[str, int]] = None, + fillna_value: Optional[float] = None, + static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None, + hierarchy: Optional[dict] = None, + ) -> Self: + """ + Build a deterministic TimeSeries instance built from a selection of columns of a DataFrame. + One column (or the DataFrame index) has to represent the time, + and a list of columns `value_cols` has to represent the values for this time series. + + Parameters + ---------- + df + The DataFrame + time_col + The time column name. If set, the column will be cast to a pandas DatetimeIndex (if it contains + timestamps) or a RangeIndex (if it contains integers). + If not set, the DataFrame index will be used. In this case the DataFrame must contain an index that is + either a pandas DatetimeIndex, a pandas RangeIndex, or a pandas Index that can be converted to a + RangeIndex. It is better if the index has no holes; alternatively setting `fill_missing_dates` can in some + cases solve these issues (filling holes with NaN, or with the provided `fillna_value` numeric value, if + any). + value_cols + A string or list of strings representing the value column(s) to be extracted from the DataFrame. If set to + `None`, the whole DataFrame will be used. + fill_missing_dates + Optionally, a boolean value indicating whether to fill missing dates (or indices in case of integer index) + with NaN values. This requires either a provided `freq` or the possibility to infer the frequency from the + provided timestamps. See :meth:`_fill_missing_dates() ` for more info. + freq + Optionally, a string or integer representing the frequency of the underlying index. This is useful in order + to fill in missing values if some dates are missing and `fill_missing_dates` is set to `True`. + If a string, represents the frequency of the pandas DatetimeIndex (see `offset aliases + `_ for more info on + supported frequencies). + If an integer, represents the step size of the pandas Index or pandas RangeIndex. + fillna_value + Optionally, a numeric value to fill missing values (NaNs) with. + static_covariates + Optionally, a set of static covariates to be added to the TimeSeries. Either a pandas Series or a pandas + DataFrame. If a Series, the index represents the static variables. The covariates are globally 'applied' + to all components of the TimeSeries. If a DataFrame, the columns represent the static variables and the + rows represent the components of the uni/multivariate TimeSeries. If a single-row DataFrame, the covariates + are globally 'applied' to all components of the TimeSeries. If a multi-row DataFrame, the number of + rows must match the number of components of the TimeSeries (in this case, the number of columns in + ``value_cols``). This adds control for component-specific static covariates. + hierarchy + Optionally, a dictionary describing the grouping(s) of the time series. The keys are component names, and + for a given component name `c`, the value is a list of component names that `c` "belongs" to. For instance, + if there is a `total` component, split both in two divisions `d1` and `d2` and in two regions `r1` and `r2`, + and four products `d1r1` (in division `d1` and region `r1`), `d2r1`, `d1r2` and `d2r2`, the hierarchy would + be encoded as follows. + + .. highlight:: python + .. code-block:: python + + hierarchy={ + "d1r1": ["d1", "r1"], + "d1r2": ["d1", "r2"], + "d2r1": ["d2", "r1"], + "d2r2": ["d2", "r2"], + "d1": ["total"], + "d2": ["total"], + "r1": ["total"], + "r2": ["total"] + } + .. + The hierarchy can be used to reconcile forecasts (so that the sums of the forecasts at + different levels are consistent), see `hierarchical reconciliation + `_. + + Returns + ------- + TimeSeries + A univariate or multivariate deterministic TimeSeries constructed from the inputs. + """ + df = nw.from_native(df) + + # get values + if value_cols is None: + if time_col is not None: + series_df = df.drop(time_col) + else: + series_df = df + else: + if isinstance(value_cols, str): + value_cols = [value_cols] + series_df = df[value_cols] + + # get time index + if time_col: + if time_col not in df.columns: + raise_log(AttributeError(f"time_col='{time_col}' is not present.")) + time_col_vals = df[time_col] + + if time_col_vals.dtype == nw.String: + # Try to convert to integers if needed + try: + time_col_vals = time_col_vals.cast(nw.Int64) + except Exception: + pass + + if time_col_vals.dtype == nw.Int64 or time_col_vals.dtype == np.integer: + # We have to check all integers appear only once to have a valid index + if time_col_vals.is_duplicated().any(): + raise_log( + ValueError( + "The provided integer time index column contains duplicate values." + ) + ) + + # Temporarily use an integer Index to sort the values, and replace by a + # RangeIndex in `TimeSeries.from_xarray()` + time_index = time_col_vals.to_list() + + elif time_col_vals.dtype == nw.String: + # The integer conversion failed; try datetimes + try: + time_index = nw.Datetime(time_col_vals) + except Exception: + raise_log( + AttributeError( + "'time_col' is of 'Utf8' dtype but doesn't contain valid timestamps" + ) + ) + elif time_col_vals.dtype == nw.Datetime: + time_index = time_col_vals.to_list() + else: + raise_log( + AttributeError( + "Invalid type of `time_col`: it needs to be of either 'Utf8', 'Datetime' or 'Int64' dtype." + ) + ) + else: + time_col_vals = nw.maybe_get_index(df) + if time_col_vals is None: + raise_log(ValueError("No time column or index found in the DataFrame.")) + # if we are here, the dataframe was pandas + raise_if_not( + isinstance(time_col_vals, VALID_INDEX_TYPES) + or np.issubdtype(time_col_vals.dtype, np.integer), + "If time_col is not specified, the DataFrame must be indexed either with " + "a DatetimeIndex, a RangeIndex, or an integer Index that can be converted into a RangeIndex", + logger, + ) + time_index = time_col_vals.to_list() + + xa = xr.DataArray( + series_df.to_numpy()[:, :, np.newaxis], + dims=(time_col if time_col else DIMS[0],) + DIMS[-2:], + coords={ + time_col if time_col else DIMS[0]: time_index, + DIMS[1]: series_df.columns, + }, + attrs={STATIC_COV_TAG: static_covariates, HIERARCHY_TAG: hierarchy}, + ) + + return cls.from_xarray( + xa=xa, + fill_missing_dates=fill_missing_dates, + freq=freq, + fillna_value=fillna_value, + ) + @classmethod def from_group_dataframe( cls, diff --git a/narwhals_test_time.py b/narwhals_test_time.py new file mode 100644 index 0000000000..de34a90552 --- /dev/null +++ b/narwhals_test_time.py @@ -0,0 +1,123 @@ +import time +import warnings +from itertools import product + +import numpy as np +import pandas as pd + +from darts.timeseries import TimeSeries + +# Suppress all warnings +warnings.filterwarnings("ignore") + + +def create_random_dataframes( + num_rows: int = 10, + num_columns: int = 3, + index: bool = True, + start_date: str = "2023-01-01", + freq: str = "D", +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Create three pandas DataFrames with random data and dates as the index or as a column. + + Parameters: + - num_rows (int): The number of rows in the DataFrames. + - num_columns (int): The number of columns in the DataFrames. + - index (bool): If True, the date is the index of the DataFrame. If False, the date is a column named 'date'. + - start_date (str): The start date for the date range (used only if date_format is 'date'). + - freq (str): The frequency of the date range (used only if date_format is 'date'). + + Returns: + - tuple: A tuple containing three DataFrames (df_date, df_numpy, df_integer). + """ + # Set a random seed for reproducibility + np.random.seed(42) + + # Generate a date range or integer list based on the date_format parameter + date_values = pd.date_range(start=start_date, periods=num_rows, freq=freq) + integer_values = list(range(1, num_rows + 1)) + numpy_values = np.array( + pd.date_range(start=start_date, periods=num_rows, freq=freq), + dtype="datetime64[D]", + ) + + # Create random data for the DataFrames + data = {f"col_{i}": np.random.randn(num_rows) for i in range(num_columns)} + + # Create the DataFrames + df_date = pd.DataFrame(data) + df_numpy = pd.DataFrame(data) + df_integer = pd.DataFrame(data) + + col_names = df_date.columns.values + + # Set the date as index or as a column based on the index parameter + if index: + df_date.index = date_values + df_numpy.index = numpy_values + df_integer.index = integer_values + else: + df_date["date"] = date_values + df_numpy["date"] = numpy_values + df_integer["date"] = integer_values + + if index: + time_col = None + else: + time_col = "date" + + return [ + [df_date, col_names, time_col], + [df_numpy, col_names, time_col], + [df_integer, col_names, time_col], + ] + + +def test_dataframes() -> list: + test_config = product( + [10, 100, 1000, 10000], + [10, 100, 500, 1000], + [True, False], + ) + + dataframes_list = [ + create_random_dataframes( + num_rows=num_rows, num_columns=num_columns, index=index + ) + for num_rows, num_columns, index in test_config + ] + + return dataframes_list + + +df_list = test_dataframes() + +############ PANDAS ############ +pandas_timer = time.time() +for df_config in df_list: + for df, col_names, time_col in df_config: + _ = TimeSeries.from_dataframe( + df, value_cols=col_names, time_col=time_col, freq=None + ) + df_shuffle = df.sample(frac=1) + _ = TimeSeries.from_dataframe( + df_shuffle, value_cols=col_names, time_col=time_col, freq=None + ) +pandas_timer = time.time() - pandas_timer + +############ NARWHALS ############ +narwhals_timer = time.time() +for df_config in df_list: + for df, col_names, time_col in df_config: + _ = TimeSeries.from_narwhals_dataframe( + df, value_cols=col_names, time_col=time_col, freq=None + ) + df_shuffle = df.sample(frac=1) + _ = TimeSeries.from_narwhals_dataframe( + df_shuffle, value_cols=col_names, time_col=time_col, freq=None + ) +narwhals_timer = time.time() - narwhals_timer + +print("pandas processing time: ", pandas_timer) +print("narwhals processing time: ", narwhals_timer) From 0041203af98b103b97ed91ecca651a299473e3d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cauthierj=E2=80=9D?= <“jules.authier@unit8.co”> Date: Tue, 4 Feb 2025 09:52:47 +0100 Subject: [PATCH 02/29] changes from MarcoGorelli incorporated --- darts/timeseries.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index 39c63ee418..cdd4cc0d18 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -857,7 +857,7 @@ def from_narwhals_dataframe( except Exception: pass - if time_col_vals.dtype == nw.Int64 or time_col_vals.dtype == np.integer: + if time_col_vals.dtype.is_integer(): # We have to check all integers appear only once to have a valid index if time_col_vals.is_duplicated().any(): raise_log( @@ -873,7 +873,7 @@ def from_narwhals_dataframe( elif time_col_vals.dtype == nw.String: # The integer conversion failed; try datetimes try: - time_index = nw.Datetime(time_col_vals) + time_index = time_col_vals.str.to_datetime() except Exception: raise_log( AttributeError( From 576e88e20fbd7e4af5713eec58408feeb163b50e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cauthierj=E2=80=9D?= <“jules.authier@unit8.co”> Date: Thu, 6 Feb 2025 09:27:11 +0100 Subject: [PATCH 03/29] improvement thanks to reviewers --- darts/timeseries.py | 5 ++- narwhals_test_time.py | 93 +++++++++++++++++++++++++++++-------------- 2 files changed, 67 insertions(+), 31 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index cdd4cc0d18..5062696784 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -836,7 +836,8 @@ def from_narwhals_dataframe( # get values if value_cols is None: if time_col is not None: - series_df = df.drop(time_col) + # series_df = df.drop(time_col) + series_df = df.loc[:, df.columns != time_col] else: series_df = df else: @@ -869,6 +870,7 @@ def from_narwhals_dataframe( # Temporarily use an integer Index to sort the values, and replace by a # RangeIndex in `TimeSeries.from_xarray()` time_index = time_col_vals.to_list() + # time_index = pd.Index(time_col_vals) elif time_col_vals.dtype == nw.String: # The integer conversion failed; try datetimes @@ -882,6 +884,7 @@ def from_narwhals_dataframe( ) elif time_col_vals.dtype == nw.Datetime: time_index = time_col_vals.to_list() + # time_index = pd.DatetimeIndex(time_col_vals) else: raise_log( AttributeError( diff --git a/narwhals_test_time.py b/narwhals_test_time.py index de34a90552..f699b8851d 100644 --- a/narwhals_test_time.py +++ b/narwhals_test_time.py @@ -76,8 +76,8 @@ def create_random_dataframes( def test_dataframes() -> list: test_config = product( - [10, 100, 1000, 10000], - [10, 100, 500, 1000], + [10, 100, 1000], + [10, 100, 500], [True, False], ) @@ -93,31 +93,64 @@ def test_dataframes() -> list: df_list = test_dataframes() -############ PANDAS ############ -pandas_timer = time.time() -for df_config in df_list: - for df, col_names, time_col in df_config: - _ = TimeSeries.from_dataframe( - df, value_cols=col_names, time_col=time_col, freq=None - ) - df_shuffle = df.sample(frac=1) - _ = TimeSeries.from_dataframe( - df_shuffle, value_cols=col_names, time_col=time_col, freq=None - ) -pandas_timer = time.time() - pandas_timer - -############ NARWHALS ############ -narwhals_timer = time.time() -for df_config in df_list: - for df, col_names, time_col in df_config: - _ = TimeSeries.from_narwhals_dataframe( - df, value_cols=col_names, time_col=time_col, freq=None - ) - df_shuffle = df.sample(frac=1) - _ = TimeSeries.from_narwhals_dataframe( - df_shuffle, value_cols=col_names, time_col=time_col, freq=None - ) -narwhals_timer = time.time() - narwhals_timer - -print("pandas processing time: ", pandas_timer) -print("narwhals processing time: ", narwhals_timer) +num_iter = 5 +pandas_global_timer = 0 +narwhals_global_timer = 0 + +for _ in range(num_iter): + pandas_timer = 0 + narwhals_timer = 0 + for df_config in df_list: + for df, col_names, time_col in df_config: + for i in range(2): + # on the second run we shuffle the data + if i == 1: + df = df.sample(frac=1) + + # pandas processing time + begin = time.time() + pandas_timeseries = TimeSeries.from_dataframe( + df, value_cols=col_names, time_col=time_col, freq=None + ) + end = time.time() + pandas_timer += end - begin + + # narwhals processing time + begin_nw = time.time() + narwhals_timeseries = TimeSeries.from_narwhals_dataframe( + df, value_cols=col_names, time_col=time_col, freq=None + ) + end_nw = time.time() + narwhals_timer += end_nw - begin_nw + + # Check if the TimeSeries objects are equal + try: + assert pandas_timeseries.time_index.equals( + narwhals_timeseries.time_index + ) + except AssertionError as e: + print( + f"Index assertion failed for DataFrame with columns {col_names} and time_col {time_col}: {e}" + ) + try: + np.testing.assert_array_almost_equal( + pandas_timeseries.all_values(), narwhals_timeseries.all_values() + ) + except AssertionError as e: + print( + f"Equal assertion failed for DataFrame with columns {col_names} and time_col {time_col}: {e}" + ) + + print("pandas processing time: ", pandas_timer) + print("narwhals processing time: ", narwhals_timer, "\n") + pandas_global_timer += pandas_timer + narwhals_global_timer += narwhals_timer + +pandas_global_timer /= num_iter +narwhals_global_timer /= num_iter + +print("Average pandas processing time: ", pandas_global_timer) +print("Average narwhals processing time: ", narwhals_global_timer) + +diff_in_fraction = (-pandas_global_timer + narwhals_global_timer) / pandas_global_timer +print(f"Average processing time difference: {diff_in_fraction:.2%}") From dbe2cd9259a0e84ff4ba6e9e84fccda111910ce2 Mon Sep 17 00:00:00 2001 From: authierj Date: Fri, 7 Feb 2025 14:54:50 +0100 Subject: [PATCH 04/29] added comments about slow and fast parts of the code --- darts/timeseries.py | 14 +++++++------- narwhals_test_time.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index 5062696784..844f2129b7 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -658,7 +658,7 @@ def from_dataframe( else: if isinstance(value_cols, str): value_cols = [value_cols] - series_df = df[value_cols] + series_df = df[value_cols] # slow # get time index if time_col: @@ -734,14 +734,14 @@ def from_dataframe( if series_df.columns.name: series_df.columns.name = None - xa = xr.DataArray( + xa = xr.DataArray( # fast series_df.values[:, :, np.newaxis], dims=(time_index.name,) + DIMS[-2:], coords={time_index.name: time_index, DIMS[1]: series_df.columns}, attrs={STATIC_COV_TAG: static_covariates, HIERARCHY_TAG: hierarchy}, ) - return cls.from_xarray( + return cls.from_xarray( # slow xa=xa, fill_missing_dates=fill_missing_dates, freq=freq, @@ -843,7 +843,7 @@ def from_narwhals_dataframe( else: if isinstance(value_cols, str): value_cols = [value_cols] - series_df = df[value_cols] + series_df = df[value_cols] # quite slow # get time index if time_col: @@ -903,9 +903,9 @@ def from_narwhals_dataframe( "a DatetimeIndex, a RangeIndex, or an integer Index that can be converted into a RangeIndex", logger, ) - time_index = time_col_vals.to_list() + time_index = time_col_vals.to_list() # slow - xa = xr.DataArray( + xa = xr.DataArray( # really slow series_df.to_numpy()[:, :, np.newaxis], dims=(time_col if time_col else DIMS[0],) + DIMS[-2:], coords={ @@ -915,7 +915,7 @@ def from_narwhals_dataframe( attrs={STATIC_COV_TAG: static_covariates, HIERARCHY_TAG: hierarchy}, ) - return cls.from_xarray( + return cls.from_xarray( # really slow xa=xa, fill_missing_dates=fill_missing_dates, freq=freq, diff --git a/narwhals_test_time.py b/narwhals_test_time.py index f699b8851d..c55f19c231 100644 --- a/narwhals_test_time.py +++ b/narwhals_test_time.py @@ -76,8 +76,8 @@ def create_random_dataframes( def test_dataframes() -> list: test_config = product( - [10, 100, 1000], - [10, 100, 500], + [10, 100, 1000, 10000], + [10, 100, 500, 1000], [True, False], ) From b2ffc674b3453e634814564784089d3928577228 Mon Sep 17 00:00:00 2001 From: authierj Date: Mon, 10 Feb 2025 15:09:18 +0100 Subject: [PATCH 05/29] using pandas index to avoid .to_list() --- darts/timeseries.py | 13 +++++++------ narwhals_test_time.py | 17 +++++++++-------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index 844f2129b7..cb8f8bc57b 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -849,6 +849,8 @@ def from_narwhals_dataframe( if time_col: if time_col not in df.columns: raise_log(AttributeError(f"time_col='{time_col}' is not present.")) + + time_index = pd.Index([]) time_col_vals = df[time_col] if time_col_vals.dtype == nw.String: @@ -869,13 +871,13 @@ def from_narwhals_dataframe( # Temporarily use an integer Index to sort the values, and replace by a # RangeIndex in `TimeSeries.from_xarray()` - time_index = time_col_vals.to_list() - # time_index = pd.Index(time_col_vals) + time_index = pd.Index(time_col_vals) elif time_col_vals.dtype == nw.String: # The integer conversion failed; try datetimes try: time_index = time_col_vals.str.to_datetime() + time_index = pd.DatetimeIndex(time_index) except Exception: raise_log( AttributeError( @@ -883,8 +885,7 @@ def from_narwhals_dataframe( ) ) elif time_col_vals.dtype == nw.Datetime: - time_index = time_col_vals.to_list() - # time_index = pd.DatetimeIndex(time_col_vals) + time_index = pd.DatetimeIndex(time_col_vals) else: raise_log( AttributeError( @@ -903,9 +904,9 @@ def from_narwhals_dataframe( "a DatetimeIndex, a RangeIndex, or an integer Index that can be converted into a RangeIndex", logger, ) - time_index = time_col_vals.to_list() # slow + time_index = time_col_vals - xa = xr.DataArray( # really slow + xa = xr.DataArray( series_df.to_numpy()[:, :, np.newaxis], dims=(time_col if time_col else DIMS[0],) + DIMS[-2:], coords={ diff --git a/narwhals_test_time.py b/narwhals_test_time.py index c55f19c231..799fe4ef68 100644 --- a/narwhals_test_time.py +++ b/narwhals_test_time.py @@ -97,7 +97,7 @@ def test_dataframes() -> list: pandas_global_timer = 0 narwhals_global_timer = 0 -for _ in range(num_iter): +for iter in range(num_iter + 1): pandas_timer = 0 narwhals_timer = 0 for df_config in df_list: @@ -140,17 +140,18 @@ def test_dataframes() -> list: print( f"Equal assertion failed for DataFrame with columns {col_names} and time_col {time_col}: {e}" ) - - print("pandas processing time: ", pandas_timer) - print("narwhals processing time: ", narwhals_timer, "\n") - pandas_global_timer += pandas_timer - narwhals_global_timer += narwhals_timer + # throw first iteration away, memory initialization + if iter > 0: + print(f"pandas processing time: {pandas_timer:.4f}") + print(f"narwhals processing time: {narwhals_timer:.4f} \n") + pandas_global_timer += pandas_timer + narwhals_global_timer += narwhals_timer pandas_global_timer /= num_iter narwhals_global_timer /= num_iter -print("Average pandas processing time: ", pandas_global_timer) -print("Average narwhals processing time: ", narwhals_global_timer) +print(f"Average pandas processing time: {pandas_global_timer:.4f}") +print(f"Average narwhals processing time: {narwhals_global_timer:.4f} \n") diff_in_fraction = (-pandas_global_timer + narwhals_global_timer) / pandas_global_timer print(f"Average processing time difference: {diff_in_fraction:.2%}") From 79312c91b68ef8546f57cab3639caa62600310a7 Mon Sep 17 00:00:00 2001 From: authierj Date: Mon, 10 Feb 2025 17:43:58 +0100 Subject: [PATCH 06/29] bug fix added --- darts/timeseries.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index cb8f8bc57b..9ee0c63a42 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -836,8 +836,7 @@ def from_narwhals_dataframe( # get values if value_cols is None: if time_col is not None: - # series_df = df.drop(time_col) - series_df = df.loc[:, df.columns != time_col] + series_df = df.drop(time_col) else: series_df = df else: @@ -904,15 +903,31 @@ def from_narwhals_dataframe( "a DatetimeIndex, a RangeIndex, or an integer Index that can be converted into a RangeIndex", logger, ) - time_index = time_col_vals + # BUGFIX : force time-index to be timezone naive as xarray doesn't support it + # pandas.DataFrame loses the tz information if it's not its index + if ( + isinstance(time_col_vals, pd.DatetimeIndex) + and time_col_vals.tz is not None + ): + logger.warning( + "The provided DatetimeIndex was associated with a timezone, which is currently not supported " + "by xarray. To avoid unexpected behaviour, the tz information was removed. Consider calling " + f"`ts.time_index.tz_localize({time_col_vals.tz})` when exporting the results." + "To plot the series with the right time steps, consider setting the matplotlib.pyplot " + "`rcParams['timezone']` parameter to automatically convert the time axis back to the " + "original timezone." + ) + time_index = time_col_vals.tz_localize(None) + else: + time_index = time_col_vals + + if not time_index.name: + time_index.name = time_col if time_col else DIMS[0] xa = xr.DataArray( series_df.to_numpy()[:, :, np.newaxis], - dims=(time_col if time_col else DIMS[0],) + DIMS[-2:], - coords={ - time_col if time_col else DIMS[0]: time_index, - DIMS[1]: series_df.columns, - }, + dims=(time_index.name,) + DIMS[-2:], + coords={time_index.name: time_index, DIMS[1]: series_df.columns}, attrs={STATIC_COV_TAG: static_covariates, HIERARCHY_TAG: hierarchy}, ) From b08a74ffa360e028417c8fc73a3e16dcb34026fd Mon Sep 17 00:00:00 2001 From: authierj Date: Tue, 11 Feb 2025 15:52:20 +0100 Subject: [PATCH 07/29] updated test script --- narwhals_test_time.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/narwhals_test_time.py b/narwhals_test_time.py index 799fe4ef68..80fd76323b 100644 --- a/narwhals_test_time.py +++ b/narwhals_test_time.py @@ -15,6 +15,7 @@ def create_random_dataframes( num_rows: int = 10, num_columns: int = 3, index: bool = True, + col_names_given: bool = True, start_date: str = "2023-01-01", freq: str = "D", ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: @@ -50,7 +51,10 @@ def create_random_dataframes( df_numpy = pd.DataFrame(data) df_integer = pd.DataFrame(data) - col_names = df_date.columns.values + if col_names_given: + col_names = df_date.columns.values + else: + col_names = None # Set the date as index or as a column based on the index parameter if index: @@ -79,13 +83,17 @@ def test_dataframes() -> list: [10, 100, 1000, 10000], [10, 100, 500, 1000], [True, False], + [True, False], ) dataframes_list = [ create_random_dataframes( - num_rows=num_rows, num_columns=num_columns, index=index + num_rows=num_rows, + num_columns=num_columns, + index=index, + col_names_given=col_names_given, ) - for num_rows, num_columns, index in test_config + for num_rows, num_columns, index, col_names_given in test_config ] return dataframes_list From 2425fbe51cdbf80b7c482a76f1a33e17db7b670a Mon Sep 17 00:00:00 2001 From: authierj Date: Wed, 12 Feb 2025 11:56:42 +0100 Subject: [PATCH 08/29] narwhals timeseries added --- darts/timeseries.py | 60 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index 9ee0c63a42..9732076404 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -50,7 +50,7 @@ import numpy as np import pandas as pd import xarray as xr -from narwhals.typing import DataFrameT +from narwhals.typing import IntoDataFrame, IntoSeries from pandas.tseries.frequencies import to_offset from scipy.stats import kurtosis, skew @@ -751,7 +751,7 @@ def from_dataframe( @classmethod def from_narwhals_dataframe( cls, - df: DataFrameT, + df: IntoDataFrame, time_col: Optional[str] = None, value_cols: Optional[Union[list[str], str]] = None, fill_missing_dates: Optional[bool] = False, @@ -1204,6 +1204,62 @@ def from_series( static_covariates=static_covariates, ) + @classmethod + def from_narwhals_series( + cls, + pd_series: IntoSeries, + fill_missing_dates: Optional[bool] = False, + freq: Optional[Union[str, int]] = None, + fillna_value: Optional[float] = None, + static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None, + ) -> Self: + """ + Build a univariate deterministic series from a pandas Series. + + The series must contain an index that is either a pandas DatetimeIndex, a pandas RangeIndex, or a pandas Index + that can be converted into a RangeIndex. It is better if the index has no holes; alternatively setting + `fill_missing_dates` can in some cases solve these issues (filling holes with NaN, or with the provided + `fillna_value` numeric value, if any). + + Parameters + ---------- + pd_series + The pandas Series instance. + fill_missing_dates + Optionally, a boolean value indicating whether to fill missing dates (or indices in case of integer index) + with NaN values. This requires either a provided `freq` or the possibility to infer the frequency from the + provided timestamps. See :meth:`_fill_missing_dates() ` for more info. + freq + Optionally, a string or integer representing the frequency of the underlying index. This is useful in order + to fill in missing values if some dates are missing and `fill_missing_dates` is set to `True`. + If a string, represents the frequency of the pandas DatetimeIndex (see `offset aliases + `_ for more info on + supported frequencies). + If an integer, represents the step size of the pandas Index or pandas RangeIndex. + fillna_value + Optionally, a numeric value to fill missing values (NaNs) with. + static_covariates + Optionally, a set of static covariates to be added to the TimeSeries. Either a pandas Series or a + single-row pandas DataFrame. If a Series, the index represents the static variables. If a DataFrame, the + columns represent the static variables and the single row represents the univariate TimeSeries component. + + Returns + ------- + TimeSeries + A univariate and deterministic TimeSeries constructed from the inputs. + """ + nw_series = nw.from_native(pd_series, allow_series=True) + df = nw_series.to_frame() + return cls.from_dataframe( + df, + time_col=None, + value_cols=None, + fill_missing_dates=fill_missing_dates, + freq=freq, + fillna_value=fillna_value, + static_covariates=static_covariates, + ) + @classmethod def from_times_and_values( cls, From 36300f2849c8460e763a606f7cd4024b57fe6fd5 Mon Sep 17 00:00:00 2001 From: authierj Date: Fri, 14 Feb 2025 13:14:27 +0100 Subject: [PATCH 09/29] from_series changed, names changed --- darts/timeseries.py | 239 +------------------------------------------- 1 file changed, 2 insertions(+), 237 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index 9732076404..877d849a9d 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -570,186 +570,6 @@ def from_csv( @classmethod def from_dataframe( - cls, - df: pd.DataFrame, - time_col: Optional[str] = None, - value_cols: Optional[Union[list[str], str]] = None, - fill_missing_dates: Optional[bool] = False, - freq: Optional[Union[str, int]] = None, - fillna_value: Optional[float] = None, - static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None, - hierarchy: Optional[dict] = None, - ) -> Self: - """ - Build a deterministic TimeSeries instance built from a selection of columns of a DataFrame. - One column (or the DataFrame index) has to represent the time, - and a list of columns `value_cols` has to represent the values for this time series. - - Parameters - ---------- - df - The DataFrame - time_col - The time column name. If set, the column will be cast to a pandas DatetimeIndex (if it contains - timestamps) or a RangeIndex (if it contains integers). - If not set, the DataFrame index will be used. In this case the DataFrame must contain an index that is - either a pandas DatetimeIndex, a pandas RangeIndex, or a pandas Index that can be converted to a - RangeIndex. It is better if the index has no holes; alternatively setting `fill_missing_dates` can in some - cases solve these issues (filling holes with NaN, or with the provided `fillna_value` numeric value, if - any). - value_cols - A string or list of strings representing the value column(s) to be extracted from the DataFrame. If set to - `None`, the whole DataFrame will be used. - fill_missing_dates - Optionally, a boolean value indicating whether to fill missing dates (or indices in case of integer index) - with NaN values. This requires either a provided `freq` or the possibility to infer the frequency from the - provided timestamps. See :meth:`_fill_missing_dates() ` for more info. - freq - Optionally, a string or integer representing the frequency of the underlying index. This is useful in order - to fill in missing values if some dates are missing and `fill_missing_dates` is set to `True`. - If a string, represents the frequency of the pandas DatetimeIndex (see `offset aliases - `_ for more info on - supported frequencies). - If an integer, represents the step size of the pandas Index or pandas RangeIndex. - fillna_value - Optionally, a numeric value to fill missing values (NaNs) with. - static_covariates - Optionally, a set of static covariates to be added to the TimeSeries. Either a pandas Series or a pandas - DataFrame. If a Series, the index represents the static variables. The covariates are globally 'applied' - to all components of the TimeSeries. If a DataFrame, the columns represent the static variables and the - rows represent the components of the uni/multivariate TimeSeries. If a single-row DataFrame, the covariates - are globally 'applied' to all components of the TimeSeries. If a multi-row DataFrame, the number of - rows must match the number of components of the TimeSeries (in this case, the number of columns in - ``value_cols``). This adds control for component-specific static covariates. - hierarchy - Optionally, a dictionary describing the grouping(s) of the time series. The keys are component names, and - for a given component name `c`, the value is a list of component names that `c` "belongs" to. For instance, - if there is a `total` component, split both in two divisions `d1` and `d2` and in two regions `r1` and `r2`, - and four products `d1r1` (in division `d1` and region `r1`), `d2r1`, `d1r2` and `d2r2`, the hierarchy would - be encoded as follows. - - .. highlight:: python - .. code-block:: python - - hierarchy={ - "d1r1": ["d1", "r1"], - "d1r2": ["d1", "r2"], - "d2r1": ["d2", "r1"], - "d2r2": ["d2", "r2"], - "d1": ["total"], - "d2": ["total"], - "r1": ["total"], - "r2": ["total"] - } - .. - The hierarchy can be used to reconcile forecasts (so that the sums of the forecasts at - different levels are consistent), see `hierarchical reconciliation - `_. - - Returns - ------- - TimeSeries - A univariate or multivariate deterministic TimeSeries constructed from the inputs. - """ - - # get values - if value_cols is None: - series_df = df.loc[:, df.columns != time_col] - else: - if isinstance(value_cols, str): - value_cols = [value_cols] - series_df = df[value_cols] # slow - - # get time index - if time_col: - if time_col not in df.columns: - raise_log(AttributeError(f"time_col='{time_col}' is not present.")) - - time_index = pd.Index([]) - time_col_vals = df[time_col] - - if np.issubdtype(time_col_vals.dtype, object): - # Try to convert to integers if needed - try: - time_col_vals = time_col_vals.astype(int) - except ValueError: - pass - - if np.issubdtype(time_col_vals.dtype, np.integer): - # We have to check all integers appear only once to have a valid index - raise_if( - time_col_vals.duplicated().any(), - "The provided integer time index column contains duplicate values.", - ) - - # Temporarily use an integer Index to sort the values, and replace by a - # RangeIndex in `TimeSeries.from_xarray()` - time_index = pd.Index(time_col_vals) - - elif np.issubdtype(time_col_vals.dtype, object): - # The integer conversion failed; try datetimes - try: - time_index = pd.DatetimeIndex(time_col_vals) - except ValueError: - raise_log( - AttributeError( - "'time_col' is of 'object' dtype but doesn't contain valid timestamps" - ) - ) - elif np.issubdtype(time_col_vals.dtype, np.datetime64): - time_index = pd.DatetimeIndex(time_col_vals) - else: - raise_log( - AttributeError( - "Invalid type of `time_col`: it needs to be of either 'str', 'datetime' or 'int' dtype." - ) - ) - time_index.name = time_col - else: - raise_if_not( - isinstance(df.index, VALID_INDEX_TYPES) - or np.issubdtype(df.index.dtype, np.integer), - "If time_col is not specified, the DataFrame must be indexed either with " - "a DatetimeIndex, a RangeIndex, or an integer Index that can be converted into a RangeIndex", - logger, - ) - # BUGFIX : force time-index to be timezone naive as xarray doesn't support it - # pandas.DataFrame loses the tz information if it's not its index - if isinstance(df.index, pd.DatetimeIndex) and df.index.tz is not None: - logger.warning( - "The provided DatetimeIndex was associated with a timezone, which is currently not supported " - "by xarray. To avoid unexpected behaviour, the tz information was removed. Consider calling " - f"`ts.time_index.tz_localize({df.index.tz})` when exporting the results." - "To plot the series with the right time steps, consider setting the matplotlib.pyplot " - "`rcParams['timezone']` parameter to automatically convert the time axis back to the " - "original timezone." - ) - time_index = df.index.tz_localize(None) - else: - time_index = df.index - - if not time_index.name: - time_index.name = time_col if time_col else DIMS[0] - - if series_df.columns.name: - series_df.columns.name = None - - xa = xr.DataArray( # fast - series_df.values[:, :, np.newaxis], - dims=(time_index.name,) + DIMS[-2:], - coords={time_index.name: time_index, DIMS[1]: series_df.columns}, - attrs={STATIC_COV_TAG: static_covariates, HIERARCHY_TAG: hierarchy}, - ) - - return cls.from_xarray( # slow - xa=xa, - fill_missing_dates=fill_missing_dates, - freq=freq, - fillna_value=fillna_value, - ) - - @classmethod - def from_narwhals_dataframe( cls, df: IntoDataFrame, time_col: Optional[str] = None, @@ -1151,61 +971,6 @@ def from_group(static_cov_vals, group): @classmethod def from_series( - cls, - pd_series: pd.Series, - fill_missing_dates: Optional[bool] = False, - freq: Optional[Union[str, int]] = None, - fillna_value: Optional[float] = None, - static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None, - ) -> Self: - """ - Build a univariate deterministic series from a pandas Series. - - The series must contain an index that is either a pandas DatetimeIndex, a pandas RangeIndex, or a pandas Index - that can be converted into a RangeIndex. It is better if the index has no holes; alternatively setting - `fill_missing_dates` can in some cases solve these issues (filling holes with NaN, or with the provided - `fillna_value` numeric value, if any). - - Parameters - ---------- - pd_series - The pandas Series instance. - fill_missing_dates - Optionally, a boolean value indicating whether to fill missing dates (or indices in case of integer index) - with NaN values. This requires either a provided `freq` or the possibility to infer the frequency from the - provided timestamps. See :meth:`_fill_missing_dates() ` for more info. - freq - Optionally, a string or integer representing the frequency of the underlying index. This is useful in order - to fill in missing values if some dates are missing and `fill_missing_dates` is set to `True`. - If a string, represents the frequency of the pandas DatetimeIndex (see `offset aliases - `_ for more info on - supported frequencies). - If an integer, represents the step size of the pandas Index or pandas RangeIndex. - fillna_value - Optionally, a numeric value to fill missing values (NaNs) with. - static_covariates - Optionally, a set of static covariates to be added to the TimeSeries. Either a pandas Series or a - single-row pandas DataFrame. If a Series, the index represents the static variables. If a DataFrame, the - columns represent the static variables and the single row represents the univariate TimeSeries component. - - Returns - ------- - TimeSeries - A univariate and deterministic TimeSeries constructed from the inputs. - """ - df = pd.DataFrame(pd_series) - return cls.from_dataframe( - df, - time_col=None, - value_cols=None, - fill_missing_dates=fill_missing_dates, - freq=freq, - fillna_value=fillna_value, - static_covariates=static_covariates, - ) - - @classmethod - def from_narwhals_series( cls, pd_series: IntoSeries, fill_missing_dates: Optional[bool] = False, @@ -1224,7 +989,7 @@ def from_narwhals_series( Parameters ---------- pd_series - The pandas Series instance. + A Series instance. fill_missing_dates Optionally, a boolean value indicating whether to fill missing dates (or indices in case of integer index) with NaN values. This requires either a provided `freq` or the possibility to infer the frequency from the @@ -1248,7 +1013,7 @@ def from_narwhals_series( TimeSeries A univariate and deterministic TimeSeries constructed from the inputs. """ - nw_series = nw.from_native(pd_series, allow_series=True) + nw_series = nw.from_native(pd_series, series_only=True) df = nw_series.to_frame() return cls.from_dataframe( df, From ba01df1245090b60762f6afa1435a83d27f9ac75 Mon Sep 17 00:00:00 2001 From: authierj Date: Fri, 14 Feb 2025 13:22:27 +0100 Subject: [PATCH 10/29] changelog updated --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d7df8c0969..b69b2c2a33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ but cannot always guarantee backwards compatibility. Changes that may **break co **Improved** +- Implemented the `from_dataframe()` and `from_series()` methods with [Narwhals](https://github.com/narwhals-dev/narwhals), a compatibility layer between dataframe librairies. From now on, Darts can transform pandas, polars, arrows and my other dataframes into `TimeSeries`. [#2661](https://github.com/unit8co/darts/pull/2661) by [Jules Authier](https://github.com/authierj) - New model: `StatsForecastAutoTBATS`. This model offers the [AutoTBATS](https://nixtlaverse.nixtla.io/statsforecast/src/core/models.html#autotbats) model from Nixtla's `statsforecasts` library. [#2611](https://github.com/unit8co/darts/pull/2611) by [He Weilin](https://github.com/cnhwl). - Added the `title` attribute to `TimeSeries.plot()`. This allows to set a title for the plot. [#2639](https://github.com/unit8co/darts/pull/2639) by [Jonathan Koch](https://github.com/jonathankoch99). - Added parameter `component_wise` to `show_anomalies()` to separately plot each component in multivariate series. [#2544](https://github.com/unit8co/darts/pull/2544) by [He Weilin](https://github.com/cnhwl). From 2e39269a1713d75b5ac36643bd12b24700f85baa Mon Sep 17 00:00:00 2001 From: authierj Date: Mon, 17 Feb 2025 15:40:19 +0100 Subject: [PATCH 11/29] small improvement --- darts/timeseries.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index 877d849a9d..b771b22a5b 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -695,8 +695,8 @@ def from_dataframe( elif time_col_vals.dtype == nw.String: # The integer conversion failed; try datetimes try: - time_index = time_col_vals.str.to_datetime() - time_index = pd.DatetimeIndex(time_index) + # time_index = time_col_vals.str.to_datetime() + time_index = pd.DatetimeIndex(time_col_vals) except Exception: raise_log( AttributeError( From 1a9a266247187b3c09c1d21c9bb22d993f6d5a0d Mon Sep 17 00:00:00 2001 From: authierj Date: Mon, 17 Feb 2025 17:01:18 +0100 Subject: [PATCH 12/29] clean test scripts added --- from_df_timing.py | 174 +++++++++++++++++++++++++++++++++++++++++ from_df_timing_col.py | 176 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 350 insertions(+) create mode 100644 from_df_timing.py create mode 100644 from_df_timing_col.py diff --git a/from_df_timing.py b/from_df_timing.py new file mode 100644 index 0000000000..823e018105 --- /dev/null +++ b/from_df_timing.py @@ -0,0 +1,174 @@ +import argparse +import json +import time +import warnings +from itertools import product + +import numpy as np +import pandas as pd +from tqdm import tqdm + +from darts.timeseries import TimeSeries + +# Suppress all warnings +warnings.filterwarnings("ignore") + + +def test_from_dataframe(f_name: str): + return getattr(TimeSeries, f_name) + + +def create_random_dataframes( + num_rows: int = 10, + num_columns: int = 3, + index: bool = True, + col_names_given: bool = True, + start_date: str = "1900-01-01", + freq: str = "D", +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Create three pandas DataFrames with random data and dates as the index or as a column. + + Parameters: + - num_rows (int): The number of rows in the DataFrames. + - num_columns (int): The number of columns in the DataFrames. + - index (bool): If True, the date is the index of the DataFrame. If False, the date is a column named 'date'. + - start_date (str): The start date for the date range (used only if date_format is 'date'). + - freq (str): The frequency of the date range (used only if date_format is 'date'). + + Returns: + - tuple: A tuple containing three DataFrames (df_date, df_numpy, df_integer). + """ + # Set a random seed for reproducibility + np.random.seed(42) + + # Generate a date range or integer list based on the date_format parameter + date_values = pd.date_range(start=start_date, periods=num_rows, freq=freq) + integer_values = list(range(1, num_rows + 1)) + numpy_values = np.array( + pd.date_range(start=start_date, periods=num_rows, freq=freq), + dtype="datetime64[D]", + ) + + # Create random data for the DataFrames + data = {f"col_{i}": np.random.randn(num_rows) for i in range(num_columns)} + + # Create the DataFrames + df_date = pd.DataFrame(data) + df_numpy = pd.DataFrame(data) + df_integer = pd.DataFrame(data) + + if col_names_given: + col_names = df_date.columns.values + else: + col_names = None + + # Set the date as index or as a column based on the index parameter + if index: + df_date.index = date_values + df_numpy.index = numpy_values + df_integer.index = integer_values + else: + df_date["date"] = date_values + df_numpy["date"] = numpy_values + df_integer["date"] = integer_values + + if index: + time_col = None + else: + time_col = "date" + + return [ + [df_date, col_names, time_col], + [df_numpy, col_names, time_col], + [df_integer, col_names, time_col], + ] + + +def test_dataframes() -> list: + test_config = product( + [10, 100, 1000, 10000, 100000], + [100], + [True, False], + [True, False], + ) + + dataframes_list = [ + create_random_dataframes( + num_rows=num_rows, + num_columns=num_columns, + index=index, + col_names_given=col_names_given, + ) + for num_rows, num_columns, index, col_names_given in test_config + ] + + return dataframes_list + + +def calculate_processing_time( + f_name: str, + num_iter: int, + save_path="/Users/julesauthier/Documents/darts/from_df_times/data/", +): + df_list = test_dataframes() + df_func = test_from_dataframe(f_name) + + # Initialize dictionaries to store processing times + times = {} + + # Initialize the progress bar + total_iterations = ( + len(df_list) * 2 * 3 + ) # 2 iterations per dataframe configuration, 3 df per config + progress_bar = tqdm(total=total_iterations, desc="Processing DataFrames") + + for df_config in df_list: + for df, col_names, time_col in df_config: + num_rows = len(df) + dict_entry = str(num_rows) + + for i in range(2): + # on the second run we shuffle the data + if i == 1: + df = df.sample(frac=1) + dict_entry += "_shuffled" + + begin = time.time() + for _ in range(num_iter): + _ = df_func(df, value_cols=col_names, time_col=time_col, freq=None) + end = time.time() + timer = (end - begin) / num_iter + + if dict_entry not in times: + times[dict_entry] = timer + else: + times[dict_entry] += timer + + # Update the progress bar + progress_bar.update(1) + + file_name = f_name + "_avg_time_" + str(num_iter) + "_iter.json" + + # Store the average times in separate JSON files + with open(save_path + file_name, "w") as f: + json.dump(times, f, indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="The function to test and the number of iter can " + ) + parser.add_argument( + "--f_name", type=str, default="from_dataframe", help="method to time" + ) + parser.add_argument( + "--n_iter", type=int, default=100, help="number of function call" + ) + + args = parser.parse_args() + + f_name = args.f_name + n_iter = args.n_iter + + calculate_processing_time(f_name, n_iter) diff --git a/from_df_timing_col.py b/from_df_timing_col.py new file mode 100644 index 0000000000..4487654ea4 --- /dev/null +++ b/from_df_timing_col.py @@ -0,0 +1,176 @@ +import argparse +import json +import time +import warnings +from itertools import product + +import numpy as np +import pandas as pd +from tqdm import tqdm + +from darts.timeseries import TimeSeries + +# Suppress all warnings +warnings.filterwarnings("ignore") + + +def test_from_dataframe(f_name: str): + return getattr(TimeSeries, f_name) + + +def create_random_dataframes( + num_rows: int = 10, + num_columns: int = 3, + index: bool = True, + col_names_given: bool = True, + start_date: str = "1900-01-01", + freq: str = "D", +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Create three pandas DataFrames with random data and dates as the index or as a column. + + Parameters: + - num_rows (int): The number of rows in the DataFrames. + - num_columns (int): The number of columns in the DataFrames. + - index (bool): If True, the date is the index of the DataFrame. If False, the date is a column named 'date'. + - start_date (str): The start date for the date range (used only if date_format is 'date'). + - freq (str): The frequency of the date range (used only if date_format is 'date'). + + Returns: + - tuple: A tuple containing three DataFrames (df_date, df_numpy, df_integer). + """ + # Set a random seed for reproducibility + np.random.seed(42) + + # Generate a date range or integer list based on the date_format parameter + date_values = pd.date_range(start=start_date, periods=num_rows, freq=freq) + integer_values = list(range(1, num_rows + 1)) + numpy_values = np.array( + pd.date_range(start=start_date, periods=num_rows, freq=freq), + dtype="datetime64[D]", + ) + + # Create random data for the DataFrames + data = {f"col_{i}": np.random.randn(num_rows) for i in range(num_columns)} + + # Create the DataFrames + df_date = pd.DataFrame(data) + df_numpy = pd.DataFrame(data) + df_integer = pd.DataFrame(data) + + if col_names_given: + col_names = df_date.columns.values + else: + col_names = None + + # Set the date as index or as a column based on the index parameter + if index: + df_date.index = date_values + df_numpy.index = numpy_values + df_integer.index = integer_values + else: + df_date["date"] = date_values + df_numpy["date"] = numpy_values + df_integer["date"] = integer_values + + if index: + time_col = None + else: + time_col = "date" + + return [ + [df_date, col_names, time_col], + [df_numpy, col_names, time_col], + [df_integer, col_names, time_col], + ] + + +def test_dataframes() -> list: + test_config = product( + [1000], + [1, 10, 100, 1000], + [True, False], + [True, False], + ) + + dataframes_list = [ + create_random_dataframes( + num_rows=num_rows, + num_columns=num_columns, + index=index, + col_names_given=col_names_given, + ) + for num_rows, num_columns, index, col_names_given in test_config + ] + + return dataframes_list + + +def calculate_processing_time( + f_name: str, + num_iter: int, + save_path="/Users/julesauthier/Documents/darts/from_df_times/data/", +): + df_list = test_dataframes() + df_func = test_from_dataframe(f_name) + + # Initialize dictionaries to store processing times + times = {} + + # Initialize the progress bar + total_iterations = ( + len(df_list) * 2 * 3 + ) # 2 iterations per dataframe configuration, 3 df per config + progress_bar = tqdm(total=total_iterations, desc="Processing DataFrames") + + for df_config in df_list: + for df, col_names, time_col in df_config: + num_cols = df.shape[1] + if num_cols > 1 and (num_cols % 2 == 1 or num_cols == 2): + num_cols -= 1 + dict_entry = str(num_cols) + + for i in range(2): + # on the second run we shuffle the data + if i == 1: + df = df.sample(frac=1) + dict_entry += "_shuffled" + + begin = time.time() + for _ in range(num_iter): + _ = df_func(df, value_cols=col_names, time_col=time_col, freq=None) + end = time.time() + timer = (end - begin) / num_iter + + if dict_entry not in times: + times[dict_entry] = timer + else: + times[dict_entry] += timer + + # Update the progress bar + progress_bar.update(1) + + file_name = f_name + "_avg_time_cols_" + str(num_iter) + "_iter.json" + + # Store the average times in separate JSON files + with open(save_path + file_name, "w") as f: + json.dump(times, f, indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="The function to test and the number of iter can " + ) + parser.add_argument( + "--f_name", type=str, default="from_dataframe", help="method to time" + ) + parser.add_argument( + "--n_iter", type=int, default=100, help="number of function call" + ) + + args = parser.parse_args() + + f_name = args.f_name + n_iter = args.n_iter + + calculate_processing_time(f_name, n_iter) From 2c24a395ed69ea0e52803a3a4e5a5395c406dd6f Mon Sep 17 00:00:00 2001 From: authierj Date: Wed, 19 Feb 2025 10:47:38 +0100 Subject: [PATCH 13/29] BUGFIX added for non_pandas df --- darts/timeseries.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index b771b22a5b..3326c41ea1 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -655,10 +655,7 @@ def from_dataframe( # get values if value_cols is None: - if time_col is not None: - series_df = df.drop(time_col) - else: - series_df = df + series_df = df.drop(time_col) if time_col else df else: if isinstance(value_cols, str): value_cols = [value_cols] @@ -669,7 +666,6 @@ def from_dataframe( if time_col not in df.columns: raise_log(AttributeError(f"time_col='{time_col}' is not present.")) - time_index = pd.Index([]) time_col_vals = df[time_col] if time_col_vals.dtype == nw.String: @@ -680,14 +676,12 @@ def from_dataframe( pass if time_col_vals.dtype.is_integer(): - # We have to check all integers appear only once to have a valid index if time_col_vals.is_duplicated().any(): raise_log( ValueError( "The provided integer time index column contains duplicate values." ) ) - # Temporarily use an integer Index to sort the values, and replace by a # RangeIndex in `TimeSeries.from_xarray()` time_index = pd.Index(time_col_vals) @@ -695,7 +689,6 @@ def from_dataframe( elif time_col_vals.dtype == nw.String: # The integer conversion failed; try datetimes try: - # time_index = time_col_vals.str.to_datetime() time_index = pd.DatetimeIndex(time_col_vals) except Exception: raise_log( @@ -704,6 +697,18 @@ def from_dataframe( ) ) elif time_col_vals.dtype == nw.Datetime: + # BUGFIX : force time-index to be timezone naive as xarray doesn't support it + # pandas.DataFrame loses the tz information if it's not its index + if time_col_vals.dtype.time_zone is not None: + logger.warning( + "The provided Datetime data was associated with a timezone, which is currently not supported " + "by xarray. To avoid unexpected behaviour, the tz information was removed. Consider calling " + f"`ts.time_index.tz_localize({time_col_vals.dtype.time_zone})` when exporting the results." + "To plot the series with the right time steps, consider setting the matplotlib.pyplot " + "`rcParams['timezone']` parameter to automatically convert the time axis back to the " + "original timezone." + ) + time_col_vals = time_col_vals.dt.replace_time_zone(None) time_index = pd.DatetimeIndex(time_col_vals) else: raise_log( From 89f23fb68f3dd959914a4d00e18961344dd5aa79 Mon Sep 17 00:00:00 2001 From: authierj Date: Wed, 19 Feb 2025 10:48:01 +0100 Subject: [PATCH 14/29] tests added for polars df --- darts/tests/test_timeseries.py | 123 ++++++++++++++++++++++++--------- 1 file changed, 91 insertions(+), 32 deletions(-) diff --git a/darts/tests/test_timeseries.py b/darts/tests/test_timeseries.py index 41b04aebd4..d8f1505751 100644 --- a/darts/tests/test_timeseries.py +++ b/darts/tests/test_timeseries.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +import polars as pl import pytest import xarray as xr from scipy.stats import kurtosis, skew @@ -2506,7 +2507,16 @@ def test_tail_numeric_time_index(self): class TestTimeSeriesFromDataFrame: - def test_from_dataframe_sunny_day(self): + def pd_to_backend(self, df, backend, index=False): + if backend == "pandas": + return df + elif backend == "polars": + if index: + return pl.from_pandas(df.reset_index()) + return pl.from_pandas(df) + + @pytest.mark.parametrize("backend", ["pandas", "polars"]) + def test_from_dataframe_sunny_day(self, backend): data_dict = {"Time": pd.date_range(start="20180501", end="20200301", freq="MS")} data_dict["Values1"] = np.random.uniform( low=-10, high=10, size=len(data_dict["Time"]) @@ -2520,40 +2530,55 @@ def test_from_dataframe_sunny_day(self): data_pd2["Time"] = data_pd2["Time"].apply(lambda date: str(date)) data_pd3 = data_pd1.set_index("Time") - data_darts1 = TimeSeries.from_dataframe(df=data_pd1, time_col="Time") - data_darts2 = TimeSeries.from_dataframe(df=data_pd2, time_col="Time") - data_darts3 = TimeSeries.from_dataframe(df=data_pd3) + data_darts1 = TimeSeries.from_dataframe( + df=self.pd_to_backend(data_pd1, backend), time_col="Time" + ) + data_darts2 = TimeSeries.from_dataframe( + df=self.pd_to_backend(data_pd2, backend), time_col="Time" + ) + data_darts3 = TimeSeries.from_dataframe( + df=self.pd_to_backend(data_pd3, backend, index=True), + time_col=None if backend == "pandas" else "Time", + ) assert data_darts1 == data_darts2 assert data_darts1 == data_darts3 - def test_time_col_convert_string_integers(self): + @pytest.mark.parametrize("backend", ["pandas", "polars"]) + def test_time_col_convert_string_integers(self, backend): expected = np.array(list(range(3, 10))) data_dict = {"Time": expected.astype(str)} data_dict["Values1"] = np.random.uniform( low=-10, high=10, size=len(data_dict["Time"]) ) df = pd.DataFrame(data_dict) - ts = TimeSeries.from_dataframe(df=df, time_col="Time") + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) assert set(ts.time_index.values.tolist()) == set(expected) assert ts.time_index.dtype == int assert ts.time_index.name == "Time" - def test_time_col_convert_integers(self): + @pytest.mark.parametrize("backend", ["pandas", "polars"]) + def test_time_col_convert_integers(self, backend): expected = np.array(list(range(10))) data_dict = {"Time": expected} data_dict["Values1"] = np.random.uniform( low=-10, high=10, size=len(data_dict["Time"]) ) + df = pd.DataFrame(data_dict) - ts = TimeSeries.from_dataframe(df=df, time_col="Time") + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) assert set(ts.time_index.values.tolist()) == set(expected) assert ts.time_index.dtype == int assert ts.time_index.name == "Time" - def test_fail_with_bad_integer_time_col(self): + @pytest.mark.parametrize("backend", ["pandas", "polars"]) + def test_fail_with_bad_integer_time_col(self, backend): bad_time_col_vals = np.array([4, 0, 1, 2]) data_dict = {"Time": bad_time_col_vals} data_dict["Values1"] = np.random.uniform( @@ -2561,9 +2586,12 @@ def test_fail_with_bad_integer_time_col(self): ) df = pd.DataFrame(data_dict) with pytest.raises(ValueError): - TimeSeries.from_dataframe(df=df, time_col="Time") + TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) - def test_time_col_convert_rangeindex(self): + @pytest.mark.parametrize("backend", ["pandas", "polars"]) + def test_time_col_convert_rangeindex(self, backend): for expected_l, step in zip([[4, 0, 2, 3, 1], [8, 0, 4, 6, 2]], [1, 2]): expected = np.array(expected_l) data_dict = {"Time": expected} @@ -2571,7 +2599,9 @@ def test_time_col_convert_rangeindex(self): low=-10, high=10, size=len(data_dict["Time"]) ) df = pd.DataFrame(data_dict) - ts = TimeSeries.from_dataframe(df=df, time_col="Time") + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) # check type (should convert to RangeIndex): assert type(ts.time_index) is pd.RangeIndex @@ -2586,31 +2616,38 @@ def test_time_col_convert_rangeindex(self): ] assert np.all(ar1 == ar2) - def test_time_col_convert_datetime(self): + @pytest.mark.parametrize("backend", ["pandas", "polars"]) + def test_time_col_convert_datetime(self, backend): expected = pd.date_range(start="20180501", end="20200301", freq="MS") data_dict = {"Time": expected} data_dict["Values1"] = np.random.uniform( low=-10, high=10, size=len(data_dict["Time"]) ) df = pd.DataFrame(data_dict) - ts = TimeSeries.from_dataframe(df=df, time_col="Time") + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) assert ts.time_index.dtype == "datetime64[ns]" assert ts.time_index.name == "Time" - def test_time_col_convert_datetime_strings(self): + @pytest.mark.parametrize("backend", ["pandas", "polars"]) + def test_time_col_convert_datetime_strings(self, backend): expected = pd.date_range(start="20180501", end="20200301", freq="MS") data_dict = {"Time": expected.values.astype(str)} data_dict["Values1"] = np.random.uniform( low=-10, high=10, size=len(data_dict["Time"]) ) df = pd.DataFrame(data_dict) - ts = TimeSeries.from_dataframe(df=df, time_col="Time") + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) assert ts.time_index.dtype == "datetime64[ns]" assert ts.time_index.name == "Time" - def test_time_col_with_tz(self): + @pytest.mark.parametrize("backend", ["pandas", "polars"]) + def test_time_col_with_tz_df(self, backend): # numpy and xarray don't support "timezone aware" pd.DatetimeIndex # the BUGFIX removes timezone information without conversion @@ -2621,13 +2658,10 @@ def test_time_col_with_tz(self): # pd.DataFrame loses the tz information unless it is contained in its index # (other columns are silently converted to UTC, with tz attribute set to None) df = pd.DataFrame(data=values, index=time_range_MS) - ts = TimeSeries.from_dataframe(df=df) - assert list(ts.time_index) == list(time_range_MS.tz_localize(None)) - assert list(ts.time_index.tz_localize("CET")) == list(time_range_MS) - assert ts.time_index.tz is None - - serie = pd.Series(data=values, index=time_range_MS) - ts = TimeSeries.from_series(pd_series=serie) + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend, index=True), + time_col=None if backend == "pandas" else "index", + ) assert list(ts.time_index) == list(time_range_MS.tz_localize(None)) assert list(ts.time_index.tz_localize("CET")) == list(time_range_MS) assert ts.time_index.tz is None @@ -2643,23 +2677,42 @@ def test_time_col_with_tz(self): values = np.random.uniform(low=-10, high=10, size=len(time_range_H)) df = pd.DataFrame(data=values, index=time_range_H) - ts = TimeSeries.from_dataframe(df=df) + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend, index=True), + time_col=None if backend == "pandas" else "index", + ) assert list(ts.time_index) == list(time_range_H.tz_localize(None)) assert list(ts.time_index.tz_localize("CET")) == list(time_range_H) assert ts.time_index.tz is None - series = pd.Series(data=values, index=time_range_H) - ts = TimeSeries.from_series(pd_series=series) + ts = TimeSeries.from_times_and_values(times=time_range_H, values=values) assert list(ts.time_index) == list(time_range_H.tz_localize(None)) assert list(ts.time_index.tz_localize("CET")) == list(time_range_H) assert ts.time_index.tz is None - ts = TimeSeries.from_times_and_values(times=time_range_H, values=values) + def test_time_col_with_tz_series(self): + time_range_MS = pd.date_range( + start="20180501", end="20200301", freq="MS", tz="CET" + ) + values = np.random.uniform(low=-10, high=10, size=len(time_range_MS)) + serie = pd.Series(data=values, index=time_range_MS) + ts = TimeSeries.from_series(pd_series=serie) + assert list(ts.time_index) == list(time_range_MS.tz_localize(None)) + assert list(ts.time_index.tz_localize("CET")) == list(time_range_MS) + assert ts.time_index.tz is None + + time_range_H = pd.date_range( + start="20200518", end="20200521", freq=freqs["h"], tz="CET" + ) + values = np.random.uniform(low=-10, high=10, size=len(time_range_H)) + series = pd.Series(data=values, index=time_range_H) + ts = TimeSeries.from_series(pd_series=series) assert list(ts.time_index) == list(time_range_H.tz_localize(None)) assert list(ts.time_index.tz_localize("CET")) == list(time_range_H) assert ts.time_index.tz is None - def test_time_col_convert_garbage(self): + @pytest.mark.parametrize("backend", ["pandas", "polars"]) + def test_time_col_convert_garbage(self, backend): expected = [ "2312312asdfdw", "asdfsdf432sdf", @@ -2674,9 +2727,12 @@ def test_time_col_convert_garbage(self): df = pd.DataFrame(data_dict) with pytest.raises(AttributeError): - TimeSeries.from_dataframe(df=df, time_col="Time") + TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend), time_col="Time" + ) - def test_df_named_columns_index(self): + @pytest.mark.parametrize("backend", ["pandas", "polars"]) + def test_df_named_columns_index(self, backend): time_index = generate_index( start=pd.Timestamp("2000-01-01"), length=4, freq="D", name="index" ) @@ -2686,7 +2742,10 @@ def test_df_named_columns_index(self): columns=["y"], ) df.columns.name = "id" - ts = TimeSeries.from_dataframe(df) + ts = TimeSeries.from_dataframe( + df=self.pd_to_backend(df, backend, index=True), + time_col=None if backend == "pandas" else "index", + ) exp_ts = TimeSeries.from_times_and_values( times=time_index, From de0a32d7d07e67989f03d57b0bb5e5a54ca7013b Mon Sep 17 00:00:00 2001 From: authierj Date: Wed, 19 Feb 2025 11:25:04 +0100 Subject: [PATCH 15/29] polars and narwhals added to dependencies. Ideally, polars should be an optional dependency. --- requirements/core.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements/core.txt b/requirements/core.txt index 0245c46194..3081922e93 100644 --- a/requirements/core.txt +++ b/requirements/core.txt @@ -1,9 +1,11 @@ holidays>=0.11.1 joblib>=0.16.0 matplotlib>=3.3.0 +narwhals>=1.25.1 nfoursid>=1.0.0 numpy>=1.19.0,<2.0.0 pandas>=1.0.5 +polars>=1.0.0 pmdarima>=1.8.0 pyod>=0.9.5 requests>=2.22.0 From 16bac00cf7f25623808c5492818529255723eff6 Mon Sep 17 00:00:00 2001 From: authierj Date: Thu, 20 Feb 2025 15:25:59 +0100 Subject: [PATCH 16/29] refactoring pd_series and pd_dataframe --- darts/timeseries.py | 42 ++++++++++++++---------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index 3326c41ea1..5d2c9a0946 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -1581,18 +1581,15 @@ def pd_series(self, copy=True) -> pd.Series: """ self._assert_univariate() self._assert_deterministic() + + data = self._xa[:, 0, 0].values + index = self._time_index + name = self.components[0] + if copy: - return pd.Series( - self._xa[:, 0, 0].values.copy(), - index=self._time_index.copy(), - name=self.components[0], - ) + return pd.Series(data=data.copy(), index=index.copy(), name=name) else: - return pd.Series( - self._xa[:, 0, 0].values, - index=self._time_index, - name=self.components[0], - ) + return pd.Series(data=data, index=index, name=name) def pd_dataframe(self, copy=True, suppress_warnings=False) -> pd.DataFrame: """ @@ -1613,6 +1610,7 @@ def pd_dataframe(self, copy=True, suppress_warnings=False) -> pd.DataFrame: pandas.DataFrame The Pandas DataFrame representation of this time series """ + if not self.is_deterministic: if not suppress_warnings: logger.warning( @@ -1628,32 +1626,20 @@ def pd_dataframe(self, copy=True, suppress_warnings=False) -> pd.DataFrame: "_s".join((comp_name, str(sample_id))) for comp_name, sample_id in itertools.product(comp_name, samples) ] + data = self._xa.stack(data=(DIMS[1], DIMS[2])) + index = self._time_index if copy: return pd.DataFrame( - self._xa.stack(data=(DIMS[1], DIMS[2])).values.copy(), - index=self._time_index.copy(), - columns=df_col_names.copy(), + data=data.copy(), index=index.copy(), columns=df_col_names.copy() ) else: - return pd.DataFrame( - self._xa.stack(data=(DIMS[1], DIMS[2])).values, - index=self._time_index, - columns=df_col_names, - ) + return pd.DataFrame(data=data, index=index, columns=df_col_names) else: if copy: - return pd.DataFrame( - self._xa[:, :, 0].values.copy(), - index=self._time_index.copy(), - columns=self._xa.get_index(DIMS[1]).copy(), - ) + return self._xa[:, :, 0].copy().to_pandas() else: - return pd.DataFrame( - self._xa[:, :, 0].values, - index=self._time_index, - columns=self._xa.get_index(DIMS[1]), - ) + return self._xa[:, :, 0].to_pandas() def quantile_df(self, quantile=0.5) -> pd.DataFrame: """ From 09509109996d48991edf276e1a1b87a0614c0294 Mon Sep 17 00:00:00 2001 From: authierj Date: Fri, 21 Feb 2025 10:07:21 +0100 Subject: [PATCH 17/29] removed test scripts from git repo --- from_df_timing.py | 174 ----------------------------------------- from_df_timing_col.py | 176 ------------------------------------------ 2 files changed, 350 deletions(-) delete mode 100644 from_df_timing.py delete mode 100644 from_df_timing_col.py diff --git a/from_df_timing.py b/from_df_timing.py deleted file mode 100644 index 823e018105..0000000000 --- a/from_df_timing.py +++ /dev/null @@ -1,174 +0,0 @@ -import argparse -import json -import time -import warnings -from itertools import product - -import numpy as np -import pandas as pd -from tqdm import tqdm - -from darts.timeseries import TimeSeries - -# Suppress all warnings -warnings.filterwarnings("ignore") - - -def test_from_dataframe(f_name: str): - return getattr(TimeSeries, f_name) - - -def create_random_dataframes( - num_rows: int = 10, - num_columns: int = 3, - index: bool = True, - col_names_given: bool = True, - start_date: str = "1900-01-01", - freq: str = "D", -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Create three pandas DataFrames with random data and dates as the index or as a column. - - Parameters: - - num_rows (int): The number of rows in the DataFrames. - - num_columns (int): The number of columns in the DataFrames. - - index (bool): If True, the date is the index of the DataFrame. If False, the date is a column named 'date'. - - start_date (str): The start date for the date range (used only if date_format is 'date'). - - freq (str): The frequency of the date range (used only if date_format is 'date'). - - Returns: - - tuple: A tuple containing three DataFrames (df_date, df_numpy, df_integer). - """ - # Set a random seed for reproducibility - np.random.seed(42) - - # Generate a date range or integer list based on the date_format parameter - date_values = pd.date_range(start=start_date, periods=num_rows, freq=freq) - integer_values = list(range(1, num_rows + 1)) - numpy_values = np.array( - pd.date_range(start=start_date, periods=num_rows, freq=freq), - dtype="datetime64[D]", - ) - - # Create random data for the DataFrames - data = {f"col_{i}": np.random.randn(num_rows) for i in range(num_columns)} - - # Create the DataFrames - df_date = pd.DataFrame(data) - df_numpy = pd.DataFrame(data) - df_integer = pd.DataFrame(data) - - if col_names_given: - col_names = df_date.columns.values - else: - col_names = None - - # Set the date as index or as a column based on the index parameter - if index: - df_date.index = date_values - df_numpy.index = numpy_values - df_integer.index = integer_values - else: - df_date["date"] = date_values - df_numpy["date"] = numpy_values - df_integer["date"] = integer_values - - if index: - time_col = None - else: - time_col = "date" - - return [ - [df_date, col_names, time_col], - [df_numpy, col_names, time_col], - [df_integer, col_names, time_col], - ] - - -def test_dataframes() -> list: - test_config = product( - [10, 100, 1000, 10000, 100000], - [100], - [True, False], - [True, False], - ) - - dataframes_list = [ - create_random_dataframes( - num_rows=num_rows, - num_columns=num_columns, - index=index, - col_names_given=col_names_given, - ) - for num_rows, num_columns, index, col_names_given in test_config - ] - - return dataframes_list - - -def calculate_processing_time( - f_name: str, - num_iter: int, - save_path="/Users/julesauthier/Documents/darts/from_df_times/data/", -): - df_list = test_dataframes() - df_func = test_from_dataframe(f_name) - - # Initialize dictionaries to store processing times - times = {} - - # Initialize the progress bar - total_iterations = ( - len(df_list) * 2 * 3 - ) # 2 iterations per dataframe configuration, 3 df per config - progress_bar = tqdm(total=total_iterations, desc="Processing DataFrames") - - for df_config in df_list: - for df, col_names, time_col in df_config: - num_rows = len(df) - dict_entry = str(num_rows) - - for i in range(2): - # on the second run we shuffle the data - if i == 1: - df = df.sample(frac=1) - dict_entry += "_shuffled" - - begin = time.time() - for _ in range(num_iter): - _ = df_func(df, value_cols=col_names, time_col=time_col, freq=None) - end = time.time() - timer = (end - begin) / num_iter - - if dict_entry not in times: - times[dict_entry] = timer - else: - times[dict_entry] += timer - - # Update the progress bar - progress_bar.update(1) - - file_name = f_name + "_avg_time_" + str(num_iter) + "_iter.json" - - # Store the average times in separate JSON files - with open(save_path + file_name, "w") as f: - json.dump(times, f, indent=4) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="The function to test and the number of iter can " - ) - parser.add_argument( - "--f_name", type=str, default="from_dataframe", help="method to time" - ) - parser.add_argument( - "--n_iter", type=int, default=100, help="number of function call" - ) - - args = parser.parse_args() - - f_name = args.f_name - n_iter = args.n_iter - - calculate_processing_time(f_name, n_iter) diff --git a/from_df_timing_col.py b/from_df_timing_col.py deleted file mode 100644 index 4487654ea4..0000000000 --- a/from_df_timing_col.py +++ /dev/null @@ -1,176 +0,0 @@ -import argparse -import json -import time -import warnings -from itertools import product - -import numpy as np -import pandas as pd -from tqdm import tqdm - -from darts.timeseries import TimeSeries - -# Suppress all warnings -warnings.filterwarnings("ignore") - - -def test_from_dataframe(f_name: str): - return getattr(TimeSeries, f_name) - - -def create_random_dataframes( - num_rows: int = 10, - num_columns: int = 3, - index: bool = True, - col_names_given: bool = True, - start_date: str = "1900-01-01", - freq: str = "D", -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Create three pandas DataFrames with random data and dates as the index or as a column. - - Parameters: - - num_rows (int): The number of rows in the DataFrames. - - num_columns (int): The number of columns in the DataFrames. - - index (bool): If True, the date is the index of the DataFrame. If False, the date is a column named 'date'. - - start_date (str): The start date for the date range (used only if date_format is 'date'). - - freq (str): The frequency of the date range (used only if date_format is 'date'). - - Returns: - - tuple: A tuple containing three DataFrames (df_date, df_numpy, df_integer). - """ - # Set a random seed for reproducibility - np.random.seed(42) - - # Generate a date range or integer list based on the date_format parameter - date_values = pd.date_range(start=start_date, periods=num_rows, freq=freq) - integer_values = list(range(1, num_rows + 1)) - numpy_values = np.array( - pd.date_range(start=start_date, periods=num_rows, freq=freq), - dtype="datetime64[D]", - ) - - # Create random data for the DataFrames - data = {f"col_{i}": np.random.randn(num_rows) for i in range(num_columns)} - - # Create the DataFrames - df_date = pd.DataFrame(data) - df_numpy = pd.DataFrame(data) - df_integer = pd.DataFrame(data) - - if col_names_given: - col_names = df_date.columns.values - else: - col_names = None - - # Set the date as index or as a column based on the index parameter - if index: - df_date.index = date_values - df_numpy.index = numpy_values - df_integer.index = integer_values - else: - df_date["date"] = date_values - df_numpy["date"] = numpy_values - df_integer["date"] = integer_values - - if index: - time_col = None - else: - time_col = "date" - - return [ - [df_date, col_names, time_col], - [df_numpy, col_names, time_col], - [df_integer, col_names, time_col], - ] - - -def test_dataframes() -> list: - test_config = product( - [1000], - [1, 10, 100, 1000], - [True, False], - [True, False], - ) - - dataframes_list = [ - create_random_dataframes( - num_rows=num_rows, - num_columns=num_columns, - index=index, - col_names_given=col_names_given, - ) - for num_rows, num_columns, index, col_names_given in test_config - ] - - return dataframes_list - - -def calculate_processing_time( - f_name: str, - num_iter: int, - save_path="/Users/julesauthier/Documents/darts/from_df_times/data/", -): - df_list = test_dataframes() - df_func = test_from_dataframe(f_name) - - # Initialize dictionaries to store processing times - times = {} - - # Initialize the progress bar - total_iterations = ( - len(df_list) * 2 * 3 - ) # 2 iterations per dataframe configuration, 3 df per config - progress_bar = tqdm(total=total_iterations, desc="Processing DataFrames") - - for df_config in df_list: - for df, col_names, time_col in df_config: - num_cols = df.shape[1] - if num_cols > 1 and (num_cols % 2 == 1 or num_cols == 2): - num_cols -= 1 - dict_entry = str(num_cols) - - for i in range(2): - # on the second run we shuffle the data - if i == 1: - df = df.sample(frac=1) - dict_entry += "_shuffled" - - begin = time.time() - for _ in range(num_iter): - _ = df_func(df, value_cols=col_names, time_col=time_col, freq=None) - end = time.time() - timer = (end - begin) / num_iter - - if dict_entry not in times: - times[dict_entry] = timer - else: - times[dict_entry] += timer - - # Update the progress bar - progress_bar.update(1) - - file_name = f_name + "_avg_time_cols_" + str(num_iter) + "_iter.json" - - # Store the average times in separate JSON files - with open(save_path + file_name, "w") as f: - json.dump(times, f, indent=4) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="The function to test and the number of iter can " - ) - parser.add_argument( - "--f_name", type=str, default="from_dataframe", help="method to time" - ) - parser.add_argument( - "--n_iter", type=int, default=100, help="number of function call" - ) - - args = parser.parse_args() - - f_name = args.f_name - n_iter = args.n_iter - - calculate_processing_time(f_name, n_iter) From 5afc7210d77fd1fc57d84fbdbc1f53f8f8aad7cd Mon Sep 17 00:00:00 2001 From: Jules Authier <55801833+authierj@users.noreply.github.com> Date: Fri, 21 Feb 2025 19:59:32 +0100 Subject: [PATCH 18/29] Update CHANGELOG.md Co-authored-by: Dennis Bader --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59d0f2565f..c9f9287295 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ but cannot always guarantee backwards compatibility. Changes that may **break co **Improved** -- Implemented the `from_dataframe()` and `from_series()` methods with [Narwhals](https://github.com/narwhals-dev/narwhals), a compatibility layer between dataframe librairies. From now on, Darts can transform pandas, polars, arrows and many other dataframes into `TimeSeries`. [#2661](https://github.com/unit8co/darts/pull/2661) by [Jules Authier](https://github.com/authierj) +- `TimeSeries.from_dataframe()` and `from_series()` now support creating `TimeSeries` from additional backends (Polars, PyArrow, ...). We leverage `narwhals` as the compatibility layer between dataframe libraries. See the `narwhals` [documentation](https://narwhals-dev.github.io/narwhals/) for all supported backends. [#2661](https://github.com/unit8co/darts/pull/2661) by [Jules Authier](https://github.com/authierj) - Added ONNX support for torch-based models with method `TorchForecastingModel.to_onnx()`. Check out [this example](https://unit8co.github.io/darts/userguide/gpu_and_tpu_usage.html#exporting-model-to-onnx-format-for-inference) from the user guide on how to export and load a model for inference. [#2620](https://github.com/unit8co/darts/pull/2620) by [Antoine Madrona](https://github.com/madtoinou) - Made method `ForecastingModel.untrained_model()` public. Use this method to get a new (untrained) model instance created with the same parameters. [#2684](https://github.com/unit8co/darts/pull/2684) by [Timon Erhart](https://github.com/turbotimon) - Made it possbile to run the quickstart notebook `00-quickstart.ipynb` locally. [#2691](https://github.com/unit8co/darts/pull/2691) by [Jules Authier](https://github.com/authierj) From 7877dd6e986faaaf7121be6d40285cd8137416bb Mon Sep 17 00:00:00 2001 From: Jules Authier <55801833+authierj@users.noreply.github.com> Date: Fri, 21 Feb 2025 20:00:32 +0100 Subject: [PATCH 19/29] Update darts/timeseries.py Co-authored-by: Dennis Bader --- darts/timeseries.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index 5d2c9a0946..8d2c8601c6 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -994,7 +994,10 @@ def from_series( Parameters ---------- pd_series - A Series instance. + The Series, or anything which can be converted to a narwhals Series (e.g. pandas.Series, ...) + polars.DataFrame, ...). See the `narwhals documentation + `_ for more + information. fill_missing_dates Optionally, a boolean value indicating whether to fill missing dates (or indices in case of integer index) with NaN values. This requires either a provided `freq` or the possibility to infer the frequency from the From 102a26c7c084380392a6b777271320614ceaeaf3 Mon Sep 17 00:00:00 2001 From: authierj Date: Fri, 21 Feb 2025 20:15:59 +0100 Subject: [PATCH 20/29] easy corrections applied --- darts/timeseries.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index 5d2c9a0946..e03a12456d 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -690,10 +690,10 @@ def from_dataframe( # The integer conversion failed; try datetimes try: time_index = pd.DatetimeIndex(time_col_vals) - except Exception: + except ValueError: raise_log( AttributeError( - "'time_col' is of 'Utf8' dtype but doesn't contain valid timestamps" + "'time_col' is of 'String' dtype but doesn't contain valid timestamps" ) ) elif time_col_vals.dtype == nw.Datetime: @@ -713,13 +713,20 @@ def from_dataframe( else: raise_log( AttributeError( - "Invalid type of `time_col`: it needs to be of either 'Utf8', 'Datetime' or 'Int64' dtype." + "Invalid type of `time_col`: it needs to be of either 'String', 'Datetime' or 'Int' dtype." ) ) else: time_col_vals = nw.maybe_get_index(df) if time_col_vals is None: - raise_log(ValueError("No time column or index found in the DataFrame.")) + raise_log( + ValueError( + "No time column or index found in the DataFrame. `time_col=None` " + "is only supported for pandas DataFrame which is indexed with one of the " + "supported index types: a DatetimeIndex, a RangeIndex, or an integer " + "Index that can be converted into a RangeIndex.", + ), + ) # if we are here, the dataframe was pandas raise_if_not( isinstance(time_col_vals, VALID_INDEX_TYPES) @@ -756,7 +763,7 @@ def from_dataframe( attrs={STATIC_COV_TAG: static_covariates, HIERARCHY_TAG: hierarchy}, ) - return cls.from_xarray( # really slow + return cls.from_xarray( xa=xa, fill_missing_dates=fill_missing_dates, freq=freq, @@ -984,7 +991,7 @@ def from_series( static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None, ) -> Self: """ - Build a univariate deterministic series from a pandas Series. + Build a univariate deterministic TimeSeries from a Series The series must contain an index that is either a pandas DatetimeIndex, a pandas RangeIndex, or a pandas Index that can be converted into a RangeIndex. It is better if the index has no holes; alternatively setting From 56a20c1e41162a35e4e914f19b2729a6135d5ae1 Mon Sep 17 00:00:00 2001 From: authierj Date: Thu, 27 Feb 2025 08:17:44 +0100 Subject: [PATCH 21/29] narwhals_test_time removed --- narwhals_test_time.py | 165 ------------------------------------------ 1 file changed, 165 deletions(-) delete mode 100644 narwhals_test_time.py diff --git a/narwhals_test_time.py b/narwhals_test_time.py deleted file mode 100644 index 80fd76323b..0000000000 --- a/narwhals_test_time.py +++ /dev/null @@ -1,165 +0,0 @@ -import time -import warnings -from itertools import product - -import numpy as np -import pandas as pd - -from darts.timeseries import TimeSeries - -# Suppress all warnings -warnings.filterwarnings("ignore") - - -def create_random_dataframes( - num_rows: int = 10, - num_columns: int = 3, - index: bool = True, - col_names_given: bool = True, - start_date: str = "2023-01-01", - freq: str = "D", -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Create three pandas DataFrames with random data and dates as the index or as a column. - - Parameters: - - num_rows (int): The number of rows in the DataFrames. - - num_columns (int): The number of columns in the DataFrames. - - index (bool): If True, the date is the index of the DataFrame. If False, the date is a column named 'date'. - - start_date (str): The start date for the date range (used only if date_format is 'date'). - - freq (str): The frequency of the date range (used only if date_format is 'date'). - - Returns: - - tuple: A tuple containing three DataFrames (df_date, df_numpy, df_integer). - """ - # Set a random seed for reproducibility - np.random.seed(42) - - # Generate a date range or integer list based on the date_format parameter - date_values = pd.date_range(start=start_date, periods=num_rows, freq=freq) - integer_values = list(range(1, num_rows + 1)) - numpy_values = np.array( - pd.date_range(start=start_date, periods=num_rows, freq=freq), - dtype="datetime64[D]", - ) - - # Create random data for the DataFrames - data = {f"col_{i}": np.random.randn(num_rows) for i in range(num_columns)} - - # Create the DataFrames - df_date = pd.DataFrame(data) - df_numpy = pd.DataFrame(data) - df_integer = pd.DataFrame(data) - - if col_names_given: - col_names = df_date.columns.values - else: - col_names = None - - # Set the date as index or as a column based on the index parameter - if index: - df_date.index = date_values - df_numpy.index = numpy_values - df_integer.index = integer_values - else: - df_date["date"] = date_values - df_numpy["date"] = numpy_values - df_integer["date"] = integer_values - - if index: - time_col = None - else: - time_col = "date" - - return [ - [df_date, col_names, time_col], - [df_numpy, col_names, time_col], - [df_integer, col_names, time_col], - ] - - -def test_dataframes() -> list: - test_config = product( - [10, 100, 1000, 10000], - [10, 100, 500, 1000], - [True, False], - [True, False], - ) - - dataframes_list = [ - create_random_dataframes( - num_rows=num_rows, - num_columns=num_columns, - index=index, - col_names_given=col_names_given, - ) - for num_rows, num_columns, index, col_names_given in test_config - ] - - return dataframes_list - - -df_list = test_dataframes() - -num_iter = 5 -pandas_global_timer = 0 -narwhals_global_timer = 0 - -for iter in range(num_iter + 1): - pandas_timer = 0 - narwhals_timer = 0 - for df_config in df_list: - for df, col_names, time_col in df_config: - for i in range(2): - # on the second run we shuffle the data - if i == 1: - df = df.sample(frac=1) - - # pandas processing time - begin = time.time() - pandas_timeseries = TimeSeries.from_dataframe( - df, value_cols=col_names, time_col=time_col, freq=None - ) - end = time.time() - pandas_timer += end - begin - - # narwhals processing time - begin_nw = time.time() - narwhals_timeseries = TimeSeries.from_narwhals_dataframe( - df, value_cols=col_names, time_col=time_col, freq=None - ) - end_nw = time.time() - narwhals_timer += end_nw - begin_nw - - # Check if the TimeSeries objects are equal - try: - assert pandas_timeseries.time_index.equals( - narwhals_timeseries.time_index - ) - except AssertionError as e: - print( - f"Index assertion failed for DataFrame with columns {col_names} and time_col {time_col}: {e}" - ) - try: - np.testing.assert_array_almost_equal( - pandas_timeseries.all_values(), narwhals_timeseries.all_values() - ) - except AssertionError as e: - print( - f"Equal assertion failed for DataFrame with columns {col_names} and time_col {time_col}: {e}" - ) - # throw first iteration away, memory initialization - if iter > 0: - print(f"pandas processing time: {pandas_timer:.4f}") - print(f"narwhals processing time: {narwhals_timer:.4f} \n") - pandas_global_timer += pandas_timer - narwhals_global_timer += narwhals_timer - -pandas_global_timer /= num_iter -narwhals_global_timer /= num_iter - -print(f"Average pandas processing time: {pandas_global_timer:.4f}") -print(f"Average narwhals processing time: {narwhals_global_timer:.4f} \n") - -diff_in_fraction = (-pandas_global_timer + narwhals_global_timer) / pandas_global_timer -print(f"Average processing time difference: {diff_in_fraction:.2%}") From f764e198e9732565dc2228dbac4771557764750a Mon Sep 17 00:00:00 2001 From: Jules Authier <55801833+authierj@users.noreply.github.com> Date: Thu, 27 Feb 2025 08:18:33 +0100 Subject: [PATCH 22/29] Update requirements/core.txt Co-authored-by: Dennis Bader --- requirements/core.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/core.txt b/requirements/core.txt index 3081922e93..21abb02049 100644 --- a/requirements/core.txt +++ b/requirements/core.txt @@ -5,7 +5,6 @@ narwhals>=1.25.1 nfoursid>=1.0.0 numpy>=1.19.0,<2.0.0 pandas>=1.0.5 -polars>=1.0.0 pmdarima>=1.8.0 pyod>=0.9.5 requests>=2.22.0 From 319a48f7f171155447356372fb300fbcb0ad590e Mon Sep 17 00:00:00 2001 From: Jules Authier <55801833+authierj@users.noreply.github.com> Date: Thu, 27 Feb 2025 08:40:38 +0100 Subject: [PATCH 23/29] Update darts/timeseries.py Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --- darts/timeseries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index ff018572e2..cb17052938 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -651,7 +651,7 @@ def from_dataframe( TimeSeries A univariate or multivariate deterministic TimeSeries constructed from the inputs. """ - df = nw.from_native(df) + df = nw.from_native(df, eager_only=True, pass_through=False) # get values if value_cols is None: From e8925f107e3bd6bd0ba172dcc9325159eca07acd Mon Sep 17 00:00:00 2001 From: authierj Date: Thu, 27 Feb 2025 14:50:34 +0100 Subject: [PATCH 24/29] most corrections added --- darts/timeseries.py | 124 +++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 64 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index ff018572e2..ae353656c7 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -33,6 +33,7 @@ ``TimeSeries`` documentation for more information on covariates. """ +import contextlib import itertools import pickle import re @@ -588,7 +589,10 @@ def from_dataframe( Parameters ---------- df - The DataFrame + The DataFrame, or anything which can be converted to a narwhals DataFrame (e.g. pandas.DataFrame, + polars.DataFrame, ...). See the `narwhals documentation + `_ for more + information. time_col The time column name. If set, the column will be cast to a pandas DatetimeIndex (if it contains timestamps) or a RangeIndex (if it contains integers). @@ -651,29 +655,28 @@ def from_dataframe( TimeSeries A univariate or multivariate deterministic TimeSeries constructed from the inputs. """ - df = nw.from_native(df) + df = nw.from_native(df, eager_only=True, pass_through=False) + time_zone = None # get values if value_cols is None: series_df = df.drop(time_col) if time_col else df else: - if isinstance(value_cols, str): + if isinstance(value_cols, (str, int)): value_cols = [value_cols] - series_df = df[value_cols] # quite slow + series_df = df[value_cols] # get time index if time_col: if time_col not in df.columns: raise_log(AttributeError(f"time_col='{time_col}' is not present.")) - time_col_vals = df[time_col] + time_col_vals = df.get_column(time_col) if time_col_vals.dtype == nw.String: # Try to convert to integers if needed - try: + with contextlib.suppress(Exception): time_col_vals = time_col_vals.cast(nw.Int64) - except Exception: - pass if time_col_vals.dtype.is_integer(): if time_col_vals.is_duplicated().any(): @@ -686,7 +689,7 @@ def from_dataframe( # RangeIndex in `TimeSeries.from_xarray()` time_index = pd.Index(time_col_vals) - elif time_col_vals.dtype == nw.String: + elif isinstance(time_col_vals.dtype, nw.String): # The integer conversion failed; try datetimes try: time_index = pd.DatetimeIndex(time_col_vals) @@ -696,18 +699,10 @@ def from_dataframe( "'time_col' is of 'String' dtype but doesn't contain valid timestamps" ) ) - elif time_col_vals.dtype == nw.Datetime: - # BUGFIX : force time-index to be timezone naive as xarray doesn't support it - # pandas.DataFrame loses the tz information if it's not its index - if time_col_vals.dtype.time_zone is not None: - logger.warning( - "The provided Datetime data was associated with a timezone, which is currently not supported " - "by xarray. To avoid unexpected behaviour, the tz information was removed. Consider calling " - f"`ts.time_index.tz_localize({time_col_vals.dtype.time_zone})` when exporting the results." - "To plot the series with the right time steps, consider setting the matplotlib.pyplot " - "`rcParams['timezone']` parameter to automatically convert the time axis back to the " - "original timezone." - ) + elif isinstance(time_col_vals.dtype, nw.Datetime): + # remember time zone here as polars converts to UTC + time_zone = time_col_vals.dtype.time_zone + if time_zone is not None: time_col_vals = time_col_vals.dt.replace_time_zone(None) time_index = pd.DatetimeIndex(time_col_vals) else: @@ -717,8 +712,8 @@ def from_dataframe( ) ) else: - time_col_vals = nw.maybe_get_index(df) - if time_col_vals is None: + time_index = nw.maybe_get_index(df) + if time_index is None: raise_log( ValueError( "No time column or index found in the DataFrame. `time_col=None` " @@ -728,30 +723,33 @@ def from_dataframe( ), ) # if we are here, the dataframe was pandas - raise_if_not( - isinstance(time_col_vals, VALID_INDEX_TYPES) - or np.issubdtype(time_col_vals.dtype, np.integer), - "If time_col is not specified, the DataFrame must be indexed either with " - "a DatetimeIndex, a RangeIndex, or an integer Index that can be converted into a RangeIndex", - logger, - ) - # BUGFIX : force time-index to be timezone naive as xarray doesn't support it - # pandas.DataFrame loses the tz information if it's not its index - if ( - isinstance(time_col_vals, pd.DatetimeIndex) - and time_col_vals.tz is not None + if not ( + isinstance(time_index, VALID_INDEX_TYPES) + or np.issubdtype(time_index.dtype, np.integer) ): - logger.warning( - "The provided DatetimeIndex was associated with a timezone, which is currently not supported " - "by xarray. To avoid unexpected behaviour, the tz information was removed. Consider calling " - f"`ts.time_index.tz_localize({time_col_vals.tz})` when exporting the results." - "To plot the series with the right time steps, consider setting the matplotlib.pyplot " - "`rcParams['timezone']` parameter to automatically convert the time axis back to the " - "original timezone." + raise_log( + ValueError( + "If time_col is not specified, the DataFrame must be indexed either with " + "a DatetimeIndex, a RangeIndex, or an integer Index that can be converted into a RangeIndex" + ), + logger, ) - time_index = time_col_vals.tz_localize(None) - else: - time_index = time_col_vals + if isinstance(time_index, pd.DatetimeIndex): + time_zone = time_index.tz + if time_zone is not None: + # remove and remember time zone here as pandas converts to UTC + time_index = time_index.tz_localize(None) + + # BUGFIX : force time-index to be timezone naive as xarray doesn't support it + if time_zone is not None: + logger.warning( + "The provided DatetimeIndex was associated with a timezone, which is currently not supported " + "by xarray. To avoid unexpected behaviour, the tz information was removed. Consider calling " + f"`ts.time_index.tz_localize({time_zone})` when exporting the results." + "To plot the series with the right time steps, consider setting the matplotlib.pyplot " + "`rcParams['timezone']` parameter to automatically convert the time axis back to the " + "original timezone." + ) if not time_index.name: time_index.name = time_col if time_col else DIMS[0] @@ -1028,7 +1026,7 @@ def from_series( TimeSeries A univariate and deterministic TimeSeries constructed from the inputs. """ - nw_series = nw.from_native(pd_series, series_only=True) + nw_series = nw.from_native(pd_series, series_only=True, pass_through=False) df = nw_series.to_frame() return cls.from_dataframe( df, @@ -1597,9 +1595,10 @@ def pd_series(self, copy=True) -> pd.Series: name = self.components[0] if copy: - return pd.Series(data=data.copy(), index=index.copy(), name=name) - else: - return pd.Series(data=data, index=index, name=name) + data = data.copy() + index = index.copy() + + return pd.Series(data=data, index=index, name=name) def pd_dataframe(self, copy=True, suppress_warnings=False) -> pd.DataFrame: """ @@ -1620,7 +1619,6 @@ def pd_dataframe(self, copy=True, suppress_warnings=False) -> pd.DataFrame: pandas.DataFrame The Pandas DataFrame representation of this time series """ - if not self.is_deterministic: if not suppress_warnings: logger.warning( @@ -1632,24 +1630,22 @@ def pd_dataframe(self, copy=True, suppress_warnings=False) -> pd.DataFrame: comp_name = list(self.components) samples = range(self.n_samples) - df_col_names = [ + columns = [ "_s".join((comp_name, str(sample_id))) for comp_name, sample_id in itertools.product(comp_name, samples) ] - data = self._xa.stack(data=(DIMS[1], DIMS[2])) - index = self._time_index - - if copy: - return pd.DataFrame( - data=data.copy(), index=index.copy(), columns=df_col_names.copy() - ) - else: - return pd.DataFrame(data=data, index=index, columns=df_col_names) + data = self._xa.stack(data=(DIMS[1], DIMS[2])).values else: - if copy: - return self._xa[:, :, 0].copy().to_pandas() - else: - return self._xa[:, :, 0].to_pandas() + columns = self._xa.get_index(DIMS[1]) + data = self._xa[:, :, 0].values + index = self._time_index + + if copy: + columns = columns.copy() + data = data.copy() + index = index.copy() + + return pd.DataFrame(data=data, index=index, columns=columns) def quantile_df(self, quantile=0.5) -> pd.DataFrame: """ From 11d17c1e7c45036e6f3048043a9d6a369d7491ec Mon Sep 17 00:00:00 2001 From: authierj Date: Thu, 27 Feb 2025 15:17:54 +0100 Subject: [PATCH 25/29] polars tests removed --- darts/tests/test_timeseries.py | 37 +++++++++++++++++----------------- darts/timeseries.py | 1 + 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/darts/tests/test_timeseries.py b/darts/tests/test_timeseries.py index d8f1505751..8245efc194 100644 --- a/darts/tests/test_timeseries.py +++ b/darts/tests/test_timeseries.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd -import polars as pl import pytest import xarray as xr from scipy.stats import kurtosis, skew @@ -2507,15 +2506,15 @@ def test_tail_numeric_time_index(self): class TestTimeSeriesFromDataFrame: - def pd_to_backend(self, df, backend, index=False): - if backend == "pandas": - return df - elif backend == "polars": - if index: - return pl.from_pandas(df.reset_index()) - return pl.from_pandas(df) - - @pytest.mark.parametrize("backend", ["pandas", "polars"]) + # def pd_to_backend(self, df, backend, index=False): + # if backend == "pandas": + # return df + # elif backend == "polars": + # if index: + # return pl.from_pandas(df.reset_index()) + # return pl.from_pandas(df) + + @pytest.mark.parametrize("backend", "pandas") def test_from_dataframe_sunny_day(self, backend): data_dict = {"Time": pd.date_range(start="20180501", end="20200301", freq="MS")} data_dict["Values1"] = np.random.uniform( @@ -2544,7 +2543,7 @@ def test_from_dataframe_sunny_day(self, backend): assert data_darts1 == data_darts2 assert data_darts1 == data_darts3 - @pytest.mark.parametrize("backend", ["pandas", "polars"]) + @pytest.mark.parametrize("backend", "pandas") def test_time_col_convert_string_integers(self, backend): expected = np.array(list(range(3, 10))) data_dict = {"Time": expected.astype(str)} @@ -2560,7 +2559,7 @@ def test_time_col_convert_string_integers(self, backend): assert ts.time_index.dtype == int assert ts.time_index.name == "Time" - @pytest.mark.parametrize("backend", ["pandas", "polars"]) + @pytest.mark.parametrize("backend", "pandas") def test_time_col_convert_integers(self, backend): expected = np.array(list(range(10))) data_dict = {"Time": expected} @@ -2577,7 +2576,7 @@ def test_time_col_convert_integers(self, backend): assert ts.time_index.dtype == int assert ts.time_index.name == "Time" - @pytest.mark.parametrize("backend", ["pandas", "polars"]) + @pytest.mark.parametrize("backend", "pandas") def test_fail_with_bad_integer_time_col(self, backend): bad_time_col_vals = np.array([4, 0, 1, 2]) data_dict = {"Time": bad_time_col_vals} @@ -2590,7 +2589,7 @@ def test_fail_with_bad_integer_time_col(self, backend): df=self.pd_to_backend(df, backend), time_col="Time" ) - @pytest.mark.parametrize("backend", ["pandas", "polars"]) + @pytest.mark.parametrize("backend", "pandas") def test_time_col_convert_rangeindex(self, backend): for expected_l, step in zip([[4, 0, 2, 3, 1], [8, 0, 4, 6, 2]], [1, 2]): expected = np.array(expected_l) @@ -2616,7 +2615,7 @@ def test_time_col_convert_rangeindex(self, backend): ] assert np.all(ar1 == ar2) - @pytest.mark.parametrize("backend", ["pandas", "polars"]) + @pytest.mark.parametrize("backend", "pandas") def test_time_col_convert_datetime(self, backend): expected = pd.date_range(start="20180501", end="20200301", freq="MS") data_dict = {"Time": expected} @@ -2631,7 +2630,7 @@ def test_time_col_convert_datetime(self, backend): assert ts.time_index.dtype == "datetime64[ns]" assert ts.time_index.name == "Time" - @pytest.mark.parametrize("backend", ["pandas", "polars"]) + @pytest.mark.parametrize("backend", "pandas") def test_time_col_convert_datetime_strings(self, backend): expected = pd.date_range(start="20180501", end="20200301", freq="MS") data_dict = {"Time": expected.values.astype(str)} @@ -2646,7 +2645,7 @@ def test_time_col_convert_datetime_strings(self, backend): assert ts.time_index.dtype == "datetime64[ns]" assert ts.time_index.name == "Time" - @pytest.mark.parametrize("backend", ["pandas", "polars"]) + @pytest.mark.parametrize("backend", "pandas") def test_time_col_with_tz_df(self, backend): # numpy and xarray don't support "timezone aware" pd.DatetimeIndex # the BUGFIX removes timezone information without conversion @@ -2711,7 +2710,7 @@ def test_time_col_with_tz_series(self): assert list(ts.time_index.tz_localize("CET")) == list(time_range_H) assert ts.time_index.tz is None - @pytest.mark.parametrize("backend", ["pandas", "polars"]) + @pytest.mark.parametrize("backend", "pandas") def test_time_col_convert_garbage(self, backend): expected = [ "2312312asdfdw", @@ -2731,7 +2730,7 @@ def test_time_col_convert_garbage(self, backend): df=self.pd_to_backend(df, backend), time_col="Time" ) - @pytest.mark.parametrize("backend", ["pandas", "polars"]) + @pytest.mark.parametrize("backend", "pandas") def test_df_named_columns_index(self, backend): time_index = generate_index( start=pd.Timestamp("2000-01-01"), length=4, freq="D", name="index" diff --git a/darts/timeseries.py b/darts/timeseries.py index ae353656c7..dd80f780b1 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -711,6 +711,7 @@ def from_dataframe( "Invalid type of `time_col`: it needs to be of either 'String', 'Datetime' or 'Int' dtype." ) ) + time_index.name = time_col else: time_index = nw.maybe_get_index(df) if time_index is None: From f9f5aa8f164600fd136080fe5a1f5d251c85fcce Mon Sep 17 00:00:00 2001 From: authierj Date: Thu, 27 Feb 2025 16:09:36 +0100 Subject: [PATCH 26/29] tests corrected --- darts/tests/test_timeseries.py | 36 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/darts/tests/test_timeseries.py b/darts/tests/test_timeseries.py index 8245efc194..f4d52477d3 100644 --- a/darts/tests/test_timeseries.py +++ b/darts/tests/test_timeseries.py @@ -2506,15 +2506,15 @@ def test_tail_numeric_time_index(self): class TestTimeSeriesFromDataFrame: - # def pd_to_backend(self, df, backend, index=False): - # if backend == "pandas": - # return df - # elif backend == "polars": - # if index: - # return pl.from_pandas(df.reset_index()) - # return pl.from_pandas(df) - - @pytest.mark.parametrize("backend", "pandas") + def pd_to_backend(self, df, backend, index=False): + if backend == "pandas": + return df + # elif backend == "polars": + # if index: + # return pl.from_pandas(df.reset_index()) + # return pl.from_pandas(df) + + @pytest.mark.parametrize("backend", ["pandas"]) def test_from_dataframe_sunny_day(self, backend): data_dict = {"Time": pd.date_range(start="20180501", end="20200301", freq="MS")} data_dict["Values1"] = np.random.uniform( @@ -2543,7 +2543,7 @@ def test_from_dataframe_sunny_day(self, backend): assert data_darts1 == data_darts2 assert data_darts1 == data_darts3 - @pytest.mark.parametrize("backend", "pandas") + @pytest.mark.parametrize("backend", ["pandas"]) def test_time_col_convert_string_integers(self, backend): expected = np.array(list(range(3, 10))) data_dict = {"Time": expected.astype(str)} @@ -2559,7 +2559,7 @@ def test_time_col_convert_string_integers(self, backend): assert ts.time_index.dtype == int assert ts.time_index.name == "Time" - @pytest.mark.parametrize("backend", "pandas") + @pytest.mark.parametrize("backend", ["pandas"]) def test_time_col_convert_integers(self, backend): expected = np.array(list(range(10))) data_dict = {"Time": expected} @@ -2576,7 +2576,7 @@ def test_time_col_convert_integers(self, backend): assert ts.time_index.dtype == int assert ts.time_index.name == "Time" - @pytest.mark.parametrize("backend", "pandas") + @pytest.mark.parametrize("backend", ["pandas"]) def test_fail_with_bad_integer_time_col(self, backend): bad_time_col_vals = np.array([4, 0, 1, 2]) data_dict = {"Time": bad_time_col_vals} @@ -2589,7 +2589,7 @@ def test_fail_with_bad_integer_time_col(self, backend): df=self.pd_to_backend(df, backend), time_col="Time" ) - @pytest.mark.parametrize("backend", "pandas") + @pytest.mark.parametrize("backend", ["pandas"]) def test_time_col_convert_rangeindex(self, backend): for expected_l, step in zip([[4, 0, 2, 3, 1], [8, 0, 4, 6, 2]], [1, 2]): expected = np.array(expected_l) @@ -2615,7 +2615,7 @@ def test_time_col_convert_rangeindex(self, backend): ] assert np.all(ar1 == ar2) - @pytest.mark.parametrize("backend", "pandas") + @pytest.mark.parametrize("backend", ["pandas"]) def test_time_col_convert_datetime(self, backend): expected = pd.date_range(start="20180501", end="20200301", freq="MS") data_dict = {"Time": expected} @@ -2630,7 +2630,7 @@ def test_time_col_convert_datetime(self, backend): assert ts.time_index.dtype == "datetime64[ns]" assert ts.time_index.name == "Time" - @pytest.mark.parametrize("backend", "pandas") + @pytest.mark.parametrize("backend", ["pandas"]) def test_time_col_convert_datetime_strings(self, backend): expected = pd.date_range(start="20180501", end="20200301", freq="MS") data_dict = {"Time": expected.values.astype(str)} @@ -2645,7 +2645,7 @@ def test_time_col_convert_datetime_strings(self, backend): assert ts.time_index.dtype == "datetime64[ns]" assert ts.time_index.name == "Time" - @pytest.mark.parametrize("backend", "pandas") + @pytest.mark.parametrize("backend", ["pandas"]) def test_time_col_with_tz_df(self, backend): # numpy and xarray don't support "timezone aware" pd.DatetimeIndex # the BUGFIX removes timezone information without conversion @@ -2710,7 +2710,7 @@ def test_time_col_with_tz_series(self): assert list(ts.time_index.tz_localize("CET")) == list(time_range_H) assert ts.time_index.tz is None - @pytest.mark.parametrize("backend", "pandas") + @pytest.mark.parametrize("backend", ["pandas"]) def test_time_col_convert_garbage(self, backend): expected = [ "2312312asdfdw", @@ -2730,7 +2730,7 @@ def test_time_col_convert_garbage(self, backend): df=self.pd_to_backend(df, backend), time_col="Time" ) - @pytest.mark.parametrize("backend", "pandas") + @pytest.mark.parametrize("backend", ["pandas"]) def test_df_named_columns_index(self, backend): time_index = generate_index( start=pd.Timestamp("2000-01-01"), length=4, freq="D", name="index" From c13cc1d3352571f4e6bae67f4e6e19f575842369 Mon Sep 17 00:00:00 2001 From: Jules Authier <55801833+authierj@users.noreply.github.com> Date: Fri, 28 Feb 2025 16:28:49 +0100 Subject: [PATCH 27/29] Update darts/timeseries.py Co-authored-by: Dennis Bader --- darts/timeseries.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index dd80f780b1..e1590b22be 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -1000,9 +1000,9 @@ def from_series( Parameters ---------- pd_series - The Series, or anything which can be converted to a narwhals Series (e.g. pandas.Series, ...) - polars.DataFrame, ...). See the `narwhals documentation - `_ for more + The Series, or anything which can be converted to a narwhals Series (e.g. pandas.Series, ...). See the + `narwhals documentation + `_ for more information. fill_missing_dates Optionally, a boolean value indicating whether to fill missing dates (or indices in case of integer index) From 370d7619d8bb773fb4ff667df0e1e3d7a8b5852a Mon Sep 17 00:00:00 2001 From: Jules Authier <55801833+authierj@users.noreply.github.com> Date: Fri, 28 Feb 2025 16:29:35 +0100 Subject: [PATCH 28/29] Update darts/timeseries.py Co-authored-by: Dennis Bader --- darts/timeseries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index e1590b22be..317a053ad0 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -990,7 +990,7 @@ def from_series( static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None, ) -> Self: """ - Build a univariate deterministic TimeSeries from a Series + Build a univariate deterministic TimeSeries from a Series. The series must contain an index that is either a pandas DatetimeIndex, a pandas RangeIndex, or a pandas Index that can be converted into a RangeIndex. It is better if the index has no holes; alternatively setting From 3fa924f429b38b0526e658a3ddcd27ca7169a1b4 Mon Sep 17 00:00:00 2001 From: authierj Date: Fri, 28 Feb 2025 16:36:59 +0100 Subject: [PATCH 29/29] no time_col, define one --- darts/timeseries.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index 317a053ad0..2a1b336bcf 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -715,16 +715,14 @@ def from_dataframe( else: time_index = nw.maybe_get_index(df) if time_index is None: - raise_log( - ValueError( - "No time column or index found in the DataFrame. `time_col=None` " - "is only supported for pandas DataFrame which is indexed with one of the " - "supported index types: a DatetimeIndex, a RangeIndex, or an integer " - "Index that can be converted into a RangeIndex.", - ), + time_index = pd.RangeIndex(len(df)) + logger.info( + "No time column specified (`time_col=None`) and no index found in the DataFrame. Defaulting to " + "`pandas.RangeIndex(len(df))`. If this is not desired consider adding a time column " + "to your dataframe and defining `time_col`." ) # if we are here, the dataframe was pandas - if not ( + elif not ( isinstance(time_index, VALID_INDEX_TYPES) or np.issubdtype(time_index.dtype, np.integer) ): @@ -1000,9 +998,9 @@ def from_series( Parameters ---------- pd_series - The Series, or anything which can be converted to a narwhals Series (e.g. pandas.Series, ...). See the - `narwhals documentation - `_ for more + The Series, or anything which can be converted to a narwhals Series (e.g. pandas.Series, ...). See the + `narwhals documentation + `_ for more information. fill_missing_dates Optionally, a boolean value indicating whether to fill missing dates (or indices in case of integer index)