unit8co · dennisbader · Feb 28, 2025 · Jan 31, 2025 · Jan 31, 2025 · Feb 4, 2025
@@ -46,9 +46,11 @@
 
 import matplotlib.axes
 import matplotlib.pyplot as plt
+import narwhals as nw
 import numpy as np
 import pandas as pd
 import xarray as xr
+from narwhals.typing import DataFrameT
 from pandas.tseries.frequencies import to_offset
 from scipy.stats import kurtosis, skew
 
@@ -746,6 +748,177 @@ def from_dataframe(
             fillna_value=fillna_value,
         )
 
+    @classmethod
+    def from_narwhals_dataframe(
+        cls,
+        df: DataFrameT,
+        time_col: Optional[str] = None,
+        value_cols: Optional[Union[list[str], str]] = None,
+        fill_missing_dates: Optional[bool] = False,
+        freq: Optional[Union[str, int]] = None,
+        fillna_value: Optional[float] = None,
+        static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None,
+        hierarchy: Optional[dict] = None,
+    ) -> Self:
+        """
+        Build a deterministic TimeSeries instance built from a selection of columns of a DataFrame.
+        One column (or the DataFrame index) has to represent the time,
+        and a list of columns `value_cols` has to represent the values for this time series.
+
+        Parameters
+        ----------
+        df
+            The DataFrame
+        time_col
+            The time column name. If set, the column will be cast to a pandas DatetimeIndex (if it contains
+            timestamps) or a RangeIndex (if it contains integers).
+            If not set, the DataFrame index will be used. In this case the DataFrame must contain an index that is
+            either a pandas DatetimeIndex, a pandas RangeIndex, or a pandas Index that can be converted to a
+            RangeIndex. It is better if the index has no holes; alternatively setting `fill_missing_dates` can in some
+            cases solve these issues (filling holes with NaN, or with the provided `fillna_value` numeric value, if
+            any).
+        value_cols
+            A string or list of strings representing the value column(s) to be extracted from the DataFrame. If set to
+            `None`, the whole DataFrame will be used.
+        fill_missing_dates
+            Optionally, a boolean value indicating whether to fill missing dates (or indices in case of integer index)
+            with NaN values. This requires either a provided `freq` or the possibility to infer the frequency from the
+            provided timestamps. See :meth:`_fill_missing_dates() <TimeSeries._fill_missing_dates>` for more info.
+        freq
+            Optionally, a string or integer representing the frequency of the underlying index. This is useful in order
+            to fill in missing values if some dates are missing and `fill_missing_dates` is set to `True`.
+            If a string, represents the frequency of the pandas DatetimeIndex (see `offset aliases
+            <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_ for more info on
+            supported frequencies).
+            If an integer, represents the step size of the pandas Index or pandas RangeIndex.
+        fillna_value
+            Optionally, a numeric value to fill missing values (NaNs) with.
+        static_covariates
+            Optionally, a set of static covariates to be added to the TimeSeries. Either a pandas Series or a pandas
+            DataFrame. If a Series, the index represents the static variables. The covariates are globally 'applied'
+            to all components of the TimeSeries. If a DataFrame, the columns represent the static variables and the
+            rows represent the components of the uni/multivariate TimeSeries. If a single-row DataFrame, the covariates
+            are globally 'applied' to all components of the TimeSeries. If a multi-row DataFrame, the number of
+            rows must match the number of components of the TimeSeries (in this case, the number of columns in
+            ``value_cols``). This adds control for component-specific static covariates.
+        hierarchy
+            Optionally, a dictionary describing the grouping(s) of the time series. The keys are component names, and
+            for a given component name `c`, the value is a list of component names that `c` "belongs" to. For instance,
+            if there is a `total` component, split both in two divisions `d1` and `d2` and in two regions `r1` and `r2`,
+            and four products `d1r1` (in division `d1` and region `r1`), `d2r1`, `d1r2` and `d2r2`, the hierarchy would
+            be encoded as follows.
+
+            .. highlight:: python
+            .. code-block:: python
+
+                hierarchy={
+                    "d1r1": ["d1", "r1"],
+                    "d1r2": ["d1", "r2"],
+                    "d2r1": ["d2", "r1"],
+                    "d2r2": ["d2", "r2"],
+                    "d1": ["total"],
+                    "d2": ["total"],
+                    "r1": ["total"],
+                    "r2": ["total"]
+                }
+            ..
+            The hierarchy can be used to reconcile forecasts (so that the sums of the forecasts at
+            different levels are consistent), see `hierarchical reconciliation
+            <https://unit8co.github.io/darts/generated_api/darts.dataprocessing.transformers.reconciliation.html>`_.
+
+        Returns
+        -------
+        TimeSeries
+            A univariate or multivariate deterministic TimeSeries constructed from the inputs.
+        """
+        df = nw.from_native(df)
+
+        # get values
+        if value_cols is None:
+            if time_col is not None:
+                series_df = df.drop(time_col)
+            else:
+                series_df = df
+        else:
+            if isinstance(value_cols, str):
+                value_cols = [value_cols]
+            series_df = df[value_cols]
+
+        # get time index
+        if time_col:
+            if time_col not in df.columns:
+                raise_log(AttributeError(f"time_col='{time_col}' is not present."))
+            time_col_vals = df[time_col]
+
+            if time_col_vals.dtype == nw.String:
+                # Try to convert to integers if needed
+                try:
+                    time_col_vals = time_col_vals.cast(nw.Int64)
+                except Exception:
+                    pass
+
+            if time_col_vals.dtype == nw.Int64 or time_col_vals.dtype == np.integer:
+                # We have to check all integers appear only once to have a valid index
+                if time_col_vals.is_duplicated().any():
+                    raise_log(
+                        ValueError(
+                            "The provided integer time index column contains duplicate values."
+                        )
+                    )
+
+                # Temporarily use an integer Index to sort the values, and replace by a
+                # RangeIndex in `TimeSeries.from_xarray()`
+                time_index = time_col_vals.to_list()
+
+            elif time_col_vals.dtype == nw.String:
+                # The integer conversion failed; try datetimes
+                try:
+                    time_index = nw.Datetime(time_col_vals)
+                except Exception:
+                    raise_log(
+                        AttributeError(
+                            "'time_col' is of 'Utf8' dtype but doesn't contain valid timestamps"
+                        )
+                    )
+            elif time_col_vals.dtype == nw.Datetime:
+                time_index = time_col_vals.to_list()
+            else:
+                raise_log(
+                    AttributeError(
+                        "Invalid type of `time_col`: it needs to be of either 'Utf8', 'Datetime' or 'Int64' dtype."
+                    )
+                )
+        else:
+            time_col_vals = nw.maybe_get_index(df)
+            if time_col_vals is None:
+                raise_log(ValueError("No time column or index found in the DataFrame."))
+            # if we are here, the dataframe was pandas
+            raise_if_not(
+                isinstance(time_col_vals, VALID_INDEX_TYPES)
+                or np.issubdtype(time_col_vals.dtype, np.integer),
+                "If time_col is not specified, the DataFrame must be indexed either with "
+                "a DatetimeIndex, a RangeIndex, or an integer Index that can be converted into a RangeIndex",
+                logger,
+            )
+            time_index = time_col_vals.to_list()
+
+        xa = xr.DataArray(
+            series_df.to_numpy()[:, :, np.newaxis],
+            dims=(time_col if time_col else DIMS[0],) + DIMS[-2:],
+            coords={
+                time_col if time_col else DIMS[0]: time_index,
+                DIMS[1]: series_df.columns,
+            },
+            attrs={STATIC_COV_TAG: static_covariates, HIERARCHY_TAG: hierarchy},
+        )
+
+        return cls.from_xarray(
+            xa=xa,
+            fill_missing_dates=fill_missing_dates,
+            freq=freq,
+            fillna_value=fillna_value,
+        )
+
     @classmethod
     def from_group_dataframe(
         cls,

diff --git a/narwhals_test_time.py b/narwhals_test_time.py
@@ -0,0 +1,123 @@
+import time
+import warnings
+from itertools import product
+
+import numpy as np
+import pandas as pd
+
+from darts.timeseries import TimeSeries
+
+# Suppress all warnings
+warnings.filterwarnings("ignore")
+
+
+def create_random_dataframes(
+    num_rows: int = 10,
+    num_columns: int = 3,
+    index: bool = True,
+    start_date: str = "2023-01-01",
+    freq: str = "D",
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """
+    Create three pandas DataFrames with random data and dates as the index or as a column.
+
+    Parameters:
+    - num_rows (int): The number of rows in the DataFrames.
+    - num_columns (int): The number of columns in the DataFrames.
+    - index (bool): If True, the date is the index of the DataFrame. If False, the date is a column named 'date'.
+    - start_date (str): The start date for the date range (used only if date_format is 'date').
+    - freq (str): The frequency of the date range (used only if date_format is 'date').
+
+    Returns:
+    - tuple: A tuple containing three DataFrames (df_date, df_numpy, df_integer).
+    """
+    # Set a random seed for reproducibility
+    np.random.seed(42)
+
+    # Generate a date range or integer list based on the date_format parameter
+    date_values = pd.date_range(start=start_date, periods=num_rows, freq=freq)
+    integer_values = list(range(1, num_rows + 1))
+    numpy_values = np.array(
+        pd.date_range(start=start_date, periods=num_rows, freq=freq),
+        dtype="datetime64[D]",
+    )
+
+    # Create random data for the DataFrames
+    data = {f"col_{i}": np.random.randn(num_rows) for i in range(num_columns)}
+
+    # Create the DataFrames
+    df_date = pd.DataFrame(data)
+    df_numpy = pd.DataFrame(data)
+    df_integer = pd.DataFrame(data)
+
+    col_names = df_date.columns.values
+
+    # Set the date as index or as a column based on the index parameter
+    if index:
+        df_date.index = date_values
+        df_numpy.index = numpy_values
+        df_integer.index = integer_values
+    else:
+        df_date["date"] = date_values
+        df_numpy["date"] = numpy_values
+        df_integer["date"] = integer_values
+
+    if index:
+        time_col = None
+    else:
+        time_col = "date"
+
+    return [
+        [df_date, col_names, time_col],
+        [df_numpy, col_names, time_col],
+        [df_integer, col_names, time_col],
+    ]
+
+
+def test_dataframes() -> list:
+    test_config = product(
+        [10, 100, 1000, 10000],
+        [10, 100, 500, 1000],
+        [True, False],
+    )
+
+    dataframes_list = [
+        create_random_dataframes(
+            num_rows=num_rows, num_columns=num_columns, index=index
+        )
+        for num_rows, num_columns, index in test_config
+    ]
+
+    return dataframes_list
+
+
+df_list = test_dataframes()
+
+############ PANDAS ############
+pandas_timer = time.time()
+for df_config in df_list:
+    for df, col_names, time_col in df_config:
+        _ = TimeSeries.from_dataframe(
+            df, value_cols=col_names, time_col=time_col, freq=None
+        )
+        df_shuffle = df.sample(frac=1)
+        _ = TimeSeries.from_dataframe(
+            df_shuffle, value_cols=col_names, time_col=time_col, freq=None
+        )
+pandas_timer = time.time() - pandas_timer
+
+############ NARWHALS ############
+narwhals_timer = time.time()
+for df_config in df_list:
+    for df, col_names, time_col in df_config:
+        _ = TimeSeries.from_narwhals_dataframe(
+            df, value_cols=col_names, time_col=time_col, freq=None
+        )
+        df_shuffle = df.sample(frac=1)
+        _ = TimeSeries.from_narwhals_dataframe(
+            df_shuffle, value_cols=col_names, time_col=time_col, freq=None
+        )
+narwhals_timer = time.time() - narwhals_timer
+
+print("pandas processing time: ", pandas_timer)
+print("narwhals processing time: ", narwhals_timer)