Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Narwhals implementation of from_dataframe and performance benchmark #2661

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
28a9298
narwhals implementation for and test benchmark
Jan 31, 2025
6382082
Merge branch 'master' into feature/add_timeseries_from_polars
authierj Jan 31, 2025
0041203
changes from MarcoGorelli incorporated
Feb 4, 2025
576e88e
improvement thanks to reviewers
Feb 6, 2025
e013a42
Merge branch 'master' into feature/add_timeseries_from_polars
authierj Feb 6, 2025
dbe2cd9
added comments about slow and fast parts of the code
authierj Feb 7, 2025
b2ffc67
using pandas index to avoid .to_list()
authierj Feb 10, 2025
c5fa503
Merge branch 'master' into feature/add_timeseries_from_polars
authierj Feb 10, 2025
79312c9
bug fix added
authierj Feb 10, 2025
fc8bda4
Merge branch 'feature/add_timeseries_from_polars' of https://github.c…
authierj Feb 10, 2025
b08a74f
updated test script
authierj Feb 11, 2025
2425fbe
narwhals timeseries added
authierj Feb 12, 2025
36300f2
from_series changed, names changed
authierj Feb 14, 2025
ba01df1
changelog updated
authierj Feb 14, 2025
ffd1202
Merge branch 'master' into feature/add_timeseries_from_polars
authierj Feb 14, 2025
2e39269
small improvement
authierj Feb 17, 2025
1a9a266
clean test scripts added
authierj Feb 17, 2025
a030ea5
Merge branch 'master' into feature/add_timeseries_from_polars
authierj Feb 17, 2025
2c24a39
BUGFIX added for non_pandas df
authierj Feb 19, 2025
89f23fb
tests added for polars df
authierj Feb 19, 2025
de0a32d
polars and narwhals added to dependencies. Ideally, polars should be …
authierj Feb 19, 2025
66b770d
Merge branch 'master' into feature/add_timeseries_from_polars
authierj Feb 20, 2025
16bac00
refactoring pd_series and pd_dataframe
authierj Feb 20, 2025
0950910
removed test scripts from git repo
authierj Feb 21, 2025
042f9fb
Merge branch 'master' into feature/add_timeseries_from_polars
authierj Feb 21, 2025
5afc721
Update CHANGELOG.md
authierj Feb 21, 2025
7877dd6
Update darts/timeseries.py
authierj Feb 21, 2025
102a26c
easy corrections applied
authierj Feb 21, 2025
9d66c06
Merge branch 'feature/add_timeseries_from_polars' of https://github.c…
authierj Feb 21, 2025
f629089
Merge branch 'master' into feature/add_timeseries_from_polars
authierj Feb 21, 2025
56a20c1
narwhals_test_time removed
authierj Feb 27, 2025
f764e19
Update requirements/core.txt
authierj Feb 27, 2025
319a48f
Update darts/timeseries.py
authierj Feb 27, 2025
e8925f1
most corrections added
authierj Feb 27, 2025
05a7215
merged
authierj Feb 27, 2025
11d17c1
polars tests removed
authierj Feb 27, 2025
a720bb4
Merge branch 'master' into feature/add_timeseries_from_polars
authierj Feb 27, 2025
f9f5aa8
tests corrected
authierj Feb 27, 2025
e0b4984
Merge branch 'master' into feature/add_timeseries_from_polars
dennisbader Feb 28, 2025
c13cc1d
Update darts/timeseries.py
authierj Feb 28, 2025
370d761
Update darts/timeseries.py
authierj Feb 28, 2025
3fa924f
no time_col, define one
authierj Feb 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 173 additions & 0 deletions darts/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,11 @@

import matplotlib.axes
import matplotlib.pyplot as plt
import narwhals as nw
import numpy as np
import pandas as pd
import xarray as xr
from narwhals.typing import DataFrameT
from pandas.tseries.frequencies import to_offset
from scipy.stats import kurtosis, skew

Expand Down Expand Up @@ -746,6 +748,177 @@ def from_dataframe(
fillna_value=fillna_value,
)

@classmethod
def from_narwhals_dataframe(
cls,
df: DataFrameT,
time_col: Optional[str] = None,
value_cols: Optional[Union[list[str], str]] = None,
fill_missing_dates: Optional[bool] = False,
freq: Optional[Union[str, int]] = None,
fillna_value: Optional[float] = None,
static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None,
hierarchy: Optional[dict] = None,
) -> Self:
"""
Build a deterministic TimeSeries instance built from a selection of columns of a DataFrame.
One column (or the DataFrame index) has to represent the time,
and a list of columns `value_cols` has to represent the values for this time series.

Parameters
----------
df
The DataFrame
time_col
The time column name. If set, the column will be cast to a pandas DatetimeIndex (if it contains
timestamps) or a RangeIndex (if it contains integers).
If not set, the DataFrame index will be used. In this case the DataFrame must contain an index that is
either a pandas DatetimeIndex, a pandas RangeIndex, or a pandas Index that can be converted to a
RangeIndex. It is better if the index has no holes; alternatively setting `fill_missing_dates` can in some
cases solve these issues (filling holes with NaN, or with the provided `fillna_value` numeric value, if
any).
value_cols
A string or list of strings representing the value column(s) to be extracted from the DataFrame. If set to
`None`, the whole DataFrame will be used.
fill_missing_dates
Optionally, a boolean value indicating whether to fill missing dates (or indices in case of integer index)
with NaN values. This requires either a provided `freq` or the possibility to infer the frequency from the
provided timestamps. See :meth:`_fill_missing_dates() <TimeSeries._fill_missing_dates>` for more info.
freq
Optionally, a string or integer representing the frequency of the underlying index. This is useful in order
to fill in missing values if some dates are missing and `fill_missing_dates` is set to `True`.
If a string, represents the frequency of the pandas DatetimeIndex (see `offset aliases
<https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_ for more info on
supported frequencies).
If an integer, represents the step size of the pandas Index or pandas RangeIndex.
fillna_value
Optionally, a numeric value to fill missing values (NaNs) with.
static_covariates
Optionally, a set of static covariates to be added to the TimeSeries. Either a pandas Series or a pandas
DataFrame. If a Series, the index represents the static variables. The covariates are globally 'applied'
to all components of the TimeSeries. If a DataFrame, the columns represent the static variables and the
rows represent the components of the uni/multivariate TimeSeries. If a single-row DataFrame, the covariates
are globally 'applied' to all components of the TimeSeries. If a multi-row DataFrame, the number of
rows must match the number of components of the TimeSeries (in this case, the number of columns in
``value_cols``). This adds control for component-specific static covariates.
hierarchy
Optionally, a dictionary describing the grouping(s) of the time series. The keys are component names, and
for a given component name `c`, the value is a list of component names that `c` "belongs" to. For instance,
if there is a `total` component, split both in two divisions `d1` and `d2` and in two regions `r1` and `r2`,
and four products `d1r1` (in division `d1` and region `r1`), `d2r1`, `d1r2` and `d2r2`, the hierarchy would
be encoded as follows.

.. highlight:: python
.. code-block:: python

hierarchy={
"d1r1": ["d1", "r1"],
"d1r2": ["d1", "r2"],
"d2r1": ["d2", "r1"],
"d2r2": ["d2", "r2"],
"d1": ["total"],
"d2": ["total"],
"r1": ["total"],
"r2": ["total"]
}
..
The hierarchy can be used to reconcile forecasts (so that the sums of the forecasts at
different levels are consistent), see `hierarchical reconciliation
<https://unit8co.github.io/darts/generated_api/darts.dataprocessing.transformers.reconciliation.html>`_.

Returns
-------
TimeSeries
A univariate or multivariate deterministic TimeSeries constructed from the inputs.
"""
df = nw.from_native(df)

# get values
if value_cols is None:
if time_col is not None:
series_df = df.drop(time_col)
else:
series_df = df
else:
if isinstance(value_cols, str):
value_cols = [value_cols]
series_df = df[value_cols]

# get time index
if time_col:
if time_col not in df.columns:
raise_log(AttributeError(f"time_col='{time_col}' is not present."))
time_col_vals = df[time_col]

if time_col_vals.dtype == nw.String:
# Try to convert to integers if needed
try:
time_col_vals = time_col_vals.cast(nw.Int64)
except Exception:
pass

if time_col_vals.dtype == nw.Int64 or time_col_vals.dtype == np.integer:
# We have to check all integers appear only once to have a valid index
if time_col_vals.is_duplicated().any():
raise_log(
ValueError(
"The provided integer time index column contains duplicate values."
)
)

# Temporarily use an integer Index to sort the values, and replace by a
# RangeIndex in `TimeSeries.from_xarray()`
time_index = time_col_vals.to_list()

elif time_col_vals.dtype == nw.String:
# The integer conversion failed; try datetimes
try:
time_index = nw.Datetime(time_col_vals)
except Exception:
raise_log(
AttributeError(
"'time_col' is of 'Utf8' dtype but doesn't contain valid timestamps"
)
)
elif time_col_vals.dtype == nw.Datetime:
time_index = time_col_vals.to_list()
else:
raise_log(
AttributeError(
"Invalid type of `time_col`: it needs to be of either 'Utf8', 'Datetime' or 'Int64' dtype."
)
)
else:
time_col_vals = nw.maybe_get_index(df)
if time_col_vals is None:
raise_log(ValueError("No time column or index found in the DataFrame."))
# if we are here, the dataframe was pandas
raise_if_not(
isinstance(time_col_vals, VALID_INDEX_TYPES)
or np.issubdtype(time_col_vals.dtype, np.integer),
"If time_col is not specified, the DataFrame must be indexed either with "
"a DatetimeIndex, a RangeIndex, or an integer Index that can be converted into a RangeIndex",
logger,
)
time_index = time_col_vals.to_list()

xa = xr.DataArray(
series_df.to_numpy()[:, :, np.newaxis],
dims=(time_col if time_col else DIMS[0],) + DIMS[-2:],
coords={
time_col if time_col else DIMS[0]: time_index,
DIMS[1]: series_df.columns,
},
attrs={STATIC_COV_TAG: static_covariates, HIERARCHY_TAG: hierarchy},
)

return cls.from_xarray(
xa=xa,
fill_missing_dates=fill_missing_dates,
freq=freq,
fillna_value=fillna_value,
)

@classmethod
def from_group_dataframe(
cls,
Expand Down
123 changes: 123 additions & 0 deletions narwhals_test_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import time
import warnings
from itertools import product

import numpy as np
import pandas as pd

from darts.timeseries import TimeSeries

# Suppress all warnings
warnings.filterwarnings("ignore")


def create_random_dataframes(
num_rows: int = 10,
num_columns: int = 3,
index: bool = True,
start_date: str = "2023-01-01",
freq: str = "D",
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Create three pandas DataFrames with random data and dates as the index or as a column.

Parameters:
- num_rows (int): The number of rows in the DataFrames.
- num_columns (int): The number of columns in the DataFrames.
- index (bool): If True, the date is the index of the DataFrame. If False, the date is a column named 'date'.
- start_date (str): The start date for the date range (used only if date_format is 'date').
- freq (str): The frequency of the date range (used only if date_format is 'date').

Returns:
- tuple: A tuple containing three DataFrames (df_date, df_numpy, df_integer).
"""
# Set a random seed for reproducibility
np.random.seed(42)

# Generate a date range or integer list based on the date_format parameter
date_values = pd.date_range(start=start_date, periods=num_rows, freq=freq)
integer_values = list(range(1, num_rows + 1))
numpy_values = np.array(
pd.date_range(start=start_date, periods=num_rows, freq=freq),
dtype="datetime64[D]",
)

# Create random data for the DataFrames
data = {f"col_{i}": np.random.randn(num_rows) for i in range(num_columns)}

# Create the DataFrames
df_date = pd.DataFrame(data)
df_numpy = pd.DataFrame(data)
df_integer = pd.DataFrame(data)

col_names = df_date.columns.values

# Set the date as index or as a column based on the index parameter
if index:
df_date.index = date_values
df_numpy.index = numpy_values
df_integer.index = integer_values
else:
df_date["date"] = date_values
df_numpy["date"] = numpy_values
df_integer["date"] = integer_values

if index:
time_col = None
else:
time_col = "date"

return [
[df_date, col_names, time_col],
[df_numpy, col_names, time_col],
[df_integer, col_names, time_col],
]


def test_dataframes() -> list:
test_config = product(
[10, 100, 1000, 10000],
[10, 100, 500, 1000],
[True, False],
)

dataframes_list = [
create_random_dataframes(
num_rows=num_rows, num_columns=num_columns, index=index
)
for num_rows, num_columns, index in test_config
]

return dataframes_list


df_list = test_dataframes()

############ PANDAS ############
pandas_timer = time.time()
for df_config in df_list:
for df, col_names, time_col in df_config:
_ = TimeSeries.from_dataframe(
df, value_cols=col_names, time_col=time_col, freq=None
)
df_shuffle = df.sample(frac=1)
_ = TimeSeries.from_dataframe(
df_shuffle, value_cols=col_names, time_col=time_col, freq=None
)
pandas_timer = time.time() - pandas_timer

############ NARWHALS ############
narwhals_timer = time.time()
for df_config in df_list:
for df, col_names, time_col in df_config:
_ = TimeSeries.from_narwhals_dataframe(
df, value_cols=col_names, time_col=time_col, freq=None
)
df_shuffle = df.sample(frac=1)
_ = TimeSeries.from_narwhals_dataframe(
df_shuffle, value_cols=col_names, time_col=time_col, freq=None
)
narwhals_timer = time.time() - narwhals_timer

print("pandas processing time: ", pandas_timer)
print("narwhals processing time: ", narwhals_timer)
Loading