Skip to content

Commit 24cec52

Browse files
authierj“authierj”dennisbaderFBruzzesi
authored
Narwhals implementation of from_dataframe and performance benchmark (#2661)
* narwhals implementation for and test benchmark * changes from MarcoGorelli incorporated * improvement thanks to reviewers * added comments about slow and fast parts of the code * using pandas index to avoid .to_list() * bug fix added * updated test script * narwhals timeseries added * from_series changed, names changed * changelog updated * small improvement * clean test scripts added * BUGFIX added for non_pandas df * tests added for polars df * polars and narwhals added to dependencies. Ideally, polars should be an optional dependency. * refactoring pd_series and pd_dataframe * removed test scripts from git repo * Update CHANGELOG.md Co-authored-by: Dennis Bader <[email protected]> * Update darts/timeseries.py Co-authored-by: Dennis Bader <[email protected]> * easy corrections applied * narwhals_test_time removed * Update requirements/core.txt Co-authored-by: Dennis Bader <[email protected]> * Update darts/timeseries.py Co-authored-by: Francesco Bruzzesi <[email protected]> * most corrections added * polars tests removed * tests corrected * Update darts/timeseries.py Co-authored-by: Dennis Bader <[email protected]> * Update darts/timeseries.py Co-authored-by: Dennis Bader <[email protected]> * no time_col, define one --------- Co-authored-by: “authierj” <“[email protected]”> Co-authored-by: Dennis Bader <[email protected]> Co-authored-by: Francesco Bruzzesi <[email protected]>
1 parent e086582 commit 24cec52

File tree

4 files changed

+188
-119
lines changed

4 files changed

+188
-119
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
1111

1212
**Improved**
1313

14+
- `TimeSeries.from_dataframe()` and `from_series()` now support creating `TimeSeries` from additional backends (Polars, PyArrow, ...). We leverage `narwhals` as the compatibility layer between dataframe libraries. See the `narwhals` [documentation](https://narwhals-dev.github.io/narwhals/) for all supported backends. [#2661](https://github.com/unit8co/darts/pull/2661) by [Jules Authier](https://github.com/authierj)
1415
- Added ONNX support for torch-based models with method `TorchForecastingModel.to_onnx()`. Check out [this example](https://unit8co.github.io/darts/userguide/gpu_and_tpu_usage.html#exporting-model-to-onnx-format-for-inference) from the user guide on how to export and load a model for inference. [#2620](https://github.com/unit8co/darts/pull/2620) by [Antoine Madrona](https://github.com/madtoinou)
1516
- Made method `ForecastingModel.untrained_model()` public. Use this method to get a new (untrained) model instance created with the same parameters. [#2684](https://github.com/unit8co/darts/pull/2684) by [Timon Erhart](https://github.com/turbotimon)
1617
- `TimeSeries.plot()` now supports setting the color for each component in the series. Simply pass a list / sequence of colors with length matching the number of components as parameters "c" or "colors". [#2680](https://github.com/unit8co/darts/pull/2680) by [Jules Authier](https://github.com/authierj)

darts/tests/test_timeseries.py

+90-32
Original file line numberDiff line numberDiff line change
@@ -2506,7 +2506,16 @@ def test_tail_numeric_time_index(self):
25062506

25072507

25082508
class TestTimeSeriesFromDataFrame:
2509-
def test_from_dataframe_sunny_day(self):
2509+
def pd_to_backend(self, df, backend, index=False):
2510+
if backend == "pandas":
2511+
return df
2512+
# elif backend == "polars":
2513+
# if index:
2514+
# return pl.from_pandas(df.reset_index())
2515+
# return pl.from_pandas(df)
2516+
2517+
@pytest.mark.parametrize("backend", ["pandas"])
2518+
def test_from_dataframe_sunny_day(self, backend):
25102519
data_dict = {"Time": pd.date_range(start="20180501", end="20200301", freq="MS")}
25112520
data_dict["Values1"] = np.random.uniform(
25122521
low=-10, high=10, size=len(data_dict["Time"])
@@ -2520,58 +2529,78 @@ def test_from_dataframe_sunny_day(self):
25202529
data_pd2["Time"] = data_pd2["Time"].apply(lambda date: str(date))
25212530
data_pd3 = data_pd1.set_index("Time")
25222531

2523-
data_darts1 = TimeSeries.from_dataframe(df=data_pd1, time_col="Time")
2524-
data_darts2 = TimeSeries.from_dataframe(df=data_pd2, time_col="Time")
2525-
data_darts3 = TimeSeries.from_dataframe(df=data_pd3)
2532+
data_darts1 = TimeSeries.from_dataframe(
2533+
df=self.pd_to_backend(data_pd1, backend), time_col="Time"
2534+
)
2535+
data_darts2 = TimeSeries.from_dataframe(
2536+
df=self.pd_to_backend(data_pd2, backend), time_col="Time"
2537+
)
2538+
data_darts3 = TimeSeries.from_dataframe(
2539+
df=self.pd_to_backend(data_pd3, backend, index=True),
2540+
time_col=None if backend == "pandas" else "Time",
2541+
)
25262542

25272543
assert data_darts1 == data_darts2
25282544
assert data_darts1 == data_darts3
25292545

2530-
def test_time_col_convert_string_integers(self):
2546+
@pytest.mark.parametrize("backend", ["pandas"])
2547+
def test_time_col_convert_string_integers(self, backend):
25312548
expected = np.array(list(range(3, 10)))
25322549
data_dict = {"Time": expected.astype(str)}
25332550
data_dict["Values1"] = np.random.uniform(
25342551
low=-10, high=10, size=len(data_dict["Time"])
25352552
)
25362553
df = pd.DataFrame(data_dict)
2537-
ts = TimeSeries.from_dataframe(df=df, time_col="Time")
2554+
ts = TimeSeries.from_dataframe(
2555+
df=self.pd_to_backend(df, backend), time_col="Time"
2556+
)
25382557

25392558
assert set(ts.time_index.values.tolist()) == set(expected)
25402559
assert ts.time_index.dtype == int
25412560
assert ts.time_index.name == "Time"
25422561

2543-
def test_time_col_convert_integers(self):
2562+
@pytest.mark.parametrize("backend", ["pandas"])
2563+
def test_time_col_convert_integers(self, backend):
25442564
expected = np.array(list(range(10)))
25452565
data_dict = {"Time": expected}
25462566
data_dict["Values1"] = np.random.uniform(
25472567
low=-10, high=10, size=len(data_dict["Time"])
25482568
)
2569+
25492570
df = pd.DataFrame(data_dict)
2550-
ts = TimeSeries.from_dataframe(df=df, time_col="Time")
2571+
ts = TimeSeries.from_dataframe(
2572+
df=self.pd_to_backend(df, backend), time_col="Time"
2573+
)
25512574

25522575
assert set(ts.time_index.values.tolist()) == set(expected)
25532576
assert ts.time_index.dtype == int
25542577
assert ts.time_index.name == "Time"
25552578

2556-
def test_fail_with_bad_integer_time_col(self):
2579+
@pytest.mark.parametrize("backend", ["pandas"])
2580+
def test_fail_with_bad_integer_time_col(self, backend):
25572581
bad_time_col_vals = np.array([4, 0, 1, 2])
25582582
data_dict = {"Time": bad_time_col_vals}
25592583
data_dict["Values1"] = np.random.uniform(
25602584
low=-10, high=10, size=len(data_dict["Time"])
25612585
)
25622586
df = pd.DataFrame(data_dict)
25632587
with pytest.raises(ValueError):
2564-
TimeSeries.from_dataframe(df=df, time_col="Time")
2588+
TimeSeries.from_dataframe(
2589+
df=self.pd_to_backend(df, backend), time_col="Time"
2590+
)
25652591

2566-
def test_time_col_convert_rangeindex(self):
2592+
@pytest.mark.parametrize("backend", ["pandas"])
2593+
def test_time_col_convert_rangeindex(self, backend):
25672594
for expected_l, step in zip([[4, 0, 2, 3, 1], [8, 0, 4, 6, 2]], [1, 2]):
25682595
expected = np.array(expected_l)
25692596
data_dict = {"Time": expected}
25702597
data_dict["Values1"] = np.random.uniform(
25712598
low=-10, high=10, size=len(data_dict["Time"])
25722599
)
25732600
df = pd.DataFrame(data_dict)
2574-
ts = TimeSeries.from_dataframe(df=df, time_col="Time")
2601+
ts = TimeSeries.from_dataframe(
2602+
df=self.pd_to_backend(df, backend), time_col="Time"
2603+
)
25752604

25762605
# check type (should convert to RangeIndex):
25772606
assert type(ts.time_index) is pd.RangeIndex
@@ -2586,31 +2615,38 @@ def test_time_col_convert_rangeindex(self):
25862615
]
25872616
assert np.all(ar1 == ar2)
25882617

2589-
def test_time_col_convert_datetime(self):
2618+
@pytest.mark.parametrize("backend", ["pandas"])
2619+
def test_time_col_convert_datetime(self, backend):
25902620
expected = pd.date_range(start="20180501", end="20200301", freq="MS")
25912621
data_dict = {"Time": expected}
25922622
data_dict["Values1"] = np.random.uniform(
25932623
low=-10, high=10, size=len(data_dict["Time"])
25942624
)
25952625
df = pd.DataFrame(data_dict)
2596-
ts = TimeSeries.from_dataframe(df=df, time_col="Time")
2626+
ts = TimeSeries.from_dataframe(
2627+
df=self.pd_to_backend(df, backend), time_col="Time"
2628+
)
25972629

25982630
assert ts.time_index.dtype == "datetime64[ns]"
25992631
assert ts.time_index.name == "Time"
26002632

2601-
def test_time_col_convert_datetime_strings(self):
2633+
@pytest.mark.parametrize("backend", ["pandas"])
2634+
def test_time_col_convert_datetime_strings(self, backend):
26022635
expected = pd.date_range(start="20180501", end="20200301", freq="MS")
26032636
data_dict = {"Time": expected.values.astype(str)}
26042637
data_dict["Values1"] = np.random.uniform(
26052638
low=-10, high=10, size=len(data_dict["Time"])
26062639
)
26072640
df = pd.DataFrame(data_dict)
2608-
ts = TimeSeries.from_dataframe(df=df, time_col="Time")
2641+
ts = TimeSeries.from_dataframe(
2642+
df=self.pd_to_backend(df, backend), time_col="Time"
2643+
)
26092644

26102645
assert ts.time_index.dtype == "datetime64[ns]"
26112646
assert ts.time_index.name == "Time"
26122647

2613-
def test_time_col_with_tz(self):
2648+
@pytest.mark.parametrize("backend", ["pandas"])
2649+
def test_time_col_with_tz_df(self, backend):
26142650
# numpy and xarray don't support "timezone aware" pd.DatetimeIndex
26152651
# the BUGFIX removes timezone information without conversion
26162652

@@ -2621,13 +2657,10 @@ def test_time_col_with_tz(self):
26212657
# pd.DataFrame loses the tz information unless it is contained in its index
26222658
# (other columns are silently converted to UTC, with tz attribute set to None)
26232659
df = pd.DataFrame(data=values, index=time_range_MS)
2624-
ts = TimeSeries.from_dataframe(df=df)
2625-
assert list(ts.time_index) == list(time_range_MS.tz_localize(None))
2626-
assert list(ts.time_index.tz_localize("CET")) == list(time_range_MS)
2627-
assert ts.time_index.tz is None
2628-
2629-
serie = pd.Series(data=values, index=time_range_MS)
2630-
ts = TimeSeries.from_series(pd_series=serie)
2660+
ts = TimeSeries.from_dataframe(
2661+
df=self.pd_to_backend(df, backend, index=True),
2662+
time_col=None if backend == "pandas" else "index",
2663+
)
26312664
assert list(ts.time_index) == list(time_range_MS.tz_localize(None))
26322665
assert list(ts.time_index.tz_localize("CET")) == list(time_range_MS)
26332666
assert ts.time_index.tz is None
@@ -2643,23 +2676,42 @@ def test_time_col_with_tz(self):
26432676
values = np.random.uniform(low=-10, high=10, size=len(time_range_H))
26442677

26452678
df = pd.DataFrame(data=values, index=time_range_H)
2646-
ts = TimeSeries.from_dataframe(df=df)
2679+
ts = TimeSeries.from_dataframe(
2680+
df=self.pd_to_backend(df, backend, index=True),
2681+
time_col=None if backend == "pandas" else "index",
2682+
)
26472683
assert list(ts.time_index) == list(time_range_H.tz_localize(None))
26482684
assert list(ts.time_index.tz_localize("CET")) == list(time_range_H)
26492685
assert ts.time_index.tz is None
26502686

2651-
series = pd.Series(data=values, index=time_range_H)
2652-
ts = TimeSeries.from_series(pd_series=series)
2687+
ts = TimeSeries.from_times_and_values(times=time_range_H, values=values)
26532688
assert list(ts.time_index) == list(time_range_H.tz_localize(None))
26542689
assert list(ts.time_index.tz_localize("CET")) == list(time_range_H)
26552690
assert ts.time_index.tz is None
26562691

2657-
ts = TimeSeries.from_times_and_values(times=time_range_H, values=values)
2692+
def test_time_col_with_tz_series(self):
2693+
time_range_MS = pd.date_range(
2694+
start="20180501", end="20200301", freq="MS", tz="CET"
2695+
)
2696+
values = np.random.uniform(low=-10, high=10, size=len(time_range_MS))
2697+
serie = pd.Series(data=values, index=time_range_MS)
2698+
ts = TimeSeries.from_series(pd_series=serie)
2699+
assert list(ts.time_index) == list(time_range_MS.tz_localize(None))
2700+
assert list(ts.time_index.tz_localize("CET")) == list(time_range_MS)
2701+
assert ts.time_index.tz is None
2702+
2703+
time_range_H = pd.date_range(
2704+
start="20200518", end="20200521", freq=freqs["h"], tz="CET"
2705+
)
2706+
values = np.random.uniform(low=-10, high=10, size=len(time_range_H))
2707+
series = pd.Series(data=values, index=time_range_H)
2708+
ts = TimeSeries.from_series(pd_series=series)
26582709
assert list(ts.time_index) == list(time_range_H.tz_localize(None))
26592710
assert list(ts.time_index.tz_localize("CET")) == list(time_range_H)
26602711
assert ts.time_index.tz is None
26612712

2662-
def test_time_col_convert_garbage(self):
2713+
@pytest.mark.parametrize("backend", ["pandas"])
2714+
def test_time_col_convert_garbage(self, backend):
26632715
expected = [
26642716
"2312312asdfdw",
26652717
"asdfsdf432sdf",
@@ -2674,9 +2726,12 @@ def test_time_col_convert_garbage(self):
26742726
df = pd.DataFrame(data_dict)
26752727

26762728
with pytest.raises(AttributeError):
2677-
TimeSeries.from_dataframe(df=df, time_col="Time")
2729+
TimeSeries.from_dataframe(
2730+
df=self.pd_to_backend(df, backend), time_col="Time"
2731+
)
26782732

2679-
def test_df_named_columns_index(self):
2733+
@pytest.mark.parametrize("backend", ["pandas"])
2734+
def test_df_named_columns_index(self, backend):
26802735
time_index = generate_index(
26812736
start=pd.Timestamp("2000-01-01"), length=4, freq="D", name="index"
26822737
)
@@ -2686,7 +2741,10 @@ def test_df_named_columns_index(self):
26862741
columns=["y"],
26872742
)
26882743
df.columns.name = "id"
2689-
ts = TimeSeries.from_dataframe(df)
2744+
ts = TimeSeries.from_dataframe(
2745+
df=self.pd_to_backend(df, backend, index=True),
2746+
time_col=None if backend == "pandas" else "index",
2747+
)
26902748

26912749
exp_ts = TimeSeries.from_times_and_values(
26922750
times=time_index,

0 commit comments

Comments
 (0)