Skip to content

Commit e4fdd0a

Browse files
authored
0.3.6
0.3.6
2 parents d20e3bb + 1613c78 commit e4fdd0a

40 files changed

+756
-192
lines changed

TODO.md

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,12 @@
1515
* Forecasts are desired for the future immediately following the most recent data.
1616

1717
# Latest
18-
* New Transfromer ScipyFilter
19-
* New models Univariate and MultivariateMotif
20-
* 'midhinge' and "weighted_mean" to AverageValueNaive
21-
* Add passing regressors to WindowRegression and made more efficient window generation
22-
* more plotting methods: plot_horizontal_transformers
23-
* for most -Regression type models, `model_params` is now treated as kwargs and can accept any args for that model
24-
* ExtraTrees and RadiusRegressor to -Regression type models
25-
* bug fix in generate_score_per_series
26-
* 'Generation' now tracked in results table, plus plotting method for generation loss
27-
18+
* back_forecast for forecast on training data
19+
* Mosaic ensembles can now be used beyond training forecast_length and for shorter lengths too
20+
* best_model_name, best_model_params, and best_model_transformation_params AutoTS attributes now available
21+
* mean, median, and ffill NaN now handle fully NaN series by returning 0.
22+
* fixed bug that was causing mosaic generalization to fail if ffill/bfill handled all missing values
23+
* STLFilter and HPFilter and convolution_filter Transformers added
2824

2925
# Errors:
3026
DynamicFactor holidays Exceptions 'numpy.ndarray' object has no attribute 'values'

autots/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from autots.tools.regressor import create_lagged_regressor
2020
from autots.evaluator.auto_model import model_forecast
2121

22-
__version__ = '0.3.5'
22+
__version__ = '0.3.6'
2323

2424
TransformTS = GeneralTransformer
2525

autots/datasets/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
load_hourly,
1010
load_weekly,
1111
load_weekdays,
12+
load_zeroes,
1213
)
1314

1415
__all__ = [
@@ -19,4 +20,5 @@
1920
'load_weekly',
2021
'load_weekdays',
2122
'load_live_daily',
23+
'load_zeroes',
2224
]

autots/datasets/_base.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import datetime
44
import io
55
import requests
6+
import numpy as np
67
import pandas as pd
78

89

@@ -373,3 +374,20 @@ def load_live_daily(
373374
id_vars=['datetime'], var_name='series_id', value_name='value'
374375
)
375376
return df_long
377+
378+
379+
def load_zeroes(long=False, shape=None, start_date: str = "2021-01-01"):
380+
"""Create a dataset of just zeroes for testing edge case."""
381+
if shape is None:
382+
shape = (200, 5)
383+
df_wide = pd.DataFrame(
384+
np.zeros(shape), index=pd.date_range(start_date, periods=shape[0], freq="D")
385+
)
386+
if not long:
387+
return df_wide
388+
else:
389+
df_wide.index.name = "datetime"
390+
df_long = df_wide.reset_index(drop=False).melt(
391+
id_vars=['datetime'], var_name='series_id', value_name='value'
392+
)
393+
return df_long

autots/evaluator/auto_model.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Mid-level helper functions for AutoTS."""
22
import sys
33
import random
4+
from math import ceil
45
import numpy as np
56
import pandas as pd
67
import datetime
@@ -1630,6 +1631,11 @@ def generate_score(
16301631
# generate minimizing scores, where smaller = better accuracy
16311632
try:
16321633
model_results = model_results.replace([np.inf, -np.inf], np.nan)
1634+
# not sure why there are negative SMAPE values, but make sure they get dealt with
1635+
if model_results['smape'].min() < 0:
1636+
model_results['smape'] = model_results['smape'].where(
1637+
model_results['smape'] >= 0, model_results['smape'].max()
1638+
)
16331639
# handle NaN in scores...
16341640
# model_results = model_results.fillna(value=model_results.max(axis=0))
16351641

@@ -1738,3 +1744,124 @@ def generate_score_per_series(results_object, metric_weighting, total_validation
17381744
# take the average score across validations
17391745
overall_score = overall_score.groupby(level=0).mean()
17401746
return overall_score
1747+
1748+
1749+
def back_forecast(
1750+
df,
1751+
model_name,
1752+
model_param_dict,
1753+
model_transform_dict,
1754+
future_regressor_train=None,
1755+
n_splits: int = "auto",
1756+
forecast_length=14,
1757+
frequency="infer",
1758+
prediction_interval=0.9,
1759+
no_negatives=False,
1760+
constraint=None,
1761+
holiday_country="US",
1762+
random_seed=123,
1763+
n_jobs="auto",
1764+
verbose=0,
1765+
):
1766+
"""Create forecasts for the historical training data, ie. backcast or back forecast.
1767+
1768+
This actually forecasts on historical data, these are not fit model values as are often returned by other packages.
1769+
As such, this will be slower, but more representative of real world model performance.
1770+
There may be jumps in data between chunks.
1771+
1772+
Args are same as for model_forecast except...
1773+
n_splits(int): how many pieces to split data into. Pass 2 for fastest, or "auto" for best accuracy
1774+
1775+
Returns a standard prediction object (access .forecast, .lower_forecast, .upper_forecast)
1776+
"""
1777+
max_chunk = int(ceil(df.index.shape[0] / forecast_length))
1778+
if not str(n_splits).isdigit():
1779+
n_splits = max_chunk
1780+
elif n_splits > max_chunk or n_splits < 2:
1781+
n_splits = max_chunk
1782+
else:
1783+
n_splits = int(n_splits)
1784+
1785+
chunk_size = df.index.shape[0] / n_splits
1786+
b_forecast, b_forecast_up, b_forecast_low = (
1787+
pd.DataFrame(),
1788+
pd.DataFrame(),
1789+
pd.DataFrame(),
1790+
)
1791+
for n in range(n_splits):
1792+
int_idx = int(n * chunk_size)
1793+
int_idx_1 = int((n + 1) * chunk_size)
1794+
inner_forecast_length = int_idx_1 - int_idx
1795+
# flip to forecast backwards for the first split
1796+
if n == 0:
1797+
df_split = df.iloc[int_idx_1:].copy()
1798+
df_split = df_split.iloc[::-1]
1799+
df_split.index = df_split.index[::-1]
1800+
result_idx = df.iloc[0:int_idx_1].index
1801+
else:
1802+
df_split = df.iloc[0:int_idx].copy()
1803+
# handle appropriate regressors
1804+
if isinstance(future_regressor_train, pd.DataFrame):
1805+
if n == 0:
1806+
split_regr = future_regressor_train.reindex(df_split.index[::-1])
1807+
split_regr_future = future_regressor_train.reindex(result_idx)
1808+
else:
1809+
split_regr = future_regressor_train.reindex(df_split.index)
1810+
split_regr_future = future_regressor_train.reindex(
1811+
df.index[int_idx:int_idx_1]
1812+
)
1813+
else:
1814+
split_regr = []
1815+
split_regr_future = []
1816+
try:
1817+
df_forecast = model_forecast(
1818+
model_name=model_name,
1819+
model_param_dict=model_param_dict,
1820+
model_transform_dict=model_transform_dict,
1821+
df_train=df_split,
1822+
forecast_length=inner_forecast_length,
1823+
frequency=frequency,
1824+
prediction_interval=prediction_interval,
1825+
no_negatives=no_negatives,
1826+
constraint=constraint,
1827+
future_regressor_train=split_regr,
1828+
future_regressor_forecast=split_regr_future,
1829+
holiday_country=holiday_country,
1830+
random_seed=random_seed,
1831+
verbose=verbose,
1832+
n_jobs=n_jobs,
1833+
)
1834+
b_forecast = pd.concat([b_forecast, df_forecast.forecast])
1835+
b_forecast_up = pd.concat([b_forecast_up, df_forecast.upper_forecast])
1836+
b_forecast_low = pd.concat([b_forecast_low, df_forecast.lower_forecast])
1837+
# handle index being wrong for the flipped forecast which comes first
1838+
if n == 0:
1839+
b_forecast = b_forecast.iloc[::-1]
1840+
b_forecast_up = b_forecast_up.iloc[::-1]
1841+
b_forecast_low = b_forecast_low.iloc[::-1]
1842+
b_forecast.index = result_idx
1843+
b_forecast_up.index = result_idx
1844+
b_forecast_low.index = result_idx
1845+
except Exception as e:
1846+
print(f"back_forecast split {n} failed with {repr(e)}")
1847+
b_df = pd.DataFrame(
1848+
np.nan, index=df.index[int_idx:int_idx_1], columns=df.columns
1849+
)
1850+
b_forecast = pd.concat([b_forecast, b_df])
1851+
b_forecast_up = pd.concat([b_forecast_up, b_df])
1852+
b_forecast_low = pd.concat([b_forecast_low, b_df])
1853+
1854+
df_forecast.forecast = b_forecast
1855+
df_forecast.upper_forecast = b_forecast_up
1856+
df_forecast.lower_forecast = b_forecast_low
1857+
return df_forecast
1858+
1859+
1860+
def remove_leading_zeros(df):
1861+
"""Accepts wide dataframe, returns dataframe with zeroes preceeding any non-zero value as NaN."""
1862+
# keep the last row unaltered to keep metrics happier if all zeroes
1863+
temp = df.head(df.shape[0] - 1)
1864+
temp = temp.abs().cumsum(axis=0).replace(0, np.nan)
1865+
temp = df[~temp.isna()]
1866+
temp = temp.head(df.shape[0] - 1)
1867+
return pd.concat([temp, df.tail(1)], axis=0)

autots/evaluator/auto_ts.py

Lines changed: 89 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
generate_score_per_series,
2525
model_forecast,
2626
validation_aggregation,
27+
back_forecast,
28+
remove_leading_zeros,
2729
)
2830
from autots.models.ensemble import (
2931
EnsembleTemplateGenerator,
@@ -312,6 +314,9 @@ def __init__(
312314
else ['ID'] + self.template_cols
313315
)
314316
self.initial_results = TemplateEvalObject()
317+
self.best_model_name = ""
318+
self.best_model_params = ""
319+
self.best_model_transformation_params = ""
315320

316321
if verbose > 2:
317322
print('"Hello. Would you like to destroy some evil today?" - Sanderson')
@@ -322,7 +327,7 @@ def __repr__(self):
322327
return "Uninitiated AutoTS object"
323328
else:
324329
try:
325-
return f"Initiated AutoTS object with best model: \n{self.best_model['Model'].iloc[0]}\n{self.best_model['TransformationParameters'].iloc[0]}\n{self.best_model['ModelParameters'].iloc[0]}"
330+
return f"Initiated AutoTS object with best model: \n{self.best_model_name}\n{self.best_model_transformation_params}\n{self.best_model_params}"
326331
except Exception:
327332
return "Initiated AutoTS object"
328333

@@ -481,12 +486,7 @@ def fit(
481486

482487
# replace any zeroes that occur prior to all non-zero values
483488
if self.remove_leading_zeroes:
484-
# keep the last row unaltered to keep metrics happier if all zeroes
485-
temp = df_wide_numeric.head(df_wide_numeric.shape[0] - 1)
486-
temp = temp.abs().cumsum(axis=0).replace(0, np.nan)
487-
temp = df_wide_numeric[~temp.isna()]
488-
temp = temp.head(df_wide_numeric.shape[0] - 1)
489-
df_wide_numeric = pd.concat([temp, df_wide_numeric.tail(1)], axis=0)
489+
df_wide_numeric = remove_leading_zeros(df_wide_numeric)
490490

491491
# remove other ensembling types if univariate
492492
if df_wide_numeric.shape[1] == 1:
@@ -1096,6 +1096,12 @@ def fit(
10961096
self.ensemble_check = int((self.best_model['Ensemble'].iloc[0]) > 0)
10971097
except IndexError:
10981098
raise ValueError(error_msg_template)
1099+
# give a more convenient dict option
1100+
self.best_model_name = self.best_model['Model'].iloc[0]
1101+
self.best_model_params = json.loads(self.best_model['ModelParameters'].iloc[0])
1102+
self.best_model_transformation_params = json.loads(
1103+
self.best_model['TransformationParameters'].iloc[0]
1104+
)
10991105

11001106
# set flags to check if regressors or ensemble used in final model.
11011107
param_dict = json.loads(self.best_model.iloc[0]['ModelParameters'])
@@ -1330,6 +1336,9 @@ def export_template(
13301336
export_template = export_template.nsmallest(n, columns=['Score'])
13311337
if not include_results:
13321338
export_template = export_template[self.template_cols]
1339+
export_template = pd.concat(
1340+
[self.best_model, export_template]
1341+
).drop_duplicates()
13331342
else:
13341343
raise ValueError("`models` must be 'all' or 'best'")
13351344
try:
@@ -1448,13 +1457,56 @@ def import_results(self, filename):
14481457
self.initial_results = self.initial_results.concat(new_obj)
14491458
return self
14501459

1460+
def back_forecast(
1461+
self, column=None, n_splits: int = 3, tail: int = None, verbose: int = 0
1462+
):
1463+
"""Create forecasts for the historical training data, ie. backcast or back forecast.
1464+
1465+
This actually forecasts on historical data, these are not fit model values as are often returned by other packages.
1466+
As such, this will be slower, but more representative of real world model performance.
1467+
There may be jumps in data between chunks.
1468+
1469+
Args are same as for model_forecast except...
1470+
n_splits(int): how many pieces to split data into. Pass 2 for fastest, or "auto" for best accuracy
1471+
column (str): if to run on only one column, pass column name. Faster than full.
1472+
tail (int): df.tail() of the dataset, back_forecast is only run on n most recent observations.
1473+
1474+
Returns a standard prediction object (access .forecast, .lower_forecast, .upper_forecast)
1475+
"""
1476+
if self.best_model.empty:
1477+
raise ValueError("No best_model. AutoTS .fit() needs to be run.")
1478+
if column is not None:
1479+
input_df = pd.DataFrame(self.df_wide_numeric[column])
1480+
else:
1481+
input_df = self.df_wide_numeric
1482+
if tail is not None:
1483+
input_df = input_df.tail(tail)
1484+
result = back_forecast(
1485+
df=input_df,
1486+
model_name=self.best_model_name,
1487+
model_param_dict=self.best_model_params,
1488+
model_transform_dict=self.best_model_transformation_params,
1489+
future_regressor_train=self.future_regressor_train,
1490+
n_splits=n_splits,
1491+
forecast_length=self.forecast_length,
1492+
frequency=self.frequency,
1493+
prediction_interval=self.prediction_interval,
1494+
no_negatives=self.no_negatives,
1495+
constraint=self.constraint,
1496+
holiday_country=self.holiday_country,
1497+
random_seed=self.random_seed,
1498+
n_jobs=self.n_jobs,
1499+
verbose=verbose,
1500+
)
1501+
return result
1502+
14511503
def horizontal_to_df(self):
14521504
"""helper function for plotting."""
14531505
if self.best_model.empty:
14541506
raise ValueError("No best_model. AutoTS .fit() needs to be run.")
14551507
if self.best_model['Ensemble'].iloc[0] != 2:
14561508
raise ValueError("Only works on horizontal ensemble type models.")
1457-
ModelParameters = json.loads(self.best_model['ModelParameters'].iloc[0])
1509+
ModelParameters = self.best_model_params
14581510
series = ModelParameters['series']
14591511
series = pd.DataFrame.from_dict(series, orient="index").reset_index(drop=False)
14601512
if series.shape[1] > 2:
@@ -1496,7 +1548,7 @@ def mosaic_to_df(self):
14961548
raise ValueError("No best_model. AutoTS .fit() needs to be run.")
14971549
if self.best_model['Ensemble'].iloc[0] != 2:
14981550
raise ValueError("Only works on horizontal ensemble type models.")
1499-
ModelParameters = json.loads(self.best_model['ModelParameters'].iloc[0])
1551+
ModelParameters = self.best_model_params
15001552
if str(ModelParameters['model_name']).lower() != 'mosaic':
15011553
raise ValueError("Only works on mosaic ensembles.")
15021554
series = pd.DataFrame.from_dict(ModelParameters['series'])
@@ -1565,6 +1617,32 @@ def plot_generation_loss(self, **kwargs):
15651617
ylabel="Lowest Score", **kwargs
15661618
)
15671619

1620+
def plot_backforecast(
1621+
self, series=None, n_splits: int = 3, start_date=None, **kwargs
1622+
):
1623+
"""Plot the historical data and fit forecast on historic.
1624+
1625+
Args:
1626+
series (str or list): column names of time series
1627+
n_splits (int or str): "auto", number > 2, higher more accurate but slower
1628+
**kwargs passed to pd.DataFrame.plot()
1629+
"""
1630+
if series is None:
1631+
series = random.choice(self.df_wide_numeric.columns)
1632+
b_df = self.back_forecast(column=series, n_splits=n_splits, verbose=0).forecast
1633+
b_df = b_df.rename(columns=lambda x: str(x) + "_forecast")
1634+
plot_df = pd.concat(
1635+
[
1636+
pd.DataFrame(self.df_wide_numeric[series]),
1637+
b_df,
1638+
],
1639+
axis=1,
1640+
)
1641+
if start_date is not None:
1642+
plot_df = plot_df[plot_df.index >= start_date]
1643+
plot_df = remove_leading_zeros(plot_df)
1644+
plot_df.plot(**kwargs)
1645+
15681646

15691647
colors_list = [
15701648
'#FF00FF',
@@ -1607,6 +1685,8 @@ def plot_generation_loss(self, **kwargs):
16071685
'#EE82EE',
16081686
'#00008B',
16091687
'#4B0082',
1688+
'#0403A7',
1689+
"#000000",
16101690
]
16111691

16121692

0 commit comments

Comments
 (0)