winedarksea
diff --git a/‎TODO.md‎
Lines changed: 6 additions & 10 deletions b/‎TODO.md‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎autots/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎autots/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autots/datasets/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎autots/datasets/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎autots/datasets/_base.py‎
Lines changed: 18 additions & 0 deletions b/‎autots/datasets/_base.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎autots/evaluator/auto_model.py‎
Lines changed: 127 additions & 0 deletions b/‎autots/evaluator/auto_model.py‎
Lines changed: 127 additions & 0 deletions
diff --git a/‎autots/evaluator/auto_ts.py‎
Lines changed: 89 additions & 9 deletions b/‎autots/evaluator/auto_ts.py‎
Lines changed: 89 additions & 9 deletions
@@ -15,16 +15,12 @@
 * Forecasts are desired for the future immediately following the most recent data.
 
 # Latest
-* New Transfromer ScipyFilter
-* New models Univariate and MultivariateMotif
-* 'midhinge' and "weighted_mean" to AverageValueNaive
-* Add passing regressors to WindowRegression and made more efficient window generation
-* more plotting methods: plot_horizontal_transformers
-* for most -Regression type models, `model_params` is now treated as kwargs and can accept any args for that model
-* ExtraTrees and RadiusRegressor to -Regression type models
-* bug fix in generate_score_per_series
-* 'Generation' now tracked in results table, plus plotting method for generation loss
-
+* back_forecast for forecast on training data
+* Mosaic ensembles can now be used beyond training forecast_length and for shorter lengths too
+* best_model_name, best_model_params, and best_model_transformation_params AutoTS attributes now available
+* mean, median, and ffill NaN now handle fully NaN series by returning 0.
+* fixed bug that was causing mosaic generalization to fail if ffill/bfill handled all missing values
+* STLFilter and HPFilter and convolution_filter Transformers added
 
 # Errors: 
 DynamicFactor holidays 	Exceptions 'numpy.ndarray' object has no attribute 'values'
 
@@ -19,7 +19,7 @@
 from autots.tools.regressor import create_lagged_regressor
 from autots.evaluator.auto_model import model_forecast
 
-__version__ = '0.3.5'
+__version__ = '0.3.6'
 
 TransformTS = GeneralTransformer
 
 
@@ -9,6 +9,7 @@
     load_hourly,
     load_weekly,
     load_weekdays,
+    load_zeroes,
 )
 
 __all__ = [
@@ -19,4 +20,5 @@
     'load_weekly',
     'load_weekdays',
     'load_live_daily',
+    'load_zeroes',
 ]
@@ -3,6 +3,7 @@
 import datetime
 import io
 import requests
+import numpy as np
 import pandas as pd
 
 
@@ -373,3 +374,20 @@ def load_live_daily(
             id_vars=['datetime'], var_name='series_id', value_name='value'
         )
         return df_long
+
+
+def load_zeroes(long=False, shape=None, start_date: str = "2021-01-01"):
+    """Create a dataset of just zeroes for testing edge case."""
+    if shape is None:
+        shape = (200, 5)
+    df_wide = pd.DataFrame(
+        np.zeros(shape), index=pd.date_range(start_date, periods=shape[0], freq="D")
+    )
+    if not long:
+        return df_wide
+    else:
+        df_wide.index.name = "datetime"
+        df_long = df_wide.reset_index(drop=False).melt(
+            id_vars=['datetime'], var_name='series_id', value_name='value'
+        )
+        return df_long
@@ -1,6 +1,7 @@
 """Mid-level helper functions for AutoTS."""
 import sys
 import random
+from math import ceil
 import numpy as np
 import pandas as pd
 import datetime
@@ -1630,6 +1631,11 @@ def generate_score(
     # generate minimizing scores, where smaller = better accuracy
     try:
         model_results = model_results.replace([np.inf, -np.inf], np.nan)
+        # not sure why there are negative SMAPE values, but make sure they get dealt with
+        if model_results['smape'].min() < 0:
+            model_results['smape'] = model_results['smape'].where(
+                model_results['smape'] >= 0, model_results['smape'].max()
+            )
         # handle NaN in scores...
         # model_results = model_results.fillna(value=model_results.max(axis=0))
 
@@ -1738,3 +1744,124 @@ def generate_score_per_series(results_object, metric_weighting, total_validation
     # take the average score across validations
     overall_score = overall_score.groupby(level=0).mean()
     return overall_score
+
+
+def back_forecast(
+    df,
+    model_name,
+    model_param_dict,
+    model_transform_dict,
+    future_regressor_train=None,
+    n_splits: int = "auto",
+    forecast_length=14,
+    frequency="infer",
+    prediction_interval=0.9,
+    no_negatives=False,
+    constraint=None,
+    holiday_country="US",
+    random_seed=123,
+    n_jobs="auto",
+    verbose=0,
+):
+    """Create forecasts for the historical training data, ie. backcast or back forecast.
+
+    This actually forecasts on historical data, these are not fit model values as are often returned by other packages.
+    As such, this will be slower, but more representative of real world model performance.
+    There may be jumps in data between chunks.
+
+    Args are same as for model_forecast except...
+    n_splits(int): how many pieces to split data into. Pass 2 for fastest, or "auto" for best accuracy
+
+    Returns a standard prediction object (access .forecast, .lower_forecast, .upper_forecast)
+    """
+    max_chunk = int(ceil(df.index.shape[0] / forecast_length))
+    if not str(n_splits).isdigit():
+        n_splits = max_chunk
+    elif n_splits > max_chunk or n_splits < 2:
+        n_splits = max_chunk
+    else:
+        n_splits = int(n_splits)
+
+    chunk_size = df.index.shape[0] / n_splits
+    b_forecast, b_forecast_up, b_forecast_low = (
+        pd.DataFrame(),
+        pd.DataFrame(),
+        pd.DataFrame(),
+    )
+    for n in range(n_splits):
+        int_idx = int(n * chunk_size)
+        int_idx_1 = int((n + 1) * chunk_size)
+        inner_forecast_length = int_idx_1 - int_idx
+        # flip to forecast backwards for the first split
+        if n == 0:
+            df_split = df.iloc[int_idx_1:].copy()
+            df_split = df_split.iloc[::-1]
+            df_split.index = df_split.index[::-1]
+            result_idx = df.iloc[0:int_idx_1].index
+        else:
+            df_split = df.iloc[0:int_idx].copy()
+        # handle appropriate regressors
+        if isinstance(future_regressor_train, pd.DataFrame):
+            if n == 0:
+                split_regr = future_regressor_train.reindex(df_split.index[::-1])
+                split_regr_future = future_regressor_train.reindex(result_idx)
+            else:
+                split_regr = future_regressor_train.reindex(df_split.index)
+                split_regr_future = future_regressor_train.reindex(
+                    df.index[int_idx:int_idx_1]
+                )
+        else:
+            split_regr = []
+            split_regr_future = []
+        try:
+            df_forecast = model_forecast(
+                model_name=model_name,
+                model_param_dict=model_param_dict,
+                model_transform_dict=model_transform_dict,
+                df_train=df_split,
+                forecast_length=inner_forecast_length,
+                frequency=frequency,
+                prediction_interval=prediction_interval,
+                no_negatives=no_negatives,
+                constraint=constraint,
+                future_regressor_train=split_regr,
+                future_regressor_forecast=split_regr_future,
+                holiday_country=holiday_country,
+                random_seed=random_seed,
+                verbose=verbose,
+                n_jobs=n_jobs,
+            )
+            b_forecast = pd.concat([b_forecast, df_forecast.forecast])
+            b_forecast_up = pd.concat([b_forecast_up, df_forecast.upper_forecast])
+            b_forecast_low = pd.concat([b_forecast_low, df_forecast.lower_forecast])
+            # handle index being wrong for the flipped forecast which comes first
+            if n == 0:
+                b_forecast = b_forecast.iloc[::-1]
+                b_forecast_up = b_forecast_up.iloc[::-1]
+                b_forecast_low = b_forecast_low.iloc[::-1]
+                b_forecast.index = result_idx
+                b_forecast_up.index = result_idx
+                b_forecast_low.index = result_idx
+        except Exception as e:
+            print(f"back_forecast split {n} failed with {repr(e)}")
+            b_df = pd.DataFrame(
+                np.nan, index=df.index[int_idx:int_idx_1], columns=df.columns
+            )
+            b_forecast = pd.concat([b_forecast, b_df])
+            b_forecast_up = pd.concat([b_forecast_up, b_df])
+            b_forecast_low = pd.concat([b_forecast_low, b_df])
+
+    df_forecast.forecast = b_forecast
+    df_forecast.upper_forecast = b_forecast_up
+    df_forecast.lower_forecast = b_forecast_low
+    return df_forecast
+
+
+def remove_leading_zeros(df):
+    """Accepts wide dataframe, returns dataframe with zeroes preceeding any non-zero value as NaN."""
+    # keep the last row unaltered to keep metrics happier if all zeroes
+    temp = df.head(df.shape[0] - 1)
+    temp = temp.abs().cumsum(axis=0).replace(0, np.nan)
+    temp = df[~temp.isna()]
+    temp = temp.head(df.shape[0] - 1)
+    return pd.concat([temp, df.tail(1)], axis=0)
@@ -24,6 +24,8 @@
     generate_score_per_series,
     model_forecast,
     validation_aggregation,
+    back_forecast,
+    remove_leading_zeros,
 )
 from autots.models.ensemble import (
     EnsembleTemplateGenerator,
@@ -312,6 +314,9 @@ def __init__(
             else ['ID'] + self.template_cols
         )
         self.initial_results = TemplateEvalObject()
+        self.best_model_name = ""
+        self.best_model_params = ""
+        self.best_model_transformation_params = ""
 
         if verbose > 2:
             print('"Hello. Would you like to destroy some evil today?" - Sanderson')
@@ -322,7 +327,7 @@ def __repr__(self):
             return "Uninitiated AutoTS object"
         else:
             try:
-                return f"Initiated AutoTS object with best model: \n{self.best_model['Model'].iloc[0]}\n{self.best_model['TransformationParameters'].iloc[0]}\n{self.best_model['ModelParameters'].iloc[0]}"
+                return f"Initiated AutoTS object with best model: \n{self.best_model_name}\n{self.best_model_transformation_params}\n{self.best_model_params}"
             except Exception:
                 return "Initiated AutoTS object"
 
@@ -481,12 +486,7 @@ def fit(
 
         # replace any zeroes that occur prior to all non-zero values
         if self.remove_leading_zeroes:
-            # keep the last row unaltered to keep metrics happier if all zeroes
-            temp = df_wide_numeric.head(df_wide_numeric.shape[0] - 1)
-            temp = temp.abs().cumsum(axis=0).replace(0, np.nan)
-            temp = df_wide_numeric[~temp.isna()]
-            temp = temp.head(df_wide_numeric.shape[0] - 1)
-            df_wide_numeric = pd.concat([temp, df_wide_numeric.tail(1)], axis=0)
+            df_wide_numeric = remove_leading_zeros(df_wide_numeric)
 
         # remove other ensembling types if univariate
         if df_wide_numeric.shape[1] == 1:
@@ -1096,6 +1096,12 @@ def fit(
                 self.ensemble_check = int((self.best_model['Ensemble'].iloc[0]) > 0)
             except IndexError:
                 raise ValueError(error_msg_template)
+        # give a more convenient dict option
+        self.best_model_name = self.best_model['Model'].iloc[0]
+        self.best_model_params = json.loads(self.best_model['ModelParameters'].iloc[0])
+        self.best_model_transformation_params = json.loads(
+            self.best_model['TransformationParameters'].iloc[0]
+        )
 
         # set flags to check if regressors or ensemble used in final model.
         param_dict = json.loads(self.best_model.iloc[0]['ModelParameters'])
@@ -1330,6 +1336,9 @@ def export_template(
                 export_template = export_template.nsmallest(n, columns=['Score'])
                 if not include_results:
                     export_template = export_template[self.template_cols]
+                    export_template = pd.concat(
+                        [self.best_model, export_template]
+                    ).drop_duplicates()
         else:
             raise ValueError("`models` must be 'all' or 'best'")
         try:
@@ -1448,13 +1457,56 @@ def import_results(self, filename):
             self.initial_results = self.initial_results.concat(new_obj)
         return self
 
+    def back_forecast(
+        self, column=None, n_splits: int = 3, tail: int = None, verbose: int = 0
+    ):
+        """Create forecasts for the historical training data, ie. backcast or back forecast.
+
+        This actually forecasts on historical data, these are not fit model values as are often returned by other packages.
+        As such, this will be slower, but more representative of real world model performance.
+        There may be jumps in data between chunks.
+
+        Args are same as for model_forecast except...
+        n_splits(int): how many pieces to split data into. Pass 2 for fastest, or "auto" for best accuracy
+        column (str): if to run on only one column, pass column name. Faster than full.
+        tail (int): df.tail() of the dataset, back_forecast is only run on n most recent observations.
+
+        Returns a standard prediction object (access .forecast, .lower_forecast, .upper_forecast)
+        """
+        if self.best_model.empty:
+            raise ValueError("No best_model. AutoTS .fit() needs to be run.")
+        if column is not None:
+            input_df = pd.DataFrame(self.df_wide_numeric[column])
+        else:
+            input_df = self.df_wide_numeric
+        if tail is not None:
+            input_df = input_df.tail(tail)
+        result = back_forecast(
+            df=input_df,
+            model_name=self.best_model_name,
+            model_param_dict=self.best_model_params,
+            model_transform_dict=self.best_model_transformation_params,
+            future_regressor_train=self.future_regressor_train,
+            n_splits=n_splits,
+            forecast_length=self.forecast_length,
+            frequency=self.frequency,
+            prediction_interval=self.prediction_interval,
+            no_negatives=self.no_negatives,
+            constraint=self.constraint,
+            holiday_country=self.holiday_country,
+            random_seed=self.random_seed,
+            n_jobs=self.n_jobs,
+            verbose=verbose,
+        )
+        return result
+
     def horizontal_to_df(self):
         """helper function for plotting."""
         if self.best_model.empty:
             raise ValueError("No best_model. AutoTS .fit() needs to be run.")
         if self.best_model['Ensemble'].iloc[0] != 2:
             raise ValueError("Only works on horizontal ensemble type models.")
-        ModelParameters = json.loads(self.best_model['ModelParameters'].iloc[0])
+        ModelParameters = self.best_model_params
         series = ModelParameters['series']
         series = pd.DataFrame.from_dict(series, orient="index").reset_index(drop=False)
         if series.shape[1] > 2:
@@ -1496,7 +1548,7 @@ def mosaic_to_df(self):
             raise ValueError("No best_model. AutoTS .fit() needs to be run.")
         if self.best_model['Ensemble'].iloc[0] != 2:
             raise ValueError("Only works on horizontal ensemble type models.")
-        ModelParameters = json.loads(self.best_model['ModelParameters'].iloc[0])
+        ModelParameters = self.best_model_params
         if str(ModelParameters['model_name']).lower() != 'mosaic':
             raise ValueError("Only works on mosaic ensembles.")
         series = pd.DataFrame.from_dict(ModelParameters['series'])
@@ -1565,6 +1617,32 @@ def plot_generation_loss(self, **kwargs):
             ylabel="Lowest Score", **kwargs
         )
 
+    def plot_backforecast(
+        self, series=None, n_splits: int = 3, start_date=None, **kwargs
+    ):
+        """Plot the historical data and fit forecast on historic.
+
+        Args:
+            series (str or list): column names of time series
+            n_splits (int or str): "auto", number > 2, higher more accurate but slower
+            **kwargs passed to pd.DataFrame.plot()
+        """
+        if series is None:
+            series = random.choice(self.df_wide_numeric.columns)
+        b_df = self.back_forecast(column=series, n_splits=n_splits, verbose=0).forecast
+        b_df = b_df.rename(columns=lambda x: str(x) + "_forecast")
+        plot_df = pd.concat(
+            [
+                pd.DataFrame(self.df_wide_numeric[series]),
+                b_df,
+            ],
+            axis=1,
+        )
+        if start_date is not None:
+            plot_df = plot_df[plot_df.index >= start_date]
+        plot_df = remove_leading_zeros(plot_df)
+        plot_df.plot(**kwargs)
+
 
 colors_list = [
     '#FF00FF',
@@ -1607,6 +1685,8 @@ def plot_generation_loss(self, **kwargs):
     '#EE82EE',
     '#00008B',
     '#4B0082',
+    '#0403A7',
+    "#000000",
 ]
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`load_hourly,`
`10`	`10`	`load_weekly,`
`11`	`11`	`load_weekdays,`
	`12`	`+ load_zeroes,`
`12`	`13`	`)`
`13`	`14`
`14`	`15`	`__all__ = [`
`@@ -19,4 +20,5 @@`
`19`	`20`	`'load_weekly',`
`20`	`21`	`'load_weekdays',`
`21`	`22`	`'load_live_daily',`
	`23`	`+ 'load_zeroes',`
`22`	`24`	`]`