winedarksea
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎TODO.md‎
Lines changed: 5 additions & 2 deletions b/‎TODO.md‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎autots/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎autots/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autots/datasets/_base.py‎
Lines changed: 1 addition & 0 deletions b/‎autots/datasets/_base.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎autots/evaluator/anomaly_detector.py‎
Lines changed: 18 additions & 4 deletions b/‎autots/evaluator/anomaly_detector.py‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎autots/evaluator/auto_model.py‎
Lines changed: 9 additions & 0 deletions b/‎autots/evaluator/auto_model.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎autots/evaluator/auto_ts.py‎
Lines changed: 34 additions & 2 deletions b/‎autots/evaluator/auto_ts.py‎
Lines changed: 34 additions & 2 deletions
diff --git a/‎autots/evaluator/metrics.py‎
Lines changed: 3 additions & 0 deletions b/‎autots/evaluator/metrics.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎autots/models/cassandra.py‎
Lines changed: 8 additions & 2 deletions b/‎autots/models/cassandra.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎autots/models/sklearn.py‎
Lines changed: 26 additions & 11 deletions b/‎autots/models/sklearn.py‎
Lines changed: 26 additions & 11 deletions
@@ -34,6 +34,8 @@ pip install autots
 ```
 This includes dependencies for basic models, but [additonal packages](https://github.com/winedarksea/AutoTS/blob/master/extended_tutorial.md#installation-and-dependency-versioning) are required for some models and methods.
 
+Be advised there are several other projects that have chosen similar names, so make sure you are on the right AutoTS code, papers, and documentation.
+
 ## Basic Use
 
 Input data for AutoTS is expected to come in either a *long* or a *wide* format:
 
@@ -12,8 +12,11 @@
 * The most recent data will generally be the most important
 * Forecasts are desired for the future immediately following the most recent data.
 
-# 0.6.6 🐌🐌🐌
-* bug fixes, particularly compatability for the archaic pandas 1.0.3 still used at a certain big tech company
+# 0.6.7 🇺🇦 🇺🇦 🇺🇦
+* Cassandra bug fix
+* isolated_only to anomaly methods
+* matse metric is possibly temporary and not added to per series weighting options
+* added HistoricValues transformer
 
 ### Unstable Upstream Pacakges (those that are frequently broken by maintainers)
 * Pytorch-Forecasting
 
@@ -26,7 +26,7 @@
 from autots.models.cassandra import Cassandra
 
 
-__version__ = '0.6.6'
+__version__ = '0.6.7'
 
 TransformTS = GeneralTransformer
 
 
@@ -239,6 +239,7 @@ def load_live_daily(
     caiso_query: str = "ENE_SLRS",
     timeout: float = 300.05,
     sleep_seconds: int = 2,
+    **kwargs,
 ):
     """Generates a dataframe of data up to the present day. Requires active internet connection.
     Try to be respectful of these free data sources by not calling too much too heavily.
 
@@ -40,11 +40,12 @@ def __init__(
         forecast_params=None,
         method_params={},
         eval_period=None,
+        isolated_only=False,
         n_jobs=1,
     ):
         """Detect anomalies on a historic dataset.
         Note anomaly score patterns vary by method.
-        Anomaly flag is standard -1 = anomaly; 1 = regular
+        Anomaly flag is standard -1 = anomaly; 1 = regular as per sklearn
 
         Args:
             output (str): 'multivariate' (each series unique outliers), or 'univariate' (all series together for one outlier flag per timestamp)
@@ -53,6 +54,7 @@ def __init__(
             forecast_params (dict): used to backcast and identify 'unforecastable' values, required only for predict_interval method
             method_params (dict): parameters specific to the method, use `.get_new_params()` to see potential models
             eval_periods (int): only use this length tail of data, currently only implemented for forecast_params forecasting if used
+            isolated_only (bool): if True, only standalone anomalies reported
             n_jobs (int): multiprocessing jobs, used by some methods
 
         Methods:
@@ -71,6 +73,7 @@ def __init__(
         self.forecast_params = forecast_params
         self.method_params = method_params
         self.eval_period = eval_period
+        self.isolated_only = isolated_only
         self.n_jobs = n_jobs
         self.anomaly_classifier = None
 
@@ -86,7 +89,7 @@ def detect(self, df):
         self.df_anomaly = df.copy()
         if self.transform_dict is not None:
             model = GeneralTransformer(
-                **self.transform_dict
+                verbose=2, **self.transform_dict
             )  # DATEPART, LOG, SMOOTHING, DIFF, CLIP OUTLIERS with high z score
             self.df_anomaly = model.fit_transform(self.df_anomaly)
 
@@ -109,6 +112,10 @@ def detect(self, df):
                 else:
                     self.df_anomaly = self.df_anomaly - backcast.forecast
 
+        if len(self.df_anomaly.columns) != len(df.columns):
+            raise ValueError(
+                f"anomaly returned a column mismatch from params {self.method_params} and {self.transform_dict}"
+            )
         if not all(self.df_anomaly.columns == df.columns):
             self.df_anomaly.columns = df.columns
 
@@ -130,6 +137,13 @@ def detect(self, df):
                 eval_period=self.eval_period,
                 n_jobs=self.n_jobs,
             )
+        if self.isolated_only:
+            # replace all anomalies (-1) except those which are isolated (1 before and after)
+            mask_minus_one = self.anomalies == -1
+            mask_prev_one = self.anomalies.shift(1) == 1
+            mask_next_one = self.anomalies.shift(-1) == 1
+            mask_replace = mask_minus_one & ~(mask_prev_one & mask_next_one)
+            self.anomalies[mask_replace] = 1
         return self.anomalies, self.scores
 
     def plot(self, series_name=None, title=None, plot_kwargs={}):
@@ -286,6 +300,8 @@ def __init__(
     def detect(self, df):
         """Run holiday detection. Input wide-style pandas time series."""
         self.anomaly_model.detect(df)
+        self.df = df
+        self.df_cols = df.columns
         if np.min(self.anomaly_model.anomalies.values) != -1:
             print("No anomalies detected.")
         (
@@ -312,8 +328,6 @@ def detect(self, df):
             use_islamic_holidays=self.use_islamic_holidays,
             use_hebrew_holidays=self.use_hebrew_holidays,
         )
-        self.df = df
-        self.df_cols = df.columns
 
     def plot_anomaly(self, kwargs={}):
         self.anomaly_model.plot(**kwargs)
 
@@ -2432,6 +2432,7 @@ def validation_aggregation(
         'mate': 'mean',
         'wasserstein': 'mean',
         'dwd': 'mean',
+        'matse': 'mean',
         'smape_weighted': 'mean',
         'mae_weighted': 'mean',
         'rmse_weighted': 'mean',
@@ -2451,6 +2452,7 @@ def validation_aggregation(
         'mate_weighted': 'mean',
         'wasserstein_weighted': 'mean',
         'dwd_weighted': 'mean',
+        'matse_weighted': 'mean',
         'containment_weighted': 'mean',
         'contour_weighted': 'mean',
         'TotalRuntimeSeconds': 'mean',
@@ -2535,6 +2537,7 @@ def generate_score(
     mate_weighting = metric_weighting.get('mate_weighting', 0)
     wasserstein_weighting = metric_weighting.get('wasserstein_weighting', 0)
     dwd_weighting = metric_weighting.get('dwd_weighting', 0)
+    matse_weighting = metric_weighting.get('matse_weighting', 0)
     # handle various runtime information records
     if 'TotalRuntimeSeconds' in model_results.columns:
         model_results['TotalRuntimeSeconds'] = np.where(
@@ -2652,6 +2655,12 @@ def generate_score(
             ].min()
             dwd_score = model_results['dwd_weighted'] / dwd_scaler
             overall_score = overall_score + (dwd_score * dwd_weighting)
+        if matse_weighting != 0:
+            matse_scaler = model_results['matse_weighted'][
+                model_results['matse_weighted'] != 0
+            ].min()
+            matse_score = model_results['matse_weighted'] / matse_scaler
+            overall_score = overall_score + (matse_score * matse_weighting)
         if smoothness_weighting != 0:
             smoothness_scaler = model_results['smoothness_weighted'][
                 model_results['smoothness_weighted'] != 0
 
@@ -2021,7 +2021,8 @@ def _run_template(
             )
         else:
             # trying to catch a rare and sneaky bug (perhaps some variety of beetle?)
-            print(f"TotalRuntime missing in {current_generation}!")
+            if verbose >= 0:
+                print(f"TotalRuntime missing in {current_generation}!")
             self.template_result_error = template_result.model_results.copy()
             self.template_error = template.copy()
         # gather results of template run
@@ -2665,7 +2666,6 @@ def _generate_mosaic_template(self, df_subset=None, models_to_use=None):
         for mos in mosaic_ensembles:
             try:
                 mosaic_config = parse_mosaic(mos)
-                print(mosaic_config)
                 # choose metric to optimize on
                 met = mosaic_config.get("metric", "mae")
                 if met in ["spl", "pl"]:
@@ -3662,6 +3662,38 @@ def plot_metric_corr(self, cols=None, percent_best=0.1):
         plt.title("Correlogram of Metric Correlations from Optimized Forecasts")
         return ax
 
+    def plot_transformer_failure_rate(self):
+        """Failure Rate per Transformer type (ignoring ensembles), failure may be due to other model or transformer."""
+        initial_results = self.results()
+        failures = []
+        successes = []
+        for idx, row in initial_results.iterrows():
+            failed = not pd.isnull(row['Exceptions'])
+            transforms = list(
+                json.loads(row['TransformationParameters'])
+                .get('transformations', {})
+                .values()
+            )
+            if failed:
+                failures = failures + transforms
+            else:
+                successes = successes + transforms
+        total = pd.concat(
+            [
+                pd.Series(failures).value_counts().rename("failures").to_frame(),
+                pd.Series(successes).value_counts().rename("successes"),
+            ],
+            axis=1,
+        ).fillna(0)
+        total['failure_rate'] = total['failures'] / (
+            total['successes'] + total['failures']
+        )
+        return (
+            total.sort_values("failure_rate", ascending=False)['failure_rate']
+            .iloc[0:20]
+            .plot(kind='bar', title='Transformers by Failure Rate', color='forestgreen')
+        )
+
     def diagnose_params(self, target='runtime', waterfall_plots=True):
         """Attempt to explain params causing measured outcomes using shap and linear regression coefficients.
 
 
@@ -681,6 +681,8 @@ def full_metric_evaluation(
         mate = np.abs(np.nansum(full_errors, axis=0))
     else:
         mate = np.abs(np.sum(full_errors, axis=0))
+    # possibly temporary
+    matse = mate / np.sum(A, axis=0)
 
     direc_sign = np.sign(F - last_of_array) == np.sign(A - last_of_array)
     weights = np.geomspace(1, 10, full_mae_errors.shape[0])[:, np.newaxis]
@@ -707,6 +709,7 @@ def full_metric_evaluation(
             # aggregate error
             'mage': mage,  # Gandalf approved
             'mate': mate,  # the British version, of course
+            'matse': matse,  # pronounced like the painter 'Matisse'
             'underestimate': np.nansum(np.where(~ovm, full_errors, 0), axis=0),
             'mle': msle(full_errors, full_mae_errors, log_errors, nan_flag=nan_flag),
             'overestimate': np.nansum(np.where(ovm, full_errors, 0), axis=0),
 
@@ -798,19 +798,25 @@ def rolling_trend(self, trend_residuals, t):
             axis=1,
         )
         wind = 30 if self.trend_window is None else self.trend_window
+        # the uneven fraction of the window goes at the ened
+        # and minus one is because there will always be at least one real point
         w_1 = wind - 1
         steps_ahd = int(w_1 / 2)
         y0 = np.repeat(np.array(trend_residuals[0:1]), steps_ahd, axis=0)
         # d0 = -1 * dates_2d[1 : y0.shape[0] + 1][::-1]
         start_pt = dates_2d[0, 0]
         step = dates_2d[1, 0] - start_pt
+        extra_step = y0.shape[0] + 1
+        # there's some weird float thing that can happen here I still don't understand
+        # when it produces one more step than expected
         d0 = np_2d_arange(
             start_pt,
-            stop=start_pt - ((y0.shape[0] + 1) * step),
+            stop=start_pt - (extra_step * step),
             step=-step,
             num_columns=dates_2d.shape[1],
-        )[1:][::-1]
+        )[1:extra_step][::-1]
         shape2 = (w_1 - steps_ahd, y0.shape[1])
+        # these combine a fake first half and fake last half window with real data in between
         y2 = np.concatenate(
             [
                 y0,
 
@@ -398,13 +398,17 @@ def retrieve_regressor(
     elif model_class in ['xgboost', 'XGBRegressor']:
         import xgboost as xgb
 
+        smaller_n_jobs = int(n_jobs / 2) if n_jobs > 3 else n_jobs
+
         if False:  # this is no longer necessary in 1.6 and beyond
             regr = MultiOutputRegressor(
                 xgb.XGBRegressor(verbosity=0, **model_param_dict, n_jobs=1),
-                n_jobs=n_jobs,
+                n_jobs=smaller_n_jobs,
             )
         else:
-            regr = xgb.XGBRegressor(verbosity=0, **model_param_dict, n_jobs=n_jobs)
+            regr = xgb.XGBRegressor(
+                verbosity=0, **model_param_dict, n_jobs=smaller_n_jobs
+            )
         return regr
     elif model_class == 'SVM':
         from sklearn.svm import LinearSVR
@@ -672,16 +676,16 @@ def retrieve_classifier(
 # these are models that are relatively fast with large multioutput Y, small n obs
 datepart_model_dict: dict = {
     # 'RandomForest': 0.05,  # crashes sometimes at scale for unclear reasons
-    'ElasticNet': 0.05,
-    'xgboost': 0.01,
+    'ElasticNet': 0.1,
+    'xgboost': 0.001,  # excess memory at scale
     'MLP': 0.05,
     'DecisionTree': 0.02,
     'Adaboost': 0.05,
     'SVM': 0.01,
     'KerasRNN': 0.02,
     'Transformer': 0.02,  # slow
     'ExtraTrees': 0.00001,  # some params cause RAM crash?
-    'RadiusNeighbors': 0.05,
+    'RadiusNeighbors': 0.1,
     'MultioutputGPR': 0.00001,
 }
 gpu = ['Transformer', 'KerasRNN', 'MLP']  # or more accurately, no dnn
@@ -888,15 +892,21 @@ def generate_regressor_params(
                 param_dict = {
                     "model": 'xgboost',
                     "model_params": {
+                        "booster": random.choices(['gbtree', 'gblinear'], [0.7, 0.3])[
+                            0
+                        ],
                         "objective": objective,
+                        "max_depth": random.choices(
+                            [6, 3, 2, 8], [0.6, 0.4, 0.2, 0.01]
+                        )[0],
                         "eta": random.choices(
                             [1.0, 0.3, 0.01, 0.03, 0.05, 0.003],
                             [0.05, 0.1, 0.1, 0.1, 0.1, 0.1],
                         )[
                             0
                         ],  # aka learning_rate
                         "min_child_weight": random.choices(
-                            [0.05, 0.5, 1, 2, 5], [0.1, 0.2, 0.8, 0.1, 0.1]
+                            [0.05, 0.5, 1, 2, 5, 10], [0.01, 0.05, 0.8, 0.1, 0.1, 0.1]
                         )[0],
                         "subsample": random.choices(
                             [1, 0.9, 0.7, 0.5], [0.9, 0.05, 0.05, 0.05]
@@ -2317,11 +2327,16 @@ def predict(
                 )
         self.X_pred.columns = [str(xc) for xc in self.X_pred.columns]
 
-        forecast = pd.DataFrame(
-            self.model.predict(self.X_pred.astype(float)),
-            index=index,
-            columns=self.column_names,
-        )
+        try:
+            forecast = pd.DataFrame(
+                self.model.predict(self.X_pred.astype(float)),
+                index=index,
+                columns=self.column_names,
+            )
+        except Exception as e:
+            raise ValueError(
+                f"Datepart prediction with params {self.get_params()} failed"
+            ) from e
 
         if just_point_forecast:
             return forecast