Skip to content

Commit 4e4f6bd

Browse files
authored
Merge pull request #222 from winedarksea/dev
0.6.7
2 parents 1c28035 + a5c4746 commit 4e4f6bd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+545
-110
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ pip install autots
3434
```
3535
This includes dependencies for basic models, but [additonal packages](https://github.com/winedarksea/AutoTS/blob/master/extended_tutorial.md#installation-and-dependency-versioning) are required for some models and methods.
3636

37+
Be advised there are several other projects that have chosen similar names, so make sure you are on the right AutoTS code, papers, and documentation.
38+
3739
## Basic Use
3840

3941
Input data for AutoTS is expected to come in either a *long* or a *wide* format:

TODO.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212
* The most recent data will generally be the most important
1313
* Forecasts are desired for the future immediately following the most recent data.
1414

15-
# 0.6.6 🐌🐌🐌
16-
* bug fixes, particularly compatability for the archaic pandas 1.0.3 still used at a certain big tech company
15+
# 0.6.7 🇺🇦 🇺🇦 🇺🇦
16+
* Cassandra bug fix
17+
* isolated_only to anomaly methods
18+
* matse metric is possibly temporary and not added to per series weighting options
19+
* added HistoricValues transformer
1720

1821
### Unstable Upstream Pacakges (those that are frequently broken by maintainers)
1922
* Pytorch-Forecasting

autots/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from autots.models.cassandra import Cassandra
2727

2828

29-
__version__ = '0.6.6'
29+
__version__ = '0.6.7'
3030

3131
TransformTS = GeneralTransformer
3232

autots/datasets/_base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ def load_live_daily(
239239
caiso_query: str = "ENE_SLRS",
240240
timeout: float = 300.05,
241241
sleep_seconds: int = 2,
242+
**kwargs,
242243
):
243244
"""Generates a dataframe of data up to the present day. Requires active internet connection.
244245
Try to be respectful of these free data sources by not calling too much too heavily.

autots/evaluator/anomaly_detector.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,12 @@ def __init__(
4040
forecast_params=None,
4141
method_params={},
4242
eval_period=None,
43+
isolated_only=False,
4344
n_jobs=1,
4445
):
4546
"""Detect anomalies on a historic dataset.
4647
Note anomaly score patterns vary by method.
47-
Anomaly flag is standard -1 = anomaly; 1 = regular
48+
Anomaly flag is standard -1 = anomaly; 1 = regular as per sklearn
4849
4950
Args:
5051
output (str): 'multivariate' (each series unique outliers), or 'univariate' (all series together for one outlier flag per timestamp)
@@ -53,6 +54,7 @@ def __init__(
5354
forecast_params (dict): used to backcast and identify 'unforecastable' values, required only for predict_interval method
5455
method_params (dict): parameters specific to the method, use `.get_new_params()` to see potential models
5556
eval_periods (int): only use this length tail of data, currently only implemented for forecast_params forecasting if used
57+
isolated_only (bool): if True, only standalone anomalies reported
5658
n_jobs (int): multiprocessing jobs, used by some methods
5759
5860
Methods:
@@ -71,6 +73,7 @@ def __init__(
7173
self.forecast_params = forecast_params
7274
self.method_params = method_params
7375
self.eval_period = eval_period
76+
self.isolated_only = isolated_only
7477
self.n_jobs = n_jobs
7578
self.anomaly_classifier = None
7679

@@ -86,7 +89,7 @@ def detect(self, df):
8689
self.df_anomaly = df.copy()
8790
if self.transform_dict is not None:
8891
model = GeneralTransformer(
89-
**self.transform_dict
92+
verbose=2, **self.transform_dict
9093
) # DATEPART, LOG, SMOOTHING, DIFF, CLIP OUTLIERS with high z score
9194
self.df_anomaly = model.fit_transform(self.df_anomaly)
9295

@@ -109,6 +112,10 @@ def detect(self, df):
109112
else:
110113
self.df_anomaly = self.df_anomaly - backcast.forecast
111114

115+
if len(self.df_anomaly.columns) != len(df.columns):
116+
raise ValueError(
117+
f"anomaly returned a column mismatch from params {self.method_params} and {self.transform_dict}"
118+
)
112119
if not all(self.df_anomaly.columns == df.columns):
113120
self.df_anomaly.columns = df.columns
114121

@@ -130,6 +137,13 @@ def detect(self, df):
130137
eval_period=self.eval_period,
131138
n_jobs=self.n_jobs,
132139
)
140+
if self.isolated_only:
141+
# replace all anomalies (-1) except those which are isolated (1 before and after)
142+
mask_minus_one = self.anomalies == -1
143+
mask_prev_one = self.anomalies.shift(1) == 1
144+
mask_next_one = self.anomalies.shift(-1) == 1
145+
mask_replace = mask_minus_one & ~(mask_prev_one & mask_next_one)
146+
self.anomalies[mask_replace] = 1
133147
return self.anomalies, self.scores
134148

135149
def plot(self, series_name=None, title=None, plot_kwargs={}):
@@ -286,6 +300,8 @@ def __init__(
286300
def detect(self, df):
287301
"""Run holiday detection. Input wide-style pandas time series."""
288302
self.anomaly_model.detect(df)
303+
self.df = df
304+
self.df_cols = df.columns
289305
if np.min(self.anomaly_model.anomalies.values) != -1:
290306
print("No anomalies detected.")
291307
(
@@ -312,8 +328,6 @@ def detect(self, df):
312328
use_islamic_holidays=self.use_islamic_holidays,
313329
use_hebrew_holidays=self.use_hebrew_holidays,
314330
)
315-
self.df = df
316-
self.df_cols = df.columns
317331

318332
def plot_anomaly(self, kwargs={}):
319333
self.anomaly_model.plot(**kwargs)

autots/evaluator/auto_model.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2432,6 +2432,7 @@ def validation_aggregation(
24322432
'mate': 'mean',
24332433
'wasserstein': 'mean',
24342434
'dwd': 'mean',
2435+
'matse': 'mean',
24352436
'smape_weighted': 'mean',
24362437
'mae_weighted': 'mean',
24372438
'rmse_weighted': 'mean',
@@ -2451,6 +2452,7 @@ def validation_aggregation(
24512452
'mate_weighted': 'mean',
24522453
'wasserstein_weighted': 'mean',
24532454
'dwd_weighted': 'mean',
2455+
'matse_weighted': 'mean',
24542456
'containment_weighted': 'mean',
24552457
'contour_weighted': 'mean',
24562458
'TotalRuntimeSeconds': 'mean',
@@ -2535,6 +2537,7 @@ def generate_score(
25352537
mate_weighting = metric_weighting.get('mate_weighting', 0)
25362538
wasserstein_weighting = metric_weighting.get('wasserstein_weighting', 0)
25372539
dwd_weighting = metric_weighting.get('dwd_weighting', 0)
2540+
matse_weighting = metric_weighting.get('matse_weighting', 0)
25382541
# handle various runtime information records
25392542
if 'TotalRuntimeSeconds' in model_results.columns:
25402543
model_results['TotalRuntimeSeconds'] = np.where(
@@ -2652,6 +2655,12 @@ def generate_score(
26522655
].min()
26532656
dwd_score = model_results['dwd_weighted'] / dwd_scaler
26542657
overall_score = overall_score + (dwd_score * dwd_weighting)
2658+
if matse_weighting != 0:
2659+
matse_scaler = model_results['matse_weighted'][
2660+
model_results['matse_weighted'] != 0
2661+
].min()
2662+
matse_score = model_results['matse_weighted'] / matse_scaler
2663+
overall_score = overall_score + (matse_score * matse_weighting)
26552664
if smoothness_weighting != 0:
26562665
smoothness_scaler = model_results['smoothness_weighted'][
26572666
model_results['smoothness_weighted'] != 0

autots/evaluator/auto_ts.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2021,7 +2021,8 @@ def _run_template(
20212021
)
20222022
else:
20232023
# trying to catch a rare and sneaky bug (perhaps some variety of beetle?)
2024-
print(f"TotalRuntime missing in {current_generation}!")
2024+
if verbose >= 0:
2025+
print(f"TotalRuntime missing in {current_generation}!")
20252026
self.template_result_error = template_result.model_results.copy()
20262027
self.template_error = template.copy()
20272028
# gather results of template run
@@ -2665,7 +2666,6 @@ def _generate_mosaic_template(self, df_subset=None, models_to_use=None):
26652666
for mos in mosaic_ensembles:
26662667
try:
26672668
mosaic_config = parse_mosaic(mos)
2668-
print(mosaic_config)
26692669
# choose metric to optimize on
26702670
met = mosaic_config.get("metric", "mae")
26712671
if met in ["spl", "pl"]:
@@ -3662,6 +3662,38 @@ def plot_metric_corr(self, cols=None, percent_best=0.1):
36623662
plt.title("Correlogram of Metric Correlations from Optimized Forecasts")
36633663
return ax
36643664

3665+
def plot_transformer_failure_rate(self):
3666+
"""Failure Rate per Transformer type (ignoring ensembles), failure may be due to other model or transformer."""
3667+
initial_results = self.results()
3668+
failures = []
3669+
successes = []
3670+
for idx, row in initial_results.iterrows():
3671+
failed = not pd.isnull(row['Exceptions'])
3672+
transforms = list(
3673+
json.loads(row['TransformationParameters'])
3674+
.get('transformations', {})
3675+
.values()
3676+
)
3677+
if failed:
3678+
failures = failures + transforms
3679+
else:
3680+
successes = successes + transforms
3681+
total = pd.concat(
3682+
[
3683+
pd.Series(failures).value_counts().rename("failures").to_frame(),
3684+
pd.Series(successes).value_counts().rename("successes"),
3685+
],
3686+
axis=1,
3687+
).fillna(0)
3688+
total['failure_rate'] = total['failures'] / (
3689+
total['successes'] + total['failures']
3690+
)
3691+
return (
3692+
total.sort_values("failure_rate", ascending=False)['failure_rate']
3693+
.iloc[0:20]
3694+
.plot(kind='bar', title='Transformers by Failure Rate', color='forestgreen')
3695+
)
3696+
36653697
def diagnose_params(self, target='runtime', waterfall_plots=True):
36663698
"""Attempt to explain params causing measured outcomes using shap and linear regression coefficients.
36673699

autots/evaluator/metrics.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -681,6 +681,8 @@ def full_metric_evaluation(
681681
mate = np.abs(np.nansum(full_errors, axis=0))
682682
else:
683683
mate = np.abs(np.sum(full_errors, axis=0))
684+
# possibly temporary
685+
matse = mate / np.sum(A, axis=0)
684686

685687
direc_sign = np.sign(F - last_of_array) == np.sign(A - last_of_array)
686688
weights = np.geomspace(1, 10, full_mae_errors.shape[0])[:, np.newaxis]
@@ -707,6 +709,7 @@ def full_metric_evaluation(
707709
# aggregate error
708710
'mage': mage, # Gandalf approved
709711
'mate': mate, # the British version, of course
712+
'matse': matse, # pronounced like the painter 'Matisse'
710713
'underestimate': np.nansum(np.where(~ovm, full_errors, 0), axis=0),
711714
'mle': msle(full_errors, full_mae_errors, log_errors, nan_flag=nan_flag),
712715
'overestimate': np.nansum(np.where(ovm, full_errors, 0), axis=0),

autots/models/cassandra.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -798,19 +798,25 @@ def rolling_trend(self, trend_residuals, t):
798798
axis=1,
799799
)
800800
wind = 30 if self.trend_window is None else self.trend_window
801+
# the uneven fraction of the window goes at the ened
802+
# and minus one is because there will always be at least one real point
801803
w_1 = wind - 1
802804
steps_ahd = int(w_1 / 2)
803805
y0 = np.repeat(np.array(trend_residuals[0:1]), steps_ahd, axis=0)
804806
# d0 = -1 * dates_2d[1 : y0.shape[0] + 1][::-1]
805807
start_pt = dates_2d[0, 0]
806808
step = dates_2d[1, 0] - start_pt
809+
extra_step = y0.shape[0] + 1
810+
# there's some weird float thing that can happen here I still don't understand
811+
# when it produces one more step than expected
807812
d0 = np_2d_arange(
808813
start_pt,
809-
stop=start_pt - ((y0.shape[0] + 1) * step),
814+
stop=start_pt - (extra_step * step),
810815
step=-step,
811816
num_columns=dates_2d.shape[1],
812-
)[1:][::-1]
817+
)[1:extra_step][::-1]
813818
shape2 = (w_1 - steps_ahd, y0.shape[1])
819+
# these combine a fake first half and fake last half window with real data in between
814820
y2 = np.concatenate(
815821
[
816822
y0,

autots/models/sklearn.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -398,13 +398,17 @@ def retrieve_regressor(
398398
elif model_class in ['xgboost', 'XGBRegressor']:
399399
import xgboost as xgb
400400

401+
smaller_n_jobs = int(n_jobs / 2) if n_jobs > 3 else n_jobs
402+
401403
if False: # this is no longer necessary in 1.6 and beyond
402404
regr = MultiOutputRegressor(
403405
xgb.XGBRegressor(verbosity=0, **model_param_dict, n_jobs=1),
404-
n_jobs=n_jobs,
406+
n_jobs=smaller_n_jobs,
405407
)
406408
else:
407-
regr = xgb.XGBRegressor(verbosity=0, **model_param_dict, n_jobs=n_jobs)
409+
regr = xgb.XGBRegressor(
410+
verbosity=0, **model_param_dict, n_jobs=smaller_n_jobs
411+
)
408412
return regr
409413
elif model_class == 'SVM':
410414
from sklearn.svm import LinearSVR
@@ -672,16 +676,16 @@ def retrieve_classifier(
672676
# these are models that are relatively fast with large multioutput Y, small n obs
673677
datepart_model_dict: dict = {
674678
# 'RandomForest': 0.05, # crashes sometimes at scale for unclear reasons
675-
'ElasticNet': 0.05,
676-
'xgboost': 0.01,
679+
'ElasticNet': 0.1,
680+
'xgboost': 0.001, # excess memory at scale
677681
'MLP': 0.05,
678682
'DecisionTree': 0.02,
679683
'Adaboost': 0.05,
680684
'SVM': 0.01,
681685
'KerasRNN': 0.02,
682686
'Transformer': 0.02, # slow
683687
'ExtraTrees': 0.00001, # some params cause RAM crash?
684-
'RadiusNeighbors': 0.05,
688+
'RadiusNeighbors': 0.1,
685689
'MultioutputGPR': 0.00001,
686690
}
687691
gpu = ['Transformer', 'KerasRNN', 'MLP'] # or more accurately, no dnn
@@ -888,15 +892,21 @@ def generate_regressor_params(
888892
param_dict = {
889893
"model": 'xgboost',
890894
"model_params": {
895+
"booster": random.choices(['gbtree', 'gblinear'], [0.7, 0.3])[
896+
0
897+
],
891898
"objective": objective,
899+
"max_depth": random.choices(
900+
[6, 3, 2, 8], [0.6, 0.4, 0.2, 0.01]
901+
)[0],
892902
"eta": random.choices(
893903
[1.0, 0.3, 0.01, 0.03, 0.05, 0.003],
894904
[0.05, 0.1, 0.1, 0.1, 0.1, 0.1],
895905
)[
896906
0
897907
], # aka learning_rate
898908
"min_child_weight": random.choices(
899-
[0.05, 0.5, 1, 2, 5], [0.1, 0.2, 0.8, 0.1, 0.1]
909+
[0.05, 0.5, 1, 2, 5, 10], [0.01, 0.05, 0.8, 0.1, 0.1, 0.1]
900910
)[0],
901911
"subsample": random.choices(
902912
[1, 0.9, 0.7, 0.5], [0.9, 0.05, 0.05, 0.05]
@@ -2317,11 +2327,16 @@ def predict(
23172327
)
23182328
self.X_pred.columns = [str(xc) for xc in self.X_pred.columns]
23192329

2320-
forecast = pd.DataFrame(
2321-
self.model.predict(self.X_pred.astype(float)),
2322-
index=index,
2323-
columns=self.column_names,
2324-
)
2330+
try:
2331+
forecast = pd.DataFrame(
2332+
self.model.predict(self.X_pred.astype(float)),
2333+
index=index,
2334+
columns=self.column_names,
2335+
)
2336+
except Exception as e:
2337+
raise ValueError(
2338+
f"Datepart prediction with params {self.get_params()} failed"
2339+
) from e
23252340

23262341
if just_point_forecast:
23272342
return forecast

0 commit comments

Comments
 (0)