Capstone_SatCast_Trilemma/scripts/experimentation/hwes_window.py at main · yajing03/Capstone_SatCast_Trilemma · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# hwes_window.py
# author: Jenny Zhang
# date: 2025-06-18
"""
hwes_window.py

Runs Holt-Winters Exponential Smoothing (HWES) back-tests on Bitcoin-fee data
using **custom-defined weekly windowing schemes**, since HWES does not natively
support expanding/sliding windows.

Modes:
------
1. **Reverse expanding window** – fixes the *last* 24 hours (96 steps) as test set
   and expands training data backward in 7-day blocks, moving one week earlier
   in each fold.

2. **Weekly expanding window** – standard walk-forward CV: starts with one week of data,
   adds a week in each fold, and always predicts the next day (96 steps).

3. **Weekly sliding window** – rolls a fixed 7-day training window forward by 1 week
   per fold, always predicting the next day.

Workflow
--------
1. Load the 15-minute-resampled Parquet file created in the preprocessing phase.
2. Define window folds based on the selected mode.
3. For each fold:
   a. Fit a Holt-Winters model using `ExponentialSmoothing`
   b. Predict the next 96 time steps (1 day)
   c. Score against the test set using `eval_metrics()` → MAE, RMSE, MAPE, custom loss, etc.
4. Aggregate fold-wise metrics into a tidy DataFrame (`fold` as index).
5. Write results to a CSV at table folder

Key Features
------------
- **Daily horizon** = 96 × 15-minute intervals (1 day).
- Uses `statsmodels`' ExponentialSmoothing (supports additive/multiplicative trend/seasonality).
- Fold generation is handled manually since statsmodels lacks native splitters.
- Automatically saves one of:
  - `expanding_window_reverse_weekly_predictions.csv`
  - `expanding_window_weekly_predictions.csv`
  - `sliding_window_weekly_predictions.csv`

Typical Usage
-------------
1. Reverse expanding:
python scripts/experimentation/hwes_window.py \
  --parquet-path ./data/raw/mar_5_may_12.parquet \
  --mode reverse

2. Weekly expanding:
python scripts/experimentation/hwes_window.py \
  --parquet-path ./data/raw/mar_5_may_12.parquet \
  --mode expanding

3. Weekly sliding:
python scripts/experimentation/hwes_window.py \
  --parquet-path ./data/raw/mar_5_may_12.parquet \
  --mode sliding
"""

import os
import sys
import click
import warnings
import pandas as pd
import numpy as np
from pathlib import Path
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Suppress warnings to keep output clean
warnings.filterwarnings("ignore")

# Set project root path and append source directory to sys.path for imports
project_root = Path(__file__).resolve().parent.parent.parent
sys.path.append(str(project_root / "src"))

# Import helper functions
from preprocess_raw_parquet import preprocess_raw_parquet
from custom_loss_eval import eval_metrics

# Constants
DAILY = 96  # 24 hours × 4 (15-min intervals) = 96 steps
WEEK = 96 * 7  # 1 week of 15-min intervals
RESULTS_DIR = project_root / "results"  # Root for saving results


def get_folds(y, mode):
    """
    Generate train/test splits based on windowing strategy.

    Parameters:
    -----------
    y : pd.Series - full time series
    mode : str - one of ['reverse', 'expanding', 'sliding']

    Returns:
    --------
    list of (train_idx, test_idx) tuples
    """

    if mode == "reverse":
        # Fixed final day as test set, training expands backwards
        test_end = len(y)
        test_start = test_end - DAILY
        train_end = test_start
        n_folds = train_end // WEEK  # number of full weeks
        folds = []
        for i in range(1, n_folds + 1):
            train_start = max(0, train_end - i * WEEK)
            folds.append((list(range(train_start, train_end)), list(range(test_start, test_end))))
        return folds

    elif mode == "expanding":
        # Train grows weekly, test always the next day (96 steps)
        folds = []
        for i in range(1, (len(y) - DAILY) // WEEK + 1):  # +1 ensures last test window is included
            train_end = i * WEEK
            test_start = train_end
            test_end = test_start + DAILY
            if test_end <= len(y):  # extra safety check
                folds.append((list(range(train_end)), list(range(test_start, test_end))))
        return folds


    elif mode == "sliding":
        # Fixed window slides 1 week forward each fold
        folds = []
        for i in range(0, (len(y) - WEEK - DAILY) // WEEK + 1):
            train_start = i * WEEK
            train_end = train_start + WEEK
            test_start = train_end
            test_end = test_start + DAILY
            folds.append((list(range(train_start, train_end)), list(range(test_start, test_end))))
        return folds

    else:
        raise ValueError("Invalid mode. Choose from reverse, expanding, sliding.")


def run_hwes_cv(y, folds, results_path, mode, trend, seasonal, damped, periods):
    """
    Run HWES on each fold and evaluate performance.

    Parameters:
    -----------
    y : pd.Series - full series
    folds : list - index splits
    results_path : Path - output CSV path
    mode : str - windowing mode
    trend, seasonal, damped, periods : HWES parameters

    Saves:
    -------
    A CSV of per-fold metrics.
    """
    all_results = []

    for i, (train_idx, test_idx) in enumerate(folds):
        y_train = y.iloc[train_idx]
        y_test = y.iloc[test_idx]

        try:
            # Fit HWES model
            model = ExponentialSmoothing(
                y_train,
                trend=trend,
                seasonal=seasonal,
                seasonal_periods=periods if seasonal else None,
                damped_trend=damped
            )
            fit = model.fit(optimized=True, use_brute=True)

            # Forecast next day (96 steps)
            y_pred = fit.forecast(DAILY)

            # Evaluate
            result = eval_metrics(y_pred, y_test).T
            result["fold"] = i + 1
            all_results.append(result)

            # Print progress
            print(f"{mode.capitalize()} Fold {i + 1} — {y.index[train_idx[0]].date()} to {y.index[train_idx[-1]].date()}")

        except Exception as e:
            print(f"Fold {i + 1} failed: {e}")

    # Aggregate all fold results
    df = pd.concat(all_results)
    df.set_index("fold", inplace=True)

    # Ensure parent directory exists
    os.makedirs(results_path.parent, exist_ok=True)

    # Save results
    df.to_csv(results_path)
    print(f"Results saved to {results_path}")


@click.command()
@click.option('--parquet-path', type=str, required=True, help="Path to raw data")
@click.option('--mode', type=click.Choice(['reverse', 'expanding', 'sliding']), required=True, help="Windowing strategy")
def main(parquet_path, mode):
    """
    CLI entry point for HWES forecasting experiment.

    Parameters:
    -----------
    data : str - Parquet path
    mode : str - windowing strategy
    """
    # Load and reindex time series
    y = preprocess_raw_parquet(parquet_path)['recommended_fee_fastestFee'][:-96].astype(float).asfreq("15min")

    # Load best HWES parameters from prior random search
    cv_result_path = RESULTS_DIR / "tables" / "hwes" / "hwes_cv_results.csv"
    hyperparam_matrix = pd.read_csv(cv_result_path)
    best_trend = hyperparam_matrix.loc[0, 'trend']
    best_seasonal = hyperparam_matrix.loc[0, 'seasonal']
    best_damped = hyperparam_matrix.loc[0, 'damped']

    # Generate CV folds
    folds = get_folds(y, mode)

    # Choose filename based on mode
    filename_map = {
        "reverse": "expanding_window_reverse_weekly_predictions.csv",
        "expanding": "expanding_window_weekly_predictions.csv",
        "sliding": "sliding_window_weekly_predictions.csv"
    }
    results_path = RESULTS_DIR / "tables" / "hwes" / filename_map[mode]

    # Run cross-validation and save results
    run_hwes_cv(y, folds, results_path, mode, best_trend, best_seasonal, best_damped, DAILY)


if __name__ == "__main__":
    main()