From 27e0a511a4bb91b024fd61e8630f5aa1a1db9d51 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 9 Oct 2024 11:07:12 -0500 Subject: [PATCH 1/8] fix attribute that caused error --- jobs/kpi-forecasting/kpi_forecasting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting.py b/jobs/kpi-forecasting/kpi_forecasting.py index e340a567..12544b3e 100644 --- a/jobs/kpi-forecasting/kpi_forecasting.py +++ b/jobs/kpi-forecasting/kpi_forecasting.py @@ -71,7 +71,7 @@ def get_predict_dates(self, observed_df): or self._default_end_date() ) return pd.DataFrame( - {"submission_date": pd.date_range(start_date, end_date).date} + {"submission_date": pd.date_range(self.start_date, self.end_date).date} ) def fit(self, observed_df): From 85037e3a6bb88b6d4674b25a2b1dce874c780d90 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 9 Oct 2024 11:30:08 -0500 Subject: [PATCH 2/8] add integration test to ensure code runs end-to-end --- .circleci/config.yml | 5 +++++ jobs/kpi-forecasting/kpi_forecasting.py | 9 ++++----- jobs/kpi-forecasting/kpi_forecasting/inputs.py | 7 +++++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5a7c497c..477fbd4c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -258,6 +258,11 @@ jobs: - run: name: Test Code command: docker run app:build pytest --ruff --ruff-format + - run: + name: integration test for kpi + command: | + docker un app:build python ./kpi_forecasting.py -c ./kpi_forecasting/configs/dau_mobile.yaml --no-write + docker un app:build python ./kpi_forecasting.py -c ./kpi_forecasting/configs/dau_desktop.yaml --no-write build-job-mozaggregator2bq: diff --git a/jobs/kpi-forecasting/kpi_forecasting.py b/jobs/kpi-forecasting/kpi_forecasting.py index 12544b3e..2cd0292e 100644 --- a/jobs/kpi-forecasting/kpi_forecasting.py +++ b/jobs/kpi-forecasting/kpi_forecasting.py @@ -1,7 +1,6 @@ import pandas as pd from datetime import datetime, timezone, timedelta import json -import pickle from kpi_forecasting.inputs import CLI, load_yaml from kpi_forecasting.models.prophet_forecast import ( @@ -160,6 +159,8 @@ def _default_end_date(self) -> str: def main() -> None: # Load the config config_path = CLI().args.config + will_write = CLI().args.write + print(will_write) pipeline = KPIPipeline(config_path) @@ -169,10 +170,8 @@ def main() -> None: summarized = pipeline.predict_and_summarize( fit_model, predict_dates.copy(), observed_df ) - pipeline.write_results(fit_model, summarized, predict_dates.copy()) - - with open("main_model.pkl", "wb") as f: - pickle.dump(fit_model, f) + if will_write: + pipeline.write_results(fit_model, summarized, predict_dates.copy()) if __name__ == "__main__": diff --git a/jobs/kpi-forecasting/kpi_forecasting/inputs.py b/jobs/kpi-forecasting/kpi_forecasting/inputs.py index 14da5545..4271e47e 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/inputs.py +++ b/jobs/kpi-forecasting/kpi_forecasting/inputs.py @@ -16,6 +16,13 @@ def __post_init__(self) -> None: self.parser.add_argument( "-c", "--config", type=str, help="Path to configuration yaml file" ) + self.parser.add_argument( + "--write", + type=bool, + help="If true, write results", + default=True, + action=argparse.BooleanOptionalAction, + ) self.args = self.parser.parse_args() From 24f6b34d41de696a945623260c57f96c10ee392b Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 9 Oct 2024 11:33:33 -0500 Subject: [PATCH 3/8] fix typo --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 477fbd4c..67152e2a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -261,8 +261,8 @@ jobs: - run: name: integration test for kpi command: | - docker un app:build python ./kpi_forecasting.py -c ./kpi_forecasting/configs/dau_mobile.yaml --no-write - docker un app:build python ./kpi_forecasting.py -c ./kpi_forecasting/configs/dau_desktop.yaml --no-write + docker run app:build python ./kpi_forecasting.py -c ./kpi_forecasting/configs/dau_mobile.yaml --no-write + docker run app:build python ./kpi_forecasting.py -c ./kpi_forecasting/configs/dau_desktop.yaml --no-write build-job-mozaggregator2bq: From 685e92acc249749e58712fbfb82f00502bc4ca61 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 9 Oct 2024 11:40:37 -0500 Subject: [PATCH 4/8] revert CI --- .circleci/config.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 67152e2a..5a7c497c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -258,11 +258,6 @@ jobs: - run: name: Test Code command: docker run app:build pytest --ruff --ruff-format - - run: - name: integration test for kpi - command: | - docker run app:build python ./kpi_forecasting.py -c ./kpi_forecasting/configs/dau_mobile.yaml --no-write - docker run app:build python ./kpi_forecasting.py -c ./kpi_forecasting/configs/dau_desktop.yaml --no-write build-job-mozaggregator2bq: From 184675e3910fd7e91f7ee5382b4ec5138eddc55f Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 9 Oct 2024 11:42:41 -0500 Subject: [PATCH 5/8] updated README --- jobs/kpi-forecasting/README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md index 2f4c51e4..cffc3299 100644 --- a/jobs/kpi-forecasting/README.md +++ b/jobs/kpi-forecasting/README.md @@ -66,8 +66,7 @@ install_name_tool -add_rpath /PATH/TO/CONDA/envs/kpi-forecasting-dev/lib/cmdstan ### Running locally A metric can be forecasted by using a command line argument that passes the relevant YAML file to the `kpi_forecasting.py` script. -[Here are approaches for accessing a Docker container's terminal](https://docs.docker.com/desktop/use-desktop/container/#integrated-terminal). - +[Here are approaches for accessing a Docker container's terminal](https://docs.docker.com/desktop/use-desktop/container/#integrated-terminal). The `--no-write` argument can also be passed to essentially run a test and ensure the pipeline runs end-to-end. For example, the following command forecasts Desktop DAU numbers: ```sh @@ -182,3 +181,8 @@ The forecast objects in this repo implement an interface similar to `sklearn` or The `BaseEnsembleForecast` makes it possible to fit multiple models over the data, where different subsets of the data have different models applied to them. These subsets are referred to as "segments" in the code. Only one kind of model is supported, and different instances of this model are fit over the different segments. The type of model is set by the `model_class` argument, and should be a class that implements the same interface as `BaseForecast`. The `fit` and `predict` methods in `BaseEnsembleForecast` determine which segment each row of incoming data belongs to and uses the `fit` and `predict` methods of the model class on the segment. This can be seen in the `FunnelForecast` object, which uses the `BaseEnsembleForecast` with `ProphetAutotunerForecast` as the model_class. +## Testing +Before merging, run the pipeline with the `--no-write` flag to ensure it runs end-to-end, IE: + +`python ./kpi_forecasting.py --no-write -c ./kpi_forecasting/configs/dau_mobile.yaml` + From 34fb8e0722a6b345cf97018b8db4ff7ceab5de05 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 9 Oct 2024 11:45:41 -0500 Subject: [PATCH 6/8] remove print --- jobs/kpi-forecasting/kpi_forecasting.py | 1 - 1 file changed, 1 deletion(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting.py b/jobs/kpi-forecasting/kpi_forecasting.py index 2cd0292e..a5ee32ce 100644 --- a/jobs/kpi-forecasting/kpi_forecasting.py +++ b/jobs/kpi-forecasting/kpi_forecasting.py @@ -160,7 +160,6 @@ def main() -> None: # Load the config config_path = CLI().args.config will_write = CLI().args.write - print(will_write) pipeline = KPIPipeline(config_path) From 69d120ab0bc12451ce101f69762db1e3ccead77c Mon Sep 17 00:00:00 2001 From: Brad Ochocki Szasz Date: Wed, 9 Oct 2024 09:19:04 -0700 Subject: [PATCH 7/8] Revert "Implement SKLearn interface (#272)" (#284) This reverts commit b5740d8ecaa9b9d1e84499c015f31b4858851320. (cherry picked from commit 73e76dfd6489c4dcc11b9e99375fd32289a83722) --- jobs/kpi-forecasting/README.md | 84 +- jobs/kpi-forecasting/kpi_forecasting.py | 37 +- .../kpi_forecasting/configs/dau_desktop.yaml | 12 +- .../kpi_forecasting/configs/dau_mobile.yaml | 11 +- .../configs/search_forecasting_ad_clicks.yaml | 70 +- ...search_forecasting_daily_active_users.yaml | 64 +- .../search_forecasting_search_count.yaml | 62 +- .../kpi_forecasting/models/base_forecast.py | 370 ++- .../kpi_forecasting/models/funnel_forecast.py | 976 ++++--- .../models/prophet_forecast.py | 941 +++---- .../kpi_forecasting/pandas_extras.py | 33 +- .../tests/test_base_forecast.py | 651 ++--- .../tests/test_funnel_forecast.py | 2344 +++++++++-------- .../tests/test_pandas_extras.py | 158 -- .../tests/test_prophet_forecast.py | 1794 +++---------- 15 files changed, 3144 insertions(+), 4463 deletions(-) diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md index cffc3299..9ef09687 100644 --- a/jobs/kpi-forecasting/README.md +++ b/jobs/kpi-forecasting/README.md @@ -1,11 +1,6 @@ # KPI and other Metric Forecasting -This job forecasts [Metric Hub](https://mozilla.acryl.io/glossaryNode/urn:li:glossaryNode:Metric%20Hub/Contents?is_lineage_mode=false) metrics based on YAML configs defined in `.kpi-forecasting/configs`. The output destinations in BigQuery for each config can be found in the `write_results` section. Note that different configs can write to the same table. - -Currently the forecasts are all done by Prophet. There are two classes used: - - `models/prophet_forecast.py/ProphetForecast` Fits a single prophet model on the entire dataset, configured as specified in the config file - - `models/funnel_forecast.py/FunnelForecast` Fits multiple models based on what segment they fall into. Segments are defined in the `metric_hub.segments` in which columns in the data are specified to used for segmentation. The data is partitioned into subsets based on all the different combinations of values the specified columns can take. A subset of the parameters can be used to specify parameters for partitions with specific values on those parameters. For funnel forecast, the `parameters` section of the config is a list, each element of which specifies configuration to be applied to partitions where the columns and values within those columns have the values of the keys and values of the `parameters.segement` fields respectively. The segmentation functionality is defined in `models/base_forecast.py/BaseEnsembleForecast`. Additionally, funnel forecast has automatic hyperparameter tuning which is implemented by `models/funnel_forecast.py/ProphetAutotunerForecast`. - +This job forecasts [Metric Hub](https://mozilla.acryl.io/glossaryNode/urn:li:glossaryNode:Metric%20Hub/Contents?is_lineage_mode=false) metrics based on YAML configs defined in `.kpi-forecasting/configs`. # Usage @@ -43,11 +38,16 @@ Note that if the code changes, `docker compose build` needs to be re-run for `do ## Local Python ### Setup -You can also run the code outside of a Docker container. The code below shows to create a new environment +You can also run the code outside of a Docker container. The code below creates a new Conda environment called `kpi-forecasting-dev`. +It assumes you have Conda installed. If you'd like to run the code in a Jupyter notebook, it is handy to install Jupyter in your `base` environment. +The `ipykernel` commands below will ensure that the `kpi-forecasting-dev` environment is made available to Jupyter. + ```sh -pyenv virtualenv 3.9.17 -pyenv activate +conda create --name kpi-forecasting-dev python=3.10 pip ipykernel +conda activate kpi-forecasting-dev +ipython kernel install --name kpi-forecasting-dev --user pip install -r requirements.txt +conda deactivate ``` If you're running on an M1 Mac, there are [currently some additional steps](https://github.com/facebook/prophet/issues/2250#issuecomment-1317709209) that you'll need to take to get Prophet running. From within your python environment, run the following (making sure to update the path appropriately): @@ -107,47 +107,49 @@ metric_hub: # this configures the observed data fed to the model which is obtai partner: "partner" where: "partner = 'Google'" # filter to apply to the metric hub pull -forecast_model: # this section configures the model - forecast_start: NULL +forecast_model: # this section configures the model + model_type: "funnel" + # type of model object to use, current options are "funnel" for FunnelForecast and "prophet" for ProphetForecast + start_date: NULL # starting date for the predicted data (unless predict_historical_dates is set), # if unset, value depends on predict_historical_dates. - forecast_end: NULL + end_date: NULL # final date for the predicted data + use_all_us_holidays: False + For prophet-based models, when true, call `model.add_country_holidays(country_name="US")` on model predict_historical_dates: True # if predict_historical_dates is True, set to first date of the observed data # if predict_historical_dates is False, defaults to the day after the last day in the observed data + number_of_simulations: 1000 + # for prophet-based models,number of simulations to run parameters: # this section can be a map or a list. # If it's a map, these parameters are used for all models # (recall multiple models are train if there is a metric_hub.segments) # If it's a list, it will set different parameters # for different subsets of the parition specified in `metric_hub.segments`. - - segment: - # specifies which subset of the partitions this applies to - # key is a column specified in metric_hub.segments - # value is a value that column can take to which the configuration is applied + - segment: + # specifies which subset of the partitions this applies to + # key is a column specified in metric_hub.segments + # value is a value that column can take to which the configuration is applied device: desktop - start_date: "2018-01-01" - # start date specific to a segment, superceeds - # forecast_start_date - parameters: - holidays: ["easter", "covid_sip11"] - # holidays specified in `configs.model_inputs.holidays` to use. - regressors: ["post_esr_migration", "in_covid"] - # regressors specified in `configs.model_inputs.regressors` - use_all_us_holidays: False - grid_parameters: - # sets grid for hyperparameter tuning - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - weekly_seasonality: True - yearly_seasonality: True - cv_settings: - # sets parameters for prophet cross-validation used in FunnelForecast - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + start_date: "2018-01-01" # only applies to FunnelForecast, allows one to set start date for each sub-model + end_date: NULL # only applies to FunnelForecast, allows one to set end date for each sub-model + holidays: ["easter", "covid_sip11"] # holidays specified in `configs.model_inputs.holidays` to use. + regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] # regressors specified in `configs.model_inputs.regressors` + grid_parameters: + # sets grid for hyperparameter tuning + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] # parameter of prior distribution controlling how much the trend fluctuates at changepoints + changepoint_range: [0.8, 0.9] # the proportion of the time series over which the changepoints are distributed + n_changepoints: [25, 50] # number of trend changepoints, equally spaced over the time series + weekly_seasonality: True # if weekly seasonality is included in the model + yearly_seasonality: True # if yearly seasonality is included in the model + cv_settings: + # sets parameters for prophet cross-validation used in FunnelForecast + initial: "1296 days" # the initial training period, used to train the first iteration of the model for CV + period: "30 days" # spacing between cutoff dates, the sliding window over which each round of cross validation is performed + horizon: "30 days" # forecast horizon used to make predictions and calculate model fit metrics for optimization + parallel: "processes" # how parallelization is performed by Prophet, or None if no paralellization is used ... summarize: @@ -172,9 +174,13 @@ write_results: - `./kpi_forecasting/models` contains the forecasting models. This repo was designed to make it simple to add new forecasting models in the future. In general, a model needs to inherit -the `models.base_forecast.BaseForecast` class and to implement the `fit` and `predict` methods. +the `models.base_forecast.BaseForecast` class and to implement the `_fit` and `_predict` methods. Output from the `_fit` method will automatically be validated by `BaseForecast._validate_forecast_df`. + +One caveat is that, in order for aggregations over time periods to work (e.g. monthly forecasts), the `_predict` method must generate a number +of simulated timeseries. This enables the measurement of variation across a range of possible outcomes. This number is set by `BaseForecast.number_of_simulations`. When testing locally, be sure to modify any config files to use non-production `project` and `dataset` values that you have write access to; otherwise the `write_output` step will fail. +<<<<<<< HEAD ## Interface The forecast objects in this repo implement an interface similar to `sklearn` or `darts`. Every forecast method should have a `fit` method for fitting the forecast and `predict` method for making predictions. The signature of these functions can be seen in `models.base_forecast.BaseForecast`. @@ -186,3 +192,5 @@ Before merging, run the pipeline with the `--no-write` flag to ensure it runs en `python ./kpi_forecasting.py --no-write -c ./kpi_forecasting/configs/dau_mobile.yaml` +======= +>>>>>>> 73e76df (Revert "Implement SKLearn interface (#272)" (#284)) diff --git a/jobs/kpi-forecasting/kpi_forecasting.py b/jobs/kpi-forecasting/kpi_forecasting.py index a5ee32ce..645f714e 100644 --- a/jobs/kpi-forecasting/kpi_forecasting.py +++ b/jobs/kpi-forecasting/kpi_forecasting.py @@ -1,19 +1,13 @@ +<<<<<<< HEAD import pandas as pd from datetime import datetime, timezone, timedelta import json +======= +>>>>>>> 73e76df (Revert "Implement SKLearn interface (#272)" (#284)) from kpi_forecasting.inputs import CLI, load_yaml -from kpi_forecasting.models.prophet_forecast import ( - ProphetForecast, - summarize as prophet_summarize, - write_results as prophet_write_results, - summarize_legacy as prophet_summarize_legacy, -) -from kpi_forecasting.models.funnel_forecast import ( - FunnelForecast, - summarize as funnel_summarize, - write_results as funnel_write_results, -) +from kpi_forecasting.models.prophet_forecast import ProphetForecast +from kpi_forecasting.models.funnel_forecast import FunnelForecast from kpi_forecasting.metric_hub import MetricHub @@ -24,6 +18,7 @@ } +<<<<<<< HEAD class KPIPipeline: def __init__(self, config_path): self.config_data = load_yaml(filepath=config_path) @@ -160,9 +155,18 @@ def main() -> None: # Load the config config_path = CLI().args.config will_write = CLI().args.write +======= +def main() -> None: + # Load the config + config = load_yaml(filepath=CLI().args.config) + model_type = config["forecast_model"]["model_type"] +>>>>>>> 73e76df (Revert "Implement SKLearn interface (#272)" (#284)) - pipeline = KPIPipeline(config_path) + if model_type in MODELS: + metric_hub = MetricHub(**config["metric_hub"]) + model = MODELS[model_type](metric_hub=metric_hub, **config["forecast_model"]) +<<<<<<< HEAD observed_df = pipeline.get_historical_data() fit_model = pipeline.fit(observed_df=observed_df) predict_dates = pipeline.get_predict_dates(observed_df) @@ -171,6 +175,15 @@ def main() -> None: ) if will_write: pipeline.write_results(fit_model, summarized, predict_dates.copy()) +======= + model.fit() + model.predict() + model.summarize(**config["summarize"]) + model.write_results(**config["write_results"]) + + else: + raise ValueError(f"Don't know how to forecast using {model_type}.") +>>>>>>> 73e76df (Revert "Implement SKLearn interface (#272)" (#284)) if __name__ == "__main__": diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml index 3476302c..0b8966f2 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml @@ -1,5 +1,4 @@ --- -model_type: prophet metric_hub: app_name: "firefox_desktop" slug: "daily_active_users_v2" @@ -8,16 +7,17 @@ metric_hub: end_date: NULL forecast_model: - forecast_start: NULL - forecast_end: NULL + model_type: "prophet" + start_date: NULL + end_date: NULL + use_all_us_holidays: False predict_historical_dates: False + number_of_simulations: 1000 parameters: seasonality_prior_scale: 0.00825 changepoint_prior_scale: 0.15983 weekly_seasonality: True yearly_seasonality: True - use_all_us_holidays: False - summarize: periods: ["day", "month"] @@ -30,5 +30,3 @@ write_results: dataset: "telemetry_derived" dataset_legacy: "telemetry_derived" table: "kpi_forecasts_v0" - forecast_table_legacy: "kpi_automated_forecast_v1" - confidences_table_legacy: "kpi_automated_forecast_confidences_v1" diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml index 5ebd1686..c9288408 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml @@ -1,5 +1,4 @@ --- -model_type: prophet metric_hub: app_name: "multi_product" slug: "mobile_daily_active_users_v1" @@ -8,15 +7,17 @@ metric_hub: end_date: NULL forecast_model: - forecast_start: NULL - forecast_end: NULL + model_type: "prophet" + start_date: NULL + end_date: NULL + use_all_us_holidays: True predict_historical_dates: False + number_of_simulations: 1000 parameters: seasonality_prior_scale: 0.01 changepoint_prior_scale: 0.01 weekly_seasonality: True yearly_seasonality: True - use_all_us_holidays: True summarize: periods: ["day", "month"] @@ -29,5 +30,3 @@ write_results: dataset: "telemetry_derived" dataset_legacy: "telemetry_derived" table: "kpi_forecasts_v0" - forecast_table_legacy: "kpi_automated_forecast_v1" - confidences_table_legacy: "kpi_automated_forecast_confidences_v1" diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml index a08efd49..7a01aa15 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml @@ -1,5 +1,4 @@ --- -model_type: funnel metric_hub: app_name: "multi_product" slug: "search_forecasting_ad_clicks" @@ -14,47 +13,48 @@ metric_hub: where: "partner = 'Google'" forecast_model: - forecast_start: NULL - forecast_end: NULL + model_type: "funnel" + start_date: NULL + end_date: NULL + use_all_us_holidays: False predict_historical_dates: True + number_of_simulations: 1000 parameters: - - segment: + - segment: device: desktop start_date: "2018-01-01" - parameters: - holidays: ["easter", "covid_sip11"] - regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] - use_all_us_holidays: False - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - n_changepoints: [25, 50] - weekly_seasonality: True - yearly_seasonality: True - cv_settings: - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + end_date: NULL + holidays: ["easter", "covid_sip11"] + regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + n_changepoints: [25, 50] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" - segment: device: mobile start_date: "2022-01-01" - parameters: - holidays: ["easter"] - regressors: ["after_fenix", "in_covid"] - use_all_us_holidays: False - grid_parameters: - changepoint_prior_scale: [.01, .1, .15, .2] - changepoint_range: [0.8, 0.9] - n_changepoints: [30] - weekly_seasonality: True - yearly_seasonality: True - growth: "logistic" - cv_settings: - initial: "366 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + end_date: NULL + holidays: ["easter"] + regressors: ["after_fenix", "in_covid"] + grid_parameters: + changepoint_prior_scale: [.01, .1, .15, .2] + changepoint_range: [0.8, 0.9, 1] + n_changepoints: [30] + weekly_seasonality: True + yearly_seasonality: True + growth: "logistic" + cv_settings: + initial: "366 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" summarize: periods: ["day", "month"] diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml index e87472c2..dfb7bb49 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml @@ -1,5 +1,4 @@ --- -model_type: funnel metric_hub: app_name: "multi_product" slug: "search_forecasting_daily_active_users" @@ -14,44 +13,45 @@ metric_hub: where: "partner = 'Google'" forecast_model: - forecast_start: NULL - forecast_end: NULL + model_type: "funnel" + start_date: NULL + end_date: NULL + use_all_us_holidays: False predict_historical_dates: True + number_of_simulations: 1000 parameters: - segment: device: desktop start_date: "2018-01-01" - parameters: - holidays: ["easter", "covid_sip11"] - regressors: ["post_esr_migration", "in_covid"] - use_all_us_holidays: False - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - weekly_seasonality: True - yearly_seasonality: True - cv_settings: - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + end_date: NULL + holidays: ["easter", "covid_sip11"] + regressors: ["post_esr_migration", "in_covid"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" - segment: device: mobile start_date: "2021-01-01" - parameters: - holidays: ["easter"] - regressors: ["after_fenix", "in_covid"] - use_all_us_holidays: False - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1] - weekly_seasonality: True - yearly_seasonality: True - growth: "logistic" - cv_settings: - initial: "366 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + end_date: NULL + holidays: ["easter"] + regressors: ["after_fenix", "in_covid"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1] + weekly_seasonality: True + yearly_seasonality: True + growth: "logistic" + cv_settings: + initial: "366 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" summarize: periods: ["day", "month"] @@ -62,4 +62,4 @@ write_results: project: "moz-fx-data-shared-prod" dataset: "search_derived" table: "search_funnel_forecasts_v1" - components_table: "search_forecast_model_components_v1" \ No newline at end of file + components_table: "search_forecast_model_components_v1" diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml index b1213874..17431247 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml @@ -1,5 +1,4 @@ --- -model_type: funnel metric_hub: app_name: "multi_product" slug: "search_forecasting_search_count" @@ -14,44 +13,45 @@ metric_hub: where: "partner = 'Google'" forecast_model: - forecast_start: NULL - forecast_end: NULL + model_type: "funnel" + start_date: NULL + end_date: NULL + use_all_us_holidays: False predict_historical_dates: True + number_of_simulations: 1000 parameters: - segment: device: desktop start_date: "2018-01-01" - parameters: - holidays: ["easter", "covid_sip11"] - regressors: ["post_esr_migration", "in_covid"] - use_all_us_holidays: False - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - weekly_seasonality: True - yearly_seasonality: True - cv_settings: - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + end_date: NULL + holidays: ["easter", "covid_sip11"] + regressors: ["post_esr_migration", "in_covid"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" - segment: device: mobile start_date: "2020-01-01" - parameters: - holidays: ["easter"] - regressors: ["after_fenix", "in_covid"] - use_all_us_holidays: False - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1] - weekly_seasonality: True - yearly_seasonality: True - growth: "logistic" - cv_settings: - initial: "366 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + end_date: NULL + holidays: ["easter"] + regressors: ["after_fenix", "in_covid"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1] + weekly_seasonality: True + yearly_seasonality: True + growth: "logistic" + cv_settings: + initial: "366 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" summarize: periods: ["day", "month"] diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index a76db106..896051f8 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -1,30 +1,88 @@ import json +import numpy as np import pandas as pd import abc -from dataclasses import dataclass -from typing import List -import logging -logger = logging.getLogger("cmdstanpy") -logger.addHandler(logging.NullHandler()) -logger.propagate = False -logger.setLevel(logging.CRITICAL) +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from kpi_forecasting.metric_hub import MetricHub +from typing import Dict, List @dataclass class BaseForecast(abc.ABC): """ - Abstract Base class for forecast objects + A base class for fitting, forecasting, and summarizing forecasts. This class + should not be invoked directly; it should be inherited by a child class. The + child class needs to implement `_fit` and `_forecast` methods in order to work. + + Args: + model_type (str): The name of the forecasting model that's being used. + parameters (Dict): Parameters that should be passed to the forecasting model. + use_all_us_holidays (bool): Whether or not the forecasting model should use holidays. + The base model does not apply holiday logic; that logic needs to be built + in the child class. + start_date (str): A 'YYYY-MM-DD' formatted-string that specifies the first + date that should be forecsted. + end_date (str): A 'YYYY-MM-DD' formatted-string that specifies the last + date the metric should be queried. + metric_hub (MetricHub): A MetricHub object that provides details about the + metric to be forecasted. + predict_historical_dates (bool): If True, forecast starts at the first + date in the observed data. If False, it uses the value of start_date it set + and the first day after the observed data ends otherwise """ - @abc.abstractmethod - def _set_seed(self) -> None: - """Set random seed to ensure that fits and predictions are reproducible.""" - return NotImplementedError + model_type: str + parameters: Dict + use_all_us_holidays: bool + start_date: str + end_date: str + metric_hub: MetricHub + predict_historical_dates: bool = False + + def _get_observed_data(self): + if self.metric_hub: + # the columns in this dataframe + # are "value" for the metric, submission_date + # and any segments where the column name + # is the name of the segment + self.observed_df = self.metric_hub.fetch() + + def __post_init__(self) -> None: + # fetch observed observed data + self.collected_at = datetime.now(timezone.utc).replace(tzinfo=None) + self._get_observed_data() + + # raise an error is predict_historical_dates is True and start_date is set + if self.start_date and self.predict_historical_dates: + raise ValueError( + "forecast start_date set while predict_historical_dates is True" + ) + # use default start/end dates if the user doesn't specify them + self.start_date = pd.to_datetime(self.start_date or self._default_start_date) + self.end_date = pd.to_datetime(self.end_date or self._default_end_date) + self.dates_to_predict = pd.DataFrame( + {"submission_date": pd.date_range(self.start_date, self.end_date).date} + ) + + # initialize unset attributes + self.model = None + self.forecast_df = None + self.summary_df = None + + # metadata + self.metadata_params = json.dumps( + { + "model_type": self.model_type.lower(), + "model_params": self.parameters, + "use_all_us_holidays": self.use_all_us_holidays, + } + ) @abc.abstractmethod - def fit(self, observed_df: pd.DataFrame) -> object: + def _fit(self, observed_df: pd.DataFrame) -> None: """Fit a forecasting model using `observed_df.` This will typically be the data that was generated using Metric Hub in `__post_init__`. @@ -32,13 +90,11 @@ def fit(self, observed_df: pd.DataFrame) -> object: Args: observed_df (pd.DataFrame): observed data used to fit the model - - Returns: self """ raise NotImplementedError @abc.abstractmethod - def predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: + def _predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: """Forecast using `self.model` on dates in `dates_to_predict`. This method should return a dataframe that will be validated by `_validate_forecast_df`. @@ -59,206 +115,116 @@ def _validate_forecast_df(self, forecast_df: pd.DataFrame) -> None: forecast_df (pd.DataFrame): dataframe produced by `_predict`""" raise NotImplementedError - -@dataclass -class BaseEnsembleForecast: - """ - A base class for forecasts that partition the data using the segments parameter - and fit a different model to each segment. The type of model used is the same for - all segments and is set with the model_class attribute - - Args: - parameters (Dict): Parameters that should be passed to the forecasting model. - model_class: Class to use to construct an ensemble - segments: segments from the metric hub data pull - """ - - parameters: List - model_class: object = BaseForecast - segments: dict = None - - def __post_init__(self) -> None: - # metadata - self.model_type = self.model_class.__class__.__name__.lower().replace( - "Forecast", "" - ) - self.metadata_params = json.dumps( - { - "model_type": self.model_type, - "model_params": self.parameters, - } - ) - - def _set_segment_models(self, observed_df: pd.DataFrame) -> None: - """Creates an element in the segment_models attribute for each segment specified in the - metric_hub.segments section of the config. It is populated from the list of - parameters in the forecast_model.parameters section of the configuration file. - The segements section of each element of the list specifies which values within which - segments the parameters are associated with. + @abc.abstractmethod + def _summarize( + self, + forecast_df: pd.DataFrame, + observed_df: pd.DataFrame, + period: str, + numpy_aggregations: List[str], + percentiles: List[int], + ) -> pd.DataFrame: + """Calculate summary metrics for `forecast_df` over a given period, and + add metadata. Args: - observed_df (pd.DataFrame): dataframe containing observed data used to model - must contain columns specified in the keys of the segments section of the config + forecast_df (pd.DataFrame): forecast dataframe created by `predict` + observed_df (pd.DataFrame): observed data used to generate prediction + period (str): aggregation period up to which metrics are aggregated + numpy_aggregations (List[str]): List of numpy aggregation names + percentiles (List[int]): List of percentiles to aggregate up to + + Returns: + pd.DataFrame: dataframe containing metrics listed in numpy_aggregations + and percentiles """ + raise NotImplementedError - # Construct a DataFrame containing all combination of segment x - ## in the observed_df - combination_df = observed_df[self.segments].drop_duplicates() - - # Construct dictionaries from those combinations - # this will be used to check that the config actually partitions the data - segment_combinations = combination_df.to_dict("records") - - # get subset of segment that is used in partitioning - split_dims = None - for partition in self.parameters: - partition_dim = set(partition["segment"].keys()) - if split_dims and partition_dim != split_dims: - raise ValueError( - "Segment keys are not the same across different elements of parameters in the config file" - ) - elif split_dims is None: - split_dims = partition_dim - else: - # this is case where split_dim is set and matches paritition_dim - continue - if not split_dims <= set(combination_df.keys()): - missing_dims = split_dims - set(combination_df.keys()) - missing_dims_str = ",".join(missing_dims) - raise ValueError( - f"Segment keys missing from metric hub segments: {missing_dims_str}" - ) + @property + def _default_start_date(self) -> str: + """The first day after the last date in the observed dataset.""" + if self.predict_historical_dates: + return self.observed_df["submission_date"].min() + else: + return self.observed_df["submission_date"].max() + timedelta(days=1) + + @property + def _default_end_date(self) -> str: + """78 weeks (18 months) ahead of the current UTC date.""" + return ( + datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78) + ).date() - # For each segment combinination, get the model parameters from the config - ## file. Parse the holidays and regressors specified in the config file. - segment_models = [] - for segment in segment_combinations: - # find the correct configuration - for partition in self.parameters: - partition_segment = partition["segment"] - selected_partition = None - # get subset of segment that is used to partition - subset_segment = { - key: val for key, val in segment.items() if key in split_dims - } - if partition_segment == subset_segment: - selected_partition = partition.copy() - break - if selected_partition is None: - raise ValueError("Partition not Found") - selected_partition["segment"] = segment - - if "start_date" in selected_partition: - start_date = pd.to_datetime(selected_partition["start_date"]).date() - else: - start_date = None - - # Create a FunnelSegmentModelSettings object for each segment combination - segment_models.append( - { - "model": self.model_class(**selected_partition["parameters"]), - "segment": segment, - "start_date": start_date, - } - ) - self.segment_models = segment_models + def _set_seed(self) -> None: + """Set random seed to ensure that fits and predictions are reproducible.""" + np.random.seed(42) - def filter_data_to_segment( - self, df: pd.DataFrame, segment: dict, start_date: str + def fit(self) -> None: + """Fit a model using historic metric data provided by `metric_hub`.""" + print(f"Fitting {self.model_type} model.", flush=True) + self._set_seed() + self.trained_at = datetime.now(timezone.utc).replace(tzinfo=None) + self._fit(self.observed_df) + + def predict(self) -> None: + """Generate a forecast from `start_date` to `end_date`. + Result is set to `self.forecast_df`""" + print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True) + self._set_seed() + self.predicted_at = datetime.now(timezone.utc).replace(tzinfo=None) + self.forecast_df = self._predict(self.dates_to_predict) + self._validate_forecast_df(self.forecast_df) + + def summarize( + self, + periods: List[str] = ["day", "month"], + numpy_aggregations: List[str] = ["mean"], + percentiles: List[int] = [10, 50, 90], ) -> pd.DataFrame: - """function to filter data to the segment set in segment - and in time to only dates on or after start_date - - Args: - df (pd.DataFrame): dataframe to filter - segment (dict): dictionary where keys are columns and values - are the value that column takes for that segment - start_date (str): filter df so that the earliest date is start_date - - Returns: - pd.DataFrame: filtered dataframe """ - column_matches_segment = df[list(segment)] == pd.Series(segment) - row_in_segment = column_matches_segment.all(axis=1) - filter_array = row_in_segment - if start_date: - row_after_start = df["submission_date"] >= start_date - filter_array &= row_after_start - return df.loc[filter_array] - - def get_filtered_observed_data(self, observed_df: pd.DataFrame) -> pd.DataFrame: - """Returns the observed dataframe with time filters applied - to each segments data + Calculate summary metrics for `forecast_df` and add metadata. + The dataframe returned here will be reported in Big Query when + `write_results` is called. Args: - observed_df (pd.DataFrame): full observed dataframe + periods (List[str]): A list of the time periods that the data should be aggregated and + summarized by. For example ["day", "month"] + numpy_aggregations (List[str]): A list of numpy methods (represented as strings) that can + be applied to summarize numeric values in a numpy dataframe. For example, ["mean"]. + percentiles (List[int]): A list of integers representing the percentiles that should be reported + in the summary. For example [50] would calculate the 50th percentile (i.e. the median). Returns: - pd.DataFrame: filtered observed dataframe + pd.DataFrame: metric dataframe for all metrics and aggregations """ - observed_df_list = [] - for segment_model in self.segment_models: - observed_subset = self.filter_data_to_segment( - observed_df, segment_model["segment"], segment_model["start_date"] - ) - observed_df_list.append(observed_subset) - return pd.concat(observed_df_list) + summary_df = pd.concat( + [ + self._summarize( + self.forecast_df, + self.observed_df, + i, + numpy_aggregations, + percentiles, + ) + for i in periods + ] + ) - def fit(self, observed_df: pd.DataFrame) -> None: - """Fit models across all segments for the data in observed_df + # add Metric Hub metadata columns + summary_df["metric_alias"] = self.metric_hub.alias.lower() + summary_df["metric_hub_app_name"] = self.metric_hub.app_name.lower() + summary_df["metric_hub_slug"] = self.metric_hub.slug.lower() + summary_df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date) + summary_df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date) + summary_df["metric_collected_at"] = self.collected_at - Args: - observed_df (pd.DataFrame): data used to fit - """ - print(f"Fitting {self.model_type} model.", flush=True) - # create list of models depending on whether there are segments or not - self._set_segment_models(observed_df) - for segment_model in self.segment_models: - print(segment_model["segment"]) - model = segment_model["model"] - model._set_seed() - observed_subset = self.filter_data_to_segment( - observed_df, segment_model["segment"], segment_model["start_date"] - ) - model.fit(observed_subset) - return self - - def predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: - """Generates a prediction for each segment for the dates in dates_to_predict + # add forecast model metadata columns + summary_df["forecast_start_date"] = self.start_date + summary_df["forecast_end_date"] = self.end_date + summary_df["forecast_trained_at"] = self.trained_at + summary_df["forecast_predicted_at"] = self.predicted_at + summary_df["forecast_parameters"] = self.metadata_params - Args: - dates_to_predict (pd.DataFrame): dataframe with a single column, - submission_date that is a string in `%Y-%m-%d` format + self.summary_df = summary_df - Returns: - pd.DataFrame: prediction across all segments - """ - start_date = dates_to_predict["submission_date"].iloc[0] - end_date = dates_to_predict["submission_date"].iloc[-1] - - print(f"Forecasting from {start_date} to {end_date}.", flush=True) - for segment_model in self.segment_models: - config_start_date = segment_model["start_date"] - - if config_start_date and config_start_date > start_date: - dates_to_predict_segment = ( - dates_to_predict[ - dates_to_predict["submission_date"] >= config_start_date - ] - .reset_index(drop=True) - .copy() - ) - else: - dates_to_predict_segment = dates_to_predict.copy() - - model = segment_model["model"] - model._set_seed() - predict_df = model.predict(dates_to_predict_segment) - - # add segments on as columns - for column, value in segment_model["segment"].items(): - predict_df[column] = value - predict_df["forecast_parameters"] = json.dumps(model._get_parameters()) - segment_model["forecast"] = predict_df - self.forecast_list = [x["forecast"] for x in self.segment_models] - return pd.concat(self.forecast_list) + return self.summary_df diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py index a9c9998b..3c06863c 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py @@ -1,48 +1,370 @@ from dataclasses import dataclass, field +from datetime import datetime import itertools -from typing import List import json +from typing import Dict, List, Union from google.cloud import bigquery from google.cloud.bigquery.enums import SqlTypeNames as bq_types import numpy as np import pandas as pd from pandas.api import types as pd_types +import prophet from prophet.diagnostics import cross_validation -from kpi_forecasting.models.prophet_forecast import ( - ProphetForecast, - aggregate_forecast_observed, +from kpi_forecasting.configs.model_inputs import ( + ProphetHoliday, + ProphetRegressor, + holiday_collection, + regressor_collection, ) -from kpi_forecasting.models.base_forecast import BaseEnsembleForecast +from kpi_forecasting.models.prophet_forecast import ProphetForecast @dataclass -class ProphetAutotunerForecast(ProphetForecast): - grid_parameters: dict = field(default_factory=dict) - cv_settings: dict = field(default_factory=dict) +class SegmentModelSettings: + """ + Holds the configuration and results for each segment + in a funnel forecasting model. + """ + + segment: Dict[str, str] + start_date: str + end_date: str + grid_parameters: Dict[str, Union[List[float], float]] + cv_settings: Dict[str, str] + holidays: list = field(default_factory=list[ProphetHoliday]) + regressors: list = field(default_factory=list[ProphetRegressor]) + + # Hold results as models are trained and forecasts made + segment_model: prophet.Prophet = None + trained_parameters: dict = field(default_factory=dict[str, str]) + forecast_df: pd.DataFrame = None + components_df: pd.DataFrame = None + + +@dataclass +class FunnelForecast(ProphetForecast): + """ + FunnelForecast class for generating and managing forecast models. The class handles + cases where forecasts for a combination of dimensions are required for a metric. + + Inherits from BaseForecast and provides methods for initializing forecast + parameters, building models, generating forecasts, summarizing results, + and writing results to BigQuery. + """ + + def __post_init__(self) -> None: + """ + Post-initialization method to set up necessary attributes and configurations. + + This method sets up the dates to predict, constructs segment combinations, + initializes models for each segment, and prepares attributes for storing results. + """ + super().__post_init__() + + if self.metric_hub is None: + # this is used to avoid the code below for testing purposes + return + + self._set_segment_models(self.observed_df, self.metric_hub.segments.keys()) + + # initialize unset attributes + self.components_df = None + + def _set_segment_models( + self, observed_df: pd.DataFrame, segment_column_list: list + ) -> None: + """Creates a SegmentSettings object for each segment specified in the + metric_hub.segments section of the config. It is populated from the list of + parameters in the forecast_model.parameters section of the configuration file. + The segements section of each element of the list specifies which values within which + segments the parameters are associated with. + + Args: + observed_df (pd.DataFrame): dataframe containing observed data used to model + must contain columns specified in the keys of the segments section of the config + segment_column_list (list): list of columns of observed_df to use to determine segments + """ + # Construct a DataFrame containing all combination of segment values + ## in the observed_df + combination_df = observed_df[segment_column_list].drop_duplicates() + + # Construct dictionaries from those combinations + # this will be used to check that the config actually partitions the data + segment_combinations = combination_df.to_dict("records") + + # get subset of segment that is used in partitioning + split_dims = None + for partition in self.parameters: + partition_dim = set(partition["segment"].keys()) + if split_dims and partition_dim != split_dims: + raise ValueError( + "Segment keys are not the same across different elements of parameters in the config file" + ) + elif split_dims is None: + split_dims = partition_dim + else: + # this is case where split_dim is set and matches paritition_dim + continue + if not split_dims <= set(combination_df.keys()): + missing_dims = split_dims - set(combination_df.keys()) + missing_dims_str = ",".join(missing_dims) + raise ValueError( + f"Segment keys missing from metric hub segments: {missing_dims_str}" + ) + + # For each segment combinination, get the model parameters from the config + ## file. Parse the holidays and regressors specified in the config file. + segment_models = [] + for segment in segment_combinations: + # find the correct configuration + for partition in self.parameters: + partition_segment = partition["segment"] + # get subset of segment that is used to partition + subset_segment = { + key: val for key, val in segment.items() if key in split_dims + } + if partition_segment == subset_segment: + # parition is set to the desired value + # break out of loop + break + holiday_list = [] + regressor_list = [] + + if "holidays" in partition: + holiday_list = [holiday_collection[h] for h in partition["holidays"]] + if "regressors" in partition: + regressor_list = [ + regressor_collection[r] for r in partition["regressors"] + ] + + # Create a SegmentModelSettings object for each segment combination + segment_models.append( + SegmentModelSettings( + segment=segment, + start_date=partition["start_date"], + end_date=self.end_date, + holidays=[ProphetHoliday(**h) for h in holiday_list], + regressors=[ProphetRegressor(**r) for r in regressor_list], + grid_parameters=dict(partition["grid_parameters"]), + cv_settings=dict(partition["cv_settings"]), + ) + ) + self.segment_models = segment_models + + @property + def column_names_map(self) -> Dict[str, str]: + """ + Map column names from the dataset to the names required by Prophet. + + Returns: + Dict[str, str]: Mapping of column names. + """ + return {"submission_date": "ds", "value": "y"} + + def _fill_regressor_dates(self, regressor: ProphetRegressor) -> ProphetRegressor: + """ + Fill missing start and end dates for a regressor. A ProphetRegressor can be created + without a 'start_date' or 'end_date' being supplied, so this checks for either date attr + being missing and fills in with the appropriate date: if 'start_date' is missing, it assumes + that the regressor starts at the beginning of the observed data; if 'end_date' is missing, + it assumes that the regressor should be filled until the end of the forecast period. + + Args: + regressor (ProphetRegressor): The regressor to fill dates for. - def _get_crossvalidation_metric(self, m: ProphetForecast) -> float: + Returns: + ProphetRegressor: The regressor with filled dates. + """ + + for date in ["start_date", "end_date"]: + if getattr(regressor, date) is None: + setattr(regressor, date, getattr(self, date)) + elif isinstance(getattr(regressor, date), str): + setattr(regressor, date, pd.to_datetime(getattr(regressor, date))) + + if regressor.end_date < regressor.start_date: + raise Exception( + f"Regressor {regressor.name} start date comes after end date" + ) + return regressor + + def _build_model( + self, + segment_settings: SegmentModelSettings, + parameters: Dict[str, Union[float, str, bool]], + ) -> prophet.Prophet: + """ + Build a Prophet model from parameters. + + Args: + segment_settings (SegmentModelSettings): The settings for the segment. + parameters (Dict[str, Union[float, str, bool]]): The parameters for the model. + + Returns: + prophet.Prophet: The Prophet model. + """ + if segment_settings.holidays: + parameters["holidays"] = pd.concat( + [ + pd.DataFrame( + { + "holiday": h.name, + "ds": pd.to_datetime(h.ds), + "lower_window": h.lower_window, + "upper_window": h.upper_window, + } + ) + for h in segment_settings.holidays + ], + ignore_index=True, + ) + + m = prophet.Prophet( + **parameters, + uncertainty_samples=self.number_of_simulations, + mcmc_samples=0, + ) + for regressor in segment_settings.regressors: + m.add_regressor( + regressor.name, + prior_scale=regressor.prior_scale, + mode=regressor.mode, + ) + + return m + + def _build_train_dataframe( + self, + observed_df, + segment_settings: SegmentModelSettings, + add_logistic_growth_cols: bool = False, + ) -> pd.DataFrame: + """ + Build the model dataframe for training + + Args: + observed_df: dataframe of observed data + segment_settings (SegmentModelSettings): The settings for the segment. + add_logistic_growth_cols (bool, optional): Whether to add logistic growth columns. Defaults to False. + + Returns: + pd.DataFrame: The dataframe for the model. + """ + + # find indices in observed_df for rows that exactly match segment dict + segment_historical_indices = ( + observed_df[list(segment_settings.segment)] + == pd.Series(segment_settings.segment) + ).all(axis=1) + df = ( + observed_df.loc[ + (segment_historical_indices) + & ( # filter observed_df if segment start date > metric_hub start date + observed_df["submission_date"] + >= datetime.strptime(segment_settings.start_date, "%Y-%m-%d").date() + ) + ] + .rename(columns=self.column_names_map) + .copy() + ) + # define limits for logistic growth + if add_logistic_growth_cols: + df["floor"] = df["y"].min() * 0.5 + df["cap"] = df["y"].max() * 1.5 + + if segment_settings.regressors: + df = self._add_regressors(df, segment_settings.regressors) + return df + + def _build_predict_dataframe( + self, + dates_to_predict: pd.DataFrame, + segment_settings: SegmentModelSettings, + add_logistic_growth_cols: bool = False, + ) -> pd.DataFrame: + """creates dataframe used for prediction + + Args: + dates_to_predict (pd.DataFrame): dataframe of dates to predict + segment_settings (SegmentModelSettings): settings related to the segment + add_logistic_growth_cols (bool): Whether to add logistic growth columns. Defaults to False. + + + Returns: + pd.DataFrame: dataframe to use used in prediction + """ + # predict dataframe only needs dates to predict, logistic growth limits, and regressors + df = dates_to_predict.rename(columns=self.column_names_map).copy() + if add_logistic_growth_cols: + df["floor"] = segment_settings.trained_parameters["floor"] + df["cap"] = segment_settings.trained_parameters["cap"] + + if segment_settings.regressors: + df = self._add_regressors(df, segment_settings.regressors) + + return df + + def _fit(self, observed_df: pd.DataFrame) -> None: + """ + Fit and save a Prophet model for each segment combination. + + Args: + observed_df (pd.DataFrame): dataframe of observations. Expected to have columns + specified in the segments section of the config, + submission_date column with unique dates corresponding to each observation and + y column containing values of observations + """ + for segment_settings in self.segment_models: + parameters = self._auto_tuning(observed_df, segment_settings) + + # Initialize model; build model dataframe + add_log_growth_cols = ( + "growth" in parameters.keys() and parameters["growth"] == "logistic" + ) + test_dat = self._build_train_dataframe( + observed_df, segment_settings, add_log_growth_cols + ) + model = self._build_model(segment_settings, parameters) + + model.fit(test_dat) + if add_log_growth_cols: + # all values in these colunns are the same + parameters["floor"] = test_dat["floor"].values[0] + parameters["cap"] = test_dat["cap"].values[0] + + if "holidays" in parameters.keys(): + parameters["holidays"] = ( + parameters["holidays"]["holiday"].unique().tolist() + ) + segment_settings.trained_parameters = parameters + segment_settings.segment_model = model + + def _get_crossvalidation_metric( + self, m: prophet.Prophet, cv_settings: dict + ) -> float: """function for calculated the metric used for crossvalidation Args: - m (ProphetForecast): Prophet model for crossvalidation + m (prophet.Prophet): Prophet model for crossvalidation cv_settings (dict): settings set by segment in the config file Returns: - float: Metric which should always be positive and where smaller values - indicate better models + float: Metric where closer to zero means a better model """ - df_cv = cross_validation(m.model, **self.cv_settings) + df_cv = cross_validation(m, **cv_settings) df_bias = df_cv.groupby("cutoff")[["yhat", "y"]].sum().reset_index() df_bias["pcnt_bias"] = df_bias["yhat"] / df_bias["y"] - 1 # Prophet splits the historical data when doing cross validation using # cutoffs. The `.tail(3)` limits the periods we consider for the best # parameters to the 3 most recent cutoff periods. - return np.abs(df_bias.tail(3)["pcnt_bias"].mean()) + return df_bias.tail(3)["pcnt_bias"].mean() - def _auto_tuning(self, observed_df) -> ProphetForecast: + def _auto_tuning( + self, observed_df, segment_settings: SegmentModelSettings + ) -> Dict[str, float]: """ Perform automatic tuning of model parameters. @@ -52,90 +374,92 @@ def _auto_tuning(self, observed_df) -> ProphetForecast: specified in the segments section of the config, submission_date column with unique dates corresponding to each observation and y column containing values of observations + segment_settings (SegmentModelSettings): The settings for the segment. + Returns: - ProphetForecast: ProphetForecast that produced the best crossvalidation metric. + Dict[str, float]: The tuned parameters. """ + add_log_growth_cols = ( + "growth" in segment_settings.grid_parameters.keys() + and segment_settings.grid_parameters["growth"] == "logistic" + ) - for k, v in self.grid_parameters.items(): + for k, v in segment_settings.grid_parameters.items(): if not isinstance(v, list): - self.grid_parameters[k] = [v] + segment_settings.grid_parameters[k] = [v] - auto_param_grid = [ - dict(zip(self.grid_parameters.keys(), v)) - for v in itertools.product(*self.grid_parameters.values()) + param_grid = [ + dict(zip(segment_settings.grid_parameters.keys(), v)) + for v in itertools.product(*segment_settings.grid_parameters.values()) ] - set_params = self._get_parameters() - for param in self.grid_parameters: - set_params.pop(param) - - auto_param_grid = [dict(**el, **set_params) for el in auto_param_grid] - - bias = np.inf - best_model = None - best_params = None - for params in auto_param_grid: - m = ProphetForecast(**params) - m.fit(observed_df) - crossval_metric = self._get_crossvalidation_metric(m) - if crossval_metric < bias: - best_model = m - bias = crossval_metric - best_params = params - - # set the parameters of the current object - # to those of the optimized ProphetForecast object - for attr_name, best_value in best_params.items(): - setattr(self, attr_name, best_value) - if best_model.growth == "logistic": - # case where logistic growth is being used - # need to set some parameters used to make training and - # predict dfs - self.growth = "logistic" - self.logistic_growth_cap = best_model.logistic_growth_cap - self.logistic_growth_floor = best_model.logistic_growth_floor - if best_model.regressors is not None: - self.regressors = best_model.regressors - - return best_model.model - - def fit(self, observed_df: pd.DataFrame) -> object: - """Select the best fit model and set it to the model attribute + test_dat = self._build_train_dataframe( + observed_df, segment_settings, add_log_growth_cols + ) + bias = [] + + for params in param_grid: + m = self._build_model(segment_settings, params) + m.fit(test_dat) + + crossval_metric = self._get_crossvalidation_metric( + m, segment_settings.cv_settings + ) + bias.append(crossval_metric) + + min_abs_bias_index = np.argmin(np.abs(bias)) + + return param_grid[min_abs_bias_index] + + def _add_regressors(self, df: pd.DataFrame, regressors: List[ProphetRegressor]): + """ + Add regressor columns to the dataframe for training or prediction. Args: - observed_df (pd.DataFrame): observed data used to fit + df (pd.DataFrame): The input dataframe. + regressors (List[ProphetRegressor]): The list of regressors to add. + + Returns: + pd.DataFrame: The dataframe with regressors added. """ - # model returned by _auto_tuning is already fit - # don't need to set seed since it happens in the - # ProphetForecast object created in the auto_tuning - self.model = self._auto_tuning(observed_df) - train_dataframe = self._build_train_dataframe(observed_df) - self.history = train_dataframe - return self + for regressor in regressors: + regressor = self._fill_regressor_dates(regressor) + # finds rows where date is in regressor date ranges and sets that regressor + ## value to 0, else 1 + df[regressor.name] = ( + ~( + (df["ds"] >= pd.to_datetime(regressor.start_date).date()) + & (df["ds"] <= pd.to_datetime(regressor.end_date).date()) + ) + ).astype(int) + return df - def predict( - self, - dates_to_predict_raw: pd.DataFrame, + def _predict( + self, dates_to_predict_raw: pd.DataFrame, segment_settings: SegmentModelSettings ) -> pd.DataFrame: """ Generate forecast samples for a segment. Args: - dates_to_predict (pd.DataFrame): dataframe with a single column, - submission_date that is a string in `%Y-%m-%d` format + dates_to_predict (pd.DataFrame): dataframe of dates to predict + segment_settings (SegmentModelSettings): The settings for the segment. + Returns: pd.DataFrame: The forecasted values. """ + add_log_growth_cols = ( + "growth" in segment_settings.trained_parameters.keys() + and segment_settings.trained_parameters["growth"] == "logistic" + ) # add regressors, logistic growth limits (if applicable) to predict dataframe - self._set_seed() - dates_to_predict = self._build_predict_dataframe(dates_to_predict_raw) - self.predict_input = dates_to_predict + dates_to_predict = self._build_predict_dataframe( + dates_to_predict_raw, segment_settings, add_log_growth_cols + ) # draws samples from Prophet posterior distribution, to provide percentile predictions - samples = self.model.predictive_samples(dates_to_predict) + samples = segment_settings.segment_model.predictive_samples(dates_to_predict) df = pd.DataFrame(samples["yhat"]) - df["submission_date"] = dates_to_predict_raw["submission_date"].values - self._validate_forecast_df(df) + df["submission_date"] = dates_to_predict_raw component_cols = [ "ds", @@ -152,22 +476,23 @@ def predict( ] # use 'predict' method to return components from the Prophet model - components_df = self.model.predict(dates_to_predict)[component_cols] + components_df = segment_settings.segment_model.predict(dates_to_predict)[ + component_cols + ] # join observed data to components df, which allows for calc of intra-sample # error rates and how components resulted in those predictions. The `fillna` # call will fill the missing y values for forecasted dates, where only yhat # is available. - history_df = self.history[["ds", "y"]].copy() - history_df["ds"] = pd.to_datetime(history_df["ds"]) components_df = components_df.merge( - history_df, + segment_settings.segment_model.history[["ds", "y"]], on="ds", how="left", ).fillna(0) components_df.rename(columns={"ds": "submission_date"}, inplace=True) - self.components_df = components_df.copy() + segment_settings.components_df = components_df.copy() + return df def _validate_forecast_df(self, df: pd.DataFrame) -> None: @@ -193,239 +518,302 @@ def _validate_forecast_df(self, df: pd.DataFrame) -> None: f" but column {i} has type {df[i].dtypes}." ) + def _percentile_name_map(self, percentiles: List[int]) -> Dict[str, str]: + """ + Map percentiles to their corresponding names for the BQ table. -@dataclass -class FunnelForecast(BaseEnsembleForecast): - """ - Holds the configuration and results for each segment - in a funnel forecasting model. - """ + Args: + percentiles (List[int]): The list of percentiles. + + Returns: + Dict[str, str]: The mapping of percentile names. + """ + + percentiles.sort() + return { + f"p{percentiles[0]}": "value_low", + f"p{percentiles[1]}": "value_mid", + f"p{percentiles[2]}": "value_high", + "mean": "value", + } + + def _combine_forecast_observed( + self, + forecast_df: pd.DataFrame, + observed_df: pd.DataFrame, + period: str, + numpy_aggregations: List, + percentiles, + segment: dict, + ) -> pd.DataFrame: + """Calculate aggregates over the forecast and observed data + and concatenate the two dataframes + Args: + forecast_df (pd.DataFrame): forecast dataframe + observed_df (pd.DataFrame): observed dataframe + period (str): period to aggregate up to, must be in (day, month, year) + numpy_aggregations (List): List of aggregation functions to apply across samples from the + posterior-predictive distribution. Must take + in a numpy array and return a single value + percentiles: 3-element list of percentiles to calculate across samples from the posterior-predictive distribution + segment (dict): dictionary that lists columns and values corresponding to the segment + keys are the column name used to segment and values are the values + of that column corresponding to the current segment + + Returns: + pd.DataFrame: combined dataframe containing aggregated values from observed and forecast + """ + # filter the forecast data to just the data in the future + last_historic_date = observed_df["submission_date"].max() + forecast_df = forecast_df.loc[ + forecast_df["submission_date"] > last_historic_date + ] - model_class: object = ProphetAutotunerForecast + forecast_summarized, observed_summarized = self._aggregate_forecast_observed( + forecast_df, observed_df, period, numpy_aggregations, percentiles + ) - def __post_init__(self, *args, **kwargs): - super(FunnelForecast, self).__post_init__() - if not self.model_class == ProphetAutotunerForecast: - raise ValueError("model_class set when ProphetForecast is expected") + # add datasource-specific metadata columns + forecast_summarized["source"] = "forecast" + observed_summarized["source"] = "historical" - def _get_parameters(self): - parameter_dict = {} - for el in self.parameters: - parameter_dict[str(el["segment"])] = json.dumps(el) - return parameter_dict + # add segment columns to forecast table + for dim, value in segment.items(): + forecast_summarized[dim] = value + # rename forecast percentile to low, middle, high + # rename mean to value + forecast_summarized = forecast_summarized.rename( + columns=self._percentile_name_map(percentiles) + ) -def combine_forecast_observed( - forecast_summarized: pd.DataFrame, - observed_summarized: pd.DataFrame, -) -> pd.DataFrame: - """Combines the observed and forecast data as part of summarization - Args: - forecast_summarized (pd.DataFrame): forecast dataframe. This dataframe should include the segments as columns - as well as a forecast_parameters column with the forecast parameters - observed_summarized (pd.DataFrame): observed dataframe + # create a single dataframe that contains observed and forecasted data + df = pd.concat([observed_summarized, forecast_summarized]) + return df - Returns: - pd.DataFrame: combined dataframe containing aggregated values from observed and forecast - """ - # add datasource-specific metadata columns - forecast_summarized["source"] = "forecast" - observed_summarized["source"] = "historical" - - # create a single dataframe that contains observed and forecasted data - df = pd.concat([observed_summarized, forecast_summarized]) - return df - - -def summarize_with_parameters( - forecast_df: pd.DataFrame, - observed_df: pd.DataFrame, - period: str, - numpy_aggregations: List, - percentiles, - segment_cols: List[str], -) -> pd.DataFrame: - """Calculate aggregates over the forecast and observed data - and concatenate the two dataframes for a single set of parameters - Args: - forecast_df (pd.DataFrame): forecast dataframe. This dataframe should include the segments as columns - as well as a forecast_parameters column with the forecast parameters - observed_df (pd.DataFrame): observed dataframe - period (str): period to aggregate up to, must be in (day, month, year) - numpy_aggregations (List): List of aggregation functions to apply across samples from the - posterior-predictive distribution. Must take - in a numpy array and return a single value - percentiles: 3-element list of percentiles to calculate across samples from the posterior-predictive distribution - segment (dict): dictionary that lists columns and values corresponding to the segment - keys are the column name used to segment and values are the values - of that column corresponding to the current segment - - Returns: - pd.DataFrame: combined dataframe containing aggregated values from observed and forecast - """ - # filter the forecast data to just the data in the future - # note that if start_date is set, it is the applied to the start of observed_df - # and that it therefore doesn't need to be applied here - last_historic_date = observed_df["submission_date"].max() - forecast_df = forecast_df.loc[forecast_df["submission_date"] > last_historic_date] - - forecast_summarized, observed_summarized = aggregate_forecast_observed( - forecast_df, - observed_df, - period, - numpy_aggregations, - percentiles, - additional_aggregation_columns=segment_cols, - ) - percentile_name_map = { - f"p{percentiles[0]}": "value_low", - f"p{percentiles[1]}": "value_mid", - f"p{percentiles[2]}": "value_high", - "mean": "value", - } - - # rename forecast percentile to low, middle, high - # rename mean to value - forecast_summarized = forecast_summarized.rename(columns=percentile_name_map) - - df = combine_forecast_observed(forecast_summarized, observed_summarized) - - df["aggregation_period"] = period.lower() - - return df - - -def summarize( - forecast_df: pd.DataFrame, - observed_df: pd.DataFrame, - periods: List[str] = ["day", "month"], - numpy_aggregations: List[str] = ["mean"], - percentiles: List[int] = [10, 50, 90], - segment_cols: List[str] = [], -) -> None: - """ - Summarize the forecast results over specified periods. - - Args: - forecast_df (pd.DataFrame): forecast dataframe - observed_df (pd.DataFrame): observed data - periods (List[str], optional): The periods for summarization. Defaults to ["day", "month"]. - segment_cols (List of str): list of columns used for segmentation - numpy_aggregations (List[str], optional): The numpy aggregation functions. Defaults to ["mean"]. - percentiles (List[int], optional): The percentiles for summarization. Defaults to [10, 50, 90]. - """ - if len(percentiles) != 3: - raise ValueError( - """ - Can only pass a list of length 3 as percentiles, for lower, mid, and upper values. - """ + def _summarize( + self, + segment_settings: SegmentModelSettings, + period: str, + numpy_aggregations: List[str], + percentiles: List[int] = [10, 50, 90], + ) -> pd.DataFrame: + """ + Calculate summary metrics on a specific segment + for `forecast_df` over a given period, and add metadata. + + Args: + segment_settings (SegmentModelSettings): The settings for the segment. + period (str): The period for aggregation. + numpy_aggregations (List[str]): List of numpy aggregation functions. + percentiles (List[int]): List of percentiles. + + Returns: + pd.DataFrame: The summarized dataframe. + """ + if len(percentiles) != 3: + raise ValueError( + """ + Can only pass a list of length 3 as percentiles, for lower, mid, and upper values. + """ + ) + + # the start date for this segment's historical data, in cases where the full time series + ## of historical data is not used for model training + segment_observed_start_date = datetime.strptime( + segment_settings.start_date, "%Y-%m-%d" + ).date() + + # find indices in observed_df for rows that exactly match segment dict + segment_historical_indices = ( + self.observed_df[list(segment_settings.segment)] + == pd.Series(segment_settings.segment) + ).all(axis=1) + + segment_observed_df = self.observed_df.loc[ + (segment_historical_indices) + & (self.observed_df["submission_date"] >= segment_observed_start_date) + ].copy() + + df = self._combine_forecast_observed( + segment_settings.forecast_df, + segment_observed_df, + period, + numpy_aggregations, + percentiles, + segment_settings.segment, ) - summary_df = pd.concat( - [ - summarize_with_parameters( - forecast_df, - observed_df, - i, - numpy_aggregations, - percentiles, - segment_cols, + df["forecast_parameters"] = json.dumps(segment_settings.trained_parameters) + + # add summary metadata columns + df["aggregation_period"] = period.lower() + return df + + def predict(self) -> None: + """Generate a forecast from `start_date` to `end_date`.""" + print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True) + self._set_seed() + self.predicted_at = datetime.utcnow() + + for segment_settings in self.segment_models: + forecast_df = self._predict(self.dates_to_predict, segment_settings) + self._validate_forecast_df(forecast_df) + + segment_settings.forecast_df = forecast_df + + def summarize( + self, + periods: List[str] = ["day", "month"], + numpy_aggregations: List[str] = ["mean"], + percentiles: List[int] = [10, 50, 90], + ) -> None: + """ + Summarize the forecast results over specified periods. + + Args: + periods (List[str], optional): The periods for summarization. Defaults to ["day", "month"]. + numpy_aggregations (List[str], optional): The numpy aggregation functions. Defaults to ["mean"]. + percentiles (List[int], optional): The percentiles for summarization. Defaults to [10, 50, 90]. + """ + summary_df_list = [] + components_df_list = [] + for segment in self.segment_models: + summary_df = pd.concat( + [ + self._summarize( + segment, + i, + numpy_aggregations, + percentiles, + ) + for i in periods + ] ) - for i in periods - ] - ) - - return summary_df - - -def write_results( - summary_df, - components_df, - segment_cols, - project: str, - dataset: str, - table: str, - write_disposition: str = "WRITE_APPEND", - components_table: str = "", - components_dataset: str = "", -) -> None: - """ - Write `self.summary_df` to Big Query. - - Args: - project (str): The Big Query project that the data should be written to. - dataset (str): The Big Query dataset that the data should be written to. - table (str): The Big Query table that the data should be written to. - write_disposition (str, optional): In the event that the destination table exists, - should the table be overwritten ("WRITE_TRUNCATE") or appended to ("WRITE_APPEND")? Defaults to "WRITE_APPEND". - components_table (str, optional): The Big Query table for model components. Defaults to "". - components_dataset (str, optional): The Big Query dataset for model components. Defaults to "". - """ - print( - f"Writing results to `{project}.{dataset}.{table}`.", - flush=True, - ) - client = bigquery.Client(project=project) - schema = [ - bigquery.SchemaField("submission_date", bq_types.DATE), - *[bigquery.SchemaField(k, bq_types.STRING) for k in segment_cols], - bigquery.SchemaField("aggregation_period", bq_types.STRING), - bigquery.SchemaField("source", bq_types.STRING), - bigquery.SchemaField("value", bq_types.FLOAT), - bigquery.SchemaField("value_low", bq_types.FLOAT), - bigquery.SchemaField("value_mid", bq_types.FLOAT), - bigquery.SchemaField("value_high", bq_types.FLOAT), - bigquery.SchemaField("metric_alias", bq_types.STRING), - bigquery.SchemaField("metric_hub_app_name", bq_types.STRING), - bigquery.SchemaField("metric_hub_slug", bq_types.STRING), - bigquery.SchemaField("metric_start_date", bq_types.DATE), - bigquery.SchemaField("metric_end_date", bq_types.DATE), - bigquery.SchemaField("metric_collected_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_start_date", bq_types.DATE), - bigquery.SchemaField("forecast_end_date", bq_types.DATE), - bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_predicted_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_parameters", bq_types.STRING), - ] - job = client.load_table_from_dataframe( - dataframe=summary_df, - destination=f"{project}.{dataset}.{table}", - job_config=bigquery.LoadJobConfig( - schema=schema, - autodetect=False, - write_disposition=write_disposition, - ), - ) - # Wait for the job to complete. - job.result() - - if components_table: - numeric_cols = list(components_df.select_dtypes(include=float).columns) - string_cols = list(components_df.select_dtypes(include=object).columns) + for dim, dim_value in segment.segment.items(): + segment.components_df[dim] = dim_value + summary_df_list.append(summary_df.copy(deep=True)) + components_df_list.append(segment.components_df) + del summary_df - schema = [ - bigquery.SchemaField("submission_date", bq_types.DATE), - bigquery.SchemaField("metric_slug", bq_types.STRING), - bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP), - ] - schema += [bigquery.SchemaField(col, bq_types.STRING) for col in string_cols] - schema += [bigquery.SchemaField(col, bq_types.FLOAT) for col in numeric_cols] + df = pd.concat(summary_df_list, ignore_index=True) + + # add Metric Hub metadata columns + df["metric_alias"] = self.metric_hub.alias.lower() + df["metric_hub_app_name"] = self.metric_hub.app_name.lower() + df["metric_hub_slug"] = self.metric_hub.slug.lower() + df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date) + df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date) + df["metric_collected_at"] = self.collected_at + + # add forecast model metadata columns + df["forecast_start_date"] = self.start_date + df["forecast_end_date"] = self.end_date + df["forecast_trained_at"] = self.trained_at + df["forecast_predicted_at"] = self.predicted_at - if not components_dataset: - components_dataset = dataset + self.summary_df = df + + self.components_df = pd.concat(components_df_list, ignore_index=True) + + def write_results( + self, + project: str, + dataset: str, + table: str, + write_disposition: str = "WRITE_APPEND", + components_table: str = "", + components_dataset: str = "", + ) -> None: + """ + Write `self.summary_df` to Big Query. + + Args: + project (str): The Big Query project that the data should be written to. + dataset (str): The Big Query dataset that the data should be written to. + table (str): The Big Query table that the data should be written to. + write_disposition (str, optional): In the event that the destination table exists, + should the table be overwritten ("WRITE_TRUNCATE") or appended to ("WRITE_APPEND")? Defaults to "WRITE_APPEND". + components_table (str, optional): The Big Query table for model components. Defaults to "". + components_dataset (str, optional): The Big Query dataset for model components. Defaults to "". + """ print( - f"Writing model components to `{project}.{components_dataset}.{components_table}`.", + f"Writing results to `{project}.{dataset}.{table}`.", flush=True, ) - + client = bigquery.Client(project=project) + schema = [ + bigquery.SchemaField("submission_date", bq_types.DATE), + *[ + bigquery.SchemaField(k, bq_types.STRING) + for k in self.metric_hub.segments.keys() + ], + bigquery.SchemaField("aggregation_period", bq_types.STRING), + bigquery.SchemaField("source", bq_types.STRING), + bigquery.SchemaField("value", bq_types.FLOAT), + bigquery.SchemaField("value_low", bq_types.FLOAT), + bigquery.SchemaField("value_mid", bq_types.FLOAT), + bigquery.SchemaField("value_high", bq_types.FLOAT), + bigquery.SchemaField("metric_alias", bq_types.STRING), + bigquery.SchemaField("metric_hub_app_name", bq_types.STRING), + bigquery.SchemaField("metric_hub_slug", bq_types.STRING), + bigquery.SchemaField("metric_start_date", bq_types.DATE), + bigquery.SchemaField("metric_end_date", bq_types.DATE), + bigquery.SchemaField("metric_collected_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_start_date", bq_types.DATE), + bigquery.SchemaField("forecast_end_date", bq_types.DATE), + bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_predicted_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_parameters", bq_types.STRING), + ] job = client.load_table_from_dataframe( - dataframe=components_df, - destination=f"{project}.{components_dataset}.{components_table}", + dataframe=self.summary_df, + destination=f"{project}.{dataset}.{table}", job_config=bigquery.LoadJobConfig( schema=schema, autodetect=False, write_disposition=write_disposition, ), ) - + # Wait for the job to complete. job.result() + + if components_table: + numeric_cols = list(self.components_df.select_dtypes(include=float).columns) + string_cols = list(self.components_df.select_dtypes(include=object).columns) + self.components_df["metric_slug"] = self.metric_hub.slug + self.components_df["forecast_trained_at"] = self.trained_at + + schema = [ + bigquery.SchemaField("submission_date", bq_types.DATE), + bigquery.SchemaField("metric_slug", bq_types.STRING), + bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP), + ] + schema += [ + bigquery.SchemaField(col, bq_types.STRING) for col in string_cols + ] + schema += [ + bigquery.SchemaField(col, bq_types.FLOAT) for col in numeric_cols + ] + + if not components_dataset: + components_dataset = dataset + print( + f"Writing model components to `{project}.{components_dataset}.{components_table}`.", + flush=True, + ) + + job = client.load_table_from_dataframe( + dataframe=self.components_df, + destination=f"{project}.{components_dataset}.{components_table}", + job_config=bigquery.LoadJobConfig( + schema=schema, + autodetect=False, + write_disposition=write_disposition, + schema_update_options=[ + bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION + ], + ), + ) + + job.result() diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py index 26e1cd2c..82a07fc4 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py @@ -3,339 +3,65 @@ from pandas.api import types as pd_types import prophet import numpy as np -from dataclasses import dataclass, field from typing import Dict, List from datetime import datetime, timezone +from dataclasses import dataclass from kpi_forecasting.models.base_forecast import BaseForecast from kpi_forecasting import pandas_extras as pdx from google.cloud import bigquery from google.cloud.bigquery.enums import SqlTypeNames as bq_types -from kpi_forecasting.configs.model_inputs import ( - ProphetHoliday, - ProphetRegressor, - holiday_collection, - regressor_collection, -) - @dataclass class ProphetForecast(BaseForecast): - """ - Holds the configuration and results for each segment - in a funnel forecasting model. - - Args: - holidays (list): list of ProphetHoliday objects used - to specify holidays in the Propohet model. Used to create - the dataframe passed to prophet under the holidays key - regressors (list): list of ProphetRegressor objects, - used to set regressors in the Prophet object and - create them in the data - use_all_us_holidays (bool): When True, `model.add_country_holidays(country_name="US")` - is called on the prophet model - growth (str): Used in Prophet object initialization - 'linear', 'logistic' or 'flat' to specify a linear, logistic or - flat trend. - changepoints (list): Used in Prophet object initialization - List of dates at which to include potential changepoints. If - not specified, potential changepoints are selected automatically. - n_changepoints (int): Used in Prophet object initialization - Number of potential changepoints to include. Not used - if input `changepoints` is supplied. If `changepoints` is not supplied, - then n_changepoints potential changepoints are selected uniformly from - the first `changepoint_range` proportion of the history. - changepoint_range (float): Used in Prophet object initialization - Proportion of history in which trend changepoints will - be estimated. Defaults to 0.8 for the first 80%. Not used if - `changepoints` is specified. - yearly_seasonality: Used in Prophet object initialization - Fit yearly seasonality. - Can be 'auto', True, False, or a number of Fourier terms to generate. - weekly_seasonality : Used in Prophet object initialization - Fit weekly seasonality. - Can be 'auto', True, False, or a number of Fourier terms to generate. - daily_seasonality: Used in Prophet object initialization - Fit daily seasonality. - Can be 'auto', True, False, or a number of Fourier terms to generate. - seasonality_mode: Used in Prophet object initialization - 'additive' (default) or 'multiplicative'. - seasonality_prior_scale: Used in Prophet object initialization - Parameter modulating the strength of the - seasonality model. Larger values allow the model to fit larger seasonal - fluctuations, smaller values dampen the seasonality. Can be specified - for individual seasonalities using add_seasonality. - holidays_prior_scale: Used in Prophet object initialization - Parameter modulating the strength of the holiday - components model, unless overridden in the holidays input. - changepoint_prior_scale: Used in Prophet object initialization - Parameter modulating the flexibility of the - automatic changepoint selection. Large values will allow many - changepoints, small values will allow few changepoints. - mcmc_samples (int): Used in Prophet object initialization - If greater than 0, will do full Bayesian inference - with the specified number of MCMC samples. If 0, will do MAP - estimation. - interval_width (float): Used in Prophet object initialization - width of the uncertainty intervals provided - for the forecast. If mcmc_samples=0, this will be only the uncertainty - in the trend using the MAP estimate of the extrapolated generative - model. If mcmc.samples>0, this will be integrated over all model - parameters, which will include uncertainty in seasonality. - uncertainty_samples: Used in Prophet object initialization - Number of simulated draws used to estimate - uncertainty intervals. Settings this value to 0 or False will disable - uncertainty estimation and speed up the calculation. - stan_backend (str): Used in Prophet object initialization - str as defined in StanBackendEnum default: None - will try to - iterate over all available backends and find the working one - holidays_mode (str): Used in Prophet object initialization - 'additive' or 'multiplicative'. Defaults to seasonality_mode. - """ - - holidays: list = field(default_factory=list[ProphetHoliday]) - regressors: list = field(default_factory=list[ProphetRegressor]) - use_all_us_holidays: bool = False - - # these are the arguments used to initialize the Prophet object - growth: str = "linear" - changepoints: list = None - n_changepoints: int = 25 - changepoint_range: float = 0.8 - yearly_seasonality: str = "auto" - weekly_seasonality: str = "auto" - daily_seasonality: str = "auto" - holidays: pd.DataFrame = None - seasonality_mode: str = "additive" - seasonality_prior_scale: float = 10.0 - holidays_prior_scale: float = 10.0 - changepoint_prior_scale: float = 0.05 - mcmc_samples: int = 0 - interval_width: float = 0.80 - uncertainty_samples: int = 1000 - stan_backend: str = None - scaling: str = "absmax" - holidays_mode: str = None - floor: float = None - cap: float = None - - def __post_init__(self): - holiday_list = [] - regressor_list = [] - - if self.holidays == []: - self.holidays = None - self.holidays_raw = None - elif not self.holidays: - self.holidays_raw = None - elif self.holidays: - self.holidays_raw = self.holidays - holiday_list = [ - ProphetHoliday(**holiday_collection[h]) for h in self.holidays - ] - holiday_df = pd.concat( - [ - pd.DataFrame( - { - "holiday": h.name, - "ds": pd.to_datetime(h.ds), - "lower_window": h.lower_window, - "upper_window": h.upper_window, - } - ) - for h in holiday_list - ], - ignore_index=True, - ) - self.holidays = holiday_df - if self.regressors: - self.regressors_raw = self.regressors - regressor_list = [ - ProphetRegressor(**regressor_collection[r]) for r in self.regressors - ] - self.regressors = regressor_list - else: - self.regressors_raw = None + """Forecast object specifically for prophet forecast models - self.model = self._build_model() - self.logistic_growth_cap = self.cap - self.logistic_growth_floor = self.floor + Additional attributes: + number_of_simulations (int): The number of simulated timeseries that the forecast + should generate. Since many forecast models are probablistic, this enables the + measurement of variation across a range of possible outcomes. + """ - def _build_model(self) -> prophet.Prophet: - """ - Build a Prophet model from parameters using attributes set on initialization + number_of_simulations: int = 1000 - Returns: - prophet.Prophet: The Prophet model. - """ + @property + def column_names_map(self) -> Dict[str, str]: + return {"submission_date": "ds", "value": "y"} + def _build_model(self, parameter_dict): model = prophet.Prophet( - growth=self.growth, - changepoints=self.changepoints, - n_changepoints=self.n_changepoints, - changepoint_range=self.changepoint_range, - yearly_seasonality=self.yearly_seasonality, - weekly_seasonality=self.weekly_seasonality, - daily_seasonality=self.daily_seasonality, - holidays=self.holidays, - seasonality_mode=self.seasonality_mode, - seasonality_prior_scale=self.seasonality_prior_scale, - holidays_prior_scale=self.holidays_prior_scale, - changepoint_prior_scale=self.changepoint_prior_scale, - mcmc_samples=self.mcmc_samples, - interval_width=self.interval_width, - uncertainty_samples=self.uncertainty_samples, - stan_backend=self.stan_backend, - scaling=self.scaling, - holidays_mode=self.holidays_mode, + **parameter_dict, + uncertainty_samples=self.number_of_simulations, + mcmc_samples=0, ) - for regressor in self.regressors: - model.add_regressor( - regressor.name, - prior_scale=regressor.prior_scale, - mode=regressor.mode, - ) - if self.use_all_us_holidays: model.add_country_holidays(country_name="US") return model - def _get_parameters(self) -> Dict: - """Return parameters used to create a new, identical ProphetForecast object""" - - # holidays and regressors get modified so use the - # raw version so these values could be used to create a new - # ProphetForecast without throwing an error - return { - "growth": self.growth, - "changepoints": self.changepoints, - "n_changepoints": self.n_changepoints, - "changepoint_range": self.changepoint_range, - "yearly_seasonality": self.yearly_seasonality, - "weekly_seasonality": self.weekly_seasonality, - "daily_seasonality": self.daily_seasonality, - "holidays": self.holidays_raw, - "seasonality_mode": self.seasonality_mode, - "seasonality_prior_scale": self.seasonality_prior_scale, - "holidays_prior_scale": self.holidays_prior_scale, - "changepoint_prior_scale": self.changepoint_prior_scale, - "mcmc_samples": self.mcmc_samples, - "interval_width": self.interval_width, - "uncertainty_samples": self.uncertainty_samples, - "stan_backend": self.stan_backend, - "scaling": self.scaling, - "holidays_mode": self.holidays_mode, - "cap": self.logistic_growth_cap, - "floor": self.logistic_growth_floor, - "regressors": self.regressors_raw, - } - - @property - def column_names_map(self) -> Dict[str, str]: - return {"submission_date": "ds", "value": "y"} - - def _add_regressors(self, df: pd.DataFrame, regressors: List[ProphetRegressor]): - """ - Add regressor columns to the dataframe for training or prediction. - - Args: - df (pd.DataFrame): The input dataframe. - regressors (List[ProphetRegressor]): The list of regressors to add. - - Returns: - pd.DataFrame: The dataframe with regressors added. - """ - for regressor in regressors: - regressor_time_filter = [True] * len(df) - if regressor.start_date: - regressor_time_filter &= ( - df["ds"] >= pd.to_datetime(regressor.start_date).date() - ) - if regressor.end_date: - regressor_time_filter &= ( - df["ds"] <= pd.to_datetime(regressor.end_date).date() - ) - # finds rows where date is in regressor date ranges and sets that regressor - ## value to 0, else 1 - df[regressor.name] = (~(regressor_time_filter)).astype(int) - return df - - def _set_seed(self) -> None: - """Set random seed to ensure that fits and predictions are reproducible.""" - np.random.seed(42) - - def _build_train_dataframe(self, observed_df) -> pd.DataFrame: - """ - Build the model dataframe for training - - Args: - observed_df: dataframe of observed data - - Returns: - pd.DataFrame: The dataframe for the model. - """ - - # define limits for logistic growth - observed_df = observed_df.rename(columns=self.column_names_map) - - if self.growth == "logistic": - self.logistic_growth_floor = observed_df["y"].min() * 0.5 - observed_df["floor"] = self.logistic_growth_floor - self.logistic_growth_cap = observed_df["y"].max() * 1.5 - observed_df["cap"] = self.logistic_growth_cap - - if self.regressors: - observed_df = self._add_regressors(observed_df, self.regressors) - - return observed_df - - def _build_predict_dataframe(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: - """creates dataframe used for prediction - - Args: - dates_to_predict (pd.DataFrame): dataframe of dates to predict - - Returns: - pd.DataFrame: dataframe to use used in prediction - """ - # predict dataframe only needs dates to predict, logistic growth limits, and regressors - df = dates_to_predict.rename(columns=self.column_names_map).copy() - if self.growth == "logistic": - df["floor"] = self.logistic_growth_floor - df["cap"] = self.logistic_growth_cap - - if self.regressors: - df = self._add_regressors(df, self.regressors) + def _fit(self, observed_df) -> None: + self.model = self._build_model(self.parameters) - return df - - def fit(self, observed_df) -> None: # Modify observed data to have column names that Prophet expects, and fit # the model - self._set_seed() - train_dataframe = self._build_train_dataframe(observed_df) - self.model.fit(train_dataframe) - return self + self.model.fit(observed_df.rename(columns=self.column_names_map)) - def predict(self, dates_to_predict) -> pd.DataFrame: + def _predict(self, dates_to_predict) -> pd.DataFrame: # generate the forecast samples - self._set_seed() samples = self.model.predictive_samples( dates_to_predict.rename(columns=self.column_names_map) ) df = pd.DataFrame(samples["yhat"]) df["submission_date"] = dates_to_predict - self._validate_forecast_df(df, dates_to_predict) return df - def _validate_forecast_df(self, df, dates_to_predict) -> None: + def _validate_forecast_df(self, df) -> None: """Validate that `self.forecast_df` has been generated correctly.""" columns = df.columns - expected_shape = (len(dates_to_predict), 1 + self.uncertainty_samples) + expected_shape = (len(self.dates_to_predict), 1 + self.number_of_simulations) numeric_columns = df.drop(columns="submission_date").columns if "submission_date" not in columns: @@ -346,41 +72,44 @@ def _validate_forecast_df(self, df, dates_to_predict) -> None: f"Expected forecast_df to have shape {expected_shape}, but it has shape {df.shape}." ) - if not df["submission_date"].equals(dates_to_predict["submission_date"]): + if not df["submission_date"].equals(self.dates_to_predict["submission_date"]): raise ValueError( "forecast_df['submission_date'] does not match dates_to_predict['submission_date']." ) for i in numeric_columns: - if not pd_types.is_numeric_dtype(df[i]): + if not pd_types.is_numeric_dtype(self.forecast_df[i]): raise ValueError( "All forecast_df columns except 'submission_date' must be numeric," f" but column {i} has type {df[i].dtypes}." ) - def _predict_legacy( - self, dates_to_predict, metric_hub_alias, parameters - ) -> pd.DataFrame: + def _predict_legacy(self) -> pd.DataFrame: """ Recreate the legacy format used in `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_v1`. """ # TODO: This method should be removed once the forecasting data model is updated: # https://mozilla-hub.atlassian.net/browse/DS-2676 - df = self.model.predict(self._build_predict_dataframe(dates_to_predict)) + + df = self.model.predict( + self.dates_to_predict.rename(columns=self.column_names_map) + ) # set legacy column values - if "dau" in metric_hub_alias.lower(): + if "dau" in self.metric_hub.alias.lower(): df["metric"] = "DAU" else: - df["metric"] = metric_hub_alias + df["metric"] = self.metric_hub.alias df["forecast_date"] = str( datetime.now(timezone.utc).replace(tzinfo=None).date() ) - df["forecast_parameters"] = str(json.dumps(parameters)) + df["forecast_parameters"] = str( + json.dumps({**self.parameters, "holidays": self.use_all_us_holidays}) + ) - alias = metric_hub_alias.lower() + alias = self.metric_hub.alias.lower() if ("desktop" in alias) and ("mobile" in alias): raise ValueError( @@ -436,363 +165,281 @@ def _predict_legacy( return df[columns] + def _aggregate_forecast_observed( + self, + forecast_df, + observed_df, + period: str, + numpy_aggregations: List[str], + percentiles: List[int], + ): + # build a list of all functions that we'll summarize the data by + aggregations = [getattr(np, i) for i in numpy_aggregations] + aggregations.extend([pdx.percentile(i) for i in percentiles]) + + # aggregate metric to the correct date period (day, month, year) + observed_summarized = pdx.aggregate_to_period(observed_df, period) + forecast_agg = pdx.aggregate_to_period(forecast_df, period).sort_values( + "submission_date" + ) -def aggregate_forecast_observed( - forecast_df: pd.DataFrame, - observed_df: pd.DataFrame, - period: str, - numpy_aggregations: List[str], - percentiles: List[int], - additional_aggregation_columns: List[str] = [], -) -> tuple[pd.DataFrame, pd.DataFrame]: - """Aggregate samples produced by prophet to aggregates specified in - numpy_aggregations and percentiles, and aggregate in time up to the period - specified in period. Aggregation will include any columns passed to - additional_aggergation_columns - - Args: - forecast_df (pd.DataFrame): raw output of the predict function from ProphetForecast or - one of it's child classes - observed_df (_type_): raw input of the fit function from ProphetForecast or - one of it's child classes - period (str): period to aggregate to in time. Must be 'day', 'month' or 'year' - numpy_aggregations (List[str]): aggregates from numpy to use. Can be the name of any - numpy function that outputs a single value - percentiles (List[int]): list of number for which the percentile should be generated - additional_aggregation_columns (List[str], optional): - additional columns to use in the aggregation. Defaults to []. - - Returns: - forecast_summarized (pd.DataFrame): summarized forecast data - observed_summarized (pd.DataFrame): summarized observed data - """ - # build a list of all functions that we'll summarize the data by - aggregations = [getattr(np, i) for i in numpy_aggregations] - aggregations.extend([pdx.percentile(i) for i in percentiles]) + # find periods of overlap between observed and forecasted data + # merge preserves key order so overlap will be sorted by submission_date + overlap = forecast_agg.merge( + observed_summarized, + on="submission_date", + how="left", + ).fillna(0) + + forecast_summarized = ( + forecast_agg.set_index("submission_date") + # Add observed data samples to any overlapping forecasted period. This + # ensures that any forecast made partway through a period accounts for + # previously observed data within the period. For example, when a monthly + # forecast is generated in the middle of the month. + .add(overlap[["value"]].values) + # calculate summary values, aggregating by submission_date, + .agg(aggregations, axis=1) + .reset_index() + ) - # aggregate metric to the correct date period (day, month, year) - observed_summarized = pdx.aggregate_to_period( - observed_df, - period, - additional_aggregation_columns=additional_aggregation_columns, - ) - forecast_agg = pdx.aggregate_to_period( + return forecast_summarized, observed_summarized + + def _combine_forecast_observed( + self, forecast_df, - period, - additional_aggregation_columns=additional_aggregation_columns, - ).sort_values("submission_date") - - aggregation_columns = ["submission_date"] + additional_aggregation_columns - - # find periods of overlap between observed and forecasted data - # merge preserves key order so overlap will be sorted by submission_date - overlap = forecast_agg.merge( - observed_summarized[aggregation_columns + ["value"]], - on=aggregation_columns, - how="left", - ).fillna(0) - - # separate out numeric columns, which will be the samples - # from non-numeric - - forecast_agg_no_aggregation_cols = forecast_agg[ - [el for el in forecast_agg.columns if el not in aggregation_columns] - ] - forecast_agg_string = forecast_agg_no_aggregation_cols.select_dtypes( - include=["datetime64", object] - ) - - # assuming that the numeric columns are exactly those created by - # predictive_samples - forecast_agg_numeric = forecast_agg_no_aggregation_cols.select_dtypes( - include=["float", "int"] - ) - - # put aggergation columns back into x_numeric so groupby works - forecast_agg_numeric = forecast_agg[ - list(forecast_agg_numeric.columns) + aggregation_columns - ] - forecast_agg_string = forecast_agg[ - list(forecast_agg_string.columns) + aggregation_columns - ] - - forecast_summarized = ( - forecast_agg_numeric.set_index(aggregation_columns) - # Add observed data samples to any overlapping forecasted period. This - # ensures that any forecast made partway through a period accounts for - # previously observed data within the period. For example, when a monthly - # forecast is generated in the middle of the month. - .add(overlap[["value"]].values) - # calculate summary values, aggregating by submission_date, - .agg(aggregations, axis=1) - .reset_index() - ) - - # add string columns back in - forecast_summarized = forecast_summarized.merge( - forecast_agg_string, on=aggregation_columns - ) - - forecast_summarized["aggregation_period"] = period.lower() - observed_summarized["aggregation_period"] = period.lower() - - return forecast_summarized, observed_summarized - - -def combine_forecast_observed( - forecast_summarized: pd.DataFrame, observed_summarized: pd.DataFrame -) -> pd.DataFrame: - """combines summarized forecast and observed data - - Args: - forecast_summarized (pd.DataFrame): summarized forecast data - observed_summarized (pd.DataFrame): summarized observed data - - Returns: - pd.DataFrame: combined data - """ - # remove aggregation period because it messes everything up with the melt - forecast_summarized = forecast_summarized.drop(columns=["aggregation_period"]) - observed_summarized = observed_summarized.drop(columns=["aggregation_period"]) + observed_df, + period: str, + numpy_aggregations: List[str], + percentiles: List[int], + ): + forecast_summarized, observed_summarized = self._aggregate_forecast_observed( + forecast_df, observed_df, period, numpy_aggregations, percentiles + ) - # remaining column of metric values get the column name 'value' - forecast_summarized = forecast_summarized.melt( - id_vars="submission_date", var_name="measure" - ) - observed_summarized["measure"] = "observed" + # remaining column of metric values get the column name 'value' + forecast_summarized = forecast_summarized.melt( + id_vars="submission_date", var_name="measure" + ) + observed_summarized["measure"] = "observed" - # add datasource-specific metadata columns - forecast_summarized["source"] = "forecast" - observed_summarized["source"] = "historical" + # add datasource-specific metadata columns + forecast_summarized["source"] = "forecast" + observed_summarized["source"] = "historical" - df = pd.concat([forecast_summarized, observed_summarized]) + df = pd.concat([forecast_summarized, observed_summarized]) - return df + return df + def _summarize( + self, + forecast_df, + observed_df, + period: str, + numpy_aggregations: List[str], + percentiles: List[int], + ) -> pd.DataFrame: + """ + Calculate summary metrics for `self.forecast_df` over a given period, and + add metadata. + """ -def summarize( - forecast_df, - observed_df, - periods: List[str], - numpy_aggregations: List[str], - percentiles: List[int], - forecast_parameters: dict, -) -> pd.DataFrame: - """ - Calculate summary metrics for `self.forecast_df` over a given period, and - add metadata. - - Args: - forecast_df (pd.DataFrame): raw output of the predict function from ProphetForecast or - one of it's child classes - observed_df (_type_): raw input of the fit function from ProphetForecast or - one of it's child classes - period (str): period to aggregate to in time. Must be 'day', 'month' or 'year' - numpy_aggregations (List[str]): aggregates from numpy to use. Can be the name of any - numpy function that outputs a single value - percentiles (List[int]): list of number for which the percentile should be generated - additional_aggregation_columns (List[str], optional): - additional columns to use in the aggregation. Defaults to []. - """ - df_list = [] - for period in periods: - forecast_summarized, observed_summarized = aggregate_forecast_observed( + df = self._combine_forecast_observed( forecast_df, observed_df, period, numpy_aggregations, percentiles ) + # add summary metadata columns + df["aggregation_period"] = period.lower() - df = combine_forecast_observed(forecast_summarized, observed_summarized) - - # it got removed in combine_forecast_observed so put it back - df["aggregation_period"] = period - df["forecast_parameters"] = forecast_parameters - df_list.append(df) + return df - return pd.concat(df_list) + def _summarize_legacy(self) -> pd.DataFrame: + """ + Converts a `self.summary_df` to the legacy format used in + `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1` + """ + # TODO: This method should be removed once the forecasting data model is updated: + # https://mozilla-hub.atlassian.net/browse/DS-2676 + df = self.summary_df.copy(deep=True) -def summarize_legacy(summary_df) -> pd.DataFrame: - """ - Converts a `self.summary_df` to the legacy format used in - `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1` - """ - # TODO: This method should be removed once the forecasting data model is updated: - # https://mozilla-hub.atlassian.net/browse/DS-2676 - - # rename columns to legacy values - df = summary_df.rename( - columns={ - "forecast_end_date": "asofdate", - "submission_date": "date", - "metric_alias": "target", - "aggregation_period": "unit", - } - ) - df["forecast_date"] = df["forecast_predicted_at"].dt.date - df["type"] = df["source"].replace("historical", "actual") - df = df.replace( - { - "measure": { - "observed": "value", - "p05": "yhat_p5", - "p10": "yhat_p10", - "p20": "yhat_p20", - "p30": "yhat_p30", - "p40": "yhat_p40", - "p50": "yhat_p50", - "p60": "yhat_p60", - "p70": "yhat_p70", - "p80": "yhat_p80", - "p90": "yhat_p90", - "p95": "yhat_p95", + # rename columns to legacy values + df.rename( + columns={ + "forecast_end_date": "asofdate", + "submission_date": "date", + "metric_alias": "target", + "aggregation_period": "unit", }, - "target": { - "desktop_dau": "desktop", - "mobile_dau": "mobile", - }, - } - ) - - # pivot the df from "long" to "wide" format - index_columns = [ - "asofdate", - "date", - "target", - "unit", - "forecast_parameters", - "forecast_date", - ] - df = ( - df[index_columns + ["measure", "value"]] - .pivot( - index=index_columns, - columns="measure", - values="value", + inplace=True, + ) + df["forecast_date"] = df["forecast_predicted_at"].dt.date + df["type"] = df["source"].replace("historical", "actual") + df = df.replace( + { + "measure": { + "observed": "value", + "p05": "yhat_p5", + "p10": "yhat_p10", + "p20": "yhat_p20", + "p30": "yhat_p30", + "p40": "yhat_p40", + "p50": "yhat_p50", + "p60": "yhat_p60", + "p70": "yhat_p70", + "p80": "yhat_p80", + "p90": "yhat_p90", + "p95": "yhat_p95", + }, + "target": { + "desktop_dau": "desktop", + "mobile_dau": "mobile", + }, + } ) - .reset_index() - ) - - # pivot sets the "name" attribute of the columns for some reason. It's - # None by default, so we just reset that here. - df.columns.name = None - - # When there's an overlap in the observed and forecasted period -- for - # example, when a monthly forecast is generated mid-month -- the legacy - # format only records the forecasted value, not the observed value. To - # account for this, we'll just find the max of the "mean" (forecasted) and - # "value" (observed) data. In all non-overlapping observed periods, the - # forecasted value will be NULL. In all non-overlapping forecasted periods, - # the observed value will be NULL. In overlapping periods, the forecasted - # value will always be larger because it is the sum of the observed and forecasted - # values. Below is a query that demonstrates the legacy behavior: - # - # SELECT * - # FROM `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1` - # WHERE asofdate = "2023-12-31" - # AND target = "mobile" - # AND unit = "month" - # AND forecast_date = "2022-06-04" - # AND date BETWEEN "2022-05-01" AND "2022-06-01" - # ORDER BY date - df["value"] = df[["mean", "value"]].max(axis=1) - df.drop(columns=["mean"], inplace=True) - - # non-numeric columns are represented in the legacy bq schema as strings - string_cols = [ - "asofdate", - "date", - "target", - "unit", - "forecast_parameters", - "forecast_date", - ] - df[string_cols] = df[string_cols].astype(str) - - return df - - -def write_results( - summary_df: pd.DataFrame, - summary_df_legacy: pd.DataFrame, - forecast_df_legacy: pd.DataFrame, - project: str, - dataset: str, - table: str, - project_legacy: str, - dataset_legacy: str, - forecast_table_legacy: str, - confidences_table_legacy: str, - write_disposition: str = "WRITE_APPEND", -) -> None: - """ - Write `self.summary_df` to Big Query. - - Args: - project (str): The Big Query project that the data should be written to. - dataset (str): The Big Query dataset that the data should be written to. - table (str): The Big Query table that the data should be written to. - write_disposition (str): In the event that the destination table exists, - should the table be overwritten ("WRITE_TRUNCATE") or appended to - ("WRITE_APPEND")? - """ - print(f"Writing results to `{project}.{dataset}.{table}`.", flush=True) - client = bigquery.Client(project=project) - schema = [ - bigquery.SchemaField("submission_date", bq_types.DATE), - bigquery.SchemaField("aggregation_period", bq_types.STRING), - bigquery.SchemaField("source", bq_types.STRING), - bigquery.SchemaField("measure", bq_types.STRING), - bigquery.SchemaField("value", bq_types.FLOAT), - bigquery.SchemaField("metric_alias", bq_types.STRING), - bigquery.SchemaField("metric_hub_app_name", bq_types.STRING), - bigquery.SchemaField("metric_hub_slug", bq_types.STRING), - bigquery.SchemaField("metric_start_date", bq_types.DATE), - bigquery.SchemaField("metric_end_date", bq_types.DATE), - bigquery.SchemaField("metric_collected_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_start_date", bq_types.DATE), - bigquery.SchemaField("forecast_end_date", bq_types.DATE), - bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_predicted_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_parameters", bq_types.STRING), - ] - job = client.load_table_from_dataframe( - dataframe=summary_df, - destination=f"{project}.{dataset}.{table}", - job_config=bigquery.LoadJobConfig( - schema=schema, - autodetect=False, - write_disposition=write_disposition, - ), - ) - # Wait for the job to complete. - job.result() - - # TODO: remove the below jobs once the forecasting data model is updated: - # https://mozilla-hub.atlassian.net/browse/DS-2676 - - job = client.load_table_from_dataframe( - dataframe=forecast_df_legacy, - destination=f"{project_legacy}.{dataset_legacy}.{forecast_table_legacy}", - job_config=bigquery.LoadJobConfig( - write_disposition=write_disposition, - schema=[ - bigquery.SchemaField("ds", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_date", bq_types.STRING), - bigquery.SchemaField("forecast_parameters", bq_types.STRING), - ], - ), - ) - job.result() - - job = client.load_table_from_dataframe( - dataframe=summary_df_legacy, - destination=f"{project_legacy}.{dataset_legacy}.{confidences_table_legacy}", - job_config=bigquery.LoadJobConfig( - write_disposition=write_disposition, - schema=[ - bigquery.SchemaField("asofdate", bq_types.STRING), - bigquery.SchemaField("date", bq_types.STRING), - ], - ), - ) - job.result() + # pivot the df from "long" to "wide" format + index_columns = [ + "asofdate", + "date", + "target", + "unit", + "forecast_parameters", + "forecast_date", + ] + df = ( + df[index_columns + ["measure", "value"]] + .pivot( + index=index_columns, + columns="measure", + values="value", + ) + .reset_index() + ) + + # pivot sets the "name" attribute of the columns for some reason. It's + # None by default, so we just reset that here. + df.columns.name = None + + # When there's an overlap in the observed and forecasted period -- for + # example, when a monthly forecast is generated mid-month -- the legacy + # format only records the forecasted value, not the observed value. To + # account for this, we'll just find the max of the "mean" (forecasted) and + # "value" (observed) data. In all non-overlapping observed periods, the + # forecasted value will be NULL. In all non-overlapping forecasted periods, + # the observed value will be NULL. In overlapping periods, the forecasted + # value will always be larger because it is the sum of the observed and forecasted + # values. Below is a query that demonstrates the legacy behavior: + # + # SELECT * + # FROM `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1` + # WHERE asofdate = "2023-12-31" + # AND target = "mobile" + # AND unit = "month" + # AND forecast_date = "2022-06-04" + # AND date BETWEEN "2022-05-01" AND "2022-06-01" + # ORDER BY date + df["value"] = df[["mean", "value"]].max(axis=1) + df.drop(columns=["mean"], inplace=True) + + # non-numeric columns are represented in the legacy bq schema as strings + string_cols = [ + "asofdate", + "date", + "target", + "unit", + "forecast_parameters", + "forecast_date", + ] + df[string_cols] = df[string_cols].astype(str) + + return df + + def write_results( + self, + project: str, + dataset: str, + table: str, + project_legacy: str, + dataset_legacy: str, + write_disposition: str = "WRITE_APPEND", + forecast_table_legacy: str = "kpi_automated_forecast_v1", + confidences_table_legacy: str = "kpi_automated_forecast_confidences_v1", + ) -> None: + """ + Write `self.summary_df` to Big Query. + + Args: + project (str): The Big Query project that the data should be written to. + dataset (str): The Big Query dataset that the data should be written to. + table (str): The Big Query table that the data should be written to. + write_disposition (str): In the event that the destination table exists, + should the table be overwritten ("WRITE_TRUNCATE") or appended to + ("WRITE_APPEND")? + """ + # get legacy tables + # TODO: remove this once the forecasting data model is updated: + # https://mozilla-hub.atlassian.net/browse/DS-2676 + self.forecast_df_legacy = self._predict_legacy() + self.summary_df_legacy = self._summarize_legacy() + + print(f"Writing results to `{project}.{dataset}.{table}`.", flush=True) + client = bigquery.Client(project=project) + schema = [ + bigquery.SchemaField("submission_date", bq_types.DATE), + bigquery.SchemaField("aggregation_period", bq_types.STRING), + bigquery.SchemaField("source", bq_types.STRING), + bigquery.SchemaField("measure", bq_types.STRING), + bigquery.SchemaField("value", bq_types.FLOAT), + bigquery.SchemaField("metric_alias", bq_types.STRING), + bigquery.SchemaField("metric_hub_app_name", bq_types.STRING), + bigquery.SchemaField("metric_hub_slug", bq_types.STRING), + bigquery.SchemaField("metric_start_date", bq_types.DATE), + bigquery.SchemaField("metric_end_date", bq_types.DATE), + bigquery.SchemaField("metric_collected_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_start_date", bq_types.DATE), + bigquery.SchemaField("forecast_end_date", bq_types.DATE), + bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_predicted_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_parameters", bq_types.STRING), + ] + job = client.load_table_from_dataframe( + dataframe=self.summary_df, + destination=f"{project}.{dataset}.{table}", + job_config=bigquery.LoadJobConfig( + schema=schema, + autodetect=False, + write_disposition=write_disposition, + ), + ) + # Wait for the job to complete. + job.result() + + # TODO: remove the below jobs once the forecasting data model is updated: + # https://mozilla-hub.atlassian.net/browse/DS-2676 + + job = client.load_table_from_dataframe( + dataframe=self.forecast_df_legacy, + destination=f"{project_legacy}.{dataset_legacy}.{forecast_table_legacy}", + job_config=bigquery.LoadJobConfig( + write_disposition=write_disposition, + schema=[ + bigquery.SchemaField("ds", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_date", bq_types.STRING), + bigquery.SchemaField("forecast_parameters", bq_types.STRING), + ], + ), + ) + job.result() + + job = client.load_table_from_dataframe( + dataframe=self.summary_df_legacy, + destination=f"{project_legacy}.{dataset_legacy}.{confidences_table_legacy}", + job_config=bigquery.LoadJobConfig( + write_disposition=write_disposition, + schema=[ + bigquery.SchemaField("asofdate", bq_types.STRING), + bigquery.SchemaField("date", bq_types.STRING), + ], + ), + ) + job.result() diff --git a/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py b/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py index 8352242f..8ae622bf 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py +++ b/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py @@ -17,23 +17,8 @@ def aggregate_to_period( period: str, aggregation: callable = np.sum, date_col: str = "submission_date", - additional_aggregation_columns: list = [], ) -> pd.DataFrame: - """aggregates a dataframe to a period within any additional columns specified - - Args: - df (pd.DataFrame): dataframe to aggregate to. Must have a date column - with the name specified in the date_col argument - period (str): period to aggregate the datat to - aggregation (callable, optional): function to use to aggergate. Defaults to np.sum. - date_col (str, optional): column in the dataframe that contains the date - information used in aggregation. Defaults to "submission_date". - additional_aggregation_columns (list, optional): Additional columns - within which the date aggregation should occur. Defaults to []. - - Returns: - pd.DataFrame: _description_ - """ + """Floor dates to the correct period and aggregate.""" if period.lower() not in ["day", "month", "year"]: raise ValueError( f"Don't know how to floor dates by {period}. Please use 'day', 'month', or 'year'." @@ -42,15 +27,9 @@ def aggregate_to_period( x = df.copy(deep=True) x[date_col] = pd.to_datetime(x[date_col]).dt.to_period(period[0]).dt.to_timestamp() - aggregation_cols = [date_col] + additional_aggregation_columns # treat numeric and string types separately - x_no_aggregation_cols = x[[el for el in x.columns if el not in aggregation_cols]] - x_string = x_no_aggregation_cols.select_dtypes(include=["datetime64", object]) - x_numeric = x_no_aggregation_cols.select_dtypes(include=["float", "int"]) - - # put aggergation columns back into x_numeric so groupby works - x_numeric = x[list(x_numeric.columns) + aggregation_cols] - x_string = x[list(x_string.columns) + aggregation_cols] + x_string = x.select_dtypes(include=["datetime64", object]) + x_numeric = x.select_dtypes(include=["float", "int", "datetime64"]) if set(x_string.columns) | set(x_numeric.columns) != set(x.columns): missing_columns = set(x.columns) - ( @@ -61,7 +40,7 @@ def aggregate_to_period( f"Columns do not have string or numeric type: {missing_columns_str}" ) - x_numeric_agg = x_numeric.groupby(aggregation_cols).agg(aggregation).reset_index() + x_numeric_agg = x_numeric.groupby(date_col).agg(aggregation).reset_index() # all values of x_string should be the same because it is just the dimensions x_string_agg = x_string.drop_duplicates().reset_index(drop=True) @@ -72,5 +51,7 @@ def aggregate_to_period( ) # unique preseves order so we should be fine to concat - output_df = x_numeric_agg.merge(x_string_agg, on=aggregation_cols) + output_df = pd.concat( + [x_numeric_agg, x_string_agg.drop(columns=[date_col])], axis=1 + ) return output_df diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py index 32c0cd1a..bfea0e5a 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -1,14 +1,15 @@ -from datetime import date +from typing import List +import collections +from datetime import date, datetime from dateutil.relativedelta import relativedelta -from dataclasses import dataclass - import pytest import pandas as pd import numpy as np +from datetime import timedelta, timezone -from kpi_forecasting.models.base_forecast import BaseForecast, BaseEnsembleForecast +from kpi_forecasting.models.base_forecast import BaseForecast # Arbitrarily choose some date to use for the tests TEST_DATE = date(2024, 1, 1) @@ -17,7 +18,6 @@ TEST_DATE_NEXT_DAY_STR = TEST_DATE_NEXT_DAY.strftime("%Y-%m-%d") TEST_PREDICT_END = TEST_DATE + relativedelta(months=2) TEST_PREDICT_END_STR = TEST_PREDICT_END.strftime("%Y-%m-%d") -TEST_OBSERVED_START = date(2023, 1, 1) class BadClass(BaseForecast): @@ -27,522 +27,233 @@ class BadClass(BaseForecast): @pytest.fixture() def good_class(): class GoodModel: - def __init__(self, id, factor): - self.id = id + def __init__(self): self.is_fit = False - self.factor = factor def fit(self, observed_data): - self.is_fit = min(observed_data["submission_date"]) - - def predict(self, forecast_data): - forecast_data = forecast_data.copy() - start_at = 2 - len(forecast_data) - forecast_data["value"] = np.array([1, 2])[start_at:] * self.factor - return forecast_data + self.is_fit = max(observed_data["submission_date"]) - @dataclass class GoodClass(BaseForecast): - id: str = None - seed_set: bool = False - factor: int = 1 - # overwrite _get_observed_data - def _set_seed(self): - self.seed_set = True - return - - def fit(self, observed_df: pd.DataFrame) -> None: + def _get_observed_data(self): + self.observed_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE + - relativedelta(years=1), # just an arbitrary date in the past + ] + } + ) + + def _fit(self, observed_df: np.array) -> None: # takes array as input to simplify tests - self.model = GoodModel(self.id, self.factor) + self.model = GoodModel() self.model.fit(observed_df) - def predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: + def _predict(self, dates_to_predict: np.array) -> pd.DataFrame: # takes array as input to simplify tests - return self.model.predict(dates_to_predict) + return dates_to_predict * 2 - def _validate_forecast_df(self, forecast_df: pd.DataFrame) -> None: + def _validate_forecast_df(self, forecast_df: np.array) -> None: # takes array as input to simplify tests # check that all are even after _predict runs assert np.all(forecast_df % 2 == 0) - def _get_parameters(self): - return {"id": self.id, "factor": self.factor} + def _summarize( + self, + forecast_df: np.array, + observed_df: np.array, + period: str, + numpy_aggregations: List[str], + percentiles: List[str], + ) -> pd.DataFrame: + # input types changes to simplify test + np_func = getattr(np, numpy_aggregations[0]) + agg_val = np_func(forecast_df + observed_df) + return pd.DataFrame( + [{"number": agg_val, "period": period, "percentiles": percentiles[0]}] + ) return GoodClass -def test_forecast_not_implemented(): +def test_not_implemented(): with pytest.raises( TypeError, - match="Can't instantiate abstract class BadClass with abstract methods _set_seed, _validate_forecast_df, fit, predict", + match="Can't instantiate abstract class BadClass with abstract methods _fit, _predict, _summarize, _validate_forecast_df", ): _ = BadClass() -def test_fit(good_class): - """test the fit method, and implicitly the set_segment_models method""" - A1_start_date = "2018-01-01" - A2_start_date = "2020-02-02" - parameter_list = [ - {"segment": {"a": "A1"}, "parameters": {"id": "This is A1"}}, - {"segment": {"a": "A2"}, "parameters": {"id": "This is A2"}}, - ] - - EnsembleObject = BaseEnsembleForecast( - model_class=good_class, parameters=parameter_list, segments=["a", "b"] +def test_post_init(good_class): + start_date = TEST_DATE_STR + end_date = TEST_PREDICT_END_STR + good_class = good_class( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date=start_date, + end_date=end_date, + metric_hub=None, ) - - observed_data = pd.DataFrame( + dates_to_predict_expected = pd.DataFrame( { - "a": ["A1", "A1", "A2", "A2", "A2"], - "b": ["B1", "B2", "B1", "B2", "B2"], - "submission_date": [ - A1_start_date, - A1_start_date, - A2_start_date, - A2_start_date, - A2_start_date, - ], + "submission_date": pd.date_range( + pd.to_datetime(start_date), pd.to_datetime(end_date) + ).date } ) + assert good_class.dates_to_predict.equals(dates_to_predict_expected) - EnsembleObject.fit(observed_data) - - segment_models = EnsembleObject.segment_models - - # put the segments and the start date in the same dictionary to make - # comparison easier - # the important things to check is that all possible combinations - # of segments are present and that each has the parameters set properly - # start_date is a stand-in for these parameters and - # is determined by the value of a as specified in parameter_dict - check_segment_models = [ - dict(**el["segment"], **{"id": el["model"].id}) for el in segment_models - ] - - expected = [ - {"a": "A1", "b": "B1", "id": "This is A1"}, - {"a": "A1", "b": "B2", "id": "This is A1"}, - {"a": "A2", "b": "B1", "id": "This is A2"}, - {"a": "A2", "b": "B2", "id": "This is A2"}, - ] - - # can't make a set of dicts for comparison - # so sort the lists and compare each element - compare_sorted = zip( - sorted(check_segment_models, key=lambda x: (x["a"], x["b"])), - sorted(expected, key=lambda x: (x["a"], x["b"])), - ) - - for checkval, expectedval in compare_sorted: - assert checkval == expectedval - - # test that the seed was set for all models during fitting - assert all([el["model"].seed_set for el in segment_models]) - - # test that the fit was applied properly to all models - # to do this check the is_fit attribute, which will equal - # A1_start_date for A1 segments and A2_start_date for A2 segments - - for segment in segment_models: - if segment["segment"]["a"] == "A1": - assert segment["model"].model.is_fit == A1_start_date - else: - assert segment["model"].model.is_fit == A2_start_date - - -def test_fit_multiple(good_class): - """test the fit method - with segments on multiple columns. - Implicitly testing set_segment_models with multiple - segments as well""" - # set arbitrary dates - # they're only used to make sure segments are set correctly - A1B1_start_date = "2018-01-01" - A1B2_start_date = "2019-01-01" - A2B1_start_date = "2020-02-02" - A2B2_start_date = "2021-02-02" - parameter_list = [ - { - "segment": {"a": "A1", "b": "B1"}, - "parameters": {"id": "This is A1B1"}, - }, - { - "segment": {"a": "A1", "b": "B2"}, - "parameters": {"id": "This is A1B2"}, - }, - { - "segment": {"a": "A2", "b": "B1"}, - "parameters": {"id": "This is A2B1"}, - }, - { - "segment": {"a": "A2", "b": "B2"}, - "parameters": {"id": "This is A2B2"}, - }, - ] - - EnsembleObject = BaseEnsembleForecast( - model_class=good_class, parameters=parameter_list, segments=["a", "b"] - ) - - observed_data = pd.DataFrame( - { - "a": ["A1", "A1", "A2", "A2", "A2"], - "b": ["B1", "B2", "B1", "B2", "B2"], - "submission_date": [ - A1B1_start_date, - A1B2_start_date, - A2B1_start_date, - A2B2_start_date, - A2B2_start_date, - ], - } - ) - EnsembleObject.fit(observed_data) - - segment_models = EnsembleObject.segment_models - - # put the segments and the start date in the same dictionary to make - # comparison easier - # the important things to check is that all possible combinations - # of segments are present and that each has the parameters set properly - # start_date is a stand-in for these parameters and - # is determined by the value of a as specified in parameter_dict - check_segment_models = [ - dict(**el["segment"], **{"id": el["model"].id}) for el in segment_models - ] - expected = [ - {"a": "A1", "b": "B1", "id": "This is A1B1"}, - {"a": "A1", "b": "B2", "id": "This is A1B2"}, - {"a": "A2", "b": "B1", "id": "This is A2B1"}, - {"a": "A2", "b": "B2", "id": "This is A2B2"}, - ] - - # can't make a set of dicts for comparison - # so sort the lists and compare each element - compare_sorted = zip( - sorted(check_segment_models, key=lambda x: (x["a"], x["b"])), - sorted(expected, key=lambda x: (x["a"], x["b"])), +def test_post_init_exception(good_class): + start_date = TEST_DATE_STR + end_date = TEST_PREDICT_END_STR + with pytest.raises( + ValueError, + match="forecast start_date set while predict_historical_dates is True", + ): + _ = good_class( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date=start_date, + end_date=end_date, + metric_hub=None, + predict_historical_dates=True, + ) + + +def test_post_init_default_dates(good_class): + # check default start and end time + good_class = good_class( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date="", + end_date="", + metric_hub=None, ) - - for checkval, expectedval in compare_sorted: - assert checkval == expectedval - - # test that the seed was set for all models during fitting - assert all([el["model"].seed_set for el in segment_models]) - - # test that the fit was applied properly to all models - # to do this check the is_fit attribute, which will equal - # A1_start_date for A1 segments and A2_start_date for A2 segments - - for segment in segment_models: - if segment["segment"]["a"] == "A1" and segment["segment"]["b"] == "B1": - assert segment["model"].model.is_fit == A1B1_start_date - elif segment["segment"]["a"] == "A1" and segment["segment"]["b"] == "B2": - assert segment["model"].model.is_fit == A1B2_start_date - elif segment["segment"]["a"] == "A2" and segment["segment"]["b"] == "B1": - assert segment["model"].model.is_fit == A2B1_start_date - else: - assert segment["model"].model.is_fit == A2B2_start_date - - -def test_fit_multiple_with_start(good_class): - """test the fit method - with segments on multiple columns. - Implicitly testing set_segment_models with multiple - segments as well""" - parameter_list = [ - { - "segment": {"a": "A1", "b": "B1"}, - "parameters": {"id": "This is A1B1"}, - }, - { - "segment": {"a": "A1", "b": "B2"}, - "parameters": {"id": "This is A1B2"}, - "start_date": TEST_DATE_NEXT_DAY_STR, - }, - { - "segment": {"a": "A2", "b": "B1"}, - "parameters": {"id": "This is A2B1"}, - }, - { - "segment": {"a": "A2", "b": "B2"}, - "parameters": {"id": "This is A2B2"}, - "start_date": TEST_DATE_NEXT_DAY_STR, - }, - ] - - EnsembleObject = BaseEnsembleForecast( - model_class=good_class, parameters=parameter_list, segments=["a", "b"] + # this is the max date of the self.observed_data['submission_date'] plus one day + # from the object definion + start_date = TEST_DATE_NEXT_DAY + end_date = ( + datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78) + ).date() + dates_to_predict_expected = pd.DataFrame( + {"submission_date": pd.date_range(start_date, end_date).date} ) - - # every segment has two days, TEST_DATE and TEST_DATE_NEXT_DAY - observed_data = pd.DataFrame( - [ - {"a": "A1", "b": "B1", "submission_date": TEST_DATE}, - {"a": "A1", "b": "B1", "submission_date": TEST_DATE_NEXT_DAY}, - {"a": "A1", "b": "B2", "submission_date": TEST_DATE}, - {"a": "A1", "b": "B2", "submission_date": TEST_DATE_NEXT_DAY}, - {"a": "A2", "b": "B1", "submission_date": TEST_DATE}, - {"a": "A2", "b": "B1", "submission_date": TEST_DATE_NEXT_DAY}, - {"a": "A2", "b": "B2", "submission_date": TEST_DATE}, - {"a": "A2", "b": "B2", "submission_date": TEST_DATE_NEXT_DAY}, - ] + assert good_class.dates_to_predict.equals(dates_to_predict_expected) + + +def test_post_init_default_dates_historical(good_class): + # check default start and end time + good_class = good_class( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date="", + end_date="", + metric_hub=None, + predict_historical_dates=True, ) - - EnsembleObject.fit(observed_data) - - segment_models = EnsembleObject.segment_models - - # put the segments and the start date in the same dictionary to make - # comparison easier - # the important things to check is that all possible combinations - # of segments are present and that each has the parameters set properly - # start_date is a stand-in for these parameters and - # is determined by the value of a as specified in parameter_dict - check_segment_models = [ - dict(**el["segment"], **{"id": el["model"].id}) for el in segment_models - ] - expected = [ - {"a": "A1", "b": "B1", "id": "This is A1B1"}, - {"a": "A1", "b": "B2", "id": "This is A1B2"}, - {"a": "A2", "b": "B1", "id": "This is A2B1"}, - {"a": "A2", "b": "B2", "id": "This is A2B2"}, - ] - - # can't make a set of dicts for comparison - # so sort the lists and compare each element - compare_sorted = zip( - sorted(check_segment_models, key=lambda x: (x["a"], x["b"])), - sorted(expected, key=lambda x: (x["a"], x["b"])), + # this is the min date of the observed data + start_date = TEST_DATE - relativedelta(years=1) + end_date = ( + datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78) + ).date() + dates_to_predict_expected = pd.DataFrame( + {"submission_date": pd.date_range(start_date, end_date).date} ) + assert good_class.dates_to_predict.equals(dates_to_predict_expected) - for checkval, expectedval in compare_sorted: - assert checkval == expectedval - - # test that the seed was set for all models during fitting - assert all([el["model"].seed_set for el in segment_models]) - - # test that the fit was applied properly to the time-filtered data - # to do this check the is_fit attribute, which will equal - # the earliest date. For B1 it is TEST_DATE - # B2 has start_date set to TEST_DATE_NEXT_DAY, so it will have that value - - for segment in segment_models: - if segment["segment"]["b"] == "B1": - assert segment["model"].model.is_fit == TEST_DATE - else: - assert segment["model"].model.is_fit == TEST_DATE_NEXT_DAY - - -def test_set_segment_models_exception(mocker): - """test the exception for segment_models where - and exception is raised if a model_setting_split_dim - is specified that isn't in the data""" - A1_start_date = "2018-01-01" - A2_start_date = "2020-02-02" - parameter_list = [ - {"segment": {"c": "A1"}, "parameters": {"id": "This is A1"}}, - {"segment": {"c": "A2"}, "parameters": {"id": "This is A2"}}, - ] - EnsembleObject = BaseEnsembleForecast( - model_class=good_class, parameters=parameter_list, segments=["a", "b"] - ) - observed_data = pd.DataFrame( - { - "a": ["A1", "A1", "A2", "A2", "A2"], - "b": ["B1", "B2", "B1", "B2", "B2"], - "submission_date": [ - A1_start_date, - A1_start_date, - A2_start_date, - A2_start_date, - A2_start_date, - ], - } +def test_fit(good_class): + good_class = good_class( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, + metric_hub=None, ) + good_class.fit() + assert good_class.model - with pytest.raises( - ValueError, - match="Segment keys missing from metric hub segments: c", - ): - EnsembleObject.fit(observed_data) - + # model sets is_fit to the largest day in the observed data + assert good_class.model.is_fit == TEST_DATE -def test_predict(good_class): - """test the predict""" - parameter_list = [ - { - "segment": {"a": "A1", "b": "B1"}, - "parameters": {"id": "This is A1B1", "factor": 4}, - }, - { - "segment": {"a": "A1", "b": "B2"}, - "parameters": {"id": "This is A1B2", "factor": 6}, - }, - { - "segment": {"a": "A2", "b": "B1"}, - "parameters": {"id": "This is A2B1", "factor": 8}, - }, - { - "segment": {"a": "A2", "b": "B2"}, - "parameters": {"id": "This is A2B2", "factor": 10}, - }, - ] - EnsembleObject = BaseEnsembleForecast( - model_class=good_class, parameters=parameter_list, segments=["a", "b"] +def test_predict_and_validate(good_class): + good_class = good_class( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, + metric_hub=None, ) - - # submission date doesn't matter here - observed_data = pd.DataFrame( - { - "a": ["A1", "A1", "A2", "A2", "A2"], - "b": ["B1", "B2", "B1", "B2", "B2"], - "submission_date": [ - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - ], - } + # overwrite date range set in __post_init__ + good_class.dates_to_predict = np.arange(10) + good_class.predict() + assert np.all(good_class.forecast_df == good_class.dates_to_predict * 2) + + +def test_summarize(good_class): + good_class = good_class( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, + metric_hub=None, + ) + good_class.forecast_df = np.array([1, 2]) + good_class.observed_df = np.array([3, 4]) + MetricHub = collections.namedtuple( + "MetricHub", + ["alias", "app_name", "slug", "min_date", "max_date"], ) - EnsembleObject.fit(observed_data) + dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR) - # pass submission_date as a float for the purpose of testing - # this is fine because no time filtering happens in the predict of - # BaseEnsembleForecast or the dummy class and model - predict_df = pd.DataFrame({"submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY]}) - output_df = EnsembleObject.predict(predict_df) + # add it here rather than in __init__ so it doesn't try to load data + good_class.metric_hub = dummy_metric_hub + good_class.trained_at = "" + good_class.predicted_at = "" - expected_df = pd.DataFrame( + number_val = 10 + output = good_class.summarize( + periods=["a", "b", "c"], numpy_aggregations=["sum"], percentiles=["percentiles"] + ) + expected_output = pd.DataFrame( [ - {"a": "A1", "b": "B1", "value": 1 * 4, "submission_date": TEST_DATE}, - { - "a": "A1", - "b": "B1", - "value": 2 * 4, - "submission_date": TEST_DATE_NEXT_DAY, - }, - {"a": "A1", "b": "B2", "value": 1 * 6, "submission_date": TEST_DATE}, - { - "a": "A1", - "b": "B2", - "value": 2 * 6, - "submission_date": TEST_DATE_NEXT_DAY, - }, - {"a": "A2", "b": "B1", "value": 1 * 8, "submission_date": TEST_DATE}, - { - "a": "A2", - "b": "B1", - "value": 2 * 8, - "submission_date": TEST_DATE_NEXT_DAY, - }, - {"a": "A2", "b": "B2", "value": 1 * 10, "submission_date": TEST_DATE}, - { - "a": "A2", - "b": "B2", - "value": 2 * 10, - "submission_date": TEST_DATE_NEXT_DAY, - }, + {"number": number_val, "period": el, "percentiles": "percentiles"} + for el in ["a", "b", "c"] ] ) + # not going to check all the metadata columns + # in assert_frame_equal. Just make sure they're there + metadata_columns = { + "metric_alias", + "metric_hub_app_name", + "metric_hub_slug", + "metric_start_date", + "metric_end_date", + "metric_collected_at", + "forecast_start_date", + "forecast_end_date", + "forecast_trained_at", + "forecast_predicted_at", + "forecast_parameters", + } + assert set(expected_output.columns) | metadata_columns == set(output.columns) pd.testing.assert_frame_equal( - output_df[["a", "b", "value", "submission_date"]].reset_index(drop=True), - expected_df, - ) - - -def test_predict_with_start(good_class): - """test the predict""" - # set B2 parameters to filter out TEST_DATE - parameter_list = [ - { - "segment": {"a": "A1", "b": "B1"}, - "parameters": {"id": "This is A1B1", "factor": 4}, - }, - { - "segment": {"a": "A1", "b": "B2"}, - "parameters": { - "id": "This is A1B2", - "factor": 6, - }, - "start_date": TEST_DATE_NEXT_DAY_STR, - }, - { - "segment": {"a": "A2", "b": "B1"}, - "parameters": {"id": "This is A2B1", "factor": 8}, - }, - { - "segment": {"a": "A2", "b": "B2"}, - "parameters": {"id": "This is A2B2", "factor": 10}, - "start_date": TEST_DATE_NEXT_DAY_STR, - }, - ] - - EnsembleObject = BaseEnsembleForecast( - model_class=good_class, parameters=parameter_list, segments=["a", "b"] - ) - - observed_data = pd.DataFrame( - { - "a": ["A1", "A1", "A2", "A2", "A2"], - "b": ["B1", "B2", "B1", "B2", "B2"], - "submission_date": [ - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - ], - } - ) - - EnsembleObject.fit(observed_data) - - # pass submission_date as a float for the purpose of testing - # this is fine because no time filtering happens in the predict of - # BaseEnsembleForecast or the dummy class and model - predict_df = pd.DataFrame({"submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY]}) - output_df = EnsembleObject.predict(predict_df) - - expected_df = pd.DataFrame( - [ - {"a": "A1", "b": "B1", "value": 1 * 4, "submission_date": TEST_DATE}, - { - "a": "A1", - "b": "B1", - "value": 2 * 4, - "submission_date": TEST_DATE_NEXT_DAY, - }, - { - "a": "A1", - "b": "B2", - "value": 2 * 6, - "submission_date": TEST_DATE_NEXT_DAY, - }, - {"a": "A2", "b": "B1", "value": 1 * 8, "submission_date": TEST_DATE}, - { - "a": "A2", - "b": "B1", - "value": 2 * 8, - "submission_date": TEST_DATE_NEXT_DAY, - }, - { - "a": "A2", - "b": "B2", - "value": 2 * 10, - "submission_date": TEST_DATE_NEXT_DAY, - }, - ] + output[expected_output.columns].reset_index(drop=True), expected_output ) pd.testing.assert_frame_equal( - output_df[["a", "b", "value", "submission_date"]].reset_index(drop=True), - expected_df, + good_class.summary_df[expected_output.columns].reset_index(drop=True), + expected_output, ) diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py index cbe2a42e..6e43e409 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py @@ -1,22 +1,16 @@ """tests for the funnel forecast module""" -from datetime import date +import collections +from datetime import date, datetime from dateutil.relativedelta import relativedelta import pandas as pd import pytest import numpy as np -import json -from kpi_forecasting.models.funnel_forecast import ( - ProphetAutotunerForecast, - FunnelForecast, - combine_forecast_observed, - summarize_with_parameters, - summarize, -) -from kpi_forecasting.models.prophet_forecast import ProphetForecast +from kpi_forecasting.configs.model_inputs import ProphetRegressor, ProphetHoliday +from kpi_forecasting.models.funnel_forecast import SegmentModelSettings, FunnelForecast # Arbitrarily choose some date to use for the tests TEST_DATE = date(2024, 1, 1) @@ -27,16 +21,101 @@ TEST_PREDICT_END_STR = TEST_PREDICT_END.strftime("%Y-%m-%d") +@pytest.fixture() +def forecast(): + """This mocks a generic forecast object""" + # 2024-01-01 is arbitarily chosen as a future date + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_PREDICT_END_STR + + forecast = FunnelForecast( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + return forecast + + +@pytest.fixture() +def segment_info_fit_tests(): + """This fixture creates segment info dictionaries + that mimic the content of the config file and are used + in the functions that test fit methods""" + + # 2024-01-01 is arbitarily chosen as a future date + A1_start_date = TEST_DATE_STR + A2_start_date = TEST_DATE_NEXT_DAY_STR + + segment_info_dict = { + "A1": { + "start_date": A1_start_date, + "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, + "min_param_value": 10, + }, + "A2": { + "start_date": A2_start_date, + "grid_parameters": {"param1": [-1, -2], "param2": [3, 4]}, + "min_param_value": -3, # closest to zero + }, + } + return segment_info_dict + + +@pytest.fixture() +def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker): + """This method creates a forecast object from the segment dict + created in the segment_info_fit_tests fixture. It also + mocks some of the object methods to enable easier testing""" + parameter_list = [ + { + "segment": {"a": "A1"}, + "start_date": segment_info_fit_tests["A1"]["start_date"], + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": segment_info_fit_tests["A1"]["grid_parameters"], + "cv_settings": {}, + }, + { + "segment": {"a": "A2"}, + "start_date": segment_info_fit_tests["A2"]["start_date"], + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": segment_info_fit_tests["A2"]["grid_parameters"], + "cv_settings": {}, + }, + ] + + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_DATE_NEXT_DAY_STR + + forecast = FunnelForecast( + model_type="test", + parameters=parameter_list, + use_all_us_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + + mocker.patch.object(forecast, "_build_model", mock_build_model) + mocker.patch.object( + forecast, "_get_crossvalidation_metric", mock_get_crossvalidation_metric + ) + + return forecast + + class MockModel: """Used in place of prophet.Prophet for testing purposes""" - def __init__(self, seasonality_prior_scale=0, holidays_prior_scale=0, growth=None): - # arbitrarily choose a few parameters from ProphetForecast to use - self.seasonality_prior_scale = seasonality_prior_scale - self.holidays_prior_scale = holidays_prior_scale - self.value = seasonality_prior_scale * holidays_prior_scale + def __init__(self, param1=0, param2=0): + self.value = param1 * param2 self.history = None - self.growth = growth def fit(self, df, *args, **kwargs): self.history = df @@ -69,124 +148,40 @@ def predictive_samples(self, dates_to_predict): return {"yhat": {0: output}} -def mock_build_model(self): +def mock_build_model(segment_settings, parameters): """mocks the FunnelForecast build_model method""" return MockModel( - seasonality_prior_scale=self.seasonality_prior_scale, - holidays_prior_scale=self.holidays_prior_scale, - growth=self.growth, + **parameters, ) -def mock_get_crossvalidation_metric(self, m, *args, **kwargs): +def mock_get_crossvalidation_metric(m, *args, **kwargs): """mocks the FunnelForecast get_crossvalidation_metric method, meant to be used with MockModel""" - return m.model.value # value atrribute in MockModel - - -def test_combine_forecast_observed(): - """tests the _combine_forecast_observed method""" - - forecast_df = pd.DataFrame( - [ - { - "submission_date": TEST_DATE, - "a": "A1", - "forecast_parameters": "blah", - "value": 0, - "value_low": 0, - "value_mid": 0, - "value_high": 0, - }, - { - "submission_date": TEST_DATE_NEXT_DAY, - "a": "A1", - "forecast_parameters": "blah", - "value": 0, - "value_low": 0, - "value_mid": 0, - "value_high": 0, - }, - ] - ) + return m.value # value atrribute in MockModel - observed_df = pd.DataFrame( - [ - { - "submission_date": TEST_DATE - relativedelta(days=2), - "value": 5, - "a": "A1", - }, - { - "submission_date": TEST_DATE - relativedelta(days=1), - "value": 6, - "a": "A1", - }, - ] - ) - output_df = combine_forecast_observed( - forecast_df, - observed_df, - ) +def mock_aggregate_forecast_observed( + forecast_df, observed_df, period, numpy_aggregations, percentiles +): + """Mocks the aggregate_forecast_observed function defined in ProphetForecast + and inherited in FunnelForecast. + This function is tested extensively in test_prophet_forecast + so we can make dummy outputs for tests related to it""" - expected_df = pd.DataFrame( - [ - { - "submission_date": TEST_DATE, - "a": "A1", - "forecast_parameters": "blah", - "value": 0, - "value_low": 0.0, - "value_mid": 0.0, - "value_high": 0.0, - "source": "forecast", - }, - { - "submission_date": TEST_DATE_NEXT_DAY, - "a": "A1", - "forecast_parameters": "blah", - "value": 0, - "value_low": 0.0, - "value_mid": 0.0, - "value_high": 0.0, - "source": "forecast", - }, - { - "submission_date": TEST_DATE - relativedelta(days=2), - "a": "A1", - "forecast_parameters": np.nan, - "value": 5, - "value_low": np.nan, - "value_mid": np.nan, - "value_high": np.nan, - "source": "historical", - }, - { - "submission_date": TEST_DATE - relativedelta(days=1), - "a": "A1", - "forecast_parameters": np.nan, - "value": 6, - "value_low": np.nan, - "value_mid": np.nan, - "value_high": np.nan, - "source": "historical", - }, - ] - ) + # add dummy columns where aggregated metrics woudl go + percentile_columns = [f"p{el}" for el in percentiles] + output_forecast_df = forecast_df.copy() + output_forecast_df[numpy_aggregations + percentile_columns] = 0 + return output_forecast_df, observed_df.copy() - assert set(expected_df.columns) == set(output_df.columns) - pd.testing.assert_frame_equal( - expected_df.sort_values(["source", "submission_date"]).reset_index(drop=True), - output_df[expected_df.columns] - .sort_values(["source", "submission_date"]) - .reset_index(drop=True), +def test_combine_forecast_observed(mocker, forecast): + """tests the _combine_forecast_observed method""" + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed ) - -def test_summarize_with_parameters_no_overlap(): - """testing summarize_with_parameters""" forecast_df = pd.DataFrame( { "submission_date": [ @@ -196,611 +191,396 @@ def test_summarize_with_parameters_no_overlap(): } ) - test_date_samples_A1 = np.arange(1000) - test_date_samples_A2 = np.arange(1000) * 10 - test_next_date_samples_A1 = np.arange(1000) * 2 - test_next_date_samples_A2 = np.arange(1000) * 20 - forecast_df = pd.DataFrame( - [ - { # this element will be filtered out because it occurs before the observed_data ends - **{ - "submission_date": TEST_DATE - relativedelta(days=2), - "a": "A1", - "forecast_parameters": "A1", - }, - **{i: 0 for i in range(1000)}, - }, - { - **{ - "submission_date": TEST_DATE, - "a": "A1", - "forecast_parameters": "A1", - }, - **{i: el for i, el in enumerate(test_date_samples_A1)}, - }, - { - **{ - "submission_date": TEST_DATE_NEXT_DAY, - "a": "A1", - "forecast_parameters": "A1", - }, - **{i: el for i, el in enumerate(test_next_date_samples_A1)}, - }, - { - **{ - "submission_date": TEST_DATE, - "a": "A2", - "forecast_parameters": "A2", - }, - **{i: el for i, el in enumerate(test_date_samples_A2)}, - }, - { - **{ - "submission_date": TEST_DATE_NEXT_DAY, - "a": "A2", - "forecast_parameters": "A2", - }, - **{i: el for i, el in enumerate(test_next_date_samples_A2)}, - }, - ] - ) - - # rows with negative values are those expected to be removed - # by filters in summarize observed_df = pd.DataFrame( { "submission_date": [ TEST_DATE - relativedelta(days=2), TEST_DATE - relativedelta(days=1), - TEST_DATE - relativedelta(days=2), - TEST_DATE - relativedelta(days=1), ], - "a": ["A1", "A1", "A2", "A2"], - "value": [20, 30, 40, 50], + "a": ["A1", "A1"], + "value": [5, 6], } ) numpy_aggregations = ["mean"] percentiles = [10, 50, 90] - output_df = summarize_with_parameters( + + output_df = forecast._combine_forecast_observed( forecast_df=forecast_df, observed_df=observed_df, - period="day", + period="period", numpy_aggregations=numpy_aggregations, percentiles=percentiles, - segment_cols=["a"], - ) - observed_expected_df = pd.DataFrame( - { - "submission_date": [ - TEST_DATE - relativedelta(days=2), - TEST_DATE - relativedelta(days=1), - TEST_DATE - relativedelta(days=2), - TEST_DATE - relativedelta(days=1), - ], - "a": ["A1", "A1", "A2", "A2"], - "value": [20, 30, 40, 50], - "value_low": [np.nan, np.nan, np.nan, np.nan], - "value_mid": [np.nan, np.nan, np.nan, np.nan], - "value_high": [np.nan, np.nan, np.nan, np.nan], - "source": ["historical", "historical", "historical", "historical"], - } + segment={"a": "A1"}, ) - forecast_summarized_expected_df = pd.DataFrame( - [ - { - "submission_date": TEST_DATE, - "a": "A1", - "forecast_parameters": "A1", - "value": np.mean(test_date_samples_A1), - "value_low": np.percentile(test_date_samples_A1, 10), - "value_mid": np.percentile(test_date_samples_A1, 50), - "value_high": np.percentile(test_date_samples_A1, 90), - "source": "forecast", - }, - { - "submission_date": TEST_DATE_NEXT_DAY, - "a": "A1", - "forecast_parameters": "A1", - "value": np.mean(test_next_date_samples_A1), - "value_low": np.percentile(test_next_date_samples_A1, 10), - "value_mid": np.percentile(test_next_date_samples_A1, 50), - "value_high": np.percentile(test_next_date_samples_A1, 90), - "source": "forecast", - }, - { - "submission_date": TEST_DATE, - "a": "A2", - "forecast_parameters": "A2", - "value": np.mean(test_date_samples_A2), - "value_low": np.percentile(test_date_samples_A2, 10), - "value_mid": np.percentile(test_date_samples_A2, 50), - "value_high": np.percentile(test_date_samples_A2, 90), - "source": "forecast", - }, - { - "submission_date": TEST_DATE_NEXT_DAY, - "a": "A2", - "forecast_parameters": "A2", - "value": np.mean(test_next_date_samples_A2), - "value_low": np.percentile(test_next_date_samples_A2, 10), - "value_mid": np.percentile(test_next_date_samples_A2, 50), - "value_high": np.percentile(test_next_date_samples_A2, 90), - "source": "forecast", - }, - ] - ) + # mean was renamed to value, percentiles to high, medium, low + forecast_df[["value", "value_low", "value_mid", "value_high"]] = 0 + forecast_df["a"] = "A1" # this column is already present in observed - # concat in same order to make our lives easier - expected = pd.concat([observed_expected_df, forecast_summarized_expected_df]) - expected["aggregation_period"] = "day" - expected["submission_date"] = pd.to_datetime(expected["submission_date"]) + forecast_df["source"] = "forecast" + observed_df["source"] = "historical" + # concat in same order to make our lives easier + expected = pd.concat([observed_df, forecast_df]) assert set(expected.columns) == set(output_df.columns) + pd.testing.assert_frame_equal(output_df, expected[output_df.columns]) + + # should not be any nulls outside the metric column + non_metric_columns = [ + el + for el in output_df.columns + if el not in ["value", "value_low", "value_mid", "value_high"] + ] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) - pd.testing.assert_frame_equal( - expected.sort_values(["source", "a", "submission_date"]).reset_index(drop=True), - output_df[expected.columns] - .sort_values(["source", "a", "submission_date"]) - .reset_index(drop=True), - ) +def test_under_summarize(mocker, forecast): + """testing _summarize""" + # 2024-01-01 is chosen as an arbitrary date to center the tests around -def test_summarize_with_parameters_month_overlap(): - """testing summarize_with_parameters""" - test_date_samples_A1 = np.arange(1000) - test_date_samples_A2 = np.arange(1000) * 10 - test_next_date_samples_A1 = np.arange(1000) * 2 - test_next_date_samples_A2 = np.arange(1000) * 20 - # add a week to all the dates so they're in the same month as the observed - # but occur after so they won't get filtered out + # forecast predictions are set with the + # mock_aggregate_forecast_observed function so they + # can be ommited here forecast_df = pd.DataFrame( - [ - { # this element will be filtered out because it occurs before the observed_data ends - **{ - "submission_date": TEST_DATE - relativedelta(days=2), - "a": "A1", - "forecast_parameters": "A1", - }, - **{i: 0 for i in range(1000)}, - }, - { - **{ - "submission_date": TEST_DATE + relativedelta(days=7), - "a": "A1", - "forecast_parameters": "A1", - }, - **{i: el for i, el in enumerate(test_date_samples_A1)}, - }, - { - **{ - "submission_date": TEST_DATE_NEXT_DAY + relativedelta(days=7), - "a": "A1", - "forecast_parameters": "A1", - }, - **{i: el for i, el in enumerate(test_next_date_samples_A1)}, - }, - { - **{ - "submission_date": TEST_DATE + relativedelta(days=7), - "a": "A2", - "forecast_parameters": "A2", - }, - **{i: el for i, el in enumerate(test_date_samples_A2)}, - }, - { - **{ - "submission_date": TEST_DATE_NEXT_DAY + relativedelta(days=7), - "a": "A2", - "forecast_parameters": "A2", - }, - **{i: el for i, el in enumerate(test_next_date_samples_A2)}, - }, - ] + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } ) # rows with negative values are those expected to be removed # by filters in summarize observed_df = pd.DataFrame( { - "submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY], - "a": ["A1", "A2"], - "value": [20, 30], + "submission_date": [ + TEST_DATE - relativedelta(months=1), + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), + ], + "a": ["A1", "A1", "A1", "A2", "A2"], + "value": [10, 20, 30, 40, 50], } ) + SegmentSettings = collections.namedtuple( + "SegmentSettings", + ["start_date", "forecast_df", "segment", "trained_parameters"], + ) + dummy_segment_settings = SegmentSettings( + start_date=(TEST_DATE - relativedelta(days=2)).strftime("%Y-%m-%d"), + forecast_df=forecast_df.copy(), + segment={"a": "A1"}, + trained_parameters={"trained_parameters": "yes"}, + ) + + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed + ) + + forecast.observed_df = observed_df + numpy_aggregations = ["mean"] percentiles = [10, 50, 90] - output_df = summarize_with_parameters( - forecast_df=forecast_df, - observed_df=observed_df, - period="month", + output_df = forecast._summarize( + segment_settings=dummy_segment_settings, + period="period", numpy_aggregations=numpy_aggregations, percentiles=percentiles, - segment_cols=["a"], ) observed_expected_df = pd.DataFrame( { - "submission_date": [TEST_DATE, TEST_DATE], - "a": ["A1", "A2"], + "submission_date": [ + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), + ], + "a": ["A1", "A1"], "value": [20, 30], - "value_low": [np.nan, np.nan], - "value_mid": [np.nan, np.nan], - "value_high": [np.nan, np.nan], - "source": ["historical", "historical"], } ) - forecast_summarized_expected_df = pd.DataFrame( - [ - { - "submission_date": TEST_DATE, - "a": "A1", - "forecast_parameters": "A1", - "value": np.mean(test_date_samples_A1 + test_next_date_samples_A1 + 20), - "value_low": np.percentile( - test_date_samples_A1 + test_next_date_samples_A1 + 20, 10 - ), - "value_mid": np.percentile( - test_date_samples_A1 + test_next_date_samples_A1 + 20, 50 - ), - "value_high": np.percentile( - test_date_samples_A1 + test_next_date_samples_A1 + 20, 90 - ), - "source": "forecast", - }, - { - "submission_date": TEST_DATE, - "a": "A2", - "forecast_parameters": "A2", - "value": np.mean(test_date_samples_A2 + test_next_date_samples_A2 + 30), - "value_low": np.percentile( - test_date_samples_A2 + test_next_date_samples_A2 + 30, 10 - ), - "value_mid": np.percentile( - test_date_samples_A2 + test_next_date_samples_A2 + 30, 50 - ), - "value_high": np.percentile( - test_date_samples_A2 + test_next_date_samples_A2 + 30, 90 - ), - "source": "forecast", - }, - ] - ) + # percentile numeric values changed to names + # mean gets mapped to value + forecast_df[["value", "value_low", "value_mid", "value_high"]] = 0 + + forecast_df["a"] = "A1" # this column is already present in observed + + forecast_df["source"] = "forecast" + observed_expected_df["source"] = "historical" # concat in same order to make our lives easier - expected = pd.concat([observed_expected_df, forecast_summarized_expected_df]) - expected["aggregation_period"] = "month" - expected["submission_date"] = pd.to_datetime(expected["submission_date"]) + expected = pd.concat([observed_expected_df, forecast_df]) + expected["forecast_parameters"] = '{"trained_parameters": "yes"}' + expected["aggregation_period"] = "period" assert set(expected.columns) == set(output_df.columns) - + # force value columns to be floats in both cases to make check easier + numeric_cols = ["value", "value_low", "value_mid", "value_high"] + expected[numeric_cols] = expected[numeric_cols].astype(float) + output_df[numeric_cols] = output_df[numeric_cols].astype(float) pd.testing.assert_frame_equal( - expected.sort_values(["source", "a", "submission_date"]).reset_index(drop=True), - output_df[expected.columns] - .sort_values(["source", "a", "submission_date"]) - .reset_index(drop=True), + output_df.reset_index(drop=True), + expected[output_df.columns].reset_index(drop=True), ) + # should not be any nulls outside the metric column + non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) -def test_summarize(): + +def test_summarize(mocker, forecast): """testing summarize""" # create dummy metric hub object to when meta data from # it is added we don't get an error - test_date_samples_A1 = np.arange(1000) - test_date_samples_A2 = np.arange(1000) * 10 - test_next_date_samples_A1 = np.arange(1000) * 2 - test_next_date_samples_A2 = np.arange(1000) * 20 - # add a week to all the dates so they're in the same month as the observed - # but occur after so they won't get filtered out - forecast_df = pd.DataFrame( - [ - { # this element will be filtered out because it occurs before the observed_data ends - **{ - "submission_date": TEST_DATE - relativedelta(days=2), - "a": "A1", - "forecast_parameters": "A1", - }, - **{i: 0 for i in range(1000)}, - }, - { - **{ - "submission_date": TEST_DATE + relativedelta(days=7), - "a": "A1", - "forecast_parameters": "A1", - }, - **{i: el for i, el in enumerate(test_date_samples_A1)}, - }, - { - **{ - "submission_date": TEST_DATE_NEXT_DAY + relativedelta(days=7), - "a": "A1", - "forecast_parameters": "A1", - }, - **{i: el for i, el in enumerate(test_next_date_samples_A1)}, - }, - { - **{ - "submission_date": TEST_DATE + relativedelta(days=7), - "a": "A2", - "forecast_parameters": "A2", - }, - **{i: el for i, el in enumerate(test_date_samples_A2)}, - }, - { - **{ - "submission_date": TEST_DATE_NEXT_DAY + relativedelta(days=7), - "a": "A2", - "forecast_parameters": "A2", - }, - **{i: el for i, el in enumerate(test_next_date_samples_A2)}, - }, - ] + MetricHub = collections.namedtuple( + "MetricHub", + ["alias", "app_name", "slug", "min_date", "max_date"], ) - # rows with negative values are those expected to be removed - # by filters in summarize - observed_df = pd.DataFrame( - { - "submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY], - "a": ["A1", "A2"], - "value": [20, 30], - } - ) + dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR) - numpy_aggregations = ["mean"] - percentiles = [10, 50, 90] - output_df = summarize( - forecast_df=forecast_df, - observed_df=observed_df, - periods=["day", "month"], - numpy_aggregations=numpy_aggregations, - percentiles=percentiles, - segment_cols=["a"], - ) - observed_month_expected_df = pd.DataFrame( + # forecast predictions are set with the + # mock_aggregate_forecast_observed function so they + # can be ommited here + forecast_df = pd.DataFrame( { - "submission_date": [TEST_DATE, TEST_DATE], - "a": ["A1", "A2"], - "value": [20, 30], - "value_low": [np.nan, np.nan], - "value_mid": [np.nan, np.nan], - "value_high": [np.nan, np.nan], - "source": ["historical", "historical"], - "aggregation_period": "month", + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], } ) - observed_day_expected_df = pd.DataFrame( + + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( { - "submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY], - "a": ["A1", "A2"], - "value": [20, 30], - "value_low": [np.nan, np.nan], - "value_mid": [np.nan, np.nan], - "value_high": [np.nan, np.nan], - "source": ["historical", "historical"], - "aggregation_period": "day", + "submission_date": [ + TEST_DATE - relativedelta(months=1), + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), + ], + "a": ["A1", "A1", "A1", "A2", "A2"], + "value": [10, 20, 30, 40, 50], } ) - forecast_month_summarized_expected_df = pd.DataFrame( - [ - { - "submission_date": TEST_DATE, - "a": "A1", - "forecast_parameters": "A1", - "value": np.mean(test_date_samples_A1 + test_next_date_samples_A1 + 20), - "value_low": np.percentile( - test_date_samples_A1 + test_next_date_samples_A1 + 20, 10 - ), - "value_mid": np.percentile( - test_date_samples_A1 + test_next_date_samples_A1 + 20, 50 - ), - "value_high": np.percentile( - test_date_samples_A1 + test_next_date_samples_A1 + 20, 90 - ), - "source": "forecast", - "aggregation_period": "month", - }, - { - "submission_date": TEST_DATE, - "a": "A2", - "forecast_parameters": "A2", - "value": np.mean(test_date_samples_A2 + test_next_date_samples_A2 + 30), - "value_low": np.percentile( - test_date_samples_A2 + test_next_date_samples_A2 + 30, 10 - ), - "value_mid": np.percentile( - test_date_samples_A2 + test_next_date_samples_A2 + 30, 50 - ), - "value_high": np.percentile( - test_date_samples_A2 + test_next_date_samples_A2 + 30, 90 - ), - "source": "forecast", - "aggregation_period": "month", - }, - ] + SegmentSettings = collections.namedtuple( + "SegmentSettings", + ["start_date", "forecast_df", "segment", "trained_parameters", "components_df"], ) - forecast_day_summarized_expected_df = pd.DataFrame( - [ - { - "submission_date": TEST_DATE + relativedelta(days=7), - "a": "A1", - "forecast_parameters": "A1", - "value": np.mean(test_date_samples_A1), - "value_low": np.percentile(test_date_samples_A1, 10), - "value_mid": np.percentile(test_date_samples_A1, 50), - "value_high": np.percentile(test_date_samples_A1, 90), - "source": "forecast", - "aggregation_period": "day", - }, - { - "submission_date": TEST_DATE_NEXT_DAY + relativedelta(days=7), - "a": "A1", - "forecast_parameters": "A1", - "value": np.mean(test_next_date_samples_A1), - "value_low": np.percentile(test_next_date_samples_A1, 10), - "value_mid": np.percentile(test_next_date_samples_A1, 50), - "value_high": np.percentile(test_next_date_samples_A1, 90), - "source": "forecast", - "aggregation_period": "day", - }, - { - "submission_date": TEST_DATE + relativedelta(days=7), - "a": "A2", - "forecast_parameters": "A2", - "value": np.mean(test_date_samples_A2), - "value_low": np.percentile(test_date_samples_A2, 10), - "value_mid": np.percentile(test_date_samples_A2, 50), - "value_high": np.percentile(test_date_samples_A2, 90), - "source": "forecast", - "aggregation_period": "day", - }, - { - "submission_date": TEST_DATE_NEXT_DAY + relativedelta(days=7), - "a": "A2", - "forecast_parameters": "A2", - "value": np.mean(test_next_date_samples_A2), - "value_low": np.percentile(test_next_date_samples_A2, 10), - "value_mid": np.percentile(test_next_date_samples_A2, 50), - "value_high": np.percentile(test_next_date_samples_A2, 90), - "source": "forecast", - "aggregation_period": "day", - }, - ] + # for the components_df the contents aren't important here + # we're only testing that it is concatenated properly + # with the segment data added + dummy_segment_settings_A1 = SegmentSettings( + start_date=(TEST_DATE - relativedelta(days=2)).strftime("%Y-%m-%d"), + forecast_df=forecast_df.copy(), + segment={"a": "A1"}, + trained_parameters={"trained_parameters": "yes"}, + components_df=pd.DataFrame({"testcol": [1]}), ) - # concat in same order to make our lives easier - expected = pd.concat( - [ - forecast_day_summarized_expected_df, - forecast_month_summarized_expected_df, - observed_day_expected_df, - observed_month_expected_df, - ] + dummy_segment_settings_A2 = SegmentSettings( + start_date=(TEST_DATE - relativedelta(days=2)).strftime("%Y-%m-%d"), + forecast_df=forecast_df.copy(), + segment={"a": "A2"}, + trained_parameters={"trained_parameters": "yes"}, + components_df=pd.DataFrame({"testcol": [2]}), ) - expected["submission_date"] = pd.to_datetime(expected["submission_date"]) - assert set(expected.columns) == set(output_df.columns) + segment_models = [dummy_segment_settings_A1, dummy_segment_settings_A2] - pd.testing.assert_frame_equal( - expected.sort_values( - ["source", "a", "submission_date", "aggregation_period"] - ).reset_index(drop=True), - output_df[expected.columns] - .sort_values(["source", "a", "submission_date", "aggregation_period"]) - .reset_index(drop=True), + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed ) + forecast.observed_df = observed_df + forecast.segment_models = segment_models + forecast.metric_hub = dummy_metric_hub -def test_auto_tuning(mocker): - """test the auto_tuning function""" + # timestamp attributes created by fit and predict + # must be added manuall + forecast.collected_at = "" + forecast.trained_at = "" + forecast.predicted_at = "" - mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) - # mock_get_crossvalidation_metric will choose the parameters that - # have the lowest absolute product - mocker.patch.object( - ProphetAutotunerForecast, - "_get_crossvalidation_metric", - mock_get_crossvalidation_metric, - ) - forecast = ProphetAutotunerForecast( - growth="testval", - grid_parameters={ - "seasonality_prior_scale": [1, 2], - "holidays_prior_scale": [20, 10], - }, + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + forecast.summarize( + periods=["period"], + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, ) - observed_df = pd.DataFrame( + output_df = forecast.summary_df + + # time filter removes first element of observed_df + observed_expected_df = pd.DataFrame( { - "a": ["A1", "A1"], - "b": ["B1", "B2"], "submission_date": [ - TEST_DATE, - TEST_DATE, + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), ], + "a": ["A1", "A1", "A2", "A2"], + "value": [20, 30, 40, 50], } ) - best_model = forecast._auto_tuning(observed_df) + # doubled because there are two segments in the observed data + forecast_df = pd.concat([forecast_df, forecast_df]) - # in the mocked class the two params get multiplied and the lowest combo gets select - assert best_model.seasonality_prior_scale == 1 - assert best_model.holidays_prior_scale == 10 + forecast_df[["value", "value_low", "value_mid", "value_high"]] = 0 + forecast_df["source"] = "forecast" - # make sure growth got written to new class - assert best_model.growth == "testval" + # segment data column is already present in observed + # needs to be added manually for forecast + forecast_df["a"] = [ + "A1", + "A1", + "A2", + "A2", + ] + + observed_expected_df["source"] = "historical" - # check to make sure it's fit + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_df]) + expected["forecast_parameters"] = '{"trained_parameters": "yes"}' + expected["aggregation_period"] = "period" + + # not going to check all the metadata columns + # in assert_frame_equal. Just make sure they're there + metadata_columns = { + "metric_alias", + "metric_hub_app_name", + "metric_hub_slug", + "metric_start_date", + "metric_end_date", + "metric_collected_at", + "forecast_start_date", + "forecast_end_date", + "forecast_trained_at", + "forecast_predicted_at", + } + assert set(expected.columns) | metadata_columns == set(output_df.columns) + # force value columns to be floats in both cases to make check easier + numeric_cols = ["value", "value_low", "value_mid", "value_high"] + expected[numeric_cols] = expected[numeric_cols].astype(float) + output_df[numeric_cols] = output_df[numeric_cols].astype(float) pd.testing.assert_frame_equal( - best_model.history, forecast._build_train_dataframe(observed_df) + output_df.sort_values(["a", "submission_date"])[expected.columns].reset_index( + drop=True + ), + expected.sort_values(["a", "submission_date"]).reset_index(drop=True), ) + # should not be any nulls outside the metric column + non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) -def test_autotuner_predict(mocker): + # check components + # only checking that concatenation happened properly + # with segment data added + output_components = forecast.components_df + expected_components = pd.DataFrame({"testcol": [1, 2], "a": ["A1", "A2"]}) + pd.testing.assert_frame_equal(expected_components, output_components) + + +def test_under_predict(mocker): """testing _predict""" - mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) - # mock_get_crossvalidation_metric will choose the parameters that - # have the lowest absolute product - mocker.patch.object( - ProphetAutotunerForecast, - "_get_crossvalidation_metric", - mock_get_crossvalidation_metric, + # set segment models + + A1_start_date = TEST_DATE_STR + parameter_list = [ + { + "segment": {"a": "A1"}, + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, + "cv_settings": {}, + } + ] + + predict_start_date = TEST_DATE_NEXT_DAY_STR + predict_end_date = TEST_PREDICT_END_STR + + forecast = FunnelForecast( + model_type="test", + parameters=parameter_list, + use_all_us_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, ) - forecast = ProphetAutotunerForecast( - growth="testval", - grid_parameters={ - "seasonality_prior_scale": [1, 2], - "holidays_prior_scale": [20, 10], - }, + # this ensures forecast is using MockModel + mocker.patch.object(forecast, "_build_model", mock_build_model) + # the optimization is just using the value attribute of MockModel, + # which is the product of the parameteres passed. The crossvalidation + # will choose the parameters where the absolute value of the product is smallest + mocker.patch.object( + forecast, "_get_crossvalidation_metric", mock_get_crossvalidation_metric ) observed_df = pd.DataFrame( { "a": ["A1", "A1"], "b": ["B1", "B2"], - "submission_date": pd.to_datetime( - [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ] - ), - "y": [1, 2], + "y": [0, 1], + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], } ) - forecast.fit(observed_df) + segment_list = ["a"] + + # manually set segment_models attribute here instead of in __post_init__ + # which is bypassed to avoid a metric hub call + forecast._set_segment_models( + observed_df=observed_df, segment_column_list=segment_list + ) + # check that we only have one element here + assert len(forecast.segment_models) == 1 + # because of the check above we can use the first element + # and know that's all the segments present + segment_settings = forecast.segment_models[0] dates_to_predict = pd.DataFrame( { - "submission_date": pd.to_datetime( - [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ] - ) + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ] } ) - - out = forecast.predict(dates_to_predict).reset_index(drop=True) + forecast.observed_df = observed_df + forecast.fit() + out = forecast._predict(dates_to_predict, segment_settings).reset_index(drop=True) # in MockModel, the predictive_samples method sets the output to # np.arange(len(dates_to_predict)) * self.value for one column called 0 # this helps ensure the forecast_df in segment_models is set properly - model_value = forecast.model.value + model_value = forecast.segment_models[0].segment_model.value expected = pd.DataFrame( { 0: [0, model_value], - "submission_date": pd.to_datetime( - [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ] - ), + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], } ) @@ -808,9 +588,6 @@ def test_autotuner_predict(mocker): # check the components expected_components = observed_df[["submission_date", "y"]].copy() - expected_components["submission_date"] = pd.to_datetime( - expected_components["submission_date"] - ) expected_components[ [ "yhat", @@ -826,66 +603,291 @@ def test_autotuner_predict(mocker): ] ] = 0 - components_df = forecast.components_df + components_df = forecast.segment_models[0].components_df assert set(expected_components.columns) == set(components_df.columns) pd.testing.assert_frame_equal( components_df, expected_components[components_df.columns] ) -def test_funnelforecast_fit(mocker): - """test the fit method, and implicitly the set_segment_models method""" - # arbitrarily choose growth as a parameter - # to set in order to check the test +def test_predict(funnel_forecast_for_fit_tests, segment_info_fit_tests): + """test the predict method. This is similar to test_under_predict + but multiple segments are acted upon""" + + observed_data = pd.DataFrame( + { + "a": ["A1", "A1", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2"], + "y": [-1, 1, -1, 1], + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + + segment_list = ["a"] + + funnel_forecast_for_fit_tests._set_segment_models( + observed_df=observed_data, segment_column_list=segment_list + ) + funnel_forecast_for_fit_tests.observed_df = observed_data + funnel_forecast_for_fit_tests.fit() + funnel_forecast_for_fit_tests.predict() + + for segment in funnel_forecast_for_fit_tests.segment_models: + key = segment.segment["a"] + + model_value = segment_info_fit_tests[key]["min_param_value"] + + # in MockModel, the predictive_samples method sets the output to + # np.arange(len(dates_to_predict)) * self.value for one column called 0 + # this helps ensure the forecast_df in segment_models is set properly + expected_raw = pd.DataFrame( + { + 0: [0, model_value], + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + + # filter in predict happens against object start_date not + # segment start_date + expected_time_filter = ( + expected_raw["submission_date"] + >= pd.to_datetime(funnel_forecast_for_fit_tests.start_date).date() + ) + expected = expected_raw[expected_time_filter].reset_index(drop=True) + + forecast_df = segment.forecast_df + pd.testing.assert_frame_equal(forecast_df, expected) + + # check the components + expected_components = expected_raw[["submission_date"]].copy() + expected_components[ + [ + "yhat", + "trend", + "trend_upper", + "trend_lower", + "weekly", + "weekly_upper", + "weekly_lower", + "yearly", + "yearly_upper", + "yearly_lower", + ] + ] = 0 + + # because of time filtereing of training data, if the history has one + # element, y will but [0, 1]. The first element is turned into a NULL + # and then becomes a 0 because of fillna(0) + # if it has two it will have both elements and be [-1,1] + + if len(segment.segment_model.history) == 2: + expected_components["y"] = [-1, 1] + else: + expected_components["y"] = [0, 1] + + components_df = segment.components_df + + # there is weird stuff going on with the types but it shouldn't matter + # so coerce the type + expected_components["y"] = expected_components["y"].astype( + components_df["y"].dtype + ) + assert set(expected_components.columns) == set(components_df.columns) + pd.testing.assert_frame_equal( + components_df, + expected_components[components_df.columns], + check_column_type=False, + ) + + +def test_auto_tuning(forecast, mocker): + """test the auto_tuning function""" + + # set one segment with two sets of grid parameters + segment_settings = SegmentModelSettings( + segment={"a": "A1"}, + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, + holidays=[], + regressors=[], + grid_parameters={"param1": [1, 2], "param2": [20, 10]}, + cv_settings={}, + ) + + mocker.patch.object(forecast, "_build_model", mock_build_model) + + # mock_get_crossvalidation_metric will choose the parameters that + # have the lowest absolute product + mocker.patch.object( + forecast, "_get_crossvalidation_metric", mock_get_crossvalidation_metric + ) + + observed_df = pd.DataFrame( + { + "a": ["A1", "A1"], + "b": ["B1", "B2"], + "submission_date": [ + TEST_DATE, + TEST_DATE, + ], + } + ) + + forecast.segment_models = [segment_settings] + + best_params = forecast._auto_tuning(observed_df, segment_settings) + + # in the mocked class the two params get multiplied and the lowest combo gets select + assert best_params == {"param1": 1, "param2": 10} + + +def test_under_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests): + """test the _fit method""" + + observed_data = pd.DataFrame( + { + "a": ["A1", "A1", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2"], + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + + segment_list = ["a"] + + funnel_forecast_for_fit_tests._set_segment_models( + observed_df=observed_data, segment_column_list=segment_list + ) + funnel_forecast_for_fit_tests._fit(observed_data) + + # _fit iterates though all the segments in segment_modles + # iterate through them and check based on the value in + # segment_info_fit_tests defined in the fixture of the same name + for segment in funnel_forecast_for_fit_tests.segment_models: + key = segment.segment["a"] + + assert segment.start_date == segment_info_fit_tests[key]["start_date"] + assert segment.grid_parameters == segment_info_fit_tests[key]["grid_parameters"] + segment_model = segment.segment_model + assert segment_model.value == segment_info_fit_tests[key]["min_param_value"] + + # the history attribute is used in the components output so check it is set properly + expected_training = observed_data[ + (observed_data["a"] == key) + & ( + observed_data["submission_date"] + >= pd.to_datetime(segment_info_fit_tests[key]["start_date"]).date() + ) + ].rename(columns={"submission_date": "ds"}) + + pd.testing.assert_frame_equal(segment_model.history, expected_training) + + +def test_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests): + """test the fit function. It is inherited from BaseForecast + and calls _fit with the proper object attributes. Test looks very + similar to that for _fit""" + observed_data = pd.DataFrame( + { + "a": ["A1", "A1", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2"], + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + + segment_list = ["a"] + + funnel_forecast_for_fit_tests._set_segment_models( + observed_df=observed_data, segment_column_list=segment_list + ) + funnel_forecast_for_fit_tests.observed_df = observed_data + funnel_forecast_for_fit_tests.fit() + + # _fit is called by fit and iterates though all the segments in segment_modles + # iterate through them and check based on the value in + # segment_info_fit_tests defined in the fixture of the same name + for segment in funnel_forecast_for_fit_tests.segment_models: + key = segment.segment["a"] + + assert segment.start_date == segment_info_fit_tests[key]["start_date"] + assert segment.grid_parameters == segment_info_fit_tests[key]["grid_parameters"] + segment_model = segment.segment_model + assert segment_model.value == segment_info_fit_tests[key]["min_param_value"] + + # check history attribute + expected_training = observed_data[ + (observed_data["a"] == key) + & ( + observed_data["submission_date"] + >= pd.to_datetime(segment_info_fit_tests[key]["start_date"]).date() + ) + ].rename(columns={"submission_date": "ds"}) + pd.testing.assert_frame_equal(segment_model.history, expected_training) + + +def test_set_segment_models(): + """test the set_segment_models method""" + A1_start_date = "2018-01-01" + A2_start_date = "2020-02-02" parameter_list = [ { "segment": {"a": "A1"}, - "parameters": { - "growth": "logistic", - "grid_parameters": { - "seasonality_prior_scale": [1, 2], - "holidays_prior_scale": [20, 10], - }, - }, + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, }, { "segment": {"a": "A2"}, - "parameters": { - "growth": "A2", - "grid_parameters": { - "seasonality_prior_scale": [3, 4], - "holidays_prior_scale": [40, 30], - }, - }, + "start_date": A2_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, }, ] - mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) - mocker.patch.object( - ProphetAutotunerForecast, - "_get_crossvalidation_metric", - mock_get_crossvalidation_metric, + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_PREDICT_END_STR + + forecast = FunnelForecast( + model_type="test", + parameters=parameter_list, + use_all_us_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, ) - ensemble_object = FunnelForecast(parameters=parameter_list, segments=["a", "b"]) observed_data = pd.DataFrame( - { - "a": ["A1", "A1", "A2", "A2", "A2"], - "b": ["B1", "B2", "B1", "B2", "B2"], - "submission_date": [ - TEST_DATE_STR, - TEST_DATE_STR, - TEST_DATE_STR, - TEST_DATE_STR, - TEST_DATE_STR, - ], - "value": [1, 2, 3, 4, 5], - } + {"a": ["A1", "A1", "A2", "A2", "A2"], "b": ["B1", "B2", "B1", "B2", "B2"]} ) - ensemble_object.fit(observed_data) + segment_list = ["a", "b"] - segment_models = ensemble_object.segment_models + forecast._set_segment_models( + observed_df=observed_data, segment_column_list=segment_list + ) # put the segments and the start date in the same dictionary to make # comparison easier @@ -894,18 +896,14 @@ def test_funnelforecast_fit(mocker): # start_date is a stand-in for these parameters and # is determined by the value of a as specified in parameter_dict check_segment_models = [ - dict( - **el["segment"], - **{"value": el["model"].model.value, "growth": el["model"].growth}, - ) - for el in segment_models + dict(**el.segment, **{"start_date": el.start_date}) + for el in forecast.segment_models ] - expected = [ - {"a": "A1", "b": "B1", "growth": "logistic", "value": 10}, - {"a": "A1", "b": "B2", "growth": "logistic", "value": 10}, - {"a": "A2", "b": "B1", "growth": "A2", "value": 90}, - {"a": "A2", "b": "B2", "growth": "A2", "value": 90}, + {"a": "A1", "b": "B1", "start_date": A1_start_date}, + {"a": "A1", "b": "B2", "start_date": A1_start_date}, + {"a": "A2", "b": "B1", "start_date": A2_start_date}, + {"a": "A2", "b": "B2", "start_date": A2_start_date}, ] # can't make a set of dicts for comparison @@ -918,106 +916,76 @@ def test_funnelforecast_fit(mocker): for checkval, expectedval in compare_sorted: assert checkval == expectedval - # test that the seed was set for all models during fitting - assert all([el["model"]._set_seed for el in segment_models]) - # test that the fit was applied properly to all models - # to do this check the is_fit attribute, which will equal - # A1_start_date for A1 segments and A2_start_date for A2 segments - - # check that it fit by making sure model.history is not null - for segment in segment_models: - subset = observed_data[ - (observed_data["a"] == segment["segment"]["a"]) - & (observed_data["b"] == segment["segment"]["b"]) - ] - subset = subset.rename(columns={"submission_date": "ds", "value": "y"}) - if segment["segment"]["a"] == "A1": - if segment["segment"]["b"] == "B1": - floor = 0.5 * 1 - cap = 1.5 * 1 - else: - floor = 0.5 * 2 - cap = 1.5 * 2 - subset["floor"] = floor - subset["cap"] = cap - pd.testing.assert_frame_equal(subset, segment["model"].model.history) - - -def test_funnelforecast_fit_multiple(mocker): +def test_set_segment_models_multiple(): """test the set_segment_models method with segments on multiple columns""" - # arbitrarily choose growth as a parameter - # to set in order to check the test + # set arbitrary dates + # they're only used to make sure segments are set correctly + A1B1_start_date = "2018-01-01" + A1B2_start_date = "2019-01-01" + A2B1_start_date = "2020-02-02" + A2B2_start_date = "2021-02-02" parameter_list = [ { "segment": {"a": "A1", "b": "B1"}, - "parameters": { - "growth": "logistic", - "grid_parameters": { - "seasonality_prior_scale": [1, 2], - "holidays_prior_scale": [20, 10], - }, - }, + "start_date": A1B1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, }, { - "segment": {"a": "A2", "b": "B1"}, - "parameters": { - "growth": "A2B1", - "grid_parameters": { - "seasonality_prior_scale": [3, 4], - "holidays_prior_scale": [40, 30], - }, - }, + "segment": {"a": "A1", "b": "B2"}, + "start_date": A1B2_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, }, { - "segment": {"a": "A1", "b": "B2"}, - "parameters": { - "growth": "logistic", - "grid_parameters": { - "seasonality_prior_scale": [10, 20], - "holidays_prior_scale": [200, 100], - }, - }, + "segment": {"a": "A2", "b": "B1"}, + "start_date": A2B1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, }, { "segment": {"a": "A2", "b": "B2"}, - "parameters": { - "growth": "A2B2", - "grid_parameters": { - "seasonality_prior_scale": [30, 40], - "holidays_prior_scale": [400, 300], - }, - }, + "start_date": A2B2_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, }, ] - mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) - mocker.patch.object( - ProphetAutotunerForecast, - "_get_crossvalidation_metric", - mock_get_crossvalidation_metric, + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_PREDICT_END_STR + + forecast = FunnelForecast( + model_type="test", + parameters=parameter_list, + use_all_us_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, ) - ensemble_object = FunnelForecast(parameters=parameter_list, segments=["a", "b"]) observed_data = pd.DataFrame( - { - "a": ["A1", "A1", "A2", "A2", "A2"], - "b": ["B1", "B2", "B1", "B2", "B2"], - "submission_date": [ - TEST_DATE_STR, - TEST_DATE_STR, - TEST_DATE_STR, - TEST_DATE_STR, - TEST_DATE_STR, - ], - "value": [1, 2, 3, 4, 5], - } + {"a": ["A1", "A1", "A2", "A2", "A2"], "b": ["B1", "B2", "B1", "B2", "B2"]} ) - ensemble_object.fit(observed_data) + segment_list = ["a", "b"] - segment_models = ensemble_object.segment_models + forecast._set_segment_models( + observed_df=observed_data, segment_column_list=segment_list + ) # put the segments and the start date in the same dictionary to make # comparison easier @@ -1026,18 +994,14 @@ def test_funnelforecast_fit_multiple(mocker): # start_date is a stand-in for these parameters and # is determined by the value of a as specified in parameter_dict check_segment_models = [ - dict( - **el["segment"], - **{"value": el["model"].model.value, "growth": el["model"].growth}, - ) - for el in segment_models + dict(**el.segment, **{"start_date": el.start_date}) + for el in forecast.segment_models ] - expected = [ - {"a": "A1", "b": "B1", "growth": "logistic", "value": 10}, - {"a": "A1", "b": "B2", "growth": "logistic", "value": 1000}, - {"a": "A2", "b": "B1", "growth": "A2B1", "value": 90}, - {"a": "A2", "b": "B2", "growth": "A2B2", "value": 9000}, + {"a": "A1", "b": "B1", "start_date": A1B1_start_date}, + {"a": "A1", "b": "B2", "start_date": A1B2_start_date}, + {"a": "A2", "b": "B1", "start_date": A2B1_start_date}, + {"a": "A2", "b": "B2", "start_date": A2B2_start_date}, ] # can't make a set of dicts for comparison @@ -1050,376 +1014,625 @@ def test_funnelforecast_fit_multiple(mocker): for checkval, expectedval in compare_sorted: assert checkval == expectedval - # test that the seed was set for all models during fitting - assert all([el["model"]._set_seed for el in segment_models]) - - # test that the fit was applied properly to all models - # to do this check the is_fit attribute, which will equal - # A1_start_date for A1 segments and A2_start_date for A2 segments - - # check that it fit by making sure model.history is not null - for segment in segment_models: - subset = observed_data[ - (observed_data["a"] == segment["segment"]["a"]) - & (observed_data["b"] == segment["segment"]["b"]) - ] - subset = subset.rename(columns={"submission_date": "ds", "value": "y"}) - if segment["segment"]["a"] == "A1": - if segment["segment"]["b"] == "B1": - floor = 0.5 * 1 - cap = 1.5 * 1 - else: - floor = 0.5 * 2 - cap = 1.5 * 2 - subset["floor"] = floor - subset["cap"] = cap - pd.testing.assert_frame_equal(subset, segment["model"].model.history) - - -def test_funnel_predict(mocker): - """test the predict method. This is similar to test_under_predict - but multiple segments are acted upon""" - # arbitrarily choose growth as a parameter - # to set in order to check the test +def test_set_segment_models_exception(): + """test the exception for segment_models where + and exception is raised if a model_setting_split_dim + is specified that isn't in the data""" + A1_start_date = "2018-01-01" + A2_start_date = "2020-02-02" parameter_list = [ { - "segment": {"a": "A1"}, - "parameters": { - "growth": "logistic", - "grid_parameters": { - "seasonality_prior_scale": [1, 2], - "holidays_prior_scale": [20, 10], - }, - }, + "segment": {"c": "A1"}, + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, }, { - "segment": {"a": "A2"}, - "parameters": { - "growth": "A2", - "grid_parameters": { - "seasonality_prior_scale": [3, 4], - "holidays_prior_scale": [40, 30], - }, - }, + "segment": {"c": "A2"}, + "start_date": A2_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, }, ] - mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) - mocker.patch.object( - ProphetAutotunerForecast, - "_get_crossvalidation_metric", - mock_get_crossvalidation_metric, + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_PREDICT_END_STR + + forecast = FunnelForecast( + model_type="test", + parameters=parameter_list, + use_all_us_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, ) - ensemble_object = FunnelForecast(parameters=parameter_list, segments=["a", "b"]) observed_data = pd.DataFrame( + {"a": ["A1", "A1", "A2", "A2", "A2"], "b": ["B1", "B2", "B1", "B2", "B2"]} + ) + + segment_list = ["a", "b"] + + with pytest.raises( + ValueError, + match="Segment keys missing from metric hub segments: c", + ): + forecast._set_segment_models( + observed_df=observed_data, segment_column_list=segment_list + ) + + +def test_fill_regressor_dates(forecast): + """test _fill_regressor_dates + the name in the regressor info indicates which case is being tested + Dates are chosen arbitrarily""" + # get the set start and end dates for the forecast fixture + # as datetime objects + default_start_datetime = datetime(TEST_DATE.year, TEST_DATE.month, TEST_DATE.day) + default_end_datetime = datetime( + TEST_PREDICT_END.year, TEST_PREDICT_END.month, TEST_PREDICT_END.day + ) + + # set the start date with an arbitrary date + regressor_info = { + "name": "only_start", + "description": "only has a start", + "start_date": "2020-08-15", + } + regressor = ProphetRegressor(**regressor_info) + forecast._fill_regressor_dates(regressor) + assert regressor.start_date == pd.to_datetime("2020-08-15") + + # this is the end dat for the forecast fixture + assert regressor.end_date == default_end_datetime + + # set the end date with an arbitrary date + regressor_info = { + "name": "only_end", + "description": "only has a end", + "end_date": "2125-08-15", + } + regressor = ProphetRegressor(**regressor_info) + forecast._fill_regressor_dates(regressor) + # the start date for the forecast fixture is TEST_DATE + assert regressor.start_date == default_start_datetime + assert regressor.end_date == pd.to_datetime("2125-08-15") + + # set both the start and end dates to arbitrary dates + regressor_info = { + "name": "both", + "description": "only has a start", + "start_date": "2020-08-15", + "end_date": "2020-09-15", + } + regressor = ProphetRegressor(**regressor_info) + forecast._fill_regressor_dates(regressor) + assert regressor.start_date == pd.to_datetime("2020-08-15") + assert regressor.end_date == pd.to_datetime("2020-09-15") + + # use the defaults for both + regressor_info = { + "name": "neither", + "description": "nothin to see here", + } + regressor = ProphetRegressor(**regressor_info) + forecast._fill_regressor_dates(regressor) + assert regressor.start_date == default_start_datetime + assert regressor.end_date == default_end_datetime + + # use arbitrary out of order dates to set + regressor_info = { + "name": "out_of_order", + "description": "best better break", + "start_date": "2020-08-15", + "end_date": "2000-09-15", + } + regressor = ProphetRegressor(**regressor_info) + with pytest.raises( + Exception, + match="Regressor out_of_order start date comes after end date", + ): + forecast._fill_regressor_dates(regressor) + + +def test_add_regressors(forecast): + """test add regressors + test case for each element of regressor_list_raw is indicated in name""" + + # choose arbitrary dates for dates + # name indicates the relationship of the window + # to the timeframe of the data as defined in the ds + # column of df below + regressor_list_raw = [ { - "a": ["A1", "A1", "A2", "A2", "A2"] * 2, - "b": ["B1", "B2", "B1", "B2", "B2"] * 2, - "submission_date": pd.to_datetime( - [ - TEST_DATE, - TEST_DATE, - TEST_DATE, - TEST_DATE, - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - ] - ), - "value": [1, 2, 3, 4, 5] * 2, + "name": "all_in", + "description": "it's all in", + "start_date": "2124-01-01", + "end_date": "2124-01-06", + }, + { + "name": "all_out", + "description": "it's all out", + "start_date": "2124-02-01", + "end_date": "2124-02-06", + }, + { + "name": "just_end", + "description": "just the second half", + "start_date": "2124-01-03", + "end_date": "2124-02-06", + }, + { + "name": "just_middle", + "description": "just the middle two", + "start_date": "2124-01-02", + "end_date": "2124-01-03", + }, + ] + + regressor_list = [ProphetRegressor(**r) for r in regressor_list_raw] + + df = pd.DataFrame( + { + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-03").date(), + pd.to_datetime("2124-01-04").date(), + ], } ) - ensemble_object.fit(observed_data) + output_df = forecast._add_regressors(df, regressors=regressor_list) - dates_to_predict = pd.DataFrame( + expected_df = pd.DataFrame( { - "submission_date": pd.to_datetime( - [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ] - ) + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-03").date(), + pd.to_datetime("2124-01-04").date(), + ], + "all_in": [0, 0, 0, 0], + "all_out": [1, 1, 1, 1], + "just_end": [1, 1, 0, 0], + "just_middle": [1, 0, 0, 1], } ) - out = ensemble_object.predict(dates_to_predict).reset_index(drop=True) + assert set(output_df.columns) == set(expected_df.columns) + pd.testing.assert_frame_equal(output_df, expected_df[output_df.columns]) - for segment in ensemble_object.segment_models: - # in MockModel, the predictive_samples method sets the output to - # np.arange(len(dates_to_predict)) * self.value for one column called 0 - # this helps ensure the forecast_df in segment_models is set properly - out_subset = out[ - (out["a"] == segment["segment"]["a"]) - & (out["b"] == segment["segment"]["b"]) - ] - model_value = segment["model"].model.value - expected = pd.DataFrame( - { - 0: [0, model_value], - "submission_date": pd.to_datetime( - [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ] - ), - "a": [segment["segment"]["a"], segment["segment"]["a"]], - "b": [segment["segment"]["b"], segment["segment"]["b"]], - "forecast_parameters": [json.dumps(segment["model"]._get_parameters())] - * 2, - } - ) - pd.testing.assert_frame_equal( - out_subset.reset_index(drop=True), expected.reset_index(drop=True) - ) +def test_build_train_dataframe_no_regressors(forecast): + """test _build_train_dataframe with no regressors""" + regressor_list = [] - # check the components - expected_components = ( - observed_data.loc[ - (observed_data["a"] == segment["segment"]["a"]) - & (observed_data["b"] == segment["segment"]["b"]), - ["submission_date", "value"], - ] - .rename(columns={"value": "y"}) - .copy() - ) - expected_components[ - [ - "yhat", - "trend", - "trend_upper", - "trend_lower", - "weekly", - "weekly_upper", - "weekly_lower", - "yearly", - "yearly_upper", - "yearly_lower", - ] - ] = 0 + grid_parameters = { + "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], + "changepoint_range": [0.8, 0.9, 1], + "n_changepoints": [30], + "weekly_seasonality": True, + "yearly_seasonality": True, + "growth": "logistic", + } + cv_settings = { + "initial": "366 days", + "period": "30 days", + "horizon": "30 days", + "parallel": "processes", + } + segment_settings = SegmentModelSettings( + segment={"a": 1, "b": 2}, + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, + holidays=[], + regressors=[ProphetRegressor(**r) for r in regressor_list], + grid_parameters=grid_parameters, + cv_settings=cv_settings, + ) - components_df = segment["model"].components_df - assert set(expected_components.columns) == set(components_df.columns) - pd.testing.assert_frame_equal( - components_df.reset_index(drop=True), - expected_components[components_df.columns].reset_index(drop=True), - ) + observed_df = pd.DataFrame( + { + "a": [1, 1, 1, 1, 3, 3], + "b": [1, 1, 2, 2, 2, 2], + "y": [1, 2, 3, 4, 5, 6], + "submission_date": [ + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE + relativedelta(months=1), + TEST_DATE_NEXT_DAY + relativedelta(months=1), + ], + } + ) + + output_train_df = forecast._build_train_dataframe( + observed_df, segment_settings=segment_settings + ) + expected_train_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + pd.testing.assert_frame_equal( + output_train_df.reset_index(drop=True), expected_train_df + ) + # test again but with add_logistic_growth_cols set to true + output_train_wlog_df = forecast._build_train_dataframe( + observed_df, segment_settings=segment_settings, add_logistic_growth_cols=True + ) + expected_train_wlog_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "floor": [1.5, 1.5], + "cap": [6.0, 6.0], + } + ) -def test_funnel_predict_growth(mocker): - """test the predict method when growth is set in the - grid parameters. Extra attributes need to be updated with this one""" + assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) + pd.testing.assert_frame_equal( + output_train_wlog_df.reset_index(drop=True), + expected_train_wlog_df[output_train_wlog_df.columns], + ) - # arbitrarily choose growth as a parameter - # to set in order to check the test - parameter_list = [ + +def test_build_train_dataframe(forecast): + """test _build_train_dataframe and include regressors""" + regressor_list = [ { - "segment": {"a": "A1"}, - "parameters": { - "grid_parameters": { - "seasonality_prior_scale": [1, 2], - "holidays_prior_scale": [20, 10], - "growth": "logistic", - }, - }, + "name": "all_in", + "description": "it's all in", + "start_date": TEST_DATE_STR, + "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), }, { - "segment": {"a": "A2"}, - "parameters": { - "growth": "A2", - "grid_parameters": { - "seasonality_prior_scale": [3, 4], - "holidays_prior_scale": [40, 30], - }, - }, + "name": "all_out", + "description": "it's all in", + "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), + }, + { + "name": "just_end", + "description": "just the second one", + "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, ] - mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) - mocker.patch.object( - ProphetAutotunerForecast, - "_get_crossvalidation_metric", - mock_get_crossvalidation_metric, + grid_parameters = { + "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], + "changepoint_range": [0.8, 0.9, 1], + "n_changepoints": [30], + "weekly_seasonality": True, + "yearly_seasonality": True, + "growth": "logistic", + } + cv_settings = { + "initial": "366 days", + "period": "30 days", + "horizon": "30 days", + "parallel": "processes", + } + segment_settings = SegmentModelSettings( + segment={"a": 1, "b": 2}, + start_date=TEST_DATE_STR, + end_date=(TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), + holidays=[], + regressors=[ProphetRegressor(**r) for r in regressor_list], + grid_parameters=grid_parameters, + cv_settings=cv_settings, ) - ensemble_object = FunnelForecast(parameters=parameter_list, segments=["a", "b"]) - observed_data = pd.DataFrame( + observed_df = pd.DataFrame( { - "a": ["A1", "A1", "A2", "A2", "A2"] * 2, - "b": ["B1", "B2", "B1", "B2", "B2"] * 2, - "submission_date": pd.to_datetime( - [ - TEST_DATE, - TEST_DATE, - TEST_DATE, - TEST_DATE, - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - TEST_DATE_NEXT_DAY, - ] - ), - "value": [1, 2, 3, 4, 5] * 2, + "a": [1, 1, 1, 1, 3, 3], + "b": [1, 1, 2, 2, 2, 2], + "y": [1, 2, 3, 4, 5, 6], + "submission_date": [ + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE + relativedelta(months=1), + TEST_DATE_NEXT_DAY + relativedelta(months=1), + ], } ) + output_train_df = forecast._build_train_dataframe( + observed_df, segment_settings=segment_settings + ) + expected_train_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "all_in": [0, 0], + "all_out": [ + 1, + 1, + ], + "just_end": [1, 0], + } + ) + pd.testing.assert_frame_equal( + output_train_df.reset_index(drop=True), expected_train_df + ) - ensemble_object.fit(observed_data) - - dates_to_predict = pd.DataFrame( + output_train_wlog_df = forecast._build_train_dataframe( + observed_df, segment_settings=segment_settings, add_logistic_growth_cols=True + ) + expected_train_wlog_df = pd.DataFrame( { - "submission_date": pd.to_datetime( - [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ] - ) + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "all_in": [0, 0], + "all_out": [1, 1], + "just_end": [1, 0], + "floor": [1.5, 1.5], + "cap": [6.0, 6.0], } ) - out = ensemble_object.predict(dates_to_predict).reset_index(drop=True) + assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) + pd.testing.assert_frame_equal( + output_train_wlog_df.reset_index(drop=True), + expected_train_wlog_df[output_train_wlog_df.columns], + ) - for segment in ensemble_object.segment_models: - # in MockModel, the predictive_samples method sets the output to - # np.arange(len(dates_to_predict)) * self.value for one column called 0 - # this helps ensure the forecast_df in segment_models is set properly - out_subset = out[ - (out["a"] == segment["segment"]["a"]) - & (out["b"] == segment["segment"]["b"]) - ] - model_value = segment["model"].model.value - expected = pd.DataFrame( - { - 0: [0, model_value], - "submission_date": pd.to_datetime( - [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ] - ), - "a": [segment["segment"]["a"], segment["segment"]["a"]], - "b": [segment["segment"]["b"], segment["segment"]["b"]], - "forecast_parameters": [json.dumps(segment["model"]._get_parameters())] - * 2, - } - ) - pd.testing.assert_frame_equal( - out_subset.reset_index(drop=True), expected.reset_index(drop=True) - ) +def test_build_predict_dataframe_no_regressors(forecast): + """test _build_predict with no regressors""" + regressor_list = [] - # check that the growth attributes were set - if segment["segment"]["a"] == "A1": - if segment["segment"]["b"] == "B1": - assert segment["model"].logistic_growth_floor == 0.5 - assert segment["model"].logistic_growth_cap == 1.5 - elif segment["segment"]["b"] == "B2": - assert segment["model"].logistic_growth_floor == 1.0 - assert segment["model"].logistic_growth_cap == 3.0 + grid_parameters = { + "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], + "changepoint_range": [0.8, 0.9, 1], + "n_changepoints": [30], + "weekly_seasonality": True, + "yearly_seasonality": True, + "growth": "logistic", + } + cv_settings = { + "initial": "366 days", + "period": "30 days", + "horizon": "30 days", + "parallel": "processes", + } + segment_settings = SegmentModelSettings( + segment={"a": 1, "b": 2}, + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, + holidays=[], + regressors=[ProphetRegressor(**r) for r in regressor_list], + grid_parameters=grid_parameters, + cv_settings=cv_settings, + ) - # check the components - expected_components = ( - observed_data.loc[ - (observed_data["a"] == segment["segment"]["a"]) - & (observed_data["b"] == segment["segment"]["b"]), - ["submission_date", "value"], - ] - .rename(columns={"value": "y"}) - .copy() - ) - expected_components[ - [ - "yhat", - "trend", - "trend_upper", - "trend_lower", - "weekly", - "weekly_upper", - "weekly_lower", - "yearly", - "yearly_upper", - "yearly_lower", - ] - ] = 0 + # manually set trained_parameters, normally this would happen during training + segment_settings.trained_parameters = {"floor": -1.0, "cap": 10.0} - components_df = segment["model"].components_df - assert set(expected_components.columns) == set(components_df.columns) - pd.testing.assert_frame_equal( - components_df.reset_index(drop=True), - expected_components[components_df.columns].reset_index(drop=True), - ) + dates_to_predict = pd.DataFrame( + { + "submission_date": [ + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + output_predict_df = forecast._build_predict_dataframe( + dates_to_predict, segment_settings=segment_settings + ) + expected_predict_df = pd.DataFrame( + { + "ds": [ + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + pd.testing.assert_frame_equal( + output_predict_df.reset_index(drop=True), expected_predict_df + ) -def test_set_segment_models_exception(mocker): - """test the exception for segment_models where - and exception is raised if a model_setting_split_dim - is specified that isn't in the data""" - # arbitrarily choose growth as a parameter - # to set in order to check the test - parameter_list = [ + # test against but with add_logistic_growth_cols set to true + output_predict_wlog_df = forecast._build_predict_dataframe( + dates_to_predict, + segment_settings=segment_settings, + add_logistic_growth_cols=True, + ) + expected_predict_wlog_df = pd.DataFrame( { - "segment": {"c": "A1"}, - "parameters": { - "growth": "logistic", - "grid_parameters": { - "seasonality_prior_scale": [1, 2], - "holidays_prior_scale": [20, 10], - }, - }, + "ds": [ + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "floor": [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0], + "cap": [10.0, 10.0, 10.0, 10.0, 10.0, 10.0], + } + ) + + assert set(output_predict_wlog_df.columns) == set(expected_predict_wlog_df.columns) + pd.testing.assert_frame_equal( + output_predict_wlog_df.reset_index(drop=True), + expected_predict_wlog_df[output_predict_wlog_df.columns], + ) + + +def test_build_predict_dataframe(forecast): + """test _build_predict_dataframe including regressors""" + regressor_list = [ + { + "name": "all_in", + "description": "it's all in", + "start_date": TEST_DATE_STR, + "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), }, { - "segment": {"c": "A2"}, - "parameters": { - "growth": "A2", - "grid_parameters": { - "seasonality_prior_scale": [3, 4], - "holidays_prior_scale": [40, 30], - }, - }, + "name": "all_out", + "description": "it's all in", + "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), + }, + { + "name": "just_end", + "description": "just the second one", + "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, ] - mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) - mocker.patch.object( - ProphetAutotunerForecast, - "_get_crossvalidation_metric", - mock_get_crossvalidation_metric, + grid_parameters = { + "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], + "changepoint_range": [0.8, 0.9, 1], + "n_changepoints": [30], + "weekly_seasonality": True, + "yearly_seasonality": True, + "growth": "logistic", + } + cv_settings = { + "initial": "366 days", + "period": "30 days", + "horizon": "30 days", + "parallel": "processes", + } + segment_settings = SegmentModelSettings( + segment={"a": 1, "b": 2}, + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, + holidays=[], + regressors=[ProphetRegressor(**r) for r in regressor_list], + grid_parameters=grid_parameters, + cv_settings=cv_settings, ) - ensemble_object = FunnelForecast(parameters=parameter_list, segments=["a", "b"]) - observed_data = pd.DataFrame( + # set training_parameters, which is usually done in the fit method + segment_settings.trained_parameters = {"floor": -1.0, "cap": 10.0} + + dates_to_predict = pd.DataFrame( { - "a": ["A1", "A1", "A2", "A2", "A2"], - "b": ["B1", "B2", "B1", "B2", "B2"], - "submission_date": [ - TEST_DATE_STR, - TEST_DATE_STR, - TEST_DATE_STR, - TEST_DATE_STR, - TEST_DATE_STR, - ], - "value": [1, 2, 3, 4, 5], + "submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY], } ) - with pytest.raises( - ValueError, - match="Segment keys missing from metric hub segments: c", - ): - ensemble_object.fit(observed_df=observed_data) + output_train_df = forecast._build_predict_dataframe( + dates_to_predict, + segment_settings=segment_settings, + ) + expected_train_df = pd.DataFrame( + { + "ds": [TEST_DATE, TEST_DATE_NEXT_DAY], + "all_in": [0, 0], + "all_out": [1, 1], + "just_end": [1, 0], + } + ) + pd.testing.assert_frame_equal( + output_train_df.reset_index(drop=True), expected_train_df + ) + # test again but with add_logistic_growth_cols set to true + output_train_wlog_df = forecast._build_predict_dataframe( + dates_to_predict, + segment_settings=segment_settings, + add_logistic_growth_cols=True, + ) + expected_train_wlog_df = pd.DataFrame( + { + "ds": [TEST_DATE, TEST_DATE_NEXT_DAY], + "all_in": [0, 0], + "all_out": [1, 1], + "just_end": [1, 0], + "floor": [-1.0, -1.0], + "cap": [10.0, 10.0], + } + ) -def test_build_model(): + assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) + pd.testing.assert_frame_equal( + output_train_wlog_df.reset_index(drop=True), + expected_train_wlog_df[output_train_wlog_df.columns], + ) + + +def test_build_model(forecast): """test build_model just runs the function and ensures no error is raised""" - regressor_list = ["post_esr_migration", "in_covid", "ad_click_bug"] + regressor_list = [ + { + "name": "all_in", + "description": "it's all in", + "start_date": TEST_DATE_STR, + "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), + }, + { + "name": "all_out", + "description": "it's all in", + "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), + }, + { + "name": "just_end", + "description": "just the second one", + "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), + }, + ] # use holidays from holiday config file holiday_list = { @@ -1468,16 +1681,29 @@ def test_build_model(): "horizon": "30 days", "parallel": "processes", } - forecast = ProphetAutotunerForecast( - holidays=holiday_list.keys(), - regressors=regressor_list, + segment_settings = SegmentModelSettings( + segment={"a": 1, "b": 2}, + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, + holidays=[ProphetHoliday(**h) for h in holiday_list.values()], + regressors=[ProphetRegressor(**r) for r in regressor_list], grid_parameters=grid_parameters, cv_settings=cv_settings, ) - _ = forecast._build_model() + model = forecast._build_model( + segment_settings=segment_settings, + parameters={ + "changepoint_prior_scale": 0.01, + "changepoint_range": 0.8, + "n_changepoints": 30, + "weekly_seasonality": True, + "yearly_seasonality": True, + "growth": "logistic", + }, + ) - holiday_df = forecast.holidays + holiday_df = model.holidays expected_holidays = pd.concat( [ pd.DataFrame( diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py index 842740e6..c512e0c9 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py @@ -52,61 +52,6 @@ def test_only_numeric(): pd.testing.assert_frame_equal(month_output, expected_month) -def test_only_numeric_with_additional(): - df = pd.DataFrame( - { - "submission_date": [ - "2020-01-01", - "2020-01-01", - "2020-01-02", - "2020-01-02", - "2020-01-02", - ], - "additional_col": ["A", "B", "A", "A", "B"], - "ints": [1, 2, 3, 4, 5], - "floats": [10.0, 20.0, 30.0, 40.0, 50.0], - } - ) - - day_output = aggregate_to_period( - df, "day", additional_aggregation_columns=["additional_col"] - ) - - expected_day = pd.DataFrame( - { - "submission_date": [ - pd.to_datetime("2020-01-01"), - pd.to_datetime("2020-01-01"), - pd.to_datetime("2020-01-02"), - pd.to_datetime("2020-01-02"), - ], - "additional_col": ["A", "B", "A", "B"], - "ints": [1, 2, 7, 5], - "floats": [10.0, 20.0, 70.0, 50.0], - } - ) - - pd.testing.assert_frame_equal(day_output, expected_day) - - month_output = aggregate_to_period( - df, "month", additional_aggregation_columns=["additional_col"] - ) - - expected_month = pd.DataFrame( - { - "submission_date": [ - pd.to_datetime("2020-01-01"), - pd.to_datetime("2020-01-01"), - ], - "additional_col": ["A", "B"], - "ints": [8, 7], - "floats": [80.0, 70.0], - } - ) - - pd.testing.assert_frame_equal(month_output, expected_month) - - def test_with_string_and_numeric(): df = pd.DataFrame( { @@ -158,55 +103,6 @@ def test_with_string_and_numeric(): pd.testing.assert_frame_equal(month_output, expected_month) -def test_with_string_and_numeri_with_additional(): - df = pd.DataFrame( - { - "submission_date": [ - "2020-01-01", - "2020-01-01", - "2020-01-02", - "2020-01-02", - "2020-01-02", - ], - "ints": [1, 2, 3, 4, 5], - "floats": [10.0, 20.0, 30.0, 40.0, 50.0], - "string": ["A01", "B01", "A02", "A02", "B02"], - "additional_col": ["A", "B", "A", "A", "B"], - } - ) - - day_output = aggregate_to_period( - df, "day", additional_aggregation_columns=["additional_col"] - ) - - expected_day = pd.DataFrame( - { - "submission_date": [ - pd.to_datetime("2020-01-01"), - pd.to_datetime("2020-01-01"), - pd.to_datetime("2020-01-02"), - pd.to_datetime("2020-01-02"), - ], - "additional_col": ["A", "B", "A", "B"], - "ints": [1, 2, 7, 5], - "floats": [10.0, 20.0, 70.0, 50.0], - "string": ["A01", "B01", "A02", "B02"], - } - ) - - pd.testing.assert_frame_equal(day_output, expected_day) - - # strings no longer have the same value within an aggregation category - # so error is expected - with pytest.raises( - ValueError, - match="String and Numeric dataframes have different length, likely due to strings not being unique up to aggregation", - ): - _ = aggregate_to_period( - df, "month", additional_aggregation_columns=["additional_col"] - ) - - def test_only_string(): df = pd.DataFrame( { @@ -252,60 +148,6 @@ def test_only_string(): pd.testing.assert_frame_equal(month_output, expected_month) -def test_only_string_with_additional(): - df = pd.DataFrame( - { - "submission_date": [ - "2020-01-01", - "2020-01-01", - "2020-01-02", - "2020-02-01", - "2020-02-02", - ], - "string": ["jan", "jan", "jan", "feb", "feb"], - "additional_col": ["jan", "jan", "jan", "feb", "feb"], - } - ) - - day_output = aggregate_to_period( - df, "day", additional_aggregation_columns=["additional_col"] - ) - - expected_day = pd.DataFrame( - { - "submission_date": [ - pd.to_datetime("2020-01-01"), - pd.to_datetime("2020-01-02"), - pd.to_datetime("2020-02-01"), - pd.to_datetime("2020-02-02"), - ], - "string": ["jan", "jan", "feb", "feb"], - "additional_col": ["jan", "jan", "feb", "feb"], - } - ) - - assert set(day_output.columns) == set(expected_day.columns) - pd.testing.assert_frame_equal(day_output, expected_day[day_output.columns]) - - month_output = aggregate_to_period( - df, "month", additional_aggregation_columns=["additional_col"] - ) - - expected_month = pd.DataFrame( - { - "submission_date": [ - pd.to_datetime("2020-01-01"), - pd.to_datetime("2020-02-01"), - ], - "string": ["jan", "feb"], - "additional_col": ["jan", "feb"], - } - ) - - assert set(month_output.columns) == set(expected_month.columns) - pd.testing.assert_frame_equal(month_output, expected_month[month_output.columns]) - - def test_non_unique_string_exception(): df = pd.DataFrame( { diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py index 928b7ba3..adc9c4ba 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py @@ -4,31 +4,53 @@ import pandas as pd import numpy as np import pytest +import collections -from kpi_forecasting.models.prophet_forecast import ( - ProphetForecast, - combine_forecast_observed, - aggregate_forecast_observed, - summarize, -) -from kpi_forecasting.configs.model_inputs import ProphetRegressor - +from kpi_forecasting.models.prophet_forecast import ProphetForecast # Arbitrarily choose some date to use for the tests TEST_DATE = date(2024, 1, 1) TEST_DATE_STR = TEST_DATE.strftime("%Y-%m-%d") -TEST_DATE_NEXT_DAY = date(2024, 1, 2) +TEST_DATE_NEXT_DAY = date(2024, 1, 1) TEST_DATE_NEXT_DAY_STR = TEST_DATE_NEXT_DAY.strftime("%Y-%m-%d") -TEST_PREDICT_END = TEST_DATE + relativedelta(months=2) -TEST_PREDICT_END_STR = TEST_PREDICT_END.strftime("%Y-%m-%d") + + +@pytest.fixture +def forecast(): + A1_start_date = TEST_DATE_STR + parameter_dict = { + "model_setting_split_dim": "a", + "segment_settings": { + "A1": { + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, + "cv_settings": {}, + }, + }, + } + + predict_start_date = TEST_DATE_NEXT_DAY_STR + # arbitarily set it a couple months in the future + predict_end_date = (TEST_DATE + relativedelta(months=2)).strftime("%Y-%m-%d") + return ProphetForecast( + model_type="test", + parameters=parameter_dict, + use_all_us_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) class MockModel: """Used in place of prophet.Prophet for testing purposes""" - def __init__(self, **kwargs): - self.value = 2 + def __init__(self, param1=0, param2=0, **kwargs): + self.value = param1 * param2 self.history = None def fit(self, df, *args, **kwargs): @@ -62,66 +84,52 @@ def predictive_samples(self, dates_to_predict): return {"yhat": {0: output}} -def mock_build_model(self): +def mock_build_model(parameters): """mocks the FunnelForecast build_model method""" - return MockModel(holidays=self.holidays, regressors=self.regressors) - + return MockModel( + **parameters, + ) -@pytest.fixture -def forecast(mocker): - parameter_dict = {"uncertainty_samples": 1} - mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) +def mock_aggregate_forecast_observed( + forecast_df, observed_df, period, numpy_aggregations, percentiles +): + """Mocks the aggregate_forecast_observed function defined in ProphetForecast + and inherited in FunnelForecast. + This function is tested extensively in test_prophet_forecast + so we can make dummy outputs for tests related to it""" - # arbitarily set it a couple months in the future - return ProphetForecast(**parameter_dict) + # add dummy columns where aggregated metrics woudl go + percentile_columns = [f"p{el}" for el in percentiles] + output_forecast_df = forecast_df.copy() + output_forecast_df[numpy_aggregations + percentile_columns] = 0 + return output_forecast_df, observed_df.copy() -def test_predict(forecast): - """testing _predict""" +def test_under_fit(forecast, mocker): + """test the _fit method""" - observed_df = pd.DataFrame( + observed_data = pd.DataFrame( { - "y": [0, 1], "submission_date": [ TEST_DATE, TEST_DATE_NEXT_DAY, - ], - } - ) - - dates_to_predict = pd.DataFrame( - { - "submission_date": [ TEST_DATE, TEST_DATE_NEXT_DAY, - ] + ], } ) + mocker.patch.object(forecast, "_build_model", mock_build_model) - forecast.fit(observed_df) - - # to make sure the validation works set the number of simulations - out = forecast.predict(dates_to_predict).reset_index(drop=True) + forecast._fit(observed_data) - # in MockModel, the predictive_samples method sets the output to - # np.arange(len(dates_to_predict)) * self.value for one column called 0 - # this helps ensure the forecast_df in segment_models is set properly - # self.value is 2 - expected = pd.DataFrame( - { - 0: [0, 2], - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } + # checking that history is set in the mocked Model ensures fit was called on it + pd.testing.assert_frame_equal( + observed_data.rename(columns={"submission_date": "ds"}), forecast.model.history ) - pd.testing.assert_frame_equal(out, expected) - -def test_fit(forecast): +def test_fit(forecast, mocker): """test the fit function. It is inherited from BaseForecast and calls _fit with the proper object attributes. Test looks very similar to that for _fit""" @@ -135,972 +143,398 @@ def test_fit(forecast): ], } ) + mocker.patch.object(forecast, "_build_model", mock_build_model) - forecast.fit(observed_data) + forecast.observed_df = observed_data + forecast.fit() # checking that history is set in the mocked Model ensures fit was called on it pd.testing.assert_frame_equal( observed_data.rename(columns={"submission_date": "ds"}), forecast.model.history ) + assert forecast.trained_at is not None -def test_aggregate_forecast_to_day(): - """tests the aggregate_forecast_observed method in the case - where the observed and forecasted have no overlap and the aggregation - happens at the day level""" - test_date_samples = np.arange(1000) - test_next_date_samples = np.arange(1000) * 2 - forecast_df = pd.DataFrame( - [ - { - **{"submission_date": TEST_DATE}, - **{i: el for i, el in enumerate(test_date_samples)}, - }, - { - **{"submission_date": TEST_DATE_NEXT_DAY}, - **{i: el for i, el in enumerate(test_next_date_samples)}, - }, - ] - ) - # rows with negative values are those expected to be removed - # by filters in summarize - # arbitrarily subtract 1 month so there's not overlap - observed_df = pd.DataFrame( +def test_combine_forecast_observed(mocker, forecast): + """tests the _combine_forecast_observed method""" + # forecast predictions are set with the + # mock_aggregate_forecast_observed function so they + # can be ommited here + forecast_df = pd.DataFrame( { "submission_date": [ - TEST_DATE - relativedelta(months=1), - TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], - "value": [10, 20], } ) - numpy_aggregations = ["mean"] - percentiles = [10, 50, 90] - forecast_summarized_output, observed_summarized_output = ( - aggregate_forecast_observed( - forecast_df, - observed_df, - period="day", - numpy_aggregations=numpy_aggregations, - percentiles=percentiles, - ) - ) - observed_summarized_expected_df = pd.DataFrame( + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime(TEST_DATE - relativedelta(months=1)), - pd.to_datetime(TEST_DATE_NEXT_DAY - relativedelta(months=1)), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "value": [10, 20], - "aggregation_period": ["day", "day"], } ) - forecast_summarized_expected_df = pd.DataFrame( - [ - { - "submission_date": pd.to_datetime(TEST_DATE), - "mean": np.mean(test_date_samples), - "p10": np.percentile(test_date_samples, 10), - "p50": np.percentile(test_date_samples, 50), - "p90": np.percentile(test_date_samples, 90), - "aggregation_period": "day", - }, - { - "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), - "mean": np.mean(test_next_date_samples), - "p10": np.percentile(test_next_date_samples, 10), - "p50": np.percentile(test_next_date_samples, 50), - "p90": np.percentile(test_next_date_samples, 90), - "aggregation_period": "day", - }, - ] - ) - - pd.testing.assert_frame_equal( - forecast_summarized_output, forecast_summarized_expected_df - ) - - pd.testing.assert_frame_equal( - observed_summarized_output, observed_summarized_expected_df + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed ) - -def test_aggregate_forecast_to_month(): - """tests the aggregate_forecast_observed method in the case - where the observed and forecasted have no overlap and the aggregation - happens at the day level""" - test_date_samples = np.arange(1000) - test_next_date_samples = np.arange(1000) * 2 - forecast_df = pd.DataFrame( - [ - { - **{"submission_date": TEST_DATE, "forecast_parameters": "test_month"}, - **{i: el for i, el in enumerate(test_date_samples)}, - }, - { - **{ - "submission_date": TEST_DATE_NEXT_DAY, - "forecast_parameters": "test_month", - }, - **{i: el for i, el in enumerate(test_next_date_samples)}, - }, - ] + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + output_df = forecast._combine_forecast_observed( + forecast_df, + observed_df, + period="period", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, ) - - # rows with negative values are those expected to be removed - # by filters in summarize - # arbitrarily subtract 1 month so there's not overlap - observed_df = pd.DataFrame( + observed_expected_df = pd.DataFrame( { "submission_date": [ - TEST_DATE - relativedelta(months=1), - TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "value": [10, 20], + "measure": ["observed", "observed"], + "source": ["historical", "historical"], } ) - numpy_aggregations = ["mean"] - percentiles = [10, 50, 90] - forecast_summarized_output, observed_summarized_output = ( - aggregate_forecast_observed( - forecast_df, - observed_df, - period="month", - numpy_aggregations=numpy_aggregations, - percentiles=percentiles, - ) - ) - - # TEST_DATE should be the first of the month - observed_summarized_expected_df = pd.DataFrame( + # 4x2 columns, 4 metrics (mean, p10, p50, p90) + forecast_expected_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime(TEST_DATE - relativedelta(months=1)), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], - "value": [30], - "aggregation_period": ["month"], + "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], + "value": [0] * 8, + "source": ["forecast"] * 8, } ) - forecast_summarized_expected_df = pd.DataFrame( - [ - { - "submission_date": pd.to_datetime(TEST_DATE), - "mean": np.mean(test_date_samples + test_next_date_samples), - "p10": np.percentile(test_date_samples + test_next_date_samples, 10), - "p50": np.percentile(test_date_samples + test_next_date_samples, 50), - "p90": np.percentile(test_date_samples + test_next_date_samples, 90), - "forecast_parameters": "test_month", - "aggregation_period": "month", - }, - ] + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( + ["submission_date", "measure"] ) - + assert set(expected.columns) == set(output_df.columns) + # force value columns to be floats in both cases to make check easier + numeric_cols = ["value", "value_low", "value_mid", "value_high"] pd.testing.assert_frame_equal( - forecast_summarized_output, forecast_summarized_expected_df + output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), + expected[output_df.columns].reset_index(drop=True), ) - pd.testing.assert_frame_equal( - observed_summarized_output, observed_summarized_expected_df - ) + # should not be any nulls outside the metric column + non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) -def test_aggregate_forecast_to_month_extra_agg_col(): - """tests the aggregate_forecast_observed method in the case - where the observed and forecasted have no overlap and the aggregation - happens at the day level""" - test_date_samples = np.arange(1000) - test_next_date_samples = np.arange(1000) * 2 +def test_under_summarize(mocker, forecast): + """testing _summarize""" + # forecast predictions are set with the + # mock_aggregate_forecast_observed function so they + # can be ommited here forecast_df = pd.DataFrame( - [ - { - **{ - "submission_date": TEST_DATE, - "a": "A1", - "forecast_parameters": "A1", - }, - **{i: el for i, el in enumerate(test_date_samples)}, - }, - { - **{ - "submission_date": TEST_DATE_NEXT_DAY, - "a": "A1", - "forecast_parameters": "A1", - }, - **{i: el for i, el in enumerate(test_next_date_samples)}, - }, - { - **{ - "submission_date": TEST_DATE, - "a": "A2", - "forecast_parameters": "A2", - }, - **{i: el for i, el in enumerate(2 * test_date_samples)}, - }, - { - **{ - "submission_date": TEST_DATE_NEXT_DAY, - "a": "A2", - "forecast_parameters": "A2", - }, - **{i: el for i, el in enumerate(2 * test_next_date_samples)}, - }, - ] + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } ) # rows with negative values are those expected to be removed # by filters in summarize - # arbitrarily subtract 1 month so there's not overlap observed_df = pd.DataFrame( { "submission_date": [ - TEST_DATE - relativedelta(months=1), - TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "value": [10, 20], - "a": ["A1", "A1"], } ) + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed + ) + numpy_aggregations = ["mean"] percentiles = [10, 50, 90] - forecast_summarized_output, observed_summarized_output = ( - aggregate_forecast_observed( - forecast_df, - observed_df, - period="month", - numpy_aggregations=numpy_aggregations, - percentiles=percentiles, - additional_aggregation_columns=["a"], - ) - ) - - # TEST_DATE should be the first of the month - observed_summarized_expected_df = pd.DataFrame( + output_df = forecast._summarize( + forecast_df, + observed_df, + period="period", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + ) + observed_expected_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime(TEST_DATE - relativedelta(months=1)), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], - "value": [30], - "a": ["A1"], - "aggregation_period": "month", + "value": [10, 20], + "measure": ["observed", "observed"], + "source": ["historical", "historical"], } ) - forecast_summarized_expected_df = pd.DataFrame( - [ - { - "submission_date": pd.to_datetime(TEST_DATE), - "mean": np.mean(test_date_samples + test_next_date_samples), - "p10": np.percentile(test_date_samples + test_next_date_samples, 10), - "p50": np.percentile(test_date_samples + test_next_date_samples, 50), - "p90": np.percentile(test_date_samples + test_next_date_samples, 90), - "a": "A1", - "forecast_parameters": "A1", - "aggregation_period": "month", - }, - { - "submission_date": pd.to_datetime(TEST_DATE), - "mean": 2 * np.mean(test_date_samples + test_next_date_samples), - "p10": 2 - * np.percentile(test_date_samples + test_next_date_samples, 10), - "p50": 2 - * np.percentile(test_date_samples + test_next_date_samples, 50), - "p90": 2 - * np.percentile(test_date_samples + test_next_date_samples, 90), - "a": "A2", - "forecast_parameters": "A2", - "aggregation_period": "month", - }, - ] + # 4x2 columns, 4 metrics (mean, p10, p50, p90) + forecast_expected_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], + "value": [0] * 8, + "source": ["forecast"] * 8, + } ) - assert set(forecast_summarized_output.columns) == set( - forecast_summarized_output.columns - ) - pd.testing.assert_frame_equal( - forecast_summarized_output[forecast_summarized_expected_df.columns], - forecast_summarized_expected_df, + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( + ["submission_date", "measure"] ) + expected["aggregation_period"] = "period" - assert set(observed_summarized_output.columns) == set( - observed_summarized_expected_df.columns - ) + assert set(expected.columns) == set(output_df.columns) + # force value columns to be floats in both cases to make check easier + numeric_cols = ["value", "value_low", "value_mid", "value_high"] pd.testing.assert_frame_equal( - observed_summarized_output[observed_summarized_expected_df.columns], - observed_summarized_expected_df, + output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), + expected[output_df.columns].reset_index(drop=True), ) + # should not be any nulls outside the metric column + non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) -def test_aggregate_forecast_observed_overlap_to_day(): - """tests the aggregate_forecast_observed method in the case - where the observed and forecasted overlap and the aggregation - happens at the day level""" - test_date_samples = np.arange(1000) - test_next_date_samples = np.arange(1000) * 2 - forecast_df = pd.DataFrame( - [ - { - **{"submission_date": TEST_DATE}, - **{i: el for i, el in enumerate(test_date_samples)}, - }, - { - **{"submission_date": TEST_DATE_NEXT_DAY}, - **{i: el for i, el in enumerate(test_next_date_samples)}, - }, - ] + +def test_summarize(mocker, forecast): + """testing summarize""" + # create dummy metric hub object to when meta data from + # it is added we don't get an error + MetricHub = collections.namedtuple( + "MetricHub", + ["alias", "app_name", "slug", "min_date", "max_date"], ) - # rows with negative values are those expected to be removed - # by filters in summarize - observed_df = pd.DataFrame( + dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR) + + # forecast predictions are set with the + # mock_aggregate_forecast_observed function so they + # can be ommited here + forecast_df = pd.DataFrame( { "submission_date": [ TEST_DATE, TEST_DATE_NEXT_DAY, ], - "value": [10, 20], } ) - numpy_aggregations = ["mean"] - percentiles = [10, 50, 90] - forecast_summarized_output, observed_summarized_output = ( - aggregate_forecast_observed( - forecast_df, - observed_df, - period="day", - numpy_aggregations=numpy_aggregations, - percentiles=percentiles, - ) - ) - observed_summarized_expected_df = pd.DataFrame( + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime(TEST_DATE), - pd.to_datetime(TEST_DATE_NEXT_DAY), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "value": [10, 20], - "aggregation_period": ["day", "day"], } ) - # add values from observed because of overlap - forecast_summarized_expected_df = pd.DataFrame( - [ - { - "submission_date": pd.to_datetime(TEST_DATE), - "mean": np.mean(test_date_samples + 10), - "p10": np.percentile(test_date_samples + 10, 10), - "p50": np.percentile(test_date_samples + 10, 50), - "p90": np.percentile(test_date_samples + 10, 90), - "aggregation_period": "day", - }, - { - "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), - "mean": np.mean(test_next_date_samples + 20), - "p10": np.percentile(test_next_date_samples + 20, 10), - "p50": np.percentile(test_next_date_samples + 20, 50), - "p90": np.percentile(test_next_date_samples + 20, 90), - "aggregation_period": "day", - }, - ] + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed ) - pd.testing.assert_frame_equal( - forecast_summarized_output, forecast_summarized_expected_df - ) + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] - pd.testing.assert_frame_equal( - observed_summarized_output, observed_summarized_expected_df - ) + forecast.observed_df = observed_df + forecast.forecast_df = forecast_df + forecast.metric_hub = dummy_metric_hub + # timestamp attributes created by fit and predict + # must be added manuall + forecast.collected_at = "" + forecast.trained_at = "" + forecast.predicted_at = "" + forecast.metadata_params = "" -def test_aggregate_forecast_observed_overlap_to_day_with_additional(): - """tests the aggregate_forecast_observed method in the case - where the observed and forecasted overlap and the aggregation - happens at the day level""" - test_date_samples = np.arange(1000) - test_next_date_samples = np.arange(1000) * 2 - forecast_df = pd.DataFrame( - [ - { - **{ - "submission_date": TEST_DATE, - "a": "A1", - "forecast_parameters": "A1", - }, - **{i: el for i, el in enumerate(test_date_samples)}, - }, - { - **{ - "submission_date": TEST_DATE_NEXT_DAY, - "a": "A1", - "forecast_parameters": "A1", - }, - **{i: el for i, el in enumerate(test_next_date_samples)}, - }, - { - **{ - "submission_date": TEST_DATE, - "a": "A2", - "forecast_parameters": "A2", - }, - **{i: el for i, el in enumerate(2 * test_date_samples)}, - }, - { - **{ - "submission_date": TEST_DATE_NEXT_DAY, - "a": "A2", - "forecast_parameters": "A2", - }, - **{i: el for i, el in enumerate(2 * test_next_date_samples)}, - }, - ] + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + forecast.summarize( + periods=["period1", "period2"], + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, ) - # rows with negative values are those expected to be removed - # by filters in summarize - observed_df = pd.DataFrame( + output_df = forecast.summary_df + + observed_expected_df = pd.DataFrame( { "submission_date": [ TEST_DATE, TEST_DATE_NEXT_DAY, ], "value": [10, 20], - "a": ["A1", "A2"], + "measure": ["observed", "observed"], + "source": ["historical", "historical"], } ) - numpy_aggregations = ["mean"] - percentiles = [10, 50, 90] - forecast_summarized_output, observed_summarized_output = ( - aggregate_forecast_observed( - forecast_df, - observed_df, - period="day", - numpy_aggregations=numpy_aggregations, - percentiles=percentiles, - additional_aggregation_columns=["a"], - ) - ) - observed_summarized_expected_df = pd.DataFrame( + # 4x2 columns, 4 metrics (mean, p10, p50, p90) + forecast_expected_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime(TEST_DATE), - pd.to_datetime(TEST_DATE_NEXT_DAY), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], - "value": [10, 20], - "a": ["A1", "A2"], - "aggregation_period": ["day", "day"], + "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], + "value": [0] * 8, + "source": ["forecast"] * 8, } ) - # add values from observed because of overlap - forecast_summarized_expected_df = pd.DataFrame( - [ - { - "submission_date": pd.to_datetime(TEST_DATE), - "a": "A1", - "forecast_parameters": "A1", - "mean": np.mean(test_date_samples + 10), - "p10": np.percentile(test_date_samples + 10, 10), - "p50": np.percentile(test_date_samples + 10, 50), - "p90": np.percentile(test_date_samples + 10, 90), - "aggregation_period": "day", - }, - { - "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), - "a": "A1", - "forecast_parameters": "A1", - "mean": np.mean(test_next_date_samples), - "p10": np.percentile(test_next_date_samples, 10), - "p50": np.percentile(test_next_date_samples, 50), - "p90": np.percentile(test_next_date_samples, 90), - "aggregation_period": "day", - }, - { - "submission_date": pd.to_datetime(TEST_DATE), - "a": "A2", - "forecast_parameters": "A2", - "mean": np.mean(2 * test_date_samples), - "p10": np.percentile(2 * test_date_samples, 10), - "p50": np.percentile(2 * test_date_samples, 50), - "p90": np.percentile(2 * test_date_samples, 90), - "aggregation_period": "day", - }, - { - "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), - "a": "A2", - "forecast_parameters": "A2", - "mean": np.mean(2 * test_next_date_samples + 20), - "p10": np.percentile(2 * test_next_date_samples + 20, 10), - "p50": np.percentile(2 * test_next_date_samples + 20, 50), - "p90": np.percentile(2 * test_next_date_samples + 20, 90), - "aggregation_period": "day", - }, - ] - ) - - assert set(forecast_summarized_expected_df.columns) == set( - forecast_summarized_output.columns + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( + ["submission_date", "measure"] ) + expected1 = expected.copy() + expected2 = expected.copy() + expected1["aggregation_period"] = "period1" + expected2["aggregation_period"] = "period2" + + expected = pd.concat([expected1, expected2]) + + # not going to check all the metadata columns + # in assert_frame_equal. Just make sure they're there + metadata_columns = { + "metric_alias", + "metric_hub_app_name", + "metric_hub_slug", + "metric_start_date", + "metric_end_date", + "metric_collected_at", + "forecast_start_date", + "forecast_end_date", + "forecast_trained_at", + "forecast_predicted_at", + "forecast_parameters", + } + assert set(expected.columns) | metadata_columns == set(output_df.columns) + # force value columns to be floats in both cases to make check easier + numeric_cols = ["value", "value_low", "value_mid", "value_high"] pd.testing.assert_frame_equal( - forecast_summarized_output[forecast_summarized_expected_df.columns] - .sort_values(["submission_date", "a"]) - .reset_index(drop=True), - forecast_summarized_expected_df.sort_values( - ["submission_date", "a"] + output_df.sort_values(["submission_date", "aggregation_period", "measure"])[ + expected.columns + ].reset_index(drop=True), + expected.sort_values( + ["submission_date", "aggregation_period", "measure"] ).reset_index(drop=True), ) - assert set(observed_summarized_expected_df.columns) == set( - observed_summarized_output.columns - ) - pd.testing.assert_frame_equal( - observed_summarized_output[observed_summarized_expected_df.columns] - .sort_values(["submission_date", "a"]) - .reset_index(drop=True), - observed_summarized_expected_df.sort_values( - ["submission_date", "a"] - ).reset_index(drop=True), - ) + # should not be any nulls outside the metric column + non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) -def test_aggregate_forecast_observed_overlap_to_month(): - """tests the aggregate_forecast_observed method in the case - where the observed and forecasted overlap and the aggregation - happens at the day level""" - test_date_samples = np.arange(1000) - test_next_date_samples = np.arange(1000) * 2 - forecast_df = pd.DataFrame( - [ - { - **{"submission_date": TEST_DATE}, - **{i: el for i, el in enumerate(test_date_samples)}, - }, - { - **{"submission_date": TEST_DATE_NEXT_DAY}, - **{i: el for i, el in enumerate(test_next_date_samples)}, - }, - ] - ) +def test_under_predict(mocker, forecast): + """testing _predict""" + # this ensures forecast is using MockModel + mocker.patch.object(forecast, "_build_model", mock_build_model) - # rows with negative values are those expected to be removed - # by filters in summarize observed_df = pd.DataFrame( { + "y": [0, 1], "submission_date": [ TEST_DATE, TEST_DATE_NEXT_DAY, ], - "value": [10, 20], } ) - numpy_aggregations = ["mean"] - percentiles = [10, 50, 90] - forecast_summarized_output, observed_summarized_output = ( - aggregate_forecast_observed( - forecast_df, - observed_df, - period="month", - numpy_aggregations=numpy_aggregations, - percentiles=percentiles, - ) - ) - observed_summarized_expected_df = pd.DataFrame( + dates_to_predict = pd.DataFrame( { "submission_date": [ - pd.to_datetime(TEST_DATE), - ], - "value": [30], - "aggregation_period": ["month"], + TEST_DATE, + TEST_DATE_NEXT_DAY, + ] } ) + forecast.observed_df = observed_df + forecast.parameters = {"param1": 1, "param2": 2} + forecast.fit() + out = forecast._predict(dates_to_predict).reset_index(drop=True) - # add values from observed because of overlap - forecast_summarized_expected_df = pd.DataFrame( - [ - { - "submission_date": pd.to_datetime(TEST_DATE), - "mean": np.mean(test_date_samples + test_next_date_samples + 30), - "p10": np.percentile( - test_date_samples + test_next_date_samples + 30, 10 - ), - "p50": np.percentile( - test_date_samples + test_next_date_samples + 30, 50 - ), - "p90": np.percentile( - test_date_samples + test_next_date_samples + 30, 90 - ), - "aggregation_period": "month", - }, - ] - ) - - pd.testing.assert_frame_equal( - forecast_summarized_output, forecast_summarized_expected_df - ) - - pd.testing.assert_frame_equal( - observed_summarized_output, observed_summarized_expected_df - ) - - -def test_aggregate_forecast_observed_overlap_to_month_with_additional(): - """tests the aggregate_forecast_observed method in the case - where the observed and forecasted overlap and the aggregation - happens at the day level""" - test_date_samples = np.arange(1000) - test_next_date_samples = np.arange(1000) * 2 - forecast_df = pd.DataFrame( - [ - { - **{ - "submission_date": TEST_DATE, - "forecast_parameters": "A1", - "a": "A1", - }, - **{i: el for i, el in enumerate(test_date_samples)}, - }, - { - **{ - "submission_date": TEST_DATE_NEXT_DAY, - "forecast_parameters": "A1", - "a": "A1", - }, - **{i: el for i, el in enumerate(test_next_date_samples)}, - }, - { - **{ - "submission_date": TEST_DATE, - "forecast_parameters": "A2", - "a": "A2", - }, - **{i: el for i, el in enumerate(2 * test_date_samples)}, - }, - { - **{ - "submission_date": TEST_DATE_NEXT_DAY, - "forecast_parameters": "A2", - "a": "A2", - }, - **{i: el for i, el in enumerate(2 * test_next_date_samples)}, - }, - ] - ) - - # rows with negative values are those expected to be removed - # by filters in summarize - observed_df = pd.DataFrame( - { - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - "value": [10, 20], - "a": ["A1", "A2"], - } - ) - - numpy_aggregations = ["mean"] - percentiles = [10, 50, 90] - forecast_summarized_output, observed_summarized_output = ( - aggregate_forecast_observed( - forecast_df, - observed_df, - period="month", - numpy_aggregations=numpy_aggregations, - percentiles=percentiles, - additional_aggregation_columns=["a"], - ) - ) - observed_summarized_expected_df = pd.DataFrame( - { - "submission_date": [ - pd.to_datetime(TEST_DATE), - pd.to_datetime(TEST_DATE), - ], - "value": [10, 20], - "a": ["A1", "A2"], - "aggregation_period": ["month", "month"], - } - ) - - # add values from observed because of overlap - forecast_summarized_expected_df = pd.DataFrame( - [ - { - "submission_date": pd.to_datetime(TEST_DATE), - "forecast_parameters": "A1", - "a": "A1", - "mean": np.mean(test_date_samples + test_next_date_samples + 10), - "p10": np.percentile( - test_date_samples + test_next_date_samples + 10, 10 - ), - "p50": np.percentile( - test_date_samples + test_next_date_samples + 10, 50 - ), - "p90": np.percentile( - test_date_samples + test_next_date_samples + 10, 90 - ), - "aggregation_period": "month", - }, - { - "submission_date": pd.to_datetime(TEST_DATE), - "forecast_parameters": "A2", - "a": "A2", - "mean": np.mean( - 2 * test_date_samples + 2 * test_next_date_samples + 20 - ), - "p10": np.percentile( - 2 * test_date_samples + 2 * test_next_date_samples + 20, 10 - ), - "p50": np.percentile( - 2 * test_date_samples + 2 * test_next_date_samples + 20, 50 - ), - "p90": np.percentile( - 2 * test_date_samples + 2 * test_next_date_samples + 20, 90 - ), - "aggregation_period": "month", - }, - ] - ) - - assert set(forecast_summarized_expected_df.columns) == set( - forecast_summarized_output.columns - ) - pd.testing.assert_frame_equal( - forecast_summarized_output[forecast_summarized_expected_df.columns] - .sort_values(["submission_date", "a"]) - .reset_index(drop=True), - forecast_summarized_expected_df.sort_values( - ["submission_date", "a"] - ).reset_index(drop=True), - ) - - assert set(observed_summarized_expected_df.columns) == set( - observed_summarized_output.columns - ) - pd.testing.assert_frame_equal( - observed_summarized_output[observed_summarized_expected_df.columns] - .sort_values(["submission_date", "a"]) - .reset_index(drop=True), - observed_summarized_expected_df.sort_values( - ["submission_date", "a"] - ).reset_index(drop=True), - ) - - -def test_combine_forecast_observed(): - """tests the combine_forecast_observed method""" - forecast_df = pd.DataFrame( - { - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - "mean": [0, 0], - "p10": [0, 0], - "p50": [0, 0], - "p90": [0, 0], - "aggregation_period": ["I get removed"] * 2, - } - ) - - # rows with negative values are those expected to be removed - # by filters in summarize - observed_df = pd.DataFrame( - { - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - "value": [10, 20], - "aggregation_period": ["I get removed"] * 2, - } - ) - - output_df = combine_forecast_observed(forecast_df, observed_df) - observed_expected_df = pd.DataFrame( - { - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - "value": [10, 20], - "measure": ["observed", "observed"], - "source": ["historical", "historical"], - } - ) - - # 4x2 columns, 4 metrics (mean, p10, p50, p90) - forecast_expected_df = pd.DataFrame( + # in MockModel, the predictive_samples method sets the output to + # np.arange(len(dates_to_predict)) * self.value for one column called 0 + # this helps ensure the forecast_df in segment_models is set properly + expected = pd.DataFrame( { + 0: [0, 2], "submission_date": [ TEST_DATE, TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, ], - "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], - "value": [0] * 8, - "source": ["forecast"] * 8, } ) - # concat in same order to make our lives easier - expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( - ["submission_date", "measure"] - ) - assert set(expected.columns) == set(output_df.columns) - - pd.testing.assert_frame_equal( - output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), - expected[output_df.columns].reset_index(drop=True), - ) + pd.testing.assert_frame_equal(out, expected) - assert not pd.isna(output_df).any(axis=None) + # test predict while we're here + forecast.dates_to_predict = dates_to_predict + forecast.number_of_simulations = 1 # so that _validate doesn't break + forecast.predict() -def test_summarize(): - """testing _summarize""" - test_date_samples = np.arange(1000) - test_next_date_samples = np.arange(1000) * 2 - forecast_df = pd.DataFrame( - [ - { - **{"submission_date": TEST_DATE}, - **{i: el for i, el in enumerate(test_date_samples)}, - }, - { - **{"submission_date": TEST_DATE_NEXT_DAY}, - **{i: el for i, el in enumerate(test_next_date_samples)}, - }, - ] - ) + out = forecast.forecast_df - # rows with negative values are those expected to be removed - # by filters in summarize - observed_df = pd.DataFrame( + # in MockModel, the predictive_samples method sets the output to + # np.arange(len(dates_to_predict)) * self.value for one column called 0 + # this helps ensure the forecast_df in segment_models is set properly + expected = pd.DataFrame( { + 0: [0, 2], "submission_date": [ TEST_DATE, TEST_DATE_NEXT_DAY, ], - "value": [10, 20], } ) - numpy_aggregations = ["mean"] - percentiles = [10, 50, 90] - output_df = summarize( - forecast_df, - observed_df, - periods=["day"], - numpy_aggregations=numpy_aggregations, - percentiles=percentiles, - forecast_parameters="", - ) - - observed_expected_df = pd.DataFrame( - { - "submission_date": [ - pd.to_datetime(TEST_DATE), - pd.to_datetime(TEST_DATE_NEXT_DAY), - ], - "value": [10, 20], - "measure": ["observed", "observed"], - "source": ["historical", "historical"], - "aggregation_period": ["day", "day"], - } - ) - - # add values from observed because of overlap - forecast_expected_df = pd.DataFrame( - [ - { - "submission_date": pd.to_datetime(TEST_DATE), - "measure": "mean", - "value": np.mean(test_date_samples + 10), - "source": "forecast", - "aggregation_period": "day", - }, - { - "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), - "measure": "mean", - "value": np.mean(test_next_date_samples + 20), - "source": "forecast", - "aggregation_period": "day", - }, - { - "submission_date": pd.to_datetime(TEST_DATE), - "measure": "p10", - "value": np.percentile(test_date_samples + 10, 10), - "source": "forecast", - "aggregation_period": "day", - }, - { - "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), - "measure": "p10", - "value": np.percentile(test_next_date_samples + 20, 10), - "source": "forecast", - "aggregation_period": "day", - }, - { - "submission_date": pd.to_datetime(TEST_DATE), - "measure": "p50", - "value": np.percentile(test_date_samples + 10, 50), - "source": "forecast", - "aggregation_period": "day", - }, - { - "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), - "measure": "p50", - "value": np.percentile(test_next_date_samples + 20, 50), - "source": "forecast", - "aggregation_period": "day", - }, - { - "submission_date": pd.to_datetime(TEST_DATE), - "measure": "p90", - "value": np.percentile(test_date_samples + 10, 90), - "source": "forecast", - "aggregation_period": "day", - }, - { - "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), - "measure": "p90", - "value": np.percentile(test_next_date_samples + 20, 90), - "source": "forecast", - "aggregation_period": "day", - }, - ] - ) - - # concat in same order to make our lives easier - expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( - ["submission_date", "measure"] - ) - expected["aggregation_period"] = "day" - expected["forecast_parameters"] = "" - - assert set(expected.columns) == set(output_df.columns) - pd.testing.assert_frame_equal( - output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), - expected[output_df.columns].reset_index(drop=True), - ) - - assert not pd.isna(output_df).any(axis=None) + pd.testing.assert_frame_equal(out, expected) + assert forecast.predicted_at is not None def test_summarize_non_overlapping_day(): @@ -1112,12 +546,18 @@ def test_summarize_non_overlapping_day(): ) predict_end_date = (TEST_DATE + relativedelta(months=2)).strftime("%Y-%m-%d") + forecast = ProphetForecast( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) observed_submission_dates = pd.date_range( pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date) ).date - predict_submission_dates = pd.date_range( - pd.to_datetime(predict_start_date), pd.to_datetime(predict_end_date) - ).date + predict_submission_dates = forecast.dates_to_predict["submission_date"].values observed_df = pd.DataFrame( { @@ -1144,8 +584,8 @@ def test_summarize_non_overlapping_day(): dict(**{"submission_date": predict_submission_dates}, **forecast_data) ) - output_df = summarize( - forecast_df, observed_df, ["day"], ["mean", "median"], [50], "" + output_df = forecast._combine_forecast_observed( + forecast_df, observed_df, "day", ["mean", "median"], [50] ) expected_observed_df = observed_df.copy() @@ -1190,9 +630,6 @@ def test_summarize_non_overlapping_day(): [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] ) - expected_df["aggregation_period"] = "day" - expected_df["forecast_parameters"] = "" - assert set(expected_df.columns) == set(output_df.columns) columns = expected_df.columns expected_df_compare = ( @@ -1224,12 +661,21 @@ def test_summarize_non_overlapping_month(): predict_start_date = "2124-04-01" predict_end_date = "2124-05-31" + print(observed_start_date, observed_end_date) + print(predict_start_date, predict_end_date) + + forecast = ProphetForecast( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) observed_submission_dates = pd.date_range( pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date) ).date - predict_submission_dates = pd.date_range( - pd.to_datetime(predict_start_date), pd.to_datetime(predict_end_date) - ).date + predict_submission_dates = forecast.dates_to_predict["submission_date"].values observed_df = pd.DataFrame( { @@ -1251,8 +697,8 @@ def test_summarize_non_overlapping_month(): dict(**{"submission_date": predict_submission_dates}, **forecast_data) ) - output_df = summarize( - forecast_df, observed_df, ["month"], ["mean", "median"], [50], "" + output_df = forecast._combine_forecast_observed( + forecast_df, observed_df, "month", ["mean", "median"], [50] ) expected_observed_dates = sorted( @@ -1313,9 +759,6 @@ def test_summarize_non_overlapping_month(): [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] ) - expected_df["aggregation_period"] = "month" - expected_df["forecast_parameters"] = "" - assert set(expected_df.columns) == set(output_df.columns) columns = expected_df.columns expected_df_compare = ( @@ -1340,12 +783,19 @@ def test_summarize_overlapping_day(): predict_start_date = TEST_DATE_STR predict_end_date = (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d") + forecast = ProphetForecast( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) observed_submission_dates = pd.date_range( pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date) ).date - predict_submission_dates = pd.date_range( - pd.to_datetime(predict_start_date), pd.to_datetime(predict_end_date) - ).date + predict_submission_dates = forecast.dates_to_predict["submission_date"].values + observed_df = pd.DataFrame( { "submission_date": observed_submission_dates, @@ -1371,8 +821,8 @@ def test_summarize_overlapping_day(): dict(**{"submission_date": predict_submission_dates}, **forecast_data) ) - output_df = summarize( - forecast_df, observed_df, ["day"], ["mean", "median"], [50], "" + output_df = forecast._combine_forecast_observed( + forecast_df, observed_df, "day", ["mean", "median"], [50] ) expected_observed_df = observed_df.copy() @@ -1419,9 +869,6 @@ def test_summarize_overlapping_day(): [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] ) - expected_df["aggregation_period"] = "day" - expected_df["forecast_parameters"] = "" - assert set(expected_df.columns) == set(output_df.columns) columns = expected_df.columns expected_df_compare = ( @@ -1450,12 +897,19 @@ def test_summarize_overlapping_month(): predict_start_date = "2124-01-01" predict_end_date = "2124-02-28" + forecast = ProphetForecast( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) observed_submission_dates = pd.date_range( pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date) ).date - predict_submission_dates = pd.date_range( - pd.to_datetime(predict_start_date), pd.to_datetime(predict_end_date) - ).date + predict_submission_dates = forecast.dates_to_predict["submission_date"].values + observed_df = pd.DataFrame( { "submission_date": observed_submission_dates, @@ -1481,8 +935,8 @@ def test_summarize_overlapping_month(): dict(**{"submission_date": predict_submission_dates}, **forecast_data) ) - output_df = summarize( - forecast_df, observed_df, ["month"], ["mean", "median"], [50], "" + output_df = forecast._combine_forecast_observed( + forecast_df, observed_df, "month", ["mean", "median"], [50] ) expected_observed_dates = sorted( @@ -1549,9 +1003,6 @@ def test_summarize_overlapping_month(): [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] ) - expected_df["aggregation_period"] = "month" - expected_df["forecast_parameters"] = "" - assert set(expected_df.columns) == set(output_df.columns) columns = expected_df.columns expected_df_compare = ( @@ -1567,552 +1018,3 @@ def test_summarize_overlapping_month(): pd.testing.assert_frame_equal( expected_df_compare, output_df_compare, check_exact=False ) - - -def test_summarize_overlapping_month_and_day(): - # choose arbitrary year for the start and end dates - # the first date of two different months is chosen - # this is a simple way to check that the aggregation - # for multiple periods is working - observed_start_date = "2124-01-01" - observed_end_date = "2124-02-01" - - observed_submission_dates = [ - pd.to_datetime(observed_start_date), - pd.to_datetime(observed_end_date), - ] - predict_submission_dates = [ - pd.to_datetime(observed_start_date), - pd.to_datetime(observed_end_date), - ] - observed_df = pd.DataFrame( - { - "submission_date": observed_submission_dates, - "value": [1] * len(observed_submission_dates), - } - ) - - # there are the samples generated - # the mean and median are the aggregates used - test_samples = np.array([1, 1, 2, 3, 5, 8, 13]) - test_mean = np.mean(test_samples) - test_median = np.median(test_samples) - - # mean and median scale with a factor - # so a factor is multiplied on to make sure the aggregation is working - # across rows properly - forecast_array = np.stack( - [test_samples] * len(predict_submission_dates), - axis=0, - ) - forecast_data = {str(i): forecast_array[:, i] for i in range(len(test_samples))} - forecast_df = pd.DataFrame( - dict(**{"submission_date": predict_submission_dates}, **forecast_data) - ) - - output_df = summarize( - forecast_df, observed_df, ["month", "day"], ["mean", "median"], [50], "" - ) - - expected_observed_dates = sorted( - pd.to_datetime(observed_df["submission_date"].values) - .to_period("m") - .to_timestamp() - .unique() - ) - expected_observed_df = pd.DataFrame( - { - "submission_date": expected_observed_dates, - "source": ["historical", "historical"], - "measure": ["observed", "observed"], - "value": [1, 1], - } - ) - - forecast_observed_dates = sorted( - pd.to_datetime(forecast_df["submission_date"].values) - .to_period("m") - .to_timestamp() - .unique() - ) - - forecast_mean_df = pd.DataFrame( - { - "submission_date": forecast_observed_dates, - "source": ["forecast", "forecast"], - "measure": ["mean", "mean"], - "value": [ - test_mean + 1, - test_mean + 1, - ], - } - ) - - forecast_median_df = pd.DataFrame( - { - "submission_date": forecast_observed_dates, - "source": ["forecast", "forecast"], - "measure": ["median", "median"], - "value": [ - test_median + 1, - test_median + 1, - ], - } - ) - - forecast_p50_df = pd.DataFrame( - { - "submission_date": forecast_observed_dates, - "source": ["forecast", "forecast"], - "measure": ["p50", "p50"], - "value": [ - test_median + 1, - test_median + 1, - ], - } - ) - - expected_df = pd.concat( - [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] - ) - - expected_df["aggregation_period"] = "month" - - # day will have the same values because there are two days - # the first day of two different months - # only thing that changes is the aggregation_period - # and 01-31 becomes 01-01 - expected_df_day = expected_df.copy() - expected_df_day["aggregation_period"] = "day" - - expected_df = pd.concat([expected_df_day, expected_df]) - expected_df["forecast_parameters"] = "" - - assert set(expected_df.columns) == set(output_df.columns) - columns = expected_df.columns - expected_df_compare = ( - expected_df[columns] - .sort_values(["submission_date", "source", "measure", "aggregation_period"]) - .reset_index(drop=True) - ) - output_df_compare = ( - output_df[columns] - .sort_values(["submission_date", "source", "measure", "aggregation_period"]) - .reset_index(drop=True) - ) - pd.testing.assert_frame_equal( - expected_df_compare, output_df_compare, check_exact=False - ) - - -def test_add_regressors(forecast): - """test add regressors - test case for each element of regressor_list_raw is indicated in name""" - - # choose arbitrary dates for dates - # name indicates the relationship of the window - # to the timeframe of the data as defined in the ds - # column of df below - regressor_list_raw = [ - { - "name": "all_in", - "description": "it's all in", - "start_date": "2124-01-01", - "end_date": "2124-01-06", - }, - { - "name": "all_out", - "description": "it's all out", - "start_date": "2124-02-01", - "end_date": "2124-02-06", - }, - { - "name": "just_end", - "description": "just the second half", - "start_date": "2124-01-03", - "end_date": "2124-02-06", - }, - { - "name": "just_middle", - "description": "just the middle two", - "start_date": "2124-01-02", - "end_date": "2124-01-03", - }, - ] - - regressor_list = [ProphetRegressor(**r) for r in regressor_list_raw] - - df = pd.DataFrame( - { - "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-03").date(), - pd.to_datetime("2124-01-04").date(), - ], - } - ) - - output_df = forecast._add_regressors(df, regressors=regressor_list) - - expected_df = pd.DataFrame( - { - "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-03").date(), - pd.to_datetime("2124-01-04").date(), - ], - "all_in": [0, 0, 0, 0], - "all_out": [1, 1, 1, 1], - "just_end": [1, 1, 0, 0], - "just_middle": [1, 0, 0, 1], - } - ) - - assert set(output_df.columns) == set(expected_df.columns) - pd.testing.assert_frame_equal(output_df, expected_df[output_df.columns]) - - -def test_add_regressors_partial(forecast): - """test add regressors when some fields aren't set - test case for each element of regressor_list_raw is indicated in name""" - - # choose arbitrary dates for dates - # name indicates the relationship of the window - # to the timeframe of the data as defined in the ds - # column of df below - regressor_list_raw = [ - { - "name": "just_end", - "description": "just the second half", - "start_date": "2124-01-03", - }, - { - "name": "just_start", - "description": "just the beginning", - "end_date": "2124-01-03", - }, - ] - - regressor_list = [ProphetRegressor(**r) for r in regressor_list_raw] - - df = pd.DataFrame( - { - "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-03").date(), - pd.to_datetime("2124-01-04").date(), - ], - } - ) - - output_df = forecast._add_regressors(df, regressors=regressor_list) - - expected_df = pd.DataFrame( - { - "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-03").date(), - pd.to_datetime("2124-01-04").date(), - ], - "just_end": [1, 1, 0, 0], - "just_start": [0, 0, 0, 1], - } - ) - - assert set(output_df.columns) == set(expected_df.columns) - pd.testing.assert_frame_equal(output_df, expected_df[output_df.columns]) - - -def test_build_train_dataframe_no_regressors(forecast): - """test _build_train_dataframe with no regressors""" - # only the growth and regressors attributes matter for train_dataframe - # so they can be manually set here - regressor_list = [] - forecast.regressors = regressor_list - - observed_df = pd.DataFrame( - { - "a": [1, 1], - "b": [2, 2], - "y": [3, 4], - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } - ) - - output_train_df = forecast._build_train_dataframe(observed_df) - expected_train_df = pd.DataFrame( - { - "a": [1, 1], - "b": [2, 2], - "y": [3, 4], - "ds": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } - ) - pd.testing.assert_frame_equal( - output_train_df.reset_index(drop=True), expected_train_df - ) - - # test again but with add_logistic_growth_cols set to true - forecast.growth = "logistic" - output_train_wlog_df = forecast._build_train_dataframe(observed_df) - expected_train_wlog_df = pd.DataFrame( - { - "a": [1, 1], - "b": [2, 2], - "y": [3, 4], - "ds": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - "floor": [1.5, 1.5], - "cap": [6.0, 6.0], - } - ) - - assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) - pd.testing.assert_frame_equal( - output_train_wlog_df.reset_index(drop=True), - expected_train_wlog_df[output_train_wlog_df.columns], - ) - - -def test_build_train_dataframe(forecast): - """test _build_train_dataframe and include regressors""" - regressor_list = [ - { - "name": "all_in", - "description": "it's all in", - "start_date": TEST_DATE_STR, - "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), - }, - { - "name": "all_out", - "description": "it's all in", - "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), - "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( - "%Y-%m-%d" - ), - }, - { - "name": "just_end", - "description": "just the second one", - "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), - "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( - "%Y-%m-%d" - ), - }, - ] - # only the growth and regressors attributes matter for train_dataframe - # so they can be manually set here - forecast.regressors = [ProphetRegressor(**r) for r in regressor_list] - - observed_df = pd.DataFrame( - { - "a": [1, 1], - "b": [2, 2], - "y": [3, 4], - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } - ) - - output_train_df = forecast._build_train_dataframe(observed_df) - expected_train_df = pd.DataFrame( - { - "a": [1, 1], - "b": [2, 2], - "y": [3, 4], - "ds": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - "all_in": [0, 0], - "all_out": [ - 1, - 1, - ], - "just_end": [1, 0], - } - ) - pd.testing.assert_frame_equal( - output_train_df.reset_index(drop=True), expected_train_df - ) - - # now with logistic growth set - forecast.growth = "logistic" - output_train_wlog_df = forecast._build_train_dataframe(observed_df) - expected_train_wlog_df = pd.DataFrame( - { - "a": [1, 1], - "b": [2, 2], - "y": [3, 4], - "ds": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - "all_in": [0, 0], - "all_out": [1, 1], - "just_end": [1, 0], - "floor": [1.5, 1.5], - "cap": [6.0, 6.0], - } - ) - - assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) - pd.testing.assert_frame_equal( - output_train_wlog_df.reset_index(drop=True), - expected_train_wlog_df[output_train_wlog_df.columns], - ) - - -def test_build_predict_dataframe_no_regressors(forecast): - """test _build_predict with no regressors""" - # only the growth and regressors attributes matter for train_dataframe - # so they can be manually set here - regressor_list = [] - forecast.regressors = regressor_list - - # manually set trained_parameters, normally this would happen during training - forecast.logistic_growth_floor = -1.0 - forecast.logistic_growth_cap = 10.0 - - dates_to_predict = pd.DataFrame( - { - "submission_date": [ - TEST_DATE - relativedelta(months=1), - TEST_DATE_NEXT_DAY - relativedelta(months=1), - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } - ) - - output_predict_df = forecast._build_predict_dataframe(dates_to_predict) - expected_predict_df = pd.DataFrame( - { - "ds": [ - TEST_DATE - relativedelta(months=1), - TEST_DATE_NEXT_DAY - relativedelta(months=1), - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } - ) - pd.testing.assert_frame_equal( - output_predict_df.reset_index(drop=True), expected_predict_df - ) - - # test against but with add_logistic_growth_cols set to true - forecast.growth = "logistic" - output_predict_wlog_df = forecast._build_predict_dataframe(dates_to_predict) - expected_predict_wlog_df = pd.DataFrame( - { - "ds": [ - TEST_DATE - relativedelta(months=1), - TEST_DATE_NEXT_DAY - relativedelta(months=1), - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - "floor": [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0], - "cap": [10.0, 10.0, 10.0, 10.0, 10.0, 10.0], - } - ) - - assert set(output_predict_wlog_df.columns) == set(expected_predict_wlog_df.columns) - pd.testing.assert_frame_equal( - output_predict_wlog_df.reset_index(drop=True), - expected_predict_wlog_df[output_predict_wlog_df.columns], - ) - - -def test_build_predict_dataframe(forecast): - """test _build_predict_dataframe including regressors""" - regressor_list = [ - { - "name": "all_in", - "description": "it's all in", - "start_date": TEST_DATE_STR, - "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), - }, - { - "name": "all_out", - "description": "it's all in", - "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), - "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( - "%Y-%m-%d" - ), - }, - { - "name": "just_end", - "description": "just the second one", - "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), - "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( - "%Y-%m-%d" - ), - }, - ] - - # only the growth and regressors attributes matter for train_dataframe - # so they can be manually set here - forecast.regressors = [ProphetRegressor(**r) for r in regressor_list] - - # manually set trained_parameters, normally this would happen during training - forecast.logistic_growth_floor = -1.0 - forecast.logistic_growth_cap = 10.0 - - dates_to_predict = pd.DataFrame( - { - "submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY], - } - ) - - output_train_df = forecast._build_predict_dataframe(dates_to_predict) - expected_train_df = pd.DataFrame( - { - "ds": [TEST_DATE, TEST_DATE_NEXT_DAY], - "all_in": [0, 0], - "all_out": [1, 1], - "just_end": [1, 0], - } - ) - pd.testing.assert_frame_equal( - output_train_df.reset_index(drop=True), expected_train_df - ) - - # test again but with add_logistic_growth_cols set to true - forecast.growth = "logistic" - output_train_wlog_df = forecast._build_predict_dataframe(dates_to_predict) - expected_train_wlog_df = pd.DataFrame( - { - "ds": [TEST_DATE, TEST_DATE_NEXT_DAY], - "all_in": [0, 0], - "all_out": [1, 1], - "just_end": [1, 0], - "floor": [-1.0, -1.0], - "cap": [10.0, 10.0], - } - ) - - assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) - pd.testing.assert_frame_equal( - output_train_wlog_df.reset_index(drop=True), - expected_train_wlog_df[output_train_wlog_df.columns], - ) From 617c2da8d70494a58bd3427f172a08f6b7f8f337 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 9 Oct 2024 11:56:11 -0500 Subject: [PATCH 8/8] Revert "Revert "Implement SKLearn interface (#272)" (#284)" This reverts commit 69d120ab0bc12451ce101f69762db1e3ccead77c. --- jobs/kpi-forecasting/README.md | 84 +- jobs/kpi-forecasting/kpi_forecasting.py | 37 +- .../kpi_forecasting/configs/dau_desktop.yaml | 12 +- .../kpi_forecasting/configs/dau_mobile.yaml | 11 +- .../configs/search_forecasting_ad_clicks.yaml | 70 +- ...search_forecasting_daily_active_users.yaml | 64 +- .../search_forecasting_search_count.yaml | 62 +- .../kpi_forecasting/models/base_forecast.py | 370 +-- .../kpi_forecasting/models/funnel_forecast.py | 976 +++---- .../models/prophet_forecast.py | 941 ++++--- .../kpi_forecasting/pandas_extras.py | 33 +- .../tests/test_base_forecast.py | 651 +++-- .../tests/test_funnel_forecast.py | 2344 ++++++++--------- .../tests/test_pandas_extras.py | 158 ++ .../tests/test_prophet_forecast.py | 1794 ++++++++++--- 15 files changed, 4463 insertions(+), 3144 deletions(-) diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md index 9ef09687..cffc3299 100644 --- a/jobs/kpi-forecasting/README.md +++ b/jobs/kpi-forecasting/README.md @@ -1,6 +1,11 @@ # KPI and other Metric Forecasting -This job forecasts [Metric Hub](https://mozilla.acryl.io/glossaryNode/urn:li:glossaryNode:Metric%20Hub/Contents?is_lineage_mode=false) metrics based on YAML configs defined in `.kpi-forecasting/configs`. +This job forecasts [Metric Hub](https://mozilla.acryl.io/glossaryNode/urn:li:glossaryNode:Metric%20Hub/Contents?is_lineage_mode=false) metrics based on YAML configs defined in `.kpi-forecasting/configs`. The output destinations in BigQuery for each config can be found in the `write_results` section. Note that different configs can write to the same table. + +Currently the forecasts are all done by Prophet. There are two classes used: + - `models/prophet_forecast.py/ProphetForecast` Fits a single prophet model on the entire dataset, configured as specified in the config file + - `models/funnel_forecast.py/FunnelForecast` Fits multiple models based on what segment they fall into. Segments are defined in the `metric_hub.segments` in which columns in the data are specified to used for segmentation. The data is partitioned into subsets based on all the different combinations of values the specified columns can take. A subset of the parameters can be used to specify parameters for partitions with specific values on those parameters. For funnel forecast, the `parameters` section of the config is a list, each element of which specifies configuration to be applied to partitions where the columns and values within those columns have the values of the keys and values of the `parameters.segement` fields respectively. The segmentation functionality is defined in `models/base_forecast.py/BaseEnsembleForecast`. Additionally, funnel forecast has automatic hyperparameter tuning which is implemented by `models/funnel_forecast.py/ProphetAutotunerForecast`. + # Usage @@ -38,16 +43,11 @@ Note that if the code changes, `docker compose build` needs to be re-run for `do ## Local Python ### Setup -You can also run the code outside of a Docker container. The code below creates a new Conda environment called `kpi-forecasting-dev`. -It assumes you have Conda installed. If you'd like to run the code in a Jupyter notebook, it is handy to install Jupyter in your `base` environment. -The `ipykernel` commands below will ensure that the `kpi-forecasting-dev` environment is made available to Jupyter. - +You can also run the code outside of a Docker container. The code below shows to create a new environment ```sh -conda create --name kpi-forecasting-dev python=3.10 pip ipykernel -conda activate kpi-forecasting-dev -ipython kernel install --name kpi-forecasting-dev --user +pyenv virtualenv 3.9.17 +pyenv activate pip install -r requirements.txt -conda deactivate ``` If you're running on an M1 Mac, there are [currently some additional steps](https://github.com/facebook/prophet/issues/2250#issuecomment-1317709209) that you'll need to take to get Prophet running. From within your python environment, run the following (making sure to update the path appropriately): @@ -107,49 +107,47 @@ metric_hub: # this configures the observed data fed to the model which is obtai partner: "partner" where: "partner = 'Google'" # filter to apply to the metric hub pull -forecast_model: # this section configures the model - model_type: "funnel" - # type of model object to use, current options are "funnel" for FunnelForecast and "prophet" for ProphetForecast - start_date: NULL +forecast_model: # this section configures the model + forecast_start: NULL # starting date for the predicted data (unless predict_historical_dates is set), # if unset, value depends on predict_historical_dates. - end_date: NULL + forecast_end: NULL # final date for the predicted data - use_all_us_holidays: False - For prophet-based models, when true, call `model.add_country_holidays(country_name="US")` on model predict_historical_dates: True # if predict_historical_dates is True, set to first date of the observed data # if predict_historical_dates is False, defaults to the day after the last day in the observed data - number_of_simulations: 1000 - # for prophet-based models,number of simulations to run parameters: # this section can be a map or a list. # If it's a map, these parameters are used for all models # (recall multiple models are train if there is a metric_hub.segments) # If it's a list, it will set different parameters # for different subsets of the parition specified in `metric_hub.segments`. - - segment: - # specifies which subset of the partitions this applies to - # key is a column specified in metric_hub.segments - # value is a value that column can take to which the configuration is applied + - segment: + # specifies which subset of the partitions this applies to + # key is a column specified in metric_hub.segments + # value is a value that column can take to which the configuration is applied device: desktop - start_date: "2018-01-01" # only applies to FunnelForecast, allows one to set start date for each sub-model - end_date: NULL # only applies to FunnelForecast, allows one to set end date for each sub-model - holidays: ["easter", "covid_sip11"] # holidays specified in `configs.model_inputs.holidays` to use. - regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] # regressors specified in `configs.model_inputs.regressors` - grid_parameters: - # sets grid for hyperparameter tuning - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] # parameter of prior distribution controlling how much the trend fluctuates at changepoints - changepoint_range: [0.8, 0.9] # the proportion of the time series over which the changepoints are distributed - n_changepoints: [25, 50] # number of trend changepoints, equally spaced over the time series - weekly_seasonality: True # if weekly seasonality is included in the model - yearly_seasonality: True # if yearly seasonality is included in the model - cv_settings: - # sets parameters for prophet cross-validation used in FunnelForecast - initial: "1296 days" # the initial training period, used to train the first iteration of the model for CV - period: "30 days" # spacing between cutoff dates, the sliding window over which each round of cross validation is performed - horizon: "30 days" # forecast horizon used to make predictions and calculate model fit metrics for optimization - parallel: "processes" # how parallelization is performed by Prophet, or None if no paralellization is used + start_date: "2018-01-01" + # start date specific to a segment, superceeds + # forecast_start_date + parameters: + holidays: ["easter", "covid_sip11"] + # holidays specified in `configs.model_inputs.holidays` to use. + regressors: ["post_esr_migration", "in_covid"] + # regressors specified in `configs.model_inputs.regressors` + use_all_us_holidays: False + grid_parameters: + # sets grid for hyperparameter tuning + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + # sets parameters for prophet cross-validation used in FunnelForecast + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" ... summarize: @@ -174,13 +172,9 @@ write_results: - `./kpi_forecasting/models` contains the forecasting models. This repo was designed to make it simple to add new forecasting models in the future. In general, a model needs to inherit -the `models.base_forecast.BaseForecast` class and to implement the `_fit` and `_predict` methods. Output from the `_fit` method will automatically be validated by `BaseForecast._validate_forecast_df`. - -One caveat is that, in order for aggregations over time periods to work (e.g. monthly forecasts), the `_predict` method must generate a number -of simulated timeseries. This enables the measurement of variation across a range of possible outcomes. This number is set by `BaseForecast.number_of_simulations`. +the `models.base_forecast.BaseForecast` class and to implement the `fit` and `predict` methods. When testing locally, be sure to modify any config files to use non-production `project` and `dataset` values that you have write access to; otherwise the `write_output` step will fail. -<<<<<<< HEAD ## Interface The forecast objects in this repo implement an interface similar to `sklearn` or `darts`. Every forecast method should have a `fit` method for fitting the forecast and `predict` method for making predictions. The signature of these functions can be seen in `models.base_forecast.BaseForecast`. @@ -192,5 +186,3 @@ Before merging, run the pipeline with the `--no-write` flag to ensure it runs en `python ./kpi_forecasting.py --no-write -c ./kpi_forecasting/configs/dau_mobile.yaml` -======= ->>>>>>> 73e76df (Revert "Implement SKLearn interface (#272)" (#284)) diff --git a/jobs/kpi-forecasting/kpi_forecasting.py b/jobs/kpi-forecasting/kpi_forecasting.py index 645f714e..a5ee32ce 100644 --- a/jobs/kpi-forecasting/kpi_forecasting.py +++ b/jobs/kpi-forecasting/kpi_forecasting.py @@ -1,13 +1,19 @@ -<<<<<<< HEAD import pandas as pd from datetime import datetime, timezone, timedelta import json -======= ->>>>>>> 73e76df (Revert "Implement SKLearn interface (#272)" (#284)) from kpi_forecasting.inputs import CLI, load_yaml -from kpi_forecasting.models.prophet_forecast import ProphetForecast -from kpi_forecasting.models.funnel_forecast import FunnelForecast +from kpi_forecasting.models.prophet_forecast import ( + ProphetForecast, + summarize as prophet_summarize, + write_results as prophet_write_results, + summarize_legacy as prophet_summarize_legacy, +) +from kpi_forecasting.models.funnel_forecast import ( + FunnelForecast, + summarize as funnel_summarize, + write_results as funnel_write_results, +) from kpi_forecasting.metric_hub import MetricHub @@ -18,7 +24,6 @@ } -<<<<<<< HEAD class KPIPipeline: def __init__(self, config_path): self.config_data = load_yaml(filepath=config_path) @@ -155,18 +160,9 @@ def main() -> None: # Load the config config_path = CLI().args.config will_write = CLI().args.write -======= -def main() -> None: - # Load the config - config = load_yaml(filepath=CLI().args.config) - model_type = config["forecast_model"]["model_type"] ->>>>>>> 73e76df (Revert "Implement SKLearn interface (#272)" (#284)) - if model_type in MODELS: - metric_hub = MetricHub(**config["metric_hub"]) - model = MODELS[model_type](metric_hub=metric_hub, **config["forecast_model"]) + pipeline = KPIPipeline(config_path) -<<<<<<< HEAD observed_df = pipeline.get_historical_data() fit_model = pipeline.fit(observed_df=observed_df) predict_dates = pipeline.get_predict_dates(observed_df) @@ -175,15 +171,6 @@ def main() -> None: ) if will_write: pipeline.write_results(fit_model, summarized, predict_dates.copy()) -======= - model.fit() - model.predict() - model.summarize(**config["summarize"]) - model.write_results(**config["write_results"]) - - else: - raise ValueError(f"Don't know how to forecast using {model_type}.") ->>>>>>> 73e76df (Revert "Implement SKLearn interface (#272)" (#284)) if __name__ == "__main__": diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml index 0b8966f2..3476302c 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml @@ -1,4 +1,5 @@ --- +model_type: prophet metric_hub: app_name: "firefox_desktop" slug: "daily_active_users_v2" @@ -7,17 +8,16 @@ metric_hub: end_date: NULL forecast_model: - model_type: "prophet" - start_date: NULL - end_date: NULL - use_all_us_holidays: False + forecast_start: NULL + forecast_end: NULL predict_historical_dates: False - number_of_simulations: 1000 parameters: seasonality_prior_scale: 0.00825 changepoint_prior_scale: 0.15983 weekly_seasonality: True yearly_seasonality: True + use_all_us_holidays: False + summarize: periods: ["day", "month"] @@ -30,3 +30,5 @@ write_results: dataset: "telemetry_derived" dataset_legacy: "telemetry_derived" table: "kpi_forecasts_v0" + forecast_table_legacy: "kpi_automated_forecast_v1" + confidences_table_legacy: "kpi_automated_forecast_confidences_v1" diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml index c9288408..5ebd1686 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml @@ -1,4 +1,5 @@ --- +model_type: prophet metric_hub: app_name: "multi_product" slug: "mobile_daily_active_users_v1" @@ -7,17 +8,15 @@ metric_hub: end_date: NULL forecast_model: - model_type: "prophet" - start_date: NULL - end_date: NULL - use_all_us_holidays: True + forecast_start: NULL + forecast_end: NULL predict_historical_dates: False - number_of_simulations: 1000 parameters: seasonality_prior_scale: 0.01 changepoint_prior_scale: 0.01 weekly_seasonality: True yearly_seasonality: True + use_all_us_holidays: True summarize: periods: ["day", "month"] @@ -30,3 +29,5 @@ write_results: dataset: "telemetry_derived" dataset_legacy: "telemetry_derived" table: "kpi_forecasts_v0" + forecast_table_legacy: "kpi_automated_forecast_v1" + confidences_table_legacy: "kpi_automated_forecast_confidences_v1" diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml index 7a01aa15..a08efd49 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml @@ -1,4 +1,5 @@ --- +model_type: funnel metric_hub: app_name: "multi_product" slug: "search_forecasting_ad_clicks" @@ -13,48 +14,47 @@ metric_hub: where: "partner = 'Google'" forecast_model: - model_type: "funnel" - start_date: NULL - end_date: NULL - use_all_us_holidays: False + forecast_start: NULL + forecast_end: NULL predict_historical_dates: True - number_of_simulations: 1000 parameters: - - segment: + - segment: device: desktop start_date: "2018-01-01" - end_date: NULL - holidays: ["easter", "covid_sip11"] - regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - n_changepoints: [25, 50] - weekly_seasonality: True - yearly_seasonality: True - cv_settings: - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + parameters: + holidays: ["easter", "covid_sip11"] + regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] + use_all_us_holidays: False + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + n_changepoints: [25, 50] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" - segment: device: mobile start_date: "2022-01-01" - end_date: NULL - holidays: ["easter"] - regressors: ["after_fenix", "in_covid"] - grid_parameters: - changepoint_prior_scale: [.01, .1, .15, .2] - changepoint_range: [0.8, 0.9, 1] - n_changepoints: [30] - weekly_seasonality: True - yearly_seasonality: True - growth: "logistic" - cv_settings: - initial: "366 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + parameters: + holidays: ["easter"] + regressors: ["after_fenix", "in_covid"] + use_all_us_holidays: False + grid_parameters: + changepoint_prior_scale: [.01, .1, .15, .2] + changepoint_range: [0.8, 0.9] + n_changepoints: [30] + weekly_seasonality: True + yearly_seasonality: True + growth: "logistic" + cv_settings: + initial: "366 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" summarize: periods: ["day", "month"] diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml index dfb7bb49..e87472c2 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml @@ -1,4 +1,5 @@ --- +model_type: funnel metric_hub: app_name: "multi_product" slug: "search_forecasting_daily_active_users" @@ -13,45 +14,44 @@ metric_hub: where: "partner = 'Google'" forecast_model: - model_type: "funnel" - start_date: NULL - end_date: NULL - use_all_us_holidays: False + forecast_start: NULL + forecast_end: NULL predict_historical_dates: True - number_of_simulations: 1000 parameters: - segment: device: desktop start_date: "2018-01-01" - end_date: NULL - holidays: ["easter", "covid_sip11"] - regressors: ["post_esr_migration", "in_covid"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - weekly_seasonality: True - yearly_seasonality: True - cv_settings: - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + parameters: + holidays: ["easter", "covid_sip11"] + regressors: ["post_esr_migration", "in_covid"] + use_all_us_holidays: False + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" - segment: device: mobile start_date: "2021-01-01" - end_date: NULL - holidays: ["easter"] - regressors: ["after_fenix", "in_covid"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1] - weekly_seasonality: True - yearly_seasonality: True - growth: "logistic" - cv_settings: - initial: "366 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + parameters: + holidays: ["easter"] + regressors: ["after_fenix", "in_covid"] + use_all_us_holidays: False + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1] + weekly_seasonality: True + yearly_seasonality: True + growth: "logistic" + cv_settings: + initial: "366 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" summarize: periods: ["day", "month"] @@ -62,4 +62,4 @@ write_results: project: "moz-fx-data-shared-prod" dataset: "search_derived" table: "search_funnel_forecasts_v1" - components_table: "search_forecast_model_components_v1" + components_table: "search_forecast_model_components_v1" \ No newline at end of file diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml index 17431247..b1213874 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml @@ -1,4 +1,5 @@ --- +model_type: funnel metric_hub: app_name: "multi_product" slug: "search_forecasting_search_count" @@ -13,45 +14,44 @@ metric_hub: where: "partner = 'Google'" forecast_model: - model_type: "funnel" - start_date: NULL - end_date: NULL - use_all_us_holidays: False + forecast_start: NULL + forecast_end: NULL predict_historical_dates: True - number_of_simulations: 1000 parameters: - segment: device: desktop start_date: "2018-01-01" - end_date: NULL - holidays: ["easter", "covid_sip11"] - regressors: ["post_esr_migration", "in_covid"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - weekly_seasonality: True - yearly_seasonality: True - cv_settings: - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + parameters: + holidays: ["easter", "covid_sip11"] + regressors: ["post_esr_migration", "in_covid"] + use_all_us_holidays: False + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" - segment: device: mobile start_date: "2020-01-01" - end_date: NULL - holidays: ["easter"] - regressors: ["after_fenix", "in_covid"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1] - weekly_seasonality: True - yearly_seasonality: True - growth: "logistic" - cv_settings: - initial: "366 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + parameters: + holidays: ["easter"] + regressors: ["after_fenix", "in_covid"] + use_all_us_holidays: False + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1] + weekly_seasonality: True + yearly_seasonality: True + growth: "logistic" + cv_settings: + initial: "366 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" summarize: periods: ["day", "month"] diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index 896051f8..a76db106 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -1,88 +1,30 @@ import json -import numpy as np import pandas as pd import abc +from dataclasses import dataclass +from typing import List +import logging -from dataclasses import dataclass -from datetime import datetime, timedelta, timezone -from kpi_forecasting.metric_hub import MetricHub -from typing import Dict, List +logger = logging.getLogger("cmdstanpy") +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.CRITICAL) @dataclass class BaseForecast(abc.ABC): """ - A base class for fitting, forecasting, and summarizing forecasts. This class - should not be invoked directly; it should be inherited by a child class. The - child class needs to implement `_fit` and `_forecast` methods in order to work. - - Args: - model_type (str): The name of the forecasting model that's being used. - parameters (Dict): Parameters that should be passed to the forecasting model. - use_all_us_holidays (bool): Whether or not the forecasting model should use holidays. - The base model does not apply holiday logic; that logic needs to be built - in the child class. - start_date (str): A 'YYYY-MM-DD' formatted-string that specifies the first - date that should be forecsted. - end_date (str): A 'YYYY-MM-DD' formatted-string that specifies the last - date the metric should be queried. - metric_hub (MetricHub): A MetricHub object that provides details about the - metric to be forecasted. - predict_historical_dates (bool): If True, forecast starts at the first - date in the observed data. If False, it uses the value of start_date it set - and the first day after the observed data ends otherwise + Abstract Base class for forecast objects """ - model_type: str - parameters: Dict - use_all_us_holidays: bool - start_date: str - end_date: str - metric_hub: MetricHub - predict_historical_dates: bool = False - - def _get_observed_data(self): - if self.metric_hub: - # the columns in this dataframe - # are "value" for the metric, submission_date - # and any segments where the column name - # is the name of the segment - self.observed_df = self.metric_hub.fetch() - - def __post_init__(self) -> None: - # fetch observed observed data - self.collected_at = datetime.now(timezone.utc).replace(tzinfo=None) - self._get_observed_data() - - # raise an error is predict_historical_dates is True and start_date is set - if self.start_date and self.predict_historical_dates: - raise ValueError( - "forecast start_date set while predict_historical_dates is True" - ) - # use default start/end dates if the user doesn't specify them - self.start_date = pd.to_datetime(self.start_date or self._default_start_date) - self.end_date = pd.to_datetime(self.end_date or self._default_end_date) - self.dates_to_predict = pd.DataFrame( - {"submission_date": pd.date_range(self.start_date, self.end_date).date} - ) - - # initialize unset attributes - self.model = None - self.forecast_df = None - self.summary_df = None - - # metadata - self.metadata_params = json.dumps( - { - "model_type": self.model_type.lower(), - "model_params": self.parameters, - "use_all_us_holidays": self.use_all_us_holidays, - } - ) + @abc.abstractmethod + def _set_seed(self) -> None: + """Set random seed to ensure that fits and predictions are reproducible.""" + return NotImplementedError @abc.abstractmethod - def _fit(self, observed_df: pd.DataFrame) -> None: + def fit(self, observed_df: pd.DataFrame) -> object: """Fit a forecasting model using `observed_df.` This will typically be the data that was generated using Metric Hub in `__post_init__`. @@ -90,11 +32,13 @@ def _fit(self, observed_df: pd.DataFrame) -> None: Args: observed_df (pd.DataFrame): observed data used to fit the model + + Returns: self """ raise NotImplementedError @abc.abstractmethod - def _predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: + def predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: """Forecast using `self.model` on dates in `dates_to_predict`. This method should return a dataframe that will be validated by `_validate_forecast_df`. @@ -115,116 +59,206 @@ def _validate_forecast_df(self, forecast_df: pd.DataFrame) -> None: forecast_df (pd.DataFrame): dataframe produced by `_predict`""" raise NotImplementedError - @abc.abstractmethod - def _summarize( - self, - forecast_df: pd.DataFrame, - observed_df: pd.DataFrame, - period: str, - numpy_aggregations: List[str], - percentiles: List[int], - ) -> pd.DataFrame: - """Calculate summary metrics for `forecast_df` over a given period, and - add metadata. - Args: - forecast_df (pd.DataFrame): forecast dataframe created by `predict` - observed_df (pd.DataFrame): observed data used to generate prediction - period (str): aggregation period up to which metrics are aggregated - numpy_aggregations (List[str]): List of numpy aggregation names - percentiles (List[int]): List of percentiles to aggregate up to +@dataclass +class BaseEnsembleForecast: + """ + A base class for forecasts that partition the data using the segments parameter + and fit a different model to each segment. The type of model used is the same for + all segments and is set with the model_class attribute - Returns: - pd.DataFrame: dataframe containing metrics listed in numpy_aggregations - and percentiles + Args: + parameters (Dict): Parameters that should be passed to the forecasting model. + model_class: Class to use to construct an ensemble + segments: segments from the metric hub data pull + """ + + parameters: List + model_class: object = BaseForecast + segments: dict = None + + def __post_init__(self) -> None: + # metadata + self.model_type = self.model_class.__class__.__name__.lower().replace( + "Forecast", "" + ) + self.metadata_params = json.dumps( + { + "model_type": self.model_type, + "model_params": self.parameters, + } + ) + + def _set_segment_models(self, observed_df: pd.DataFrame) -> None: + """Creates an element in the segment_models attribute for each segment specified in the + metric_hub.segments section of the config. It is populated from the list of + parameters in the forecast_model.parameters section of the configuration file. + The segements section of each element of the list specifies which values within which + segments the parameters are associated with. + + Args: + observed_df (pd.DataFrame): dataframe containing observed data used to model + must contain columns specified in the keys of the segments section of the config """ - raise NotImplementedError - @property - def _default_start_date(self) -> str: - """The first day after the last date in the observed dataset.""" - if self.predict_historical_dates: - return self.observed_df["submission_date"].min() - else: - return self.observed_df["submission_date"].max() + timedelta(days=1) - - @property - def _default_end_date(self) -> str: - """78 weeks (18 months) ahead of the current UTC date.""" - return ( - datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78) - ).date() + # Construct a DataFrame containing all combination of segment x + ## in the observed_df + combination_df = observed_df[self.segments].drop_duplicates() + + # Construct dictionaries from those combinations + # this will be used to check that the config actually partitions the data + segment_combinations = combination_df.to_dict("records") + + # get subset of segment that is used in partitioning + split_dims = None + for partition in self.parameters: + partition_dim = set(partition["segment"].keys()) + if split_dims and partition_dim != split_dims: + raise ValueError( + "Segment keys are not the same across different elements of parameters in the config file" + ) + elif split_dims is None: + split_dims = partition_dim + else: + # this is case where split_dim is set and matches paritition_dim + continue + if not split_dims <= set(combination_df.keys()): + missing_dims = split_dims - set(combination_df.keys()) + missing_dims_str = ",".join(missing_dims) + raise ValueError( + f"Segment keys missing from metric hub segments: {missing_dims_str}" + ) - def _set_seed(self) -> None: - """Set random seed to ensure that fits and predictions are reproducible.""" - np.random.seed(42) + # For each segment combinination, get the model parameters from the config + ## file. Parse the holidays and regressors specified in the config file. + segment_models = [] + for segment in segment_combinations: + # find the correct configuration + for partition in self.parameters: + partition_segment = partition["segment"] + selected_partition = None + # get subset of segment that is used to partition + subset_segment = { + key: val for key, val in segment.items() if key in split_dims + } + if partition_segment == subset_segment: + selected_partition = partition.copy() + break + if selected_partition is None: + raise ValueError("Partition not Found") + selected_partition["segment"] = segment + + if "start_date" in selected_partition: + start_date = pd.to_datetime(selected_partition["start_date"]).date() + else: + start_date = None + + # Create a FunnelSegmentModelSettings object for each segment combination + segment_models.append( + { + "model": self.model_class(**selected_partition["parameters"]), + "segment": segment, + "start_date": start_date, + } + ) + self.segment_models = segment_models - def fit(self) -> None: - """Fit a model using historic metric data provided by `metric_hub`.""" - print(f"Fitting {self.model_type} model.", flush=True) - self._set_seed() - self.trained_at = datetime.now(timezone.utc).replace(tzinfo=None) - self._fit(self.observed_df) - - def predict(self) -> None: - """Generate a forecast from `start_date` to `end_date`. - Result is set to `self.forecast_df`""" - print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True) - self._set_seed() - self.predicted_at = datetime.now(timezone.utc).replace(tzinfo=None) - self.forecast_df = self._predict(self.dates_to_predict) - self._validate_forecast_df(self.forecast_df) - - def summarize( - self, - periods: List[str] = ["day", "month"], - numpy_aggregations: List[str] = ["mean"], - percentiles: List[int] = [10, 50, 90], + def filter_data_to_segment( + self, df: pd.DataFrame, segment: dict, start_date: str ) -> pd.DataFrame: + """function to filter data to the segment set in segment + and in time to only dates on or after start_date + + Args: + df (pd.DataFrame): dataframe to filter + segment (dict): dictionary where keys are columns and values + are the value that column takes for that segment + start_date (str): filter df so that the earliest date is start_date + + Returns: + pd.DataFrame: filtered dataframe """ - Calculate summary metrics for `forecast_df` and add metadata. - The dataframe returned here will be reported in Big Query when - `write_results` is called. + column_matches_segment = df[list(segment)] == pd.Series(segment) + row_in_segment = column_matches_segment.all(axis=1) + filter_array = row_in_segment + if start_date: + row_after_start = df["submission_date"] >= start_date + filter_array &= row_after_start + return df.loc[filter_array] + + def get_filtered_observed_data(self, observed_df: pd.DataFrame) -> pd.DataFrame: + """Returns the observed dataframe with time filters applied + to each segments data Args: - periods (List[str]): A list of the time periods that the data should be aggregated and - summarized by. For example ["day", "month"] - numpy_aggregations (List[str]): A list of numpy methods (represented as strings) that can - be applied to summarize numeric values in a numpy dataframe. For example, ["mean"]. - percentiles (List[int]): A list of integers representing the percentiles that should be reported - in the summary. For example [50] would calculate the 50th percentile (i.e. the median). + observed_df (pd.DataFrame): full observed dataframe Returns: - pd.DataFrame: metric dataframe for all metrics and aggregations + pd.DataFrame: filtered observed dataframe """ - summary_df = pd.concat( - [ - self._summarize( - self.forecast_df, - self.observed_df, - i, - numpy_aggregations, - percentiles, - ) - for i in periods - ] - ) + observed_df_list = [] + for segment_model in self.segment_models: + observed_subset = self.filter_data_to_segment( + observed_df, segment_model["segment"], segment_model["start_date"] + ) + observed_df_list.append(observed_subset) + return pd.concat(observed_df_list) - # add Metric Hub metadata columns - summary_df["metric_alias"] = self.metric_hub.alias.lower() - summary_df["metric_hub_app_name"] = self.metric_hub.app_name.lower() - summary_df["metric_hub_slug"] = self.metric_hub.slug.lower() - summary_df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date) - summary_df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date) - summary_df["metric_collected_at"] = self.collected_at + def fit(self, observed_df: pd.DataFrame) -> None: + """Fit models across all segments for the data in observed_df - # add forecast model metadata columns - summary_df["forecast_start_date"] = self.start_date - summary_df["forecast_end_date"] = self.end_date - summary_df["forecast_trained_at"] = self.trained_at - summary_df["forecast_predicted_at"] = self.predicted_at - summary_df["forecast_parameters"] = self.metadata_params + Args: + observed_df (pd.DataFrame): data used to fit + """ + print(f"Fitting {self.model_type} model.", flush=True) + # create list of models depending on whether there are segments or not + self._set_segment_models(observed_df) + for segment_model in self.segment_models: + print(segment_model["segment"]) + model = segment_model["model"] + model._set_seed() + observed_subset = self.filter_data_to_segment( + observed_df, segment_model["segment"], segment_model["start_date"] + ) + model.fit(observed_subset) + return self + + def predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: + """Generates a prediction for each segment for the dates in dates_to_predict - self.summary_df = summary_df + Args: + dates_to_predict (pd.DataFrame): dataframe with a single column, + submission_date that is a string in `%Y-%m-%d` format - return self.summary_df + Returns: + pd.DataFrame: prediction across all segments + """ + start_date = dates_to_predict["submission_date"].iloc[0] + end_date = dates_to_predict["submission_date"].iloc[-1] + + print(f"Forecasting from {start_date} to {end_date}.", flush=True) + for segment_model in self.segment_models: + config_start_date = segment_model["start_date"] + + if config_start_date and config_start_date > start_date: + dates_to_predict_segment = ( + dates_to_predict[ + dates_to_predict["submission_date"] >= config_start_date + ] + .reset_index(drop=True) + .copy() + ) + else: + dates_to_predict_segment = dates_to_predict.copy() + + model = segment_model["model"] + model._set_seed() + predict_df = model.predict(dates_to_predict_segment) + + # add segments on as columns + for column, value in segment_model["segment"].items(): + predict_df[column] = value + predict_df["forecast_parameters"] = json.dumps(model._get_parameters()) + segment_model["forecast"] = predict_df + self.forecast_list = [x["forecast"] for x in self.segment_models] + return pd.concat(self.forecast_list) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py index 3c06863c..a9c9998b 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py @@ -1,370 +1,48 @@ from dataclasses import dataclass, field -from datetime import datetime import itertools +from typing import List import json -from typing import Dict, List, Union from google.cloud import bigquery from google.cloud.bigquery.enums import SqlTypeNames as bq_types import numpy as np import pandas as pd from pandas.api import types as pd_types -import prophet from prophet.diagnostics import cross_validation -from kpi_forecasting.configs.model_inputs import ( - ProphetHoliday, - ProphetRegressor, - holiday_collection, - regressor_collection, +from kpi_forecasting.models.prophet_forecast import ( + ProphetForecast, + aggregate_forecast_observed, ) -from kpi_forecasting.models.prophet_forecast import ProphetForecast +from kpi_forecasting.models.base_forecast import BaseEnsembleForecast @dataclass -class SegmentModelSettings: - """ - Holds the configuration and results for each segment - in a funnel forecasting model. - """ - - segment: Dict[str, str] - start_date: str - end_date: str - grid_parameters: Dict[str, Union[List[float], float]] - cv_settings: Dict[str, str] - holidays: list = field(default_factory=list[ProphetHoliday]) - regressors: list = field(default_factory=list[ProphetRegressor]) - - # Hold results as models are trained and forecasts made - segment_model: prophet.Prophet = None - trained_parameters: dict = field(default_factory=dict[str, str]) - forecast_df: pd.DataFrame = None - components_df: pd.DataFrame = None - - -@dataclass -class FunnelForecast(ProphetForecast): - """ - FunnelForecast class for generating and managing forecast models. The class handles - cases where forecasts for a combination of dimensions are required for a metric. - - Inherits from BaseForecast and provides methods for initializing forecast - parameters, building models, generating forecasts, summarizing results, - and writing results to BigQuery. - """ - - def __post_init__(self) -> None: - """ - Post-initialization method to set up necessary attributes and configurations. - - This method sets up the dates to predict, constructs segment combinations, - initializes models for each segment, and prepares attributes for storing results. - """ - super().__post_init__() - - if self.metric_hub is None: - # this is used to avoid the code below for testing purposes - return - - self._set_segment_models(self.observed_df, self.metric_hub.segments.keys()) - - # initialize unset attributes - self.components_df = None - - def _set_segment_models( - self, observed_df: pd.DataFrame, segment_column_list: list - ) -> None: - """Creates a SegmentSettings object for each segment specified in the - metric_hub.segments section of the config. It is populated from the list of - parameters in the forecast_model.parameters section of the configuration file. - The segements section of each element of the list specifies which values within which - segments the parameters are associated with. - - Args: - observed_df (pd.DataFrame): dataframe containing observed data used to model - must contain columns specified in the keys of the segments section of the config - segment_column_list (list): list of columns of observed_df to use to determine segments - """ - # Construct a DataFrame containing all combination of segment values - ## in the observed_df - combination_df = observed_df[segment_column_list].drop_duplicates() - - # Construct dictionaries from those combinations - # this will be used to check that the config actually partitions the data - segment_combinations = combination_df.to_dict("records") - - # get subset of segment that is used in partitioning - split_dims = None - for partition in self.parameters: - partition_dim = set(partition["segment"].keys()) - if split_dims and partition_dim != split_dims: - raise ValueError( - "Segment keys are not the same across different elements of parameters in the config file" - ) - elif split_dims is None: - split_dims = partition_dim - else: - # this is case where split_dim is set and matches paritition_dim - continue - if not split_dims <= set(combination_df.keys()): - missing_dims = split_dims - set(combination_df.keys()) - missing_dims_str = ",".join(missing_dims) - raise ValueError( - f"Segment keys missing from metric hub segments: {missing_dims_str}" - ) - - # For each segment combinination, get the model parameters from the config - ## file. Parse the holidays and regressors specified in the config file. - segment_models = [] - for segment in segment_combinations: - # find the correct configuration - for partition in self.parameters: - partition_segment = partition["segment"] - # get subset of segment that is used to partition - subset_segment = { - key: val for key, val in segment.items() if key in split_dims - } - if partition_segment == subset_segment: - # parition is set to the desired value - # break out of loop - break - holiday_list = [] - regressor_list = [] - - if "holidays" in partition: - holiday_list = [holiday_collection[h] for h in partition["holidays"]] - if "regressors" in partition: - regressor_list = [ - regressor_collection[r] for r in partition["regressors"] - ] - - # Create a SegmentModelSettings object for each segment combination - segment_models.append( - SegmentModelSettings( - segment=segment, - start_date=partition["start_date"], - end_date=self.end_date, - holidays=[ProphetHoliday(**h) for h in holiday_list], - regressors=[ProphetRegressor(**r) for r in regressor_list], - grid_parameters=dict(partition["grid_parameters"]), - cv_settings=dict(partition["cv_settings"]), - ) - ) - self.segment_models = segment_models - - @property - def column_names_map(self) -> Dict[str, str]: - """ - Map column names from the dataset to the names required by Prophet. - - Returns: - Dict[str, str]: Mapping of column names. - """ - return {"submission_date": "ds", "value": "y"} - - def _fill_regressor_dates(self, regressor: ProphetRegressor) -> ProphetRegressor: - """ - Fill missing start and end dates for a regressor. A ProphetRegressor can be created - without a 'start_date' or 'end_date' being supplied, so this checks for either date attr - being missing and fills in with the appropriate date: if 'start_date' is missing, it assumes - that the regressor starts at the beginning of the observed data; if 'end_date' is missing, - it assumes that the regressor should be filled until the end of the forecast period. - - Args: - regressor (ProphetRegressor): The regressor to fill dates for. +class ProphetAutotunerForecast(ProphetForecast): + grid_parameters: dict = field(default_factory=dict) + cv_settings: dict = field(default_factory=dict) - Returns: - ProphetRegressor: The regressor with filled dates. - """ - - for date in ["start_date", "end_date"]: - if getattr(regressor, date) is None: - setattr(regressor, date, getattr(self, date)) - elif isinstance(getattr(regressor, date), str): - setattr(regressor, date, pd.to_datetime(getattr(regressor, date))) - - if regressor.end_date < regressor.start_date: - raise Exception( - f"Regressor {regressor.name} start date comes after end date" - ) - return regressor - - def _build_model( - self, - segment_settings: SegmentModelSettings, - parameters: Dict[str, Union[float, str, bool]], - ) -> prophet.Prophet: - """ - Build a Prophet model from parameters. - - Args: - segment_settings (SegmentModelSettings): The settings for the segment. - parameters (Dict[str, Union[float, str, bool]]): The parameters for the model. - - Returns: - prophet.Prophet: The Prophet model. - """ - if segment_settings.holidays: - parameters["holidays"] = pd.concat( - [ - pd.DataFrame( - { - "holiday": h.name, - "ds": pd.to_datetime(h.ds), - "lower_window": h.lower_window, - "upper_window": h.upper_window, - } - ) - for h in segment_settings.holidays - ], - ignore_index=True, - ) - - m = prophet.Prophet( - **parameters, - uncertainty_samples=self.number_of_simulations, - mcmc_samples=0, - ) - for regressor in segment_settings.regressors: - m.add_regressor( - regressor.name, - prior_scale=regressor.prior_scale, - mode=regressor.mode, - ) - - return m - - def _build_train_dataframe( - self, - observed_df, - segment_settings: SegmentModelSettings, - add_logistic_growth_cols: bool = False, - ) -> pd.DataFrame: - """ - Build the model dataframe for training - - Args: - observed_df: dataframe of observed data - segment_settings (SegmentModelSettings): The settings for the segment. - add_logistic_growth_cols (bool, optional): Whether to add logistic growth columns. Defaults to False. - - Returns: - pd.DataFrame: The dataframe for the model. - """ - - # find indices in observed_df for rows that exactly match segment dict - segment_historical_indices = ( - observed_df[list(segment_settings.segment)] - == pd.Series(segment_settings.segment) - ).all(axis=1) - df = ( - observed_df.loc[ - (segment_historical_indices) - & ( # filter observed_df if segment start date > metric_hub start date - observed_df["submission_date"] - >= datetime.strptime(segment_settings.start_date, "%Y-%m-%d").date() - ) - ] - .rename(columns=self.column_names_map) - .copy() - ) - # define limits for logistic growth - if add_logistic_growth_cols: - df["floor"] = df["y"].min() * 0.5 - df["cap"] = df["y"].max() * 1.5 - - if segment_settings.regressors: - df = self._add_regressors(df, segment_settings.regressors) - return df - - def _build_predict_dataframe( - self, - dates_to_predict: pd.DataFrame, - segment_settings: SegmentModelSettings, - add_logistic_growth_cols: bool = False, - ) -> pd.DataFrame: - """creates dataframe used for prediction - - Args: - dates_to_predict (pd.DataFrame): dataframe of dates to predict - segment_settings (SegmentModelSettings): settings related to the segment - add_logistic_growth_cols (bool): Whether to add logistic growth columns. Defaults to False. - - - Returns: - pd.DataFrame: dataframe to use used in prediction - """ - # predict dataframe only needs dates to predict, logistic growth limits, and regressors - df = dates_to_predict.rename(columns=self.column_names_map).copy() - if add_logistic_growth_cols: - df["floor"] = segment_settings.trained_parameters["floor"] - df["cap"] = segment_settings.trained_parameters["cap"] - - if segment_settings.regressors: - df = self._add_regressors(df, segment_settings.regressors) - - return df - - def _fit(self, observed_df: pd.DataFrame) -> None: - """ - Fit and save a Prophet model for each segment combination. - - Args: - observed_df (pd.DataFrame): dataframe of observations. Expected to have columns - specified in the segments section of the config, - submission_date column with unique dates corresponding to each observation and - y column containing values of observations - """ - for segment_settings in self.segment_models: - parameters = self._auto_tuning(observed_df, segment_settings) - - # Initialize model; build model dataframe - add_log_growth_cols = ( - "growth" in parameters.keys() and parameters["growth"] == "logistic" - ) - test_dat = self._build_train_dataframe( - observed_df, segment_settings, add_log_growth_cols - ) - model = self._build_model(segment_settings, parameters) - - model.fit(test_dat) - if add_log_growth_cols: - # all values in these colunns are the same - parameters["floor"] = test_dat["floor"].values[0] - parameters["cap"] = test_dat["cap"].values[0] - - if "holidays" in parameters.keys(): - parameters["holidays"] = ( - parameters["holidays"]["holiday"].unique().tolist() - ) - segment_settings.trained_parameters = parameters - segment_settings.segment_model = model - - def _get_crossvalidation_metric( - self, m: prophet.Prophet, cv_settings: dict - ) -> float: + def _get_crossvalidation_metric(self, m: ProphetForecast) -> float: """function for calculated the metric used for crossvalidation Args: - m (prophet.Prophet): Prophet model for crossvalidation + m (ProphetForecast): Prophet model for crossvalidation cv_settings (dict): settings set by segment in the config file Returns: - float: Metric where closer to zero means a better model + float: Metric which should always be positive and where smaller values + indicate better models """ - df_cv = cross_validation(m, **cv_settings) + df_cv = cross_validation(m.model, **self.cv_settings) df_bias = df_cv.groupby("cutoff")[["yhat", "y"]].sum().reset_index() df_bias["pcnt_bias"] = df_bias["yhat"] / df_bias["y"] - 1 # Prophet splits the historical data when doing cross validation using # cutoffs. The `.tail(3)` limits the periods we consider for the best # parameters to the 3 most recent cutoff periods. - return df_bias.tail(3)["pcnt_bias"].mean() + return np.abs(df_bias.tail(3)["pcnt_bias"].mean()) - def _auto_tuning( - self, observed_df, segment_settings: SegmentModelSettings - ) -> Dict[str, float]: + def _auto_tuning(self, observed_df) -> ProphetForecast: """ Perform automatic tuning of model parameters. @@ -374,92 +52,90 @@ def _auto_tuning( specified in the segments section of the config, submission_date column with unique dates corresponding to each observation and y column containing values of observations - segment_settings (SegmentModelSettings): The settings for the segment. - Returns: - Dict[str, float]: The tuned parameters. + ProphetForecast: ProphetForecast that produced the best crossvalidation metric. """ - add_log_growth_cols = ( - "growth" in segment_settings.grid_parameters.keys() - and segment_settings.grid_parameters["growth"] == "logistic" - ) - for k, v in segment_settings.grid_parameters.items(): + for k, v in self.grid_parameters.items(): if not isinstance(v, list): - segment_settings.grid_parameters[k] = [v] + self.grid_parameters[k] = [v] - param_grid = [ - dict(zip(segment_settings.grid_parameters.keys(), v)) - for v in itertools.product(*segment_settings.grid_parameters.values()) + auto_param_grid = [ + dict(zip(self.grid_parameters.keys(), v)) + for v in itertools.product(*self.grid_parameters.values()) ] - test_dat = self._build_train_dataframe( - observed_df, segment_settings, add_log_growth_cols - ) - bias = [] - - for params in param_grid: - m = self._build_model(segment_settings, params) - m.fit(test_dat) - - crossval_metric = self._get_crossvalidation_metric( - m, segment_settings.cv_settings - ) - bias.append(crossval_metric) - - min_abs_bias_index = np.argmin(np.abs(bias)) - - return param_grid[min_abs_bias_index] - - def _add_regressors(self, df: pd.DataFrame, regressors: List[ProphetRegressor]): - """ - Add regressor columns to the dataframe for training or prediction. + set_params = self._get_parameters() + for param in self.grid_parameters: + set_params.pop(param) + + auto_param_grid = [dict(**el, **set_params) for el in auto_param_grid] + + bias = np.inf + best_model = None + best_params = None + for params in auto_param_grid: + m = ProphetForecast(**params) + m.fit(observed_df) + crossval_metric = self._get_crossvalidation_metric(m) + if crossval_metric < bias: + best_model = m + bias = crossval_metric + best_params = params + + # set the parameters of the current object + # to those of the optimized ProphetForecast object + for attr_name, best_value in best_params.items(): + setattr(self, attr_name, best_value) + if best_model.growth == "logistic": + # case where logistic growth is being used + # need to set some parameters used to make training and + # predict dfs + self.growth = "logistic" + self.logistic_growth_cap = best_model.logistic_growth_cap + self.logistic_growth_floor = best_model.logistic_growth_floor + if best_model.regressors is not None: + self.regressors = best_model.regressors + + return best_model.model + + def fit(self, observed_df: pd.DataFrame) -> object: + """Select the best fit model and set it to the model attribute Args: - df (pd.DataFrame): The input dataframe. - regressors (List[ProphetRegressor]): The list of regressors to add. - - Returns: - pd.DataFrame: The dataframe with regressors added. + observed_df (pd.DataFrame): observed data used to fit """ - for regressor in regressors: - regressor = self._fill_regressor_dates(regressor) - # finds rows where date is in regressor date ranges and sets that regressor - ## value to 0, else 1 - df[regressor.name] = ( - ~( - (df["ds"] >= pd.to_datetime(regressor.start_date).date()) - & (df["ds"] <= pd.to_datetime(regressor.end_date).date()) - ) - ).astype(int) - return df + # model returned by _auto_tuning is already fit + # don't need to set seed since it happens in the + # ProphetForecast object created in the auto_tuning + self.model = self._auto_tuning(observed_df) + train_dataframe = self._build_train_dataframe(observed_df) + self.history = train_dataframe + return self - def _predict( - self, dates_to_predict_raw: pd.DataFrame, segment_settings: SegmentModelSettings + def predict( + self, + dates_to_predict_raw: pd.DataFrame, ) -> pd.DataFrame: """ Generate forecast samples for a segment. Args: - dates_to_predict (pd.DataFrame): dataframe of dates to predict - segment_settings (SegmentModelSettings): The settings for the segment. - + dates_to_predict (pd.DataFrame): dataframe with a single column, + submission_date that is a string in `%Y-%m-%d` format Returns: pd.DataFrame: The forecasted values. """ - add_log_growth_cols = ( - "growth" in segment_settings.trained_parameters.keys() - and segment_settings.trained_parameters["growth"] == "logistic" - ) # add regressors, logistic growth limits (if applicable) to predict dataframe - dates_to_predict = self._build_predict_dataframe( - dates_to_predict_raw, segment_settings, add_log_growth_cols - ) + self._set_seed() + dates_to_predict = self._build_predict_dataframe(dates_to_predict_raw) + self.predict_input = dates_to_predict # draws samples from Prophet posterior distribution, to provide percentile predictions - samples = segment_settings.segment_model.predictive_samples(dates_to_predict) + samples = self.model.predictive_samples(dates_to_predict) df = pd.DataFrame(samples["yhat"]) - df["submission_date"] = dates_to_predict_raw + df["submission_date"] = dates_to_predict_raw["submission_date"].values + self._validate_forecast_df(df) component_cols = [ "ds", @@ -476,23 +152,22 @@ def _predict( ] # use 'predict' method to return components from the Prophet model - components_df = segment_settings.segment_model.predict(dates_to_predict)[ - component_cols - ] + components_df = self.model.predict(dates_to_predict)[component_cols] # join observed data to components df, which allows for calc of intra-sample # error rates and how components resulted in those predictions. The `fillna` # call will fill the missing y values for forecasted dates, where only yhat # is available. + history_df = self.history[["ds", "y"]].copy() + history_df["ds"] = pd.to_datetime(history_df["ds"]) components_df = components_df.merge( - segment_settings.segment_model.history[["ds", "y"]], + history_df, on="ds", how="left", ).fillna(0) components_df.rename(columns={"ds": "submission_date"}, inplace=True) - segment_settings.components_df = components_df.copy() - + self.components_df = components_df.copy() return df def _validate_forecast_df(self, df: pd.DataFrame) -> None: @@ -518,302 +193,239 @@ def _validate_forecast_df(self, df: pd.DataFrame) -> None: f" but column {i} has type {df[i].dtypes}." ) - def _percentile_name_map(self, percentiles: List[int]) -> Dict[str, str]: - """ - Map percentiles to their corresponding names for the BQ table. - - Args: - percentiles (List[int]): The list of percentiles. - Returns: - Dict[str, str]: The mapping of percentile names. - """ - - percentiles.sort() - return { - f"p{percentiles[0]}": "value_low", - f"p{percentiles[1]}": "value_mid", - f"p{percentiles[2]}": "value_high", - "mean": "value", - } - - def _combine_forecast_observed( - self, - forecast_df: pd.DataFrame, - observed_df: pd.DataFrame, - period: str, - numpy_aggregations: List, - percentiles, - segment: dict, - ) -> pd.DataFrame: - """Calculate aggregates over the forecast and observed data - and concatenate the two dataframes - Args: - forecast_df (pd.DataFrame): forecast dataframe - observed_df (pd.DataFrame): observed dataframe - period (str): period to aggregate up to, must be in (day, month, year) - numpy_aggregations (List): List of aggregation functions to apply across samples from the - posterior-predictive distribution. Must take - in a numpy array and return a single value - percentiles: 3-element list of percentiles to calculate across samples from the posterior-predictive distribution - segment (dict): dictionary that lists columns and values corresponding to the segment - keys are the column name used to segment and values are the values - of that column corresponding to the current segment - - Returns: - pd.DataFrame: combined dataframe containing aggregated values from observed and forecast - """ - # filter the forecast data to just the data in the future - last_historic_date = observed_df["submission_date"].max() - forecast_df = forecast_df.loc[ - forecast_df["submission_date"] > last_historic_date - ] - - forecast_summarized, observed_summarized = self._aggregate_forecast_observed( - forecast_df, observed_df, period, numpy_aggregations, percentiles - ) - - # add datasource-specific metadata columns - forecast_summarized["source"] = "forecast" - observed_summarized["source"] = "historical" - - # add segment columns to forecast table - for dim, value in segment.items(): - forecast_summarized[dim] = value +@dataclass +class FunnelForecast(BaseEnsembleForecast): + """ + Holds the configuration and results for each segment + in a funnel forecasting model. + """ - # rename forecast percentile to low, middle, high - # rename mean to value - forecast_summarized = forecast_summarized.rename( - columns=self._percentile_name_map(percentiles) - ) + model_class: object = ProphetAutotunerForecast - # create a single dataframe that contains observed and forecasted data - df = pd.concat([observed_summarized, forecast_summarized]) - return df + def __post_init__(self, *args, **kwargs): + super(FunnelForecast, self).__post_init__() + if not self.model_class == ProphetAutotunerForecast: + raise ValueError("model_class set when ProphetForecast is expected") - def _summarize( - self, - segment_settings: SegmentModelSettings, - period: str, - numpy_aggregations: List[str], - percentiles: List[int] = [10, 50, 90], - ) -> pd.DataFrame: - """ - Calculate summary metrics on a specific segment - for `forecast_df` over a given period, and add metadata. + def _get_parameters(self): + parameter_dict = {} + for el in self.parameters: + parameter_dict[str(el["segment"])] = json.dumps(el) + return parameter_dict - Args: - segment_settings (SegmentModelSettings): The settings for the segment. - period (str): The period for aggregation. - numpy_aggregations (List[str]): List of numpy aggregation functions. - percentiles (List[int]): List of percentiles. - Returns: - pd.DataFrame: The summarized dataframe. - """ - if len(percentiles) != 3: - raise ValueError( - """ - Can only pass a list of length 3 as percentiles, for lower, mid, and upper values. - """ - ) +def combine_forecast_observed( + forecast_summarized: pd.DataFrame, + observed_summarized: pd.DataFrame, +) -> pd.DataFrame: + """Combines the observed and forecast data as part of summarization + Args: + forecast_summarized (pd.DataFrame): forecast dataframe. This dataframe should include the segments as columns + as well as a forecast_parameters column with the forecast parameters + observed_summarized (pd.DataFrame): observed dataframe - # the start date for this segment's historical data, in cases where the full time series - ## of historical data is not used for model training - segment_observed_start_date = datetime.strptime( - segment_settings.start_date, "%Y-%m-%d" - ).date() - - # find indices in observed_df for rows that exactly match segment dict - segment_historical_indices = ( - self.observed_df[list(segment_settings.segment)] - == pd.Series(segment_settings.segment) - ).all(axis=1) - - segment_observed_df = self.observed_df.loc[ - (segment_historical_indices) - & (self.observed_df["submission_date"] >= segment_observed_start_date) - ].copy() - - df = self._combine_forecast_observed( - segment_settings.forecast_df, - segment_observed_df, - period, - numpy_aggregations, - percentiles, - segment_settings.segment, + Returns: + pd.DataFrame: combined dataframe containing aggregated values from observed and forecast + """ + # add datasource-specific metadata columns + forecast_summarized["source"] = "forecast" + observed_summarized["source"] = "historical" + + # create a single dataframe that contains observed and forecasted data + df = pd.concat([observed_summarized, forecast_summarized]) + return df + + +def summarize_with_parameters( + forecast_df: pd.DataFrame, + observed_df: pd.DataFrame, + period: str, + numpy_aggregations: List, + percentiles, + segment_cols: List[str], +) -> pd.DataFrame: + """Calculate aggregates over the forecast and observed data + and concatenate the two dataframes for a single set of parameters + Args: + forecast_df (pd.DataFrame): forecast dataframe. This dataframe should include the segments as columns + as well as a forecast_parameters column with the forecast parameters + observed_df (pd.DataFrame): observed dataframe + period (str): period to aggregate up to, must be in (day, month, year) + numpy_aggregations (List): List of aggregation functions to apply across samples from the + posterior-predictive distribution. Must take + in a numpy array and return a single value + percentiles: 3-element list of percentiles to calculate across samples from the posterior-predictive distribution + segment (dict): dictionary that lists columns and values corresponding to the segment + keys are the column name used to segment and values are the values + of that column corresponding to the current segment + + Returns: + pd.DataFrame: combined dataframe containing aggregated values from observed and forecast + """ + # filter the forecast data to just the data in the future + # note that if start_date is set, it is the applied to the start of observed_df + # and that it therefore doesn't need to be applied here + last_historic_date = observed_df["submission_date"].max() + forecast_df = forecast_df.loc[forecast_df["submission_date"] > last_historic_date] + + forecast_summarized, observed_summarized = aggregate_forecast_observed( + forecast_df, + observed_df, + period, + numpy_aggregations, + percentiles, + additional_aggregation_columns=segment_cols, + ) + percentile_name_map = { + f"p{percentiles[0]}": "value_low", + f"p{percentiles[1]}": "value_mid", + f"p{percentiles[2]}": "value_high", + "mean": "value", + } + + # rename forecast percentile to low, middle, high + # rename mean to value + forecast_summarized = forecast_summarized.rename(columns=percentile_name_map) + + df = combine_forecast_observed(forecast_summarized, observed_summarized) + + df["aggregation_period"] = period.lower() + + return df + + +def summarize( + forecast_df: pd.DataFrame, + observed_df: pd.DataFrame, + periods: List[str] = ["day", "month"], + numpy_aggregations: List[str] = ["mean"], + percentiles: List[int] = [10, 50, 90], + segment_cols: List[str] = [], +) -> None: + """ + Summarize the forecast results over specified periods. + + Args: + forecast_df (pd.DataFrame): forecast dataframe + observed_df (pd.DataFrame): observed data + periods (List[str], optional): The periods for summarization. Defaults to ["day", "month"]. + segment_cols (List of str): list of columns used for segmentation + numpy_aggregations (List[str], optional): The numpy aggregation functions. Defaults to ["mean"]. + percentiles (List[int], optional): The percentiles for summarization. Defaults to [10, 50, 90]. + """ + if len(percentiles) != 3: + raise ValueError( + """ + Can only pass a list of length 3 as percentiles, for lower, mid, and upper values. + """ ) - df["forecast_parameters"] = json.dumps(segment_settings.trained_parameters) - - # add summary metadata columns - df["aggregation_period"] = period.lower() - return df - - def predict(self) -> None: - """Generate a forecast from `start_date` to `end_date`.""" - print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True) - self._set_seed() - self.predicted_at = datetime.utcnow() - - for segment_settings in self.segment_models: - forecast_df = self._predict(self.dates_to_predict, segment_settings) - self._validate_forecast_df(forecast_df) - - segment_settings.forecast_df = forecast_df - - def summarize( - self, - periods: List[str] = ["day", "month"], - numpy_aggregations: List[str] = ["mean"], - percentiles: List[int] = [10, 50, 90], - ) -> None: - """ - Summarize the forecast results over specified periods. - - Args: - periods (List[str], optional): The periods for summarization. Defaults to ["day", "month"]. - numpy_aggregations (List[str], optional): The numpy aggregation functions. Defaults to ["mean"]. - percentiles (List[int], optional): The percentiles for summarization. Defaults to [10, 50, 90]. - """ - summary_df_list = [] - components_df_list = [] - for segment in self.segment_models: - summary_df = pd.concat( - [ - self._summarize( - segment, - i, - numpy_aggregations, - percentiles, - ) - for i in periods - ] + summary_df = pd.concat( + [ + summarize_with_parameters( + forecast_df, + observed_df, + i, + numpy_aggregations, + percentiles, + segment_cols, ) - for dim, dim_value in segment.segment.items(): - segment.components_df[dim] = dim_value - summary_df_list.append(summary_df.copy(deep=True)) - components_df_list.append(segment.components_df) - del summary_df - - df = pd.concat(summary_df_list, ignore_index=True) - - # add Metric Hub metadata columns - df["metric_alias"] = self.metric_hub.alias.lower() - df["metric_hub_app_name"] = self.metric_hub.app_name.lower() - df["metric_hub_slug"] = self.metric_hub.slug.lower() - df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date) - df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date) - df["metric_collected_at"] = self.collected_at - - # add forecast model metadata columns - df["forecast_start_date"] = self.start_date - df["forecast_end_date"] = self.end_date - df["forecast_trained_at"] = self.trained_at - df["forecast_predicted_at"] = self.predicted_at - - self.summary_df = df - - self.components_df = pd.concat(components_df_list, ignore_index=True) - - def write_results( - self, - project: str, - dataset: str, - table: str, - write_disposition: str = "WRITE_APPEND", - components_table: str = "", - components_dataset: str = "", - ) -> None: - """ - Write `self.summary_df` to Big Query. + for i in periods + ] + ) + + return summary_df + + +def write_results( + summary_df, + components_df, + segment_cols, + project: str, + dataset: str, + table: str, + write_disposition: str = "WRITE_APPEND", + components_table: str = "", + components_dataset: str = "", +) -> None: + """ + Write `self.summary_df` to Big Query. + + Args: + project (str): The Big Query project that the data should be written to. + dataset (str): The Big Query dataset that the data should be written to. + table (str): The Big Query table that the data should be written to. + write_disposition (str, optional): In the event that the destination table exists, + should the table be overwritten ("WRITE_TRUNCATE") or appended to ("WRITE_APPEND")? Defaults to "WRITE_APPEND". + components_table (str, optional): The Big Query table for model components. Defaults to "". + components_dataset (str, optional): The Big Query dataset for model components. Defaults to "". + """ + print( + f"Writing results to `{project}.{dataset}.{table}`.", + flush=True, + ) + client = bigquery.Client(project=project) + schema = [ + bigquery.SchemaField("submission_date", bq_types.DATE), + *[bigquery.SchemaField(k, bq_types.STRING) for k in segment_cols], + bigquery.SchemaField("aggregation_period", bq_types.STRING), + bigquery.SchemaField("source", bq_types.STRING), + bigquery.SchemaField("value", bq_types.FLOAT), + bigquery.SchemaField("value_low", bq_types.FLOAT), + bigquery.SchemaField("value_mid", bq_types.FLOAT), + bigquery.SchemaField("value_high", bq_types.FLOAT), + bigquery.SchemaField("metric_alias", bq_types.STRING), + bigquery.SchemaField("metric_hub_app_name", bq_types.STRING), + bigquery.SchemaField("metric_hub_slug", bq_types.STRING), + bigquery.SchemaField("metric_start_date", bq_types.DATE), + bigquery.SchemaField("metric_end_date", bq_types.DATE), + bigquery.SchemaField("metric_collected_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_start_date", bq_types.DATE), + bigquery.SchemaField("forecast_end_date", bq_types.DATE), + bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_predicted_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_parameters", bq_types.STRING), + ] + job = client.load_table_from_dataframe( + dataframe=summary_df, + destination=f"{project}.{dataset}.{table}", + job_config=bigquery.LoadJobConfig( + schema=schema, + autodetect=False, + write_disposition=write_disposition, + ), + ) + # Wait for the job to complete. + job.result() + + if components_table: + numeric_cols = list(components_df.select_dtypes(include=float).columns) + string_cols = list(components_df.select_dtypes(include=object).columns) - Args: - project (str): The Big Query project that the data should be written to. - dataset (str): The Big Query dataset that the data should be written to. - table (str): The Big Query table that the data should be written to. - write_disposition (str, optional): In the event that the destination table exists, - should the table be overwritten ("WRITE_TRUNCATE") or appended to ("WRITE_APPEND")? Defaults to "WRITE_APPEND". - components_table (str, optional): The Big Query table for model components. Defaults to "". - components_dataset (str, optional): The Big Query dataset for model components. Defaults to "". - """ - print( - f"Writing results to `{project}.{dataset}.{table}`.", - flush=True, - ) - client = bigquery.Client(project=project) schema = [ bigquery.SchemaField("submission_date", bq_types.DATE), - *[ - bigquery.SchemaField(k, bq_types.STRING) - for k in self.metric_hub.segments.keys() - ], - bigquery.SchemaField("aggregation_period", bq_types.STRING), - bigquery.SchemaField("source", bq_types.STRING), - bigquery.SchemaField("value", bq_types.FLOAT), - bigquery.SchemaField("value_low", bq_types.FLOAT), - bigquery.SchemaField("value_mid", bq_types.FLOAT), - bigquery.SchemaField("value_high", bq_types.FLOAT), - bigquery.SchemaField("metric_alias", bq_types.STRING), - bigquery.SchemaField("metric_hub_app_name", bq_types.STRING), - bigquery.SchemaField("metric_hub_slug", bq_types.STRING), - bigquery.SchemaField("metric_start_date", bq_types.DATE), - bigquery.SchemaField("metric_end_date", bq_types.DATE), - bigquery.SchemaField("metric_collected_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_start_date", bq_types.DATE), - bigquery.SchemaField("forecast_end_date", bq_types.DATE), + bigquery.SchemaField("metric_slug", bq_types.STRING), bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_predicted_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_parameters", bq_types.STRING), ] + schema += [bigquery.SchemaField(col, bq_types.STRING) for col in string_cols] + schema += [bigquery.SchemaField(col, bq_types.FLOAT) for col in numeric_cols] + + if not components_dataset: + components_dataset = dataset + print( + f"Writing model components to `{project}.{components_dataset}.{components_table}`.", + flush=True, + ) + job = client.load_table_from_dataframe( - dataframe=self.summary_df, - destination=f"{project}.{dataset}.{table}", + dataframe=components_df, + destination=f"{project}.{components_dataset}.{components_table}", job_config=bigquery.LoadJobConfig( schema=schema, autodetect=False, write_disposition=write_disposition, ), ) - # Wait for the job to complete. - job.result() - - if components_table: - numeric_cols = list(self.components_df.select_dtypes(include=float).columns) - string_cols = list(self.components_df.select_dtypes(include=object).columns) - self.components_df["metric_slug"] = self.metric_hub.slug - self.components_df["forecast_trained_at"] = self.trained_at - - schema = [ - bigquery.SchemaField("submission_date", bq_types.DATE), - bigquery.SchemaField("metric_slug", bq_types.STRING), - bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP), - ] - schema += [ - bigquery.SchemaField(col, bq_types.STRING) for col in string_cols - ] - schema += [ - bigquery.SchemaField(col, bq_types.FLOAT) for col in numeric_cols - ] - - if not components_dataset: - components_dataset = dataset - print( - f"Writing model components to `{project}.{components_dataset}.{components_table}`.", - flush=True, - ) - job = client.load_table_from_dataframe( - dataframe=self.components_df, - destination=f"{project}.{components_dataset}.{components_table}", - job_config=bigquery.LoadJobConfig( - schema=schema, - autodetect=False, - write_disposition=write_disposition, - schema_update_options=[ - bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION - ], - ), - ) - - job.result() + job.result() diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py index 82a07fc4..26e1cd2c 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py @@ -3,65 +3,339 @@ from pandas.api import types as pd_types import prophet import numpy as np +from dataclasses import dataclass, field from typing import Dict, List from datetime import datetime, timezone -from dataclasses import dataclass from kpi_forecasting.models.base_forecast import BaseForecast from kpi_forecasting import pandas_extras as pdx from google.cloud import bigquery from google.cloud.bigquery.enums import SqlTypeNames as bq_types +from kpi_forecasting.configs.model_inputs import ( + ProphetHoliday, + ProphetRegressor, + holiday_collection, + regressor_collection, +) + @dataclass class ProphetForecast(BaseForecast): - """Forecast object specifically for prophet forecast models - - Additional attributes: - number_of_simulations (int): The number of simulated timeseries that the forecast - should generate. Since many forecast models are probablistic, this enables the - measurement of variation across a range of possible outcomes. + """ + Holds the configuration and results for each segment + in a funnel forecasting model. + + Args: + holidays (list): list of ProphetHoliday objects used + to specify holidays in the Propohet model. Used to create + the dataframe passed to prophet under the holidays key + regressors (list): list of ProphetRegressor objects, + used to set regressors in the Prophet object and + create them in the data + use_all_us_holidays (bool): When True, `model.add_country_holidays(country_name="US")` + is called on the prophet model + growth (str): Used in Prophet object initialization + 'linear', 'logistic' or 'flat' to specify a linear, logistic or + flat trend. + changepoints (list): Used in Prophet object initialization + List of dates at which to include potential changepoints. If + not specified, potential changepoints are selected automatically. + n_changepoints (int): Used in Prophet object initialization + Number of potential changepoints to include. Not used + if input `changepoints` is supplied. If `changepoints` is not supplied, + then n_changepoints potential changepoints are selected uniformly from + the first `changepoint_range` proportion of the history. + changepoint_range (float): Used in Prophet object initialization + Proportion of history in which trend changepoints will + be estimated. Defaults to 0.8 for the first 80%. Not used if + `changepoints` is specified. + yearly_seasonality: Used in Prophet object initialization + Fit yearly seasonality. + Can be 'auto', True, False, or a number of Fourier terms to generate. + weekly_seasonality : Used in Prophet object initialization + Fit weekly seasonality. + Can be 'auto', True, False, or a number of Fourier terms to generate. + daily_seasonality: Used in Prophet object initialization + Fit daily seasonality. + Can be 'auto', True, False, or a number of Fourier terms to generate. + seasonality_mode: Used in Prophet object initialization + 'additive' (default) or 'multiplicative'. + seasonality_prior_scale: Used in Prophet object initialization + Parameter modulating the strength of the + seasonality model. Larger values allow the model to fit larger seasonal + fluctuations, smaller values dampen the seasonality. Can be specified + for individual seasonalities using add_seasonality. + holidays_prior_scale: Used in Prophet object initialization + Parameter modulating the strength of the holiday + components model, unless overridden in the holidays input. + changepoint_prior_scale: Used in Prophet object initialization + Parameter modulating the flexibility of the + automatic changepoint selection. Large values will allow many + changepoints, small values will allow few changepoints. + mcmc_samples (int): Used in Prophet object initialization + If greater than 0, will do full Bayesian inference + with the specified number of MCMC samples. If 0, will do MAP + estimation. + interval_width (float): Used in Prophet object initialization + width of the uncertainty intervals provided + for the forecast. If mcmc_samples=0, this will be only the uncertainty + in the trend using the MAP estimate of the extrapolated generative + model. If mcmc.samples>0, this will be integrated over all model + parameters, which will include uncertainty in seasonality. + uncertainty_samples: Used in Prophet object initialization + Number of simulated draws used to estimate + uncertainty intervals. Settings this value to 0 or False will disable + uncertainty estimation and speed up the calculation. + stan_backend (str): Used in Prophet object initialization + str as defined in StanBackendEnum default: None - will try to + iterate over all available backends and find the working one + holidays_mode (str): Used in Prophet object initialization + 'additive' or 'multiplicative'. Defaults to seasonality_mode. """ - number_of_simulations: int = 1000 + holidays: list = field(default_factory=list[ProphetHoliday]) + regressors: list = field(default_factory=list[ProphetRegressor]) + use_all_us_holidays: bool = False + + # these are the arguments used to initialize the Prophet object + growth: str = "linear" + changepoints: list = None + n_changepoints: int = 25 + changepoint_range: float = 0.8 + yearly_seasonality: str = "auto" + weekly_seasonality: str = "auto" + daily_seasonality: str = "auto" + holidays: pd.DataFrame = None + seasonality_mode: str = "additive" + seasonality_prior_scale: float = 10.0 + holidays_prior_scale: float = 10.0 + changepoint_prior_scale: float = 0.05 + mcmc_samples: int = 0 + interval_width: float = 0.80 + uncertainty_samples: int = 1000 + stan_backend: str = None + scaling: str = "absmax" + holidays_mode: str = None + floor: float = None + cap: float = None + + def __post_init__(self): + holiday_list = [] + regressor_list = [] + + if self.holidays == []: + self.holidays = None + self.holidays_raw = None + elif not self.holidays: + self.holidays_raw = None + elif self.holidays: + self.holidays_raw = self.holidays + holiday_list = [ + ProphetHoliday(**holiday_collection[h]) for h in self.holidays + ] + holiday_df = pd.concat( + [ + pd.DataFrame( + { + "holiday": h.name, + "ds": pd.to_datetime(h.ds), + "lower_window": h.lower_window, + "upper_window": h.upper_window, + } + ) + for h in holiday_list + ], + ignore_index=True, + ) + self.holidays = holiday_df + if self.regressors: + self.regressors_raw = self.regressors + regressor_list = [ + ProphetRegressor(**regressor_collection[r]) for r in self.regressors + ] + self.regressors = regressor_list + else: + self.regressors_raw = None - @property - def column_names_map(self) -> Dict[str, str]: - return {"submission_date": "ds", "value": "y"} + self.model = self._build_model() + self.logistic_growth_cap = self.cap + self.logistic_growth_floor = self.floor + + def _build_model(self) -> prophet.Prophet: + """ + Build a Prophet model from parameters using attributes set on initialization + + Returns: + prophet.Prophet: The Prophet model. + """ - def _build_model(self, parameter_dict): model = prophet.Prophet( - **parameter_dict, - uncertainty_samples=self.number_of_simulations, - mcmc_samples=0, + growth=self.growth, + changepoints=self.changepoints, + n_changepoints=self.n_changepoints, + changepoint_range=self.changepoint_range, + yearly_seasonality=self.yearly_seasonality, + weekly_seasonality=self.weekly_seasonality, + daily_seasonality=self.daily_seasonality, + holidays=self.holidays, + seasonality_mode=self.seasonality_mode, + seasonality_prior_scale=self.seasonality_prior_scale, + holidays_prior_scale=self.holidays_prior_scale, + changepoint_prior_scale=self.changepoint_prior_scale, + mcmc_samples=self.mcmc_samples, + interval_width=self.interval_width, + uncertainty_samples=self.uncertainty_samples, + stan_backend=self.stan_backend, + scaling=self.scaling, + holidays_mode=self.holidays_mode, ) + for regressor in self.regressors: + model.add_regressor( + regressor.name, + prior_scale=regressor.prior_scale, + mode=regressor.mode, + ) + if self.use_all_us_holidays: model.add_country_holidays(country_name="US") return model - def _fit(self, observed_df) -> None: - self.model = self._build_model(self.parameters) + def _get_parameters(self) -> Dict: + """Return parameters used to create a new, identical ProphetForecast object""" + + # holidays and regressors get modified so use the + # raw version so these values could be used to create a new + # ProphetForecast without throwing an error + return { + "growth": self.growth, + "changepoints": self.changepoints, + "n_changepoints": self.n_changepoints, + "changepoint_range": self.changepoint_range, + "yearly_seasonality": self.yearly_seasonality, + "weekly_seasonality": self.weekly_seasonality, + "daily_seasonality": self.daily_seasonality, + "holidays": self.holidays_raw, + "seasonality_mode": self.seasonality_mode, + "seasonality_prior_scale": self.seasonality_prior_scale, + "holidays_prior_scale": self.holidays_prior_scale, + "changepoint_prior_scale": self.changepoint_prior_scale, + "mcmc_samples": self.mcmc_samples, + "interval_width": self.interval_width, + "uncertainty_samples": self.uncertainty_samples, + "stan_backend": self.stan_backend, + "scaling": self.scaling, + "holidays_mode": self.holidays_mode, + "cap": self.logistic_growth_cap, + "floor": self.logistic_growth_floor, + "regressors": self.regressors_raw, + } + + @property + def column_names_map(self) -> Dict[str, str]: + return {"submission_date": "ds", "value": "y"} + + def _add_regressors(self, df: pd.DataFrame, regressors: List[ProphetRegressor]): + """ + Add regressor columns to the dataframe for training or prediction. + + Args: + df (pd.DataFrame): The input dataframe. + regressors (List[ProphetRegressor]): The list of regressors to add. + + Returns: + pd.DataFrame: The dataframe with regressors added. + """ + for regressor in regressors: + regressor_time_filter = [True] * len(df) + if regressor.start_date: + regressor_time_filter &= ( + df["ds"] >= pd.to_datetime(regressor.start_date).date() + ) + if regressor.end_date: + regressor_time_filter &= ( + df["ds"] <= pd.to_datetime(regressor.end_date).date() + ) + # finds rows where date is in regressor date ranges and sets that regressor + ## value to 0, else 1 + df[regressor.name] = (~(regressor_time_filter)).astype(int) + return df + + def _set_seed(self) -> None: + """Set random seed to ensure that fits and predictions are reproducible.""" + np.random.seed(42) + + def _build_train_dataframe(self, observed_df) -> pd.DataFrame: + """ + Build the model dataframe for training + + Args: + observed_df: dataframe of observed data + + Returns: + pd.DataFrame: The dataframe for the model. + """ + + # define limits for logistic growth + observed_df = observed_df.rename(columns=self.column_names_map) + + if self.growth == "logistic": + self.logistic_growth_floor = observed_df["y"].min() * 0.5 + observed_df["floor"] = self.logistic_growth_floor + self.logistic_growth_cap = observed_df["y"].max() * 1.5 + observed_df["cap"] = self.logistic_growth_cap + + if self.regressors: + observed_df = self._add_regressors(observed_df, self.regressors) + + return observed_df + + def _build_predict_dataframe(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: + """creates dataframe used for prediction + + Args: + dates_to_predict (pd.DataFrame): dataframe of dates to predict + + Returns: + pd.DataFrame: dataframe to use used in prediction + """ + # predict dataframe only needs dates to predict, logistic growth limits, and regressors + df = dates_to_predict.rename(columns=self.column_names_map).copy() + if self.growth == "logistic": + df["floor"] = self.logistic_growth_floor + df["cap"] = self.logistic_growth_cap + + if self.regressors: + df = self._add_regressors(df, self.regressors) + return df + + def fit(self, observed_df) -> None: # Modify observed data to have column names that Prophet expects, and fit # the model - self.model.fit(observed_df.rename(columns=self.column_names_map)) + self._set_seed() + train_dataframe = self._build_train_dataframe(observed_df) + self.model.fit(train_dataframe) + return self - def _predict(self, dates_to_predict) -> pd.DataFrame: + def predict(self, dates_to_predict) -> pd.DataFrame: # generate the forecast samples + self._set_seed() samples = self.model.predictive_samples( dates_to_predict.rename(columns=self.column_names_map) ) df = pd.DataFrame(samples["yhat"]) df["submission_date"] = dates_to_predict + self._validate_forecast_df(df, dates_to_predict) return df - def _validate_forecast_df(self, df) -> None: + def _validate_forecast_df(self, df, dates_to_predict) -> None: """Validate that `self.forecast_df` has been generated correctly.""" columns = df.columns - expected_shape = (len(self.dates_to_predict), 1 + self.number_of_simulations) + expected_shape = (len(dates_to_predict), 1 + self.uncertainty_samples) numeric_columns = df.drop(columns="submission_date").columns if "submission_date" not in columns: @@ -72,44 +346,41 @@ def _validate_forecast_df(self, df) -> None: f"Expected forecast_df to have shape {expected_shape}, but it has shape {df.shape}." ) - if not df["submission_date"].equals(self.dates_to_predict["submission_date"]): + if not df["submission_date"].equals(dates_to_predict["submission_date"]): raise ValueError( "forecast_df['submission_date'] does not match dates_to_predict['submission_date']." ) for i in numeric_columns: - if not pd_types.is_numeric_dtype(self.forecast_df[i]): + if not pd_types.is_numeric_dtype(df[i]): raise ValueError( "All forecast_df columns except 'submission_date' must be numeric," f" but column {i} has type {df[i].dtypes}." ) - def _predict_legacy(self) -> pd.DataFrame: + def _predict_legacy( + self, dates_to_predict, metric_hub_alias, parameters + ) -> pd.DataFrame: """ Recreate the legacy format used in `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_v1`. """ # TODO: This method should be removed once the forecasting data model is updated: # https://mozilla-hub.atlassian.net/browse/DS-2676 - - df = self.model.predict( - self.dates_to_predict.rename(columns=self.column_names_map) - ) + df = self.model.predict(self._build_predict_dataframe(dates_to_predict)) # set legacy column values - if "dau" in self.metric_hub.alias.lower(): + if "dau" in metric_hub_alias.lower(): df["metric"] = "DAU" else: - df["metric"] = self.metric_hub.alias + df["metric"] = metric_hub_alias df["forecast_date"] = str( datetime.now(timezone.utc).replace(tzinfo=None).date() ) - df["forecast_parameters"] = str( - json.dumps({**self.parameters, "holidays": self.use_all_us_holidays}) - ) + df["forecast_parameters"] = str(json.dumps(parameters)) - alias = self.metric_hub.alias.lower() + alias = metric_hub_alias.lower() if ("desktop" in alias) and ("mobile" in alias): raise ValueError( @@ -165,281 +436,363 @@ def _predict_legacy(self) -> pd.DataFrame: return df[columns] - def _aggregate_forecast_observed( - self, - forecast_df, - observed_df, - period: str, - numpy_aggregations: List[str], - percentiles: List[int], - ): - # build a list of all functions that we'll summarize the data by - aggregations = [getattr(np, i) for i in numpy_aggregations] - aggregations.extend([pdx.percentile(i) for i in percentiles]) - - # aggregate metric to the correct date period (day, month, year) - observed_summarized = pdx.aggregate_to_period(observed_df, period) - forecast_agg = pdx.aggregate_to_period(forecast_df, period).sort_values( - "submission_date" - ) - - # find periods of overlap between observed and forecasted data - # merge preserves key order so overlap will be sorted by submission_date - overlap = forecast_agg.merge( - observed_summarized, - on="submission_date", - how="left", - ).fillna(0) - - forecast_summarized = ( - forecast_agg.set_index("submission_date") - # Add observed data samples to any overlapping forecasted period. This - # ensures that any forecast made partway through a period accounts for - # previously observed data within the period. For example, when a monthly - # forecast is generated in the middle of the month. - .add(overlap[["value"]].values) - # calculate summary values, aggregating by submission_date, - .agg(aggregations, axis=1) - .reset_index() - ) - return forecast_summarized, observed_summarized +def aggregate_forecast_observed( + forecast_df: pd.DataFrame, + observed_df: pd.DataFrame, + period: str, + numpy_aggregations: List[str], + percentiles: List[int], + additional_aggregation_columns: List[str] = [], +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Aggregate samples produced by prophet to aggregates specified in + numpy_aggregations and percentiles, and aggregate in time up to the period + specified in period. Aggregation will include any columns passed to + additional_aggergation_columns + + Args: + forecast_df (pd.DataFrame): raw output of the predict function from ProphetForecast or + one of it's child classes + observed_df (_type_): raw input of the fit function from ProphetForecast or + one of it's child classes + period (str): period to aggregate to in time. Must be 'day', 'month' or 'year' + numpy_aggregations (List[str]): aggregates from numpy to use. Can be the name of any + numpy function that outputs a single value + percentiles (List[int]): list of number for which the percentile should be generated + additional_aggregation_columns (List[str], optional): + additional columns to use in the aggregation. Defaults to []. + + Returns: + forecast_summarized (pd.DataFrame): summarized forecast data + observed_summarized (pd.DataFrame): summarized observed data + """ + # build a list of all functions that we'll summarize the data by + aggregations = [getattr(np, i) for i in numpy_aggregations] + aggregations.extend([pdx.percentile(i) for i in percentiles]) - def _combine_forecast_observed( - self, - forecast_df, + # aggregate metric to the correct date period (day, month, year) + observed_summarized = pdx.aggregate_to_period( observed_df, - period: str, - numpy_aggregations: List[str], - percentiles: List[int], - ): - forecast_summarized, observed_summarized = self._aggregate_forecast_observed( - forecast_df, observed_df, period, numpy_aggregations, percentiles - ) + period, + additional_aggregation_columns=additional_aggregation_columns, + ) + forecast_agg = pdx.aggregate_to_period( + forecast_df, + period, + additional_aggregation_columns=additional_aggregation_columns, + ).sort_values("submission_date") + + aggregation_columns = ["submission_date"] + additional_aggregation_columns + + # find periods of overlap between observed and forecasted data + # merge preserves key order so overlap will be sorted by submission_date + overlap = forecast_agg.merge( + observed_summarized[aggregation_columns + ["value"]], + on=aggregation_columns, + how="left", + ).fillna(0) + + # separate out numeric columns, which will be the samples + # from non-numeric + + forecast_agg_no_aggregation_cols = forecast_agg[ + [el for el in forecast_agg.columns if el not in aggregation_columns] + ] + forecast_agg_string = forecast_agg_no_aggregation_cols.select_dtypes( + include=["datetime64", object] + ) + + # assuming that the numeric columns are exactly those created by + # predictive_samples + forecast_agg_numeric = forecast_agg_no_aggregation_cols.select_dtypes( + include=["float", "int"] + ) + + # put aggergation columns back into x_numeric so groupby works + forecast_agg_numeric = forecast_agg[ + list(forecast_agg_numeric.columns) + aggregation_columns + ] + forecast_agg_string = forecast_agg[ + list(forecast_agg_string.columns) + aggregation_columns + ] + + forecast_summarized = ( + forecast_agg_numeric.set_index(aggregation_columns) + # Add observed data samples to any overlapping forecasted period. This + # ensures that any forecast made partway through a period accounts for + # previously observed data within the period. For example, when a monthly + # forecast is generated in the middle of the month. + .add(overlap[["value"]].values) + # calculate summary values, aggregating by submission_date, + .agg(aggregations, axis=1) + .reset_index() + ) + + # add string columns back in + forecast_summarized = forecast_summarized.merge( + forecast_agg_string, on=aggregation_columns + ) + + forecast_summarized["aggregation_period"] = period.lower() + observed_summarized["aggregation_period"] = period.lower() + + return forecast_summarized, observed_summarized + + +def combine_forecast_observed( + forecast_summarized: pd.DataFrame, observed_summarized: pd.DataFrame +) -> pd.DataFrame: + """combines summarized forecast and observed data + + Args: + forecast_summarized (pd.DataFrame): summarized forecast data + observed_summarized (pd.DataFrame): summarized observed data + + Returns: + pd.DataFrame: combined data + """ + # remove aggregation period because it messes everything up with the melt + forecast_summarized = forecast_summarized.drop(columns=["aggregation_period"]) + observed_summarized = observed_summarized.drop(columns=["aggregation_period"]) - # remaining column of metric values get the column name 'value' - forecast_summarized = forecast_summarized.melt( - id_vars="submission_date", var_name="measure" - ) - observed_summarized["measure"] = "observed" + # remaining column of metric values get the column name 'value' + forecast_summarized = forecast_summarized.melt( + id_vars="submission_date", var_name="measure" + ) + observed_summarized["measure"] = "observed" - # add datasource-specific metadata columns - forecast_summarized["source"] = "forecast" - observed_summarized["source"] = "historical" + # add datasource-specific metadata columns + forecast_summarized["source"] = "forecast" + observed_summarized["source"] = "historical" - df = pd.concat([forecast_summarized, observed_summarized]) + df = pd.concat([forecast_summarized, observed_summarized]) - return df + return df - def _summarize( - self, - forecast_df, - observed_df, - period: str, - numpy_aggregations: List[str], - percentiles: List[int], - ) -> pd.DataFrame: - """ - Calculate summary metrics for `self.forecast_df` over a given period, and - add metadata. - """ - df = self._combine_forecast_observed( +def summarize( + forecast_df, + observed_df, + periods: List[str], + numpy_aggregations: List[str], + percentiles: List[int], + forecast_parameters: dict, +) -> pd.DataFrame: + """ + Calculate summary metrics for `self.forecast_df` over a given period, and + add metadata. + + Args: + forecast_df (pd.DataFrame): raw output of the predict function from ProphetForecast or + one of it's child classes + observed_df (_type_): raw input of the fit function from ProphetForecast or + one of it's child classes + period (str): period to aggregate to in time. Must be 'day', 'month' or 'year' + numpy_aggregations (List[str]): aggregates from numpy to use. Can be the name of any + numpy function that outputs a single value + percentiles (List[int]): list of number for which the percentile should be generated + additional_aggregation_columns (List[str], optional): + additional columns to use in the aggregation. Defaults to []. + """ + df_list = [] + for period in periods: + forecast_summarized, observed_summarized = aggregate_forecast_observed( forecast_df, observed_df, period, numpy_aggregations, percentiles ) - # add summary metadata columns - df["aggregation_period"] = period.lower() - return df + df = combine_forecast_observed(forecast_summarized, observed_summarized) - def _summarize_legacy(self) -> pd.DataFrame: - """ - Converts a `self.summary_df` to the legacy format used in - `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1` - """ - # TODO: This method should be removed once the forecasting data model is updated: - # https://mozilla-hub.atlassian.net/browse/DS-2676 + # it got removed in combine_forecast_observed so put it back + df["aggregation_period"] = period + df["forecast_parameters"] = forecast_parameters + df_list.append(df) - df = self.summary_df.copy(deep=True) + return pd.concat(df_list) - # rename columns to legacy values - df.rename( - columns={ - "forecast_end_date": "asofdate", - "submission_date": "date", - "metric_alias": "target", - "aggregation_period": "unit", - }, - inplace=True, - ) - df["forecast_date"] = df["forecast_predicted_at"].dt.date - df["type"] = df["source"].replace("historical", "actual") - df = df.replace( - { - "measure": { - "observed": "value", - "p05": "yhat_p5", - "p10": "yhat_p10", - "p20": "yhat_p20", - "p30": "yhat_p30", - "p40": "yhat_p40", - "p50": "yhat_p50", - "p60": "yhat_p60", - "p70": "yhat_p70", - "p80": "yhat_p80", - "p90": "yhat_p90", - "p95": "yhat_p95", - }, - "target": { - "desktop_dau": "desktop", - "mobile_dau": "mobile", - }, - } - ) - # pivot the df from "long" to "wide" format - index_columns = [ - "asofdate", - "date", - "target", - "unit", - "forecast_parameters", - "forecast_date", - ] - df = ( - df[index_columns + ["measure", "value"]] - .pivot( - index=index_columns, - columns="measure", - values="value", - ) - .reset_index() - ) - - # pivot sets the "name" attribute of the columns for some reason. It's - # None by default, so we just reset that here. - df.columns.name = None - - # When there's an overlap in the observed and forecasted period -- for - # example, when a monthly forecast is generated mid-month -- the legacy - # format only records the forecasted value, not the observed value. To - # account for this, we'll just find the max of the "mean" (forecasted) and - # "value" (observed) data. In all non-overlapping observed periods, the - # forecasted value will be NULL. In all non-overlapping forecasted periods, - # the observed value will be NULL. In overlapping periods, the forecasted - # value will always be larger because it is the sum of the observed and forecasted - # values. Below is a query that demonstrates the legacy behavior: - # - # SELECT * - # FROM `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1` - # WHERE asofdate = "2023-12-31" - # AND target = "mobile" - # AND unit = "month" - # AND forecast_date = "2022-06-04" - # AND date BETWEEN "2022-05-01" AND "2022-06-01" - # ORDER BY date - df["value"] = df[["mean", "value"]].max(axis=1) - df.drop(columns=["mean"], inplace=True) - - # non-numeric columns are represented in the legacy bq schema as strings - string_cols = [ - "asofdate", - "date", - "target", - "unit", - "forecast_parameters", - "forecast_date", - ] - df[string_cols] = df[string_cols].astype(str) - - return df - - def write_results( - self, - project: str, - dataset: str, - table: str, - project_legacy: str, - dataset_legacy: str, - write_disposition: str = "WRITE_APPEND", - forecast_table_legacy: str = "kpi_automated_forecast_v1", - confidences_table_legacy: str = "kpi_automated_forecast_confidences_v1", - ) -> None: - """ - Write `self.summary_df` to Big Query. - - Args: - project (str): The Big Query project that the data should be written to. - dataset (str): The Big Query dataset that the data should be written to. - table (str): The Big Query table that the data should be written to. - write_disposition (str): In the event that the destination table exists, - should the table be overwritten ("WRITE_TRUNCATE") or appended to - ("WRITE_APPEND")? - """ - # get legacy tables - # TODO: remove this once the forecasting data model is updated: - # https://mozilla-hub.atlassian.net/browse/DS-2676 - self.forecast_df_legacy = self._predict_legacy() - self.summary_df_legacy = self._summarize_legacy() - - print(f"Writing results to `{project}.{dataset}.{table}`.", flush=True) - client = bigquery.Client(project=project) - schema = [ - bigquery.SchemaField("submission_date", bq_types.DATE), - bigquery.SchemaField("aggregation_period", bq_types.STRING), - bigquery.SchemaField("source", bq_types.STRING), - bigquery.SchemaField("measure", bq_types.STRING), - bigquery.SchemaField("value", bq_types.FLOAT), - bigquery.SchemaField("metric_alias", bq_types.STRING), - bigquery.SchemaField("metric_hub_app_name", bq_types.STRING), - bigquery.SchemaField("metric_hub_slug", bq_types.STRING), - bigquery.SchemaField("metric_start_date", bq_types.DATE), - bigquery.SchemaField("metric_end_date", bq_types.DATE), - bigquery.SchemaField("metric_collected_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_start_date", bq_types.DATE), - bigquery.SchemaField("forecast_end_date", bq_types.DATE), - bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_predicted_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_parameters", bq_types.STRING), - ] - job = client.load_table_from_dataframe( - dataframe=self.summary_df, - destination=f"{project}.{dataset}.{table}", - job_config=bigquery.LoadJobConfig( - schema=schema, - autodetect=False, - write_disposition=write_disposition, - ), +def summarize_legacy(summary_df) -> pd.DataFrame: + """ + Converts a `self.summary_df` to the legacy format used in + `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1` + """ + # TODO: This method should be removed once the forecasting data model is updated: + # https://mozilla-hub.atlassian.net/browse/DS-2676 + + # rename columns to legacy values + df = summary_df.rename( + columns={ + "forecast_end_date": "asofdate", + "submission_date": "date", + "metric_alias": "target", + "aggregation_period": "unit", + } + ) + df["forecast_date"] = df["forecast_predicted_at"].dt.date + df["type"] = df["source"].replace("historical", "actual") + df = df.replace( + { + "measure": { + "observed": "value", + "p05": "yhat_p5", + "p10": "yhat_p10", + "p20": "yhat_p20", + "p30": "yhat_p30", + "p40": "yhat_p40", + "p50": "yhat_p50", + "p60": "yhat_p60", + "p70": "yhat_p70", + "p80": "yhat_p80", + "p90": "yhat_p90", + "p95": "yhat_p95", + }, + "target": { + "desktop_dau": "desktop", + "mobile_dau": "mobile", + }, + } + ) + + # pivot the df from "long" to "wide" format + index_columns = [ + "asofdate", + "date", + "target", + "unit", + "forecast_parameters", + "forecast_date", + ] + df = ( + df[index_columns + ["measure", "value"]] + .pivot( + index=index_columns, + columns="measure", + values="value", ) - # Wait for the job to complete. - job.result() - - # TODO: remove the below jobs once the forecasting data model is updated: - # https://mozilla-hub.atlassian.net/browse/DS-2676 + .reset_index() + ) + + # pivot sets the "name" attribute of the columns for some reason. It's + # None by default, so we just reset that here. + df.columns.name = None + + # When there's an overlap in the observed and forecasted period -- for + # example, when a monthly forecast is generated mid-month -- the legacy + # format only records the forecasted value, not the observed value. To + # account for this, we'll just find the max of the "mean" (forecasted) and + # "value" (observed) data. In all non-overlapping observed periods, the + # forecasted value will be NULL. In all non-overlapping forecasted periods, + # the observed value will be NULL. In overlapping periods, the forecasted + # value will always be larger because it is the sum of the observed and forecasted + # values. Below is a query that demonstrates the legacy behavior: + # + # SELECT * + # FROM `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1` + # WHERE asofdate = "2023-12-31" + # AND target = "mobile" + # AND unit = "month" + # AND forecast_date = "2022-06-04" + # AND date BETWEEN "2022-05-01" AND "2022-06-01" + # ORDER BY date + df["value"] = df[["mean", "value"]].max(axis=1) + df.drop(columns=["mean"], inplace=True) + + # non-numeric columns are represented in the legacy bq schema as strings + string_cols = [ + "asofdate", + "date", + "target", + "unit", + "forecast_parameters", + "forecast_date", + ] + df[string_cols] = df[string_cols].astype(str) + + return df + + +def write_results( + summary_df: pd.DataFrame, + summary_df_legacy: pd.DataFrame, + forecast_df_legacy: pd.DataFrame, + project: str, + dataset: str, + table: str, + project_legacy: str, + dataset_legacy: str, + forecast_table_legacy: str, + confidences_table_legacy: str, + write_disposition: str = "WRITE_APPEND", +) -> None: + """ + Write `self.summary_df` to Big Query. + + Args: + project (str): The Big Query project that the data should be written to. + dataset (str): The Big Query dataset that the data should be written to. + table (str): The Big Query table that the data should be written to. + write_disposition (str): In the event that the destination table exists, + should the table be overwritten ("WRITE_TRUNCATE") or appended to + ("WRITE_APPEND")? + """ - job = client.load_table_from_dataframe( - dataframe=self.forecast_df_legacy, - destination=f"{project_legacy}.{dataset_legacy}.{forecast_table_legacy}", - job_config=bigquery.LoadJobConfig( - write_disposition=write_disposition, - schema=[ - bigquery.SchemaField("ds", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_date", bq_types.STRING), - bigquery.SchemaField("forecast_parameters", bq_types.STRING), - ], - ), - ) - job.result() - - job = client.load_table_from_dataframe( - dataframe=self.summary_df_legacy, - destination=f"{project_legacy}.{dataset_legacy}.{confidences_table_legacy}", - job_config=bigquery.LoadJobConfig( - write_disposition=write_disposition, - schema=[ - bigquery.SchemaField("asofdate", bq_types.STRING), - bigquery.SchemaField("date", bq_types.STRING), - ], - ), - ) - job.result() + print(f"Writing results to `{project}.{dataset}.{table}`.", flush=True) + client = bigquery.Client(project=project) + schema = [ + bigquery.SchemaField("submission_date", bq_types.DATE), + bigquery.SchemaField("aggregation_period", bq_types.STRING), + bigquery.SchemaField("source", bq_types.STRING), + bigquery.SchemaField("measure", bq_types.STRING), + bigquery.SchemaField("value", bq_types.FLOAT), + bigquery.SchemaField("metric_alias", bq_types.STRING), + bigquery.SchemaField("metric_hub_app_name", bq_types.STRING), + bigquery.SchemaField("metric_hub_slug", bq_types.STRING), + bigquery.SchemaField("metric_start_date", bq_types.DATE), + bigquery.SchemaField("metric_end_date", bq_types.DATE), + bigquery.SchemaField("metric_collected_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_start_date", bq_types.DATE), + bigquery.SchemaField("forecast_end_date", bq_types.DATE), + bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_predicted_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_parameters", bq_types.STRING), + ] + job = client.load_table_from_dataframe( + dataframe=summary_df, + destination=f"{project}.{dataset}.{table}", + job_config=bigquery.LoadJobConfig( + schema=schema, + autodetect=False, + write_disposition=write_disposition, + ), + ) + # Wait for the job to complete. + job.result() + + # TODO: remove the below jobs once the forecasting data model is updated: + # https://mozilla-hub.atlassian.net/browse/DS-2676 + + job = client.load_table_from_dataframe( + dataframe=forecast_df_legacy, + destination=f"{project_legacy}.{dataset_legacy}.{forecast_table_legacy}", + job_config=bigquery.LoadJobConfig( + write_disposition=write_disposition, + schema=[ + bigquery.SchemaField("ds", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_date", bq_types.STRING), + bigquery.SchemaField("forecast_parameters", bq_types.STRING), + ], + ), + ) + job.result() + + job = client.load_table_from_dataframe( + dataframe=summary_df_legacy, + destination=f"{project_legacy}.{dataset_legacy}.{confidences_table_legacy}", + job_config=bigquery.LoadJobConfig( + write_disposition=write_disposition, + schema=[ + bigquery.SchemaField("asofdate", bq_types.STRING), + bigquery.SchemaField("date", bq_types.STRING), + ], + ), + ) + job.result() diff --git a/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py b/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py index 8ae622bf..8352242f 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py +++ b/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py @@ -17,8 +17,23 @@ def aggregate_to_period( period: str, aggregation: callable = np.sum, date_col: str = "submission_date", + additional_aggregation_columns: list = [], ) -> pd.DataFrame: - """Floor dates to the correct period and aggregate.""" + """aggregates a dataframe to a period within any additional columns specified + + Args: + df (pd.DataFrame): dataframe to aggregate to. Must have a date column + with the name specified in the date_col argument + period (str): period to aggregate the datat to + aggregation (callable, optional): function to use to aggergate. Defaults to np.sum. + date_col (str, optional): column in the dataframe that contains the date + information used in aggregation. Defaults to "submission_date". + additional_aggregation_columns (list, optional): Additional columns + within which the date aggregation should occur. Defaults to []. + + Returns: + pd.DataFrame: _description_ + """ if period.lower() not in ["day", "month", "year"]: raise ValueError( f"Don't know how to floor dates by {period}. Please use 'day', 'month', or 'year'." @@ -27,9 +42,15 @@ def aggregate_to_period( x = df.copy(deep=True) x[date_col] = pd.to_datetime(x[date_col]).dt.to_period(period[0]).dt.to_timestamp() + aggregation_cols = [date_col] + additional_aggregation_columns # treat numeric and string types separately - x_string = x.select_dtypes(include=["datetime64", object]) - x_numeric = x.select_dtypes(include=["float", "int", "datetime64"]) + x_no_aggregation_cols = x[[el for el in x.columns if el not in aggregation_cols]] + x_string = x_no_aggregation_cols.select_dtypes(include=["datetime64", object]) + x_numeric = x_no_aggregation_cols.select_dtypes(include=["float", "int"]) + + # put aggergation columns back into x_numeric so groupby works + x_numeric = x[list(x_numeric.columns) + aggregation_cols] + x_string = x[list(x_string.columns) + aggregation_cols] if set(x_string.columns) | set(x_numeric.columns) != set(x.columns): missing_columns = set(x.columns) - ( @@ -40,7 +61,7 @@ def aggregate_to_period( f"Columns do not have string or numeric type: {missing_columns_str}" ) - x_numeric_agg = x_numeric.groupby(date_col).agg(aggregation).reset_index() + x_numeric_agg = x_numeric.groupby(aggregation_cols).agg(aggregation).reset_index() # all values of x_string should be the same because it is just the dimensions x_string_agg = x_string.drop_duplicates().reset_index(drop=True) @@ -51,7 +72,5 @@ def aggregate_to_period( ) # unique preseves order so we should be fine to concat - output_df = pd.concat( - [x_numeric_agg, x_string_agg.drop(columns=[date_col])], axis=1 - ) + output_df = x_numeric_agg.merge(x_string_agg, on=aggregation_cols) return output_df diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py index bfea0e5a..32c0cd1a 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -1,15 +1,14 @@ -from typing import List -import collections -from datetime import date, datetime +from datetime import date from dateutil.relativedelta import relativedelta +from dataclasses import dataclass + import pytest import pandas as pd import numpy as np -from datetime import timedelta, timezone -from kpi_forecasting.models.base_forecast import BaseForecast +from kpi_forecasting.models.base_forecast import BaseForecast, BaseEnsembleForecast # Arbitrarily choose some date to use for the tests TEST_DATE = date(2024, 1, 1) @@ -18,6 +17,7 @@ TEST_DATE_NEXT_DAY_STR = TEST_DATE_NEXT_DAY.strftime("%Y-%m-%d") TEST_PREDICT_END = TEST_DATE + relativedelta(months=2) TEST_PREDICT_END_STR = TEST_PREDICT_END.strftime("%Y-%m-%d") +TEST_OBSERVED_START = date(2023, 1, 1) class BadClass(BaseForecast): @@ -27,233 +27,522 @@ class BadClass(BaseForecast): @pytest.fixture() def good_class(): class GoodModel: - def __init__(self): + def __init__(self, id, factor): + self.id = id self.is_fit = False + self.factor = factor def fit(self, observed_data): - self.is_fit = max(observed_data["submission_date"]) + self.is_fit = min(observed_data["submission_date"]) + + def predict(self, forecast_data): + forecast_data = forecast_data.copy() + start_at = 2 - len(forecast_data) + forecast_data["value"] = np.array([1, 2])[start_at:] * self.factor + return forecast_data + @dataclass class GoodClass(BaseForecast): + id: str = None + seed_set: bool = False + factor: int = 1 + # overwrite _get_observed_data - def _get_observed_data(self): - self.observed_df = pd.DataFrame( - { - "submission_date": [ - TEST_DATE, - TEST_DATE - - relativedelta(years=1), # just an arbitrary date in the past - ] - } - ) - - def _fit(self, observed_df: np.array) -> None: + def _set_seed(self): + self.seed_set = True + return + + def fit(self, observed_df: pd.DataFrame) -> None: # takes array as input to simplify tests - self.model = GoodModel() + self.model = GoodModel(self.id, self.factor) self.model.fit(observed_df) - def _predict(self, dates_to_predict: np.array) -> pd.DataFrame: + def predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: # takes array as input to simplify tests - return dates_to_predict * 2 + return self.model.predict(dates_to_predict) - def _validate_forecast_df(self, forecast_df: np.array) -> None: + def _validate_forecast_df(self, forecast_df: pd.DataFrame) -> None: # takes array as input to simplify tests # check that all are even after _predict runs assert np.all(forecast_df % 2 == 0) - def _summarize( - self, - forecast_df: np.array, - observed_df: np.array, - period: str, - numpy_aggregations: List[str], - percentiles: List[str], - ) -> pd.DataFrame: - # input types changes to simplify test - np_func = getattr(np, numpy_aggregations[0]) - agg_val = np_func(forecast_df + observed_df) - return pd.DataFrame( - [{"number": agg_val, "period": period, "percentiles": percentiles[0]}] - ) + def _get_parameters(self): + return {"id": self.id, "factor": self.factor} return GoodClass -def test_not_implemented(): +def test_forecast_not_implemented(): with pytest.raises( TypeError, - match="Can't instantiate abstract class BadClass with abstract methods _fit, _predict, _summarize, _validate_forecast_df", + match="Can't instantiate abstract class BadClass with abstract methods _set_seed, _validate_forecast_df, fit, predict", ): _ = BadClass() -def test_post_init(good_class): - start_date = TEST_DATE_STR - end_date = TEST_PREDICT_END_STR - good_class = good_class( - model_type="test", - parameters={}, - use_all_us_holidays=None, - start_date=start_date, - end_date=end_date, - metric_hub=None, +def test_fit(good_class): + """test the fit method, and implicitly the set_segment_models method""" + A1_start_date = "2018-01-01" + A2_start_date = "2020-02-02" + parameter_list = [ + {"segment": {"a": "A1"}, "parameters": {"id": "This is A1"}}, + {"segment": {"a": "A2"}, "parameters": {"id": "This is A2"}}, + ] + + EnsembleObject = BaseEnsembleForecast( + model_class=good_class, parameters=parameter_list, segments=["a", "b"] ) - dates_to_predict_expected = pd.DataFrame( + + observed_data = pd.DataFrame( { - "submission_date": pd.date_range( - pd.to_datetime(start_date), pd.to_datetime(end_date) - ).date + "a": ["A1", "A1", "A2", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2", "B2"], + "submission_date": [ + A1_start_date, + A1_start_date, + A2_start_date, + A2_start_date, + A2_start_date, + ], } ) - assert good_class.dates_to_predict.equals(dates_to_predict_expected) - -def test_post_init_exception(good_class): - start_date = TEST_DATE_STR - end_date = TEST_PREDICT_END_STR - with pytest.raises( - ValueError, - match="forecast start_date set while predict_historical_dates is True", - ): - _ = good_class( - model_type="test", - parameters={}, - use_all_us_holidays=None, - start_date=start_date, - end_date=end_date, - metric_hub=None, - predict_historical_dates=True, - ) - - -def test_post_init_default_dates(good_class): - # check default start and end time - good_class = good_class( - model_type="test", - parameters={}, - use_all_us_holidays=None, - start_date="", - end_date="", - metric_hub=None, + EnsembleObject.fit(observed_data) + + segment_models = EnsembleObject.segment_models + + # put the segments and the start date in the same dictionary to make + # comparison easier + # the important things to check is that all possible combinations + # of segments are present and that each has the parameters set properly + # start_date is a stand-in for these parameters and + # is determined by the value of a as specified in parameter_dict + check_segment_models = [ + dict(**el["segment"], **{"id": el["model"].id}) for el in segment_models + ] + + expected = [ + {"a": "A1", "b": "B1", "id": "This is A1"}, + {"a": "A1", "b": "B2", "id": "This is A1"}, + {"a": "A2", "b": "B1", "id": "This is A2"}, + {"a": "A2", "b": "B2", "id": "This is A2"}, + ] + + # can't make a set of dicts for comparison + # so sort the lists and compare each element + compare_sorted = zip( + sorted(check_segment_models, key=lambda x: (x["a"], x["b"])), + sorted(expected, key=lambda x: (x["a"], x["b"])), ) - # this is the max date of the self.observed_data['submission_date'] plus one day - # from the object definion - start_date = TEST_DATE_NEXT_DAY - end_date = ( - datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78) - ).date() - dates_to_predict_expected = pd.DataFrame( - {"submission_date": pd.date_range(start_date, end_date).date} + + for checkval, expectedval in compare_sorted: + assert checkval == expectedval + + # test that the seed was set for all models during fitting + assert all([el["model"].seed_set for el in segment_models]) + + # test that the fit was applied properly to all models + # to do this check the is_fit attribute, which will equal + # A1_start_date for A1 segments and A2_start_date for A2 segments + + for segment in segment_models: + if segment["segment"]["a"] == "A1": + assert segment["model"].model.is_fit == A1_start_date + else: + assert segment["model"].model.is_fit == A2_start_date + + +def test_fit_multiple(good_class): + """test the fit method + with segments on multiple columns. + Implicitly testing set_segment_models with multiple + segments as well""" + # set arbitrary dates + # they're only used to make sure segments are set correctly + A1B1_start_date = "2018-01-01" + A1B2_start_date = "2019-01-01" + A2B1_start_date = "2020-02-02" + A2B2_start_date = "2021-02-02" + parameter_list = [ + { + "segment": {"a": "A1", "b": "B1"}, + "parameters": {"id": "This is A1B1"}, + }, + { + "segment": {"a": "A1", "b": "B2"}, + "parameters": {"id": "This is A1B2"}, + }, + { + "segment": {"a": "A2", "b": "B1"}, + "parameters": {"id": "This is A2B1"}, + }, + { + "segment": {"a": "A2", "b": "B2"}, + "parameters": {"id": "This is A2B2"}, + }, + ] + + EnsembleObject = BaseEnsembleForecast( + model_class=good_class, parameters=parameter_list, segments=["a", "b"] ) - assert good_class.dates_to_predict.equals(dates_to_predict_expected) - - -def test_post_init_default_dates_historical(good_class): - # check default start and end time - good_class = good_class( - model_type="test", - parameters={}, - use_all_us_holidays=None, - start_date="", - end_date="", - metric_hub=None, - predict_historical_dates=True, + + observed_data = pd.DataFrame( + { + "a": ["A1", "A1", "A2", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2", "B2"], + "submission_date": [ + A1B1_start_date, + A1B2_start_date, + A2B1_start_date, + A2B2_start_date, + A2B2_start_date, + ], + } ) - # this is the min date of the observed data - start_date = TEST_DATE - relativedelta(years=1) - end_date = ( - datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78) - ).date() - dates_to_predict_expected = pd.DataFrame( - {"submission_date": pd.date_range(start_date, end_date).date} + + EnsembleObject.fit(observed_data) + + segment_models = EnsembleObject.segment_models + + # put the segments and the start date in the same dictionary to make + # comparison easier + # the important things to check is that all possible combinations + # of segments are present and that each has the parameters set properly + # start_date is a stand-in for these parameters and + # is determined by the value of a as specified in parameter_dict + check_segment_models = [ + dict(**el["segment"], **{"id": el["model"].id}) for el in segment_models + ] + expected = [ + {"a": "A1", "b": "B1", "id": "This is A1B1"}, + {"a": "A1", "b": "B2", "id": "This is A1B2"}, + {"a": "A2", "b": "B1", "id": "This is A2B1"}, + {"a": "A2", "b": "B2", "id": "This is A2B2"}, + ] + + # can't make a set of dicts for comparison + # so sort the lists and compare each element + compare_sorted = zip( + sorted(check_segment_models, key=lambda x: (x["a"], x["b"])), + sorted(expected, key=lambda x: (x["a"], x["b"])), ) - assert good_class.dates_to_predict.equals(dates_to_predict_expected) + for checkval, expectedval in compare_sorted: + assert checkval == expectedval + + # test that the seed was set for all models during fitting + assert all([el["model"].seed_set for el in segment_models]) + + # test that the fit was applied properly to all models + # to do this check the is_fit attribute, which will equal + # A1_start_date for A1 segments and A2_start_date for A2 segments + + for segment in segment_models: + if segment["segment"]["a"] == "A1" and segment["segment"]["b"] == "B1": + assert segment["model"].model.is_fit == A1B1_start_date + elif segment["segment"]["a"] == "A1" and segment["segment"]["b"] == "B2": + assert segment["model"].model.is_fit == A1B2_start_date + elif segment["segment"]["a"] == "A2" and segment["segment"]["b"] == "B1": + assert segment["model"].model.is_fit == A2B1_start_date + else: + assert segment["model"].model.is_fit == A2B2_start_date + + +def test_fit_multiple_with_start(good_class): + """test the fit method + with segments on multiple columns. + Implicitly testing set_segment_models with multiple + segments as well""" + parameter_list = [ + { + "segment": {"a": "A1", "b": "B1"}, + "parameters": {"id": "This is A1B1"}, + }, + { + "segment": {"a": "A1", "b": "B2"}, + "parameters": {"id": "This is A1B2"}, + "start_date": TEST_DATE_NEXT_DAY_STR, + }, + { + "segment": {"a": "A2", "b": "B1"}, + "parameters": {"id": "This is A2B1"}, + }, + { + "segment": {"a": "A2", "b": "B2"}, + "parameters": {"id": "This is A2B2"}, + "start_date": TEST_DATE_NEXT_DAY_STR, + }, + ] + + EnsembleObject = BaseEnsembleForecast( + model_class=good_class, parameters=parameter_list, segments=["a", "b"] + ) -def test_fit(good_class): - good_class = good_class( - model_type="test", - parameters={}, - use_all_us_holidays=None, - start_date=TEST_DATE_STR, - end_date=TEST_PREDICT_END_STR, - metric_hub=None, + # every segment has two days, TEST_DATE and TEST_DATE_NEXT_DAY + observed_data = pd.DataFrame( + [ + {"a": "A1", "b": "B1", "submission_date": TEST_DATE}, + {"a": "A1", "b": "B1", "submission_date": TEST_DATE_NEXT_DAY}, + {"a": "A1", "b": "B2", "submission_date": TEST_DATE}, + {"a": "A1", "b": "B2", "submission_date": TEST_DATE_NEXT_DAY}, + {"a": "A2", "b": "B1", "submission_date": TEST_DATE}, + {"a": "A2", "b": "B1", "submission_date": TEST_DATE_NEXT_DAY}, + {"a": "A2", "b": "B2", "submission_date": TEST_DATE}, + {"a": "A2", "b": "B2", "submission_date": TEST_DATE_NEXT_DAY}, + ] ) - good_class.fit() - assert good_class.model - # model sets is_fit to the largest day in the observed data - assert good_class.model.is_fit == TEST_DATE + EnsembleObject.fit(observed_data) + + segment_models = EnsembleObject.segment_models + + # put the segments and the start date in the same dictionary to make + # comparison easier + # the important things to check is that all possible combinations + # of segments are present and that each has the parameters set properly + # start_date is a stand-in for these parameters and + # is determined by the value of a as specified in parameter_dict + check_segment_models = [ + dict(**el["segment"], **{"id": el["model"].id}) for el in segment_models + ] + expected = [ + {"a": "A1", "b": "B1", "id": "This is A1B1"}, + {"a": "A1", "b": "B2", "id": "This is A1B2"}, + {"a": "A2", "b": "B1", "id": "This is A2B1"}, + {"a": "A2", "b": "B2", "id": "This is A2B2"}, + ] + + # can't make a set of dicts for comparison + # so sort the lists and compare each element + compare_sorted = zip( + sorted(check_segment_models, key=lambda x: (x["a"], x["b"])), + sorted(expected, key=lambda x: (x["a"], x["b"])), + ) + for checkval, expectedval in compare_sorted: + assert checkval == expectedval + + # test that the seed was set for all models during fitting + assert all([el["model"].seed_set for el in segment_models]) + + # test that the fit was applied properly to the time-filtered data + # to do this check the is_fit attribute, which will equal + # the earliest date. For B1 it is TEST_DATE + # B2 has start_date set to TEST_DATE_NEXT_DAY, so it will have that value + + for segment in segment_models: + if segment["segment"]["b"] == "B1": + assert segment["model"].model.is_fit == TEST_DATE + else: + assert segment["model"].model.is_fit == TEST_DATE_NEXT_DAY + + +def test_set_segment_models_exception(mocker): + """test the exception for segment_models where + and exception is raised if a model_setting_split_dim + is specified that isn't in the data""" + A1_start_date = "2018-01-01" + A2_start_date = "2020-02-02" + parameter_list = [ + {"segment": {"c": "A1"}, "parameters": {"id": "This is A1"}}, + {"segment": {"c": "A2"}, "parameters": {"id": "This is A2"}}, + ] + EnsembleObject = BaseEnsembleForecast( + model_class=good_class, parameters=parameter_list, segments=["a", "b"] + ) -def test_predict_and_validate(good_class): - good_class = good_class( - model_type="test", - parameters={}, - use_all_us_holidays=None, - start_date=TEST_DATE_STR, - end_date=TEST_PREDICT_END_STR, - metric_hub=None, + observed_data = pd.DataFrame( + { + "a": ["A1", "A1", "A2", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2", "B2"], + "submission_date": [ + A1_start_date, + A1_start_date, + A2_start_date, + A2_start_date, + A2_start_date, + ], + } ) - # overwrite date range set in __post_init__ - good_class.dates_to_predict = np.arange(10) - good_class.predict() - assert np.all(good_class.forecast_df == good_class.dates_to_predict * 2) - - -def test_summarize(good_class): - good_class = good_class( - model_type="test", - parameters={}, - use_all_us_holidays=None, - start_date=TEST_DATE_STR, - end_date=TEST_PREDICT_END_STR, - metric_hub=None, + + with pytest.raises( + ValueError, + match="Segment keys missing from metric hub segments: c", + ): + EnsembleObject.fit(observed_data) + + +def test_predict(good_class): + """test the predict""" + parameter_list = [ + { + "segment": {"a": "A1", "b": "B1"}, + "parameters": {"id": "This is A1B1", "factor": 4}, + }, + { + "segment": {"a": "A1", "b": "B2"}, + "parameters": {"id": "This is A1B2", "factor": 6}, + }, + { + "segment": {"a": "A2", "b": "B1"}, + "parameters": {"id": "This is A2B1", "factor": 8}, + }, + { + "segment": {"a": "A2", "b": "B2"}, + "parameters": {"id": "This is A2B2", "factor": 10}, + }, + ] + + EnsembleObject = BaseEnsembleForecast( + model_class=good_class, parameters=parameter_list, segments=["a", "b"] ) - good_class.forecast_df = np.array([1, 2]) - good_class.observed_df = np.array([3, 4]) - MetricHub = collections.namedtuple( - "MetricHub", - ["alias", "app_name", "slug", "min_date", "max_date"], + + # submission date doesn't matter here + observed_data = pd.DataFrame( + { + "a": ["A1", "A1", "A2", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2", "B2"], + "submission_date": [ + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + ], + } ) - dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR) + EnsembleObject.fit(observed_data) - # add it here rather than in __init__ so it doesn't try to load data - good_class.metric_hub = dummy_metric_hub - good_class.trained_at = "" - good_class.predicted_at = "" + # pass submission_date as a float for the purpose of testing + # this is fine because no time filtering happens in the predict of + # BaseEnsembleForecast or the dummy class and model + predict_df = pd.DataFrame({"submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY]}) + output_df = EnsembleObject.predict(predict_df) - number_val = 10 - output = good_class.summarize( - periods=["a", "b", "c"], numpy_aggregations=["sum"], percentiles=["percentiles"] - ) - expected_output = pd.DataFrame( + expected_df = pd.DataFrame( [ - {"number": number_val, "period": el, "percentiles": "percentiles"} - for el in ["a", "b", "c"] + {"a": "A1", "b": "B1", "value": 1 * 4, "submission_date": TEST_DATE}, + { + "a": "A1", + "b": "B1", + "value": 2 * 4, + "submission_date": TEST_DATE_NEXT_DAY, + }, + {"a": "A1", "b": "B2", "value": 1 * 6, "submission_date": TEST_DATE}, + { + "a": "A1", + "b": "B2", + "value": 2 * 6, + "submission_date": TEST_DATE_NEXT_DAY, + }, + {"a": "A2", "b": "B1", "value": 1 * 8, "submission_date": TEST_DATE}, + { + "a": "A2", + "b": "B1", + "value": 2 * 8, + "submission_date": TEST_DATE_NEXT_DAY, + }, + {"a": "A2", "b": "B2", "value": 1 * 10, "submission_date": TEST_DATE}, + { + "a": "A2", + "b": "B2", + "value": 2 * 10, + "submission_date": TEST_DATE_NEXT_DAY, + }, ] ) - # not going to check all the metadata columns - # in assert_frame_equal. Just make sure they're there - metadata_columns = { - "metric_alias", - "metric_hub_app_name", - "metric_hub_slug", - "metric_start_date", - "metric_end_date", - "metric_collected_at", - "forecast_start_date", - "forecast_end_date", - "forecast_trained_at", - "forecast_predicted_at", - "forecast_parameters", - } - assert set(expected_output.columns) | metadata_columns == set(output.columns) pd.testing.assert_frame_equal( - output[expected_output.columns].reset_index(drop=True), expected_output + output_df[["a", "b", "value", "submission_date"]].reset_index(drop=True), + expected_df, + ) + + +def test_predict_with_start(good_class): + """test the predict""" + # set B2 parameters to filter out TEST_DATE + parameter_list = [ + { + "segment": {"a": "A1", "b": "B1"}, + "parameters": {"id": "This is A1B1", "factor": 4}, + }, + { + "segment": {"a": "A1", "b": "B2"}, + "parameters": { + "id": "This is A1B2", + "factor": 6, + }, + "start_date": TEST_DATE_NEXT_DAY_STR, + }, + { + "segment": {"a": "A2", "b": "B1"}, + "parameters": {"id": "This is A2B1", "factor": 8}, + }, + { + "segment": {"a": "A2", "b": "B2"}, + "parameters": {"id": "This is A2B2", "factor": 10}, + "start_date": TEST_DATE_NEXT_DAY_STR, + }, + ] + + EnsembleObject = BaseEnsembleForecast( + model_class=good_class, parameters=parameter_list, segments=["a", "b"] + ) + + observed_data = pd.DataFrame( + { + "a": ["A1", "A1", "A2", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2", "B2"], + "submission_date": [ + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + ], + } + ) + + EnsembleObject.fit(observed_data) + + # pass submission_date as a float for the purpose of testing + # this is fine because no time filtering happens in the predict of + # BaseEnsembleForecast or the dummy class and model + predict_df = pd.DataFrame({"submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY]}) + output_df = EnsembleObject.predict(predict_df) + + expected_df = pd.DataFrame( + [ + {"a": "A1", "b": "B1", "value": 1 * 4, "submission_date": TEST_DATE}, + { + "a": "A1", + "b": "B1", + "value": 2 * 4, + "submission_date": TEST_DATE_NEXT_DAY, + }, + { + "a": "A1", + "b": "B2", + "value": 2 * 6, + "submission_date": TEST_DATE_NEXT_DAY, + }, + {"a": "A2", "b": "B1", "value": 1 * 8, "submission_date": TEST_DATE}, + { + "a": "A2", + "b": "B1", + "value": 2 * 8, + "submission_date": TEST_DATE_NEXT_DAY, + }, + { + "a": "A2", + "b": "B2", + "value": 2 * 10, + "submission_date": TEST_DATE_NEXT_DAY, + }, + ] ) pd.testing.assert_frame_equal( - good_class.summary_df[expected_output.columns].reset_index(drop=True), - expected_output, + output_df[["a", "b", "value", "submission_date"]].reset_index(drop=True), + expected_df, ) diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py index 6e43e409..cbe2a42e 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py @@ -1,16 +1,22 @@ """tests for the funnel forecast module""" -import collections -from datetime import date, datetime +from datetime import date from dateutil.relativedelta import relativedelta import pandas as pd import pytest import numpy as np +import json -from kpi_forecasting.configs.model_inputs import ProphetRegressor, ProphetHoliday -from kpi_forecasting.models.funnel_forecast import SegmentModelSettings, FunnelForecast +from kpi_forecasting.models.funnel_forecast import ( + ProphetAutotunerForecast, + FunnelForecast, + combine_forecast_observed, + summarize_with_parameters, + summarize, +) +from kpi_forecasting.models.prophet_forecast import ProphetForecast # Arbitrarily choose some date to use for the tests TEST_DATE = date(2024, 1, 1) @@ -21,101 +27,16 @@ TEST_PREDICT_END_STR = TEST_PREDICT_END.strftime("%Y-%m-%d") -@pytest.fixture() -def forecast(): - """This mocks a generic forecast object""" - # 2024-01-01 is arbitarily chosen as a future date - predict_start_date = TEST_DATE_STR - predict_end_date = TEST_PREDICT_END_STR - - forecast = FunnelForecast( - model_type="test", - parameters={}, - use_all_us_holidays=None, - start_date=predict_start_date, - end_date=predict_end_date, - metric_hub=None, - ) - return forecast - - -@pytest.fixture() -def segment_info_fit_tests(): - """This fixture creates segment info dictionaries - that mimic the content of the config file and are used - in the functions that test fit methods""" - - # 2024-01-01 is arbitarily chosen as a future date - A1_start_date = TEST_DATE_STR - A2_start_date = TEST_DATE_NEXT_DAY_STR - - segment_info_dict = { - "A1": { - "start_date": A1_start_date, - "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, - "min_param_value": 10, - }, - "A2": { - "start_date": A2_start_date, - "grid_parameters": {"param1": [-1, -2], "param2": [3, 4]}, - "min_param_value": -3, # closest to zero - }, - } - return segment_info_dict - - -@pytest.fixture() -def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker): - """This method creates a forecast object from the segment dict - created in the segment_info_fit_tests fixture. It also - mocks some of the object methods to enable easier testing""" - parameter_list = [ - { - "segment": {"a": "A1"}, - "start_date": segment_info_fit_tests["A1"]["start_date"], - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": segment_info_fit_tests["A1"]["grid_parameters"], - "cv_settings": {}, - }, - { - "segment": {"a": "A2"}, - "start_date": segment_info_fit_tests["A2"]["start_date"], - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": segment_info_fit_tests["A2"]["grid_parameters"], - "cv_settings": {}, - }, - ] - - predict_start_date = TEST_DATE_STR - predict_end_date = TEST_DATE_NEXT_DAY_STR - - forecast = FunnelForecast( - model_type="test", - parameters=parameter_list, - use_all_us_holidays=None, - start_date=predict_start_date, - end_date=predict_end_date, - metric_hub=None, - ) - - mocker.patch.object(forecast, "_build_model", mock_build_model) - mocker.patch.object( - forecast, "_get_crossvalidation_metric", mock_get_crossvalidation_metric - ) - - return forecast - - class MockModel: """Used in place of prophet.Prophet for testing purposes""" - def __init__(self, param1=0, param2=0): - self.value = param1 * param2 + def __init__(self, seasonality_prior_scale=0, holidays_prior_scale=0, growth=None): + # arbitrarily choose a few parameters from ProphetForecast to use + self.seasonality_prior_scale = seasonality_prior_scale + self.holidays_prior_scale = holidays_prior_scale + self.value = seasonality_prior_scale * holidays_prior_scale self.history = None + self.growth = growth def fit(self, df, *args, **kwargs): self.history = df @@ -148,40 +69,124 @@ def predictive_samples(self, dates_to_predict): return {"yhat": {0: output}} -def mock_build_model(segment_settings, parameters): +def mock_build_model(self): """mocks the FunnelForecast build_model method""" return MockModel( - **parameters, + seasonality_prior_scale=self.seasonality_prior_scale, + holidays_prior_scale=self.holidays_prior_scale, + growth=self.growth, ) -def mock_get_crossvalidation_metric(m, *args, **kwargs): +def mock_get_crossvalidation_metric(self, m, *args, **kwargs): """mocks the FunnelForecast get_crossvalidation_metric method, meant to be used with MockModel""" - return m.value # value atrribute in MockModel + return m.model.value # value atrribute in MockModel + + +def test_combine_forecast_observed(): + """tests the _combine_forecast_observed method""" + + forecast_df = pd.DataFrame( + [ + { + "submission_date": TEST_DATE, + "a": "A1", + "forecast_parameters": "blah", + "value": 0, + "value_low": 0, + "value_mid": 0, + "value_high": 0, + }, + { + "submission_date": TEST_DATE_NEXT_DAY, + "a": "A1", + "forecast_parameters": "blah", + "value": 0, + "value_low": 0, + "value_mid": 0, + "value_high": 0, + }, + ] + ) + observed_df = pd.DataFrame( + [ + { + "submission_date": TEST_DATE - relativedelta(days=2), + "value": 5, + "a": "A1", + }, + { + "submission_date": TEST_DATE - relativedelta(days=1), + "value": 6, + "a": "A1", + }, + ] + ) -def mock_aggregate_forecast_observed( - forecast_df, observed_df, period, numpy_aggregations, percentiles -): - """Mocks the aggregate_forecast_observed function defined in ProphetForecast - and inherited in FunnelForecast. - This function is tested extensively in test_prophet_forecast - so we can make dummy outputs for tests related to it""" + output_df = combine_forecast_observed( + forecast_df, + observed_df, + ) - # add dummy columns where aggregated metrics woudl go - percentile_columns = [f"p{el}" for el in percentiles] - output_forecast_df = forecast_df.copy() - output_forecast_df[numpy_aggregations + percentile_columns] = 0 - return output_forecast_df, observed_df.copy() + expected_df = pd.DataFrame( + [ + { + "submission_date": TEST_DATE, + "a": "A1", + "forecast_parameters": "blah", + "value": 0, + "value_low": 0.0, + "value_mid": 0.0, + "value_high": 0.0, + "source": "forecast", + }, + { + "submission_date": TEST_DATE_NEXT_DAY, + "a": "A1", + "forecast_parameters": "blah", + "value": 0, + "value_low": 0.0, + "value_mid": 0.0, + "value_high": 0.0, + "source": "forecast", + }, + { + "submission_date": TEST_DATE - relativedelta(days=2), + "a": "A1", + "forecast_parameters": np.nan, + "value": 5, + "value_low": np.nan, + "value_mid": np.nan, + "value_high": np.nan, + "source": "historical", + }, + { + "submission_date": TEST_DATE - relativedelta(days=1), + "a": "A1", + "forecast_parameters": np.nan, + "value": 6, + "value_low": np.nan, + "value_mid": np.nan, + "value_high": np.nan, + "source": "historical", + }, + ] + ) + assert set(expected_df.columns) == set(output_df.columns) -def test_combine_forecast_observed(mocker, forecast): - """tests the _combine_forecast_observed method""" - mocker.patch.object( - forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed + pd.testing.assert_frame_equal( + expected_df.sort_values(["source", "submission_date"]).reset_index(drop=True), + output_df[expected_df.columns] + .sort_values(["source", "submission_date"]) + .reset_index(drop=True), ) + +def test_summarize_with_parameters_no_overlap(): + """testing summarize_with_parameters""" forecast_df = pd.DataFrame( { "submission_date": [ @@ -191,396 +196,611 @@ def test_combine_forecast_observed(mocker, forecast): } ) + test_date_samples_A1 = np.arange(1000) + test_date_samples_A2 = np.arange(1000) * 10 + test_next_date_samples_A1 = np.arange(1000) * 2 + test_next_date_samples_A2 = np.arange(1000) * 20 + forecast_df = pd.DataFrame( + [ + { # this element will be filtered out because it occurs before the observed_data ends + **{ + "submission_date": TEST_DATE - relativedelta(days=2), + "a": "A1", + "forecast_parameters": "A1", + }, + **{i: 0 for i in range(1000)}, + }, + { + **{ + "submission_date": TEST_DATE, + "a": "A1", + "forecast_parameters": "A1", + }, + **{i: el for i, el in enumerate(test_date_samples_A1)}, + }, + { + **{ + "submission_date": TEST_DATE_NEXT_DAY, + "a": "A1", + "forecast_parameters": "A1", + }, + **{i: el for i, el in enumerate(test_next_date_samples_A1)}, + }, + { + **{ + "submission_date": TEST_DATE, + "a": "A2", + "forecast_parameters": "A2", + }, + **{i: el for i, el in enumerate(test_date_samples_A2)}, + }, + { + **{ + "submission_date": TEST_DATE_NEXT_DAY, + "a": "A2", + "forecast_parameters": "A2", + }, + **{i: el for i, el in enumerate(test_next_date_samples_A2)}, + }, + ] + ) + + # rows with negative values are those expected to be removed + # by filters in summarize observed_df = pd.DataFrame( { "submission_date": [ TEST_DATE - relativedelta(days=2), TEST_DATE - relativedelta(days=1), + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), ], - "a": ["A1", "A1"], - "value": [5, 6], + "a": ["A1", "A1", "A2", "A2"], + "value": [20, 30, 40, 50], } ) numpy_aggregations = ["mean"] percentiles = [10, 50, 90] - - output_df = forecast._combine_forecast_observed( + output_df = summarize_with_parameters( forecast_df=forecast_df, observed_df=observed_df, - period="period", + period="day", numpy_aggregations=numpy_aggregations, percentiles=percentiles, - segment={"a": "A1"}, + segment_cols=["a"], + ) + observed_expected_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), + ], + "a": ["A1", "A1", "A2", "A2"], + "value": [20, 30, 40, 50], + "value_low": [np.nan, np.nan, np.nan, np.nan], + "value_mid": [np.nan, np.nan, np.nan, np.nan], + "value_high": [np.nan, np.nan, np.nan, np.nan], + "source": ["historical", "historical", "historical", "historical"], + } ) - # mean was renamed to value, percentiles to high, medium, low - forecast_df[["value", "value_low", "value_mid", "value_high"]] = 0 - forecast_df["a"] = "A1" # this column is already present in observed - - forecast_df["source"] = "forecast" - observed_df["source"] = "historical" + forecast_summarized_expected_df = pd.DataFrame( + [ + { + "submission_date": TEST_DATE, + "a": "A1", + "forecast_parameters": "A1", + "value": np.mean(test_date_samples_A1), + "value_low": np.percentile(test_date_samples_A1, 10), + "value_mid": np.percentile(test_date_samples_A1, 50), + "value_high": np.percentile(test_date_samples_A1, 90), + "source": "forecast", + }, + { + "submission_date": TEST_DATE_NEXT_DAY, + "a": "A1", + "forecast_parameters": "A1", + "value": np.mean(test_next_date_samples_A1), + "value_low": np.percentile(test_next_date_samples_A1, 10), + "value_mid": np.percentile(test_next_date_samples_A1, 50), + "value_high": np.percentile(test_next_date_samples_A1, 90), + "source": "forecast", + }, + { + "submission_date": TEST_DATE, + "a": "A2", + "forecast_parameters": "A2", + "value": np.mean(test_date_samples_A2), + "value_low": np.percentile(test_date_samples_A2, 10), + "value_mid": np.percentile(test_date_samples_A2, 50), + "value_high": np.percentile(test_date_samples_A2, 90), + "source": "forecast", + }, + { + "submission_date": TEST_DATE_NEXT_DAY, + "a": "A2", + "forecast_parameters": "A2", + "value": np.mean(test_next_date_samples_A2), + "value_low": np.percentile(test_next_date_samples_A2, 10), + "value_mid": np.percentile(test_next_date_samples_A2, 50), + "value_high": np.percentile(test_next_date_samples_A2, 90), + "source": "forecast", + }, + ] + ) # concat in same order to make our lives easier - expected = pd.concat([observed_df, forecast_df]) - assert set(expected.columns) == set(output_df.columns) - pd.testing.assert_frame_equal(output_df, expected[output_df.columns]) + expected = pd.concat([observed_expected_df, forecast_summarized_expected_df]) + expected["aggregation_period"] = "day" + expected["submission_date"] = pd.to_datetime(expected["submission_date"]) - # should not be any nulls outside the metric column - non_metric_columns = [ - el - for el in output_df.columns - if el not in ["value", "value_low", "value_mid", "value_high"] - ] - assert not pd.isna(output_df[non_metric_columns]).any(axis=None) + assert set(expected.columns) == set(output_df.columns) + pd.testing.assert_frame_equal( + expected.sort_values(["source", "a", "submission_date"]).reset_index(drop=True), + output_df[expected.columns] + .sort_values(["source", "a", "submission_date"]) + .reset_index(drop=True), + ) -def test_under_summarize(mocker, forecast): - """testing _summarize""" - # 2024-01-01 is chosen as an arbitrary date to center the tests around - # forecast predictions are set with the - # mock_aggregate_forecast_observed function so they - # can be ommited here +def test_summarize_with_parameters_month_overlap(): + """testing summarize_with_parameters""" + test_date_samples_A1 = np.arange(1000) + test_date_samples_A2 = np.arange(1000) * 10 + test_next_date_samples_A1 = np.arange(1000) * 2 + test_next_date_samples_A2 = np.arange(1000) * 20 + # add a week to all the dates so they're in the same month as the observed + # but occur after so they won't get filtered out forecast_df = pd.DataFrame( - { - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } + [ + { # this element will be filtered out because it occurs before the observed_data ends + **{ + "submission_date": TEST_DATE - relativedelta(days=2), + "a": "A1", + "forecast_parameters": "A1", + }, + **{i: 0 for i in range(1000)}, + }, + { + **{ + "submission_date": TEST_DATE + relativedelta(days=7), + "a": "A1", + "forecast_parameters": "A1", + }, + **{i: el for i, el in enumerate(test_date_samples_A1)}, + }, + { + **{ + "submission_date": TEST_DATE_NEXT_DAY + relativedelta(days=7), + "a": "A1", + "forecast_parameters": "A1", + }, + **{i: el for i, el in enumerate(test_next_date_samples_A1)}, + }, + { + **{ + "submission_date": TEST_DATE + relativedelta(days=7), + "a": "A2", + "forecast_parameters": "A2", + }, + **{i: el for i, el in enumerate(test_date_samples_A2)}, + }, + { + **{ + "submission_date": TEST_DATE_NEXT_DAY + relativedelta(days=7), + "a": "A2", + "forecast_parameters": "A2", + }, + **{i: el for i, el in enumerate(test_next_date_samples_A2)}, + }, + ] ) # rows with negative values are those expected to be removed # by filters in summarize observed_df = pd.DataFrame( { - "submission_date": [ - TEST_DATE - relativedelta(months=1), - TEST_DATE - relativedelta(days=2), - TEST_DATE - relativedelta(days=1), - TEST_DATE - relativedelta(days=2), - TEST_DATE - relativedelta(days=1), - ], - "a": ["A1", "A1", "A1", "A2", "A2"], - "value": [10, 20, 30, 40, 50], + "submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY], + "a": ["A1", "A2"], + "value": [20, 30], } ) - SegmentSettings = collections.namedtuple( - "SegmentSettings", - ["start_date", "forecast_df", "segment", "trained_parameters"], - ) - dummy_segment_settings = SegmentSettings( - start_date=(TEST_DATE - relativedelta(days=2)).strftime("%Y-%m-%d"), - forecast_df=forecast_df.copy(), - segment={"a": "A1"}, - trained_parameters={"trained_parameters": "yes"}, - ) - - mocker.patch.object( - forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed - ) - - forecast.observed_df = observed_df - numpy_aggregations = ["mean"] percentiles = [10, 50, 90] - output_df = forecast._summarize( - segment_settings=dummy_segment_settings, - period="period", + output_df = summarize_with_parameters( + forecast_df=forecast_df, + observed_df=observed_df, + period="month", numpy_aggregations=numpy_aggregations, percentiles=percentiles, + segment_cols=["a"], ) observed_expected_df = pd.DataFrame( { - "submission_date": [ - TEST_DATE - relativedelta(days=2), - TEST_DATE - relativedelta(days=1), - ], - "a": ["A1", "A1"], + "submission_date": [TEST_DATE, TEST_DATE], + "a": ["A1", "A2"], "value": [20, 30], + "value_low": [np.nan, np.nan], + "value_mid": [np.nan, np.nan], + "value_high": [np.nan, np.nan], + "source": ["historical", "historical"], } ) - # percentile numeric values changed to names - # mean gets mapped to value - forecast_df[["value", "value_low", "value_mid", "value_high"]] = 0 - - forecast_df["a"] = "A1" # this column is already present in observed - - forecast_df["source"] = "forecast" - observed_expected_df["source"] = "historical" + forecast_summarized_expected_df = pd.DataFrame( + [ + { + "submission_date": TEST_DATE, + "a": "A1", + "forecast_parameters": "A1", + "value": np.mean(test_date_samples_A1 + test_next_date_samples_A1 + 20), + "value_low": np.percentile( + test_date_samples_A1 + test_next_date_samples_A1 + 20, 10 + ), + "value_mid": np.percentile( + test_date_samples_A1 + test_next_date_samples_A1 + 20, 50 + ), + "value_high": np.percentile( + test_date_samples_A1 + test_next_date_samples_A1 + 20, 90 + ), + "source": "forecast", + }, + { + "submission_date": TEST_DATE, + "a": "A2", + "forecast_parameters": "A2", + "value": np.mean(test_date_samples_A2 + test_next_date_samples_A2 + 30), + "value_low": np.percentile( + test_date_samples_A2 + test_next_date_samples_A2 + 30, 10 + ), + "value_mid": np.percentile( + test_date_samples_A2 + test_next_date_samples_A2 + 30, 50 + ), + "value_high": np.percentile( + test_date_samples_A2 + test_next_date_samples_A2 + 30, 90 + ), + "source": "forecast", + }, + ] + ) # concat in same order to make our lives easier - expected = pd.concat([observed_expected_df, forecast_df]) - expected["forecast_parameters"] = '{"trained_parameters": "yes"}' - expected["aggregation_period"] = "period" + expected = pd.concat([observed_expected_df, forecast_summarized_expected_df]) + expected["aggregation_period"] = "month" + expected["submission_date"] = pd.to_datetime(expected["submission_date"]) assert set(expected.columns) == set(output_df.columns) - # force value columns to be floats in both cases to make check easier - numeric_cols = ["value", "value_low", "value_mid", "value_high"] - expected[numeric_cols] = expected[numeric_cols].astype(float) - output_df[numeric_cols] = output_df[numeric_cols].astype(float) + pd.testing.assert_frame_equal( - output_df.reset_index(drop=True), - expected[output_df.columns].reset_index(drop=True), + expected.sort_values(["source", "a", "submission_date"]).reset_index(drop=True), + output_df[expected.columns] + .sort_values(["source", "a", "submission_date"]) + .reset_index(drop=True), ) - # should not be any nulls outside the metric column - non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] - assert not pd.isna(output_df[non_metric_columns]).any(axis=None) - -def test_summarize(mocker, forecast): +def test_summarize(): """testing summarize""" # create dummy metric hub object to when meta data from # it is added we don't get an error - MetricHub = collections.namedtuple( - "MetricHub", - ["alias", "app_name", "slug", "min_date", "max_date"], - ) - - dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR) - - # forecast predictions are set with the - # mock_aggregate_forecast_observed function so they - # can be ommited here + test_date_samples_A1 = np.arange(1000) + test_date_samples_A2 = np.arange(1000) * 10 + test_next_date_samples_A1 = np.arange(1000) * 2 + test_next_date_samples_A2 = np.arange(1000) * 20 + # add a week to all the dates so they're in the same month as the observed + # but occur after so they won't get filtered out forecast_df = pd.DataFrame( - { - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } + [ + { # this element will be filtered out because it occurs before the observed_data ends + **{ + "submission_date": TEST_DATE - relativedelta(days=2), + "a": "A1", + "forecast_parameters": "A1", + }, + **{i: 0 for i in range(1000)}, + }, + { + **{ + "submission_date": TEST_DATE + relativedelta(days=7), + "a": "A1", + "forecast_parameters": "A1", + }, + **{i: el for i, el in enumerate(test_date_samples_A1)}, + }, + { + **{ + "submission_date": TEST_DATE_NEXT_DAY + relativedelta(days=7), + "a": "A1", + "forecast_parameters": "A1", + }, + **{i: el for i, el in enumerate(test_next_date_samples_A1)}, + }, + { + **{ + "submission_date": TEST_DATE + relativedelta(days=7), + "a": "A2", + "forecast_parameters": "A2", + }, + **{i: el for i, el in enumerate(test_date_samples_A2)}, + }, + { + **{ + "submission_date": TEST_DATE_NEXT_DAY + relativedelta(days=7), + "a": "A2", + "forecast_parameters": "A2", + }, + **{i: el for i, el in enumerate(test_next_date_samples_A2)}, + }, + ] ) # rows with negative values are those expected to be removed # by filters in summarize observed_df = pd.DataFrame( { - "submission_date": [ - TEST_DATE - relativedelta(months=1), - TEST_DATE - relativedelta(days=2), - TEST_DATE - relativedelta(days=1), - TEST_DATE - relativedelta(days=2), - TEST_DATE - relativedelta(days=1), - ], - "a": ["A1", "A1", "A1", "A2", "A2"], - "value": [10, 20, 30, 40, 50], + "submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY], + "a": ["A1", "A2"], + "value": [20, 30], + } + ) + + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + output_df = summarize( + forecast_df=forecast_df, + observed_df=observed_df, + periods=["day", "month"], + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + segment_cols=["a"], + ) + observed_month_expected_df = pd.DataFrame( + { + "submission_date": [TEST_DATE, TEST_DATE], + "a": ["A1", "A2"], + "value": [20, 30], + "value_low": [np.nan, np.nan], + "value_mid": [np.nan, np.nan], + "value_high": [np.nan, np.nan], + "source": ["historical", "historical"], + "aggregation_period": "month", + } + ) + observed_day_expected_df = pd.DataFrame( + { + "submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY], + "a": ["A1", "A2"], + "value": [20, 30], + "value_low": [np.nan, np.nan], + "value_mid": [np.nan, np.nan], + "value_high": [np.nan, np.nan], + "source": ["historical", "historical"], + "aggregation_period": "day", } ) - SegmentSettings = collections.namedtuple( - "SegmentSettings", - ["start_date", "forecast_df", "segment", "trained_parameters", "components_df"], + forecast_month_summarized_expected_df = pd.DataFrame( + [ + { + "submission_date": TEST_DATE, + "a": "A1", + "forecast_parameters": "A1", + "value": np.mean(test_date_samples_A1 + test_next_date_samples_A1 + 20), + "value_low": np.percentile( + test_date_samples_A1 + test_next_date_samples_A1 + 20, 10 + ), + "value_mid": np.percentile( + test_date_samples_A1 + test_next_date_samples_A1 + 20, 50 + ), + "value_high": np.percentile( + test_date_samples_A1 + test_next_date_samples_A1 + 20, 90 + ), + "source": "forecast", + "aggregation_period": "month", + }, + { + "submission_date": TEST_DATE, + "a": "A2", + "forecast_parameters": "A2", + "value": np.mean(test_date_samples_A2 + test_next_date_samples_A2 + 30), + "value_low": np.percentile( + test_date_samples_A2 + test_next_date_samples_A2 + 30, 10 + ), + "value_mid": np.percentile( + test_date_samples_A2 + test_next_date_samples_A2 + 30, 50 + ), + "value_high": np.percentile( + test_date_samples_A2 + test_next_date_samples_A2 + 30, 90 + ), + "source": "forecast", + "aggregation_period": "month", + }, + ] ) - # for the components_df the contents aren't important here - # we're only testing that it is concatenated properly - # with the segment data added - dummy_segment_settings_A1 = SegmentSettings( - start_date=(TEST_DATE - relativedelta(days=2)).strftime("%Y-%m-%d"), - forecast_df=forecast_df.copy(), - segment={"a": "A1"}, - trained_parameters={"trained_parameters": "yes"}, - components_df=pd.DataFrame({"testcol": [1]}), + forecast_day_summarized_expected_df = pd.DataFrame( + [ + { + "submission_date": TEST_DATE + relativedelta(days=7), + "a": "A1", + "forecast_parameters": "A1", + "value": np.mean(test_date_samples_A1), + "value_low": np.percentile(test_date_samples_A1, 10), + "value_mid": np.percentile(test_date_samples_A1, 50), + "value_high": np.percentile(test_date_samples_A1, 90), + "source": "forecast", + "aggregation_period": "day", + }, + { + "submission_date": TEST_DATE_NEXT_DAY + relativedelta(days=7), + "a": "A1", + "forecast_parameters": "A1", + "value": np.mean(test_next_date_samples_A1), + "value_low": np.percentile(test_next_date_samples_A1, 10), + "value_mid": np.percentile(test_next_date_samples_A1, 50), + "value_high": np.percentile(test_next_date_samples_A1, 90), + "source": "forecast", + "aggregation_period": "day", + }, + { + "submission_date": TEST_DATE + relativedelta(days=7), + "a": "A2", + "forecast_parameters": "A2", + "value": np.mean(test_date_samples_A2), + "value_low": np.percentile(test_date_samples_A2, 10), + "value_mid": np.percentile(test_date_samples_A2, 50), + "value_high": np.percentile(test_date_samples_A2, 90), + "source": "forecast", + "aggregation_period": "day", + }, + { + "submission_date": TEST_DATE_NEXT_DAY + relativedelta(days=7), + "a": "A2", + "forecast_parameters": "A2", + "value": np.mean(test_next_date_samples_A2), + "value_low": np.percentile(test_next_date_samples_A2, 10), + "value_mid": np.percentile(test_next_date_samples_A2, 50), + "value_high": np.percentile(test_next_date_samples_A2, 90), + "source": "forecast", + "aggregation_period": "day", + }, + ] ) - dummy_segment_settings_A2 = SegmentSettings( - start_date=(TEST_DATE - relativedelta(days=2)).strftime("%Y-%m-%d"), - forecast_df=forecast_df.copy(), - segment={"a": "A2"}, - trained_parameters={"trained_parameters": "yes"}, - components_df=pd.DataFrame({"testcol": [2]}), + # concat in same order to make our lives easier + expected = pd.concat( + [ + forecast_day_summarized_expected_df, + forecast_month_summarized_expected_df, + observed_day_expected_df, + observed_month_expected_df, + ] ) + expected["submission_date"] = pd.to_datetime(expected["submission_date"]) - segment_models = [dummy_segment_settings_A1, dummy_segment_settings_A2] + assert set(expected.columns) == set(output_df.columns) - mocker.patch.object( - forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed + pd.testing.assert_frame_equal( + expected.sort_values( + ["source", "a", "submission_date", "aggregation_period"] + ).reset_index(drop=True), + output_df[expected.columns] + .sort_values(["source", "a", "submission_date", "aggregation_period"]) + .reset_index(drop=True), ) - forecast.observed_df = observed_df - forecast.segment_models = segment_models - forecast.metric_hub = dummy_metric_hub - # timestamp attributes created by fit and predict - # must be added manuall - forecast.collected_at = "" - forecast.trained_at = "" - forecast.predicted_at = "" +def test_auto_tuning(mocker): + """test the auto_tuning function""" - numpy_aggregations = ["mean"] - percentiles = [10, 50, 90] - forecast.summarize( - periods=["period"], - numpy_aggregations=numpy_aggregations, - percentiles=percentiles, + mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) + # mock_get_crossvalidation_metric will choose the parameters that + # have the lowest absolute product + mocker.patch.object( + ProphetAutotunerForecast, + "_get_crossvalidation_metric", + mock_get_crossvalidation_metric, + ) + forecast = ProphetAutotunerForecast( + growth="testval", + grid_parameters={ + "seasonality_prior_scale": [1, 2], + "holidays_prior_scale": [20, 10], + }, ) - output_df = forecast.summary_df - - # time filter removes first element of observed_df - observed_expected_df = pd.DataFrame( + observed_df = pd.DataFrame( { + "a": ["A1", "A1"], + "b": ["B1", "B2"], "submission_date": [ - TEST_DATE - relativedelta(days=2), - TEST_DATE - relativedelta(days=1), - TEST_DATE - relativedelta(days=2), - TEST_DATE - relativedelta(days=1), + TEST_DATE, + TEST_DATE, ], - "a": ["A1", "A1", "A2", "A2"], - "value": [20, 30, 40, 50], } ) - # doubled because there are two segments in the observed data - forecast_df = pd.concat([forecast_df, forecast_df]) - - forecast_df[["value", "value_low", "value_mid", "value_high"]] = 0 - forecast_df["source"] = "forecast" + best_model = forecast._auto_tuning(observed_df) - # segment data column is already present in observed - # needs to be added manually for forecast - forecast_df["a"] = [ - "A1", - "A1", - "A2", - "A2", - ] + # in the mocked class the two params get multiplied and the lowest combo gets select + assert best_model.seasonality_prior_scale == 1 + assert best_model.holidays_prior_scale == 10 - observed_expected_df["source"] = "historical" + # make sure growth got written to new class + assert best_model.growth == "testval" - # concat in same order to make our lives easier - expected = pd.concat([observed_expected_df, forecast_df]) - expected["forecast_parameters"] = '{"trained_parameters": "yes"}' - expected["aggregation_period"] = "period" - - # not going to check all the metadata columns - # in assert_frame_equal. Just make sure they're there - metadata_columns = { - "metric_alias", - "metric_hub_app_name", - "metric_hub_slug", - "metric_start_date", - "metric_end_date", - "metric_collected_at", - "forecast_start_date", - "forecast_end_date", - "forecast_trained_at", - "forecast_predicted_at", - } - assert set(expected.columns) | metadata_columns == set(output_df.columns) - # force value columns to be floats in both cases to make check easier - numeric_cols = ["value", "value_low", "value_mid", "value_high"] - expected[numeric_cols] = expected[numeric_cols].astype(float) - output_df[numeric_cols] = output_df[numeric_cols].astype(float) + # check to make sure it's fit pd.testing.assert_frame_equal( - output_df.sort_values(["a", "submission_date"])[expected.columns].reset_index( - drop=True - ), - expected.sort_values(["a", "submission_date"]).reset_index(drop=True), + best_model.history, forecast._build_train_dataframe(observed_df) ) - # should not be any nulls outside the metric column - non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] - assert not pd.isna(output_df[non_metric_columns]).any(axis=None) - # check components - # only checking that concatenation happened properly - # with segment data added - output_components = forecast.components_df - expected_components = pd.DataFrame({"testcol": [1, 2], "a": ["A1", "A2"]}) - pd.testing.assert_frame_equal(expected_components, output_components) - - -def test_under_predict(mocker): +def test_autotuner_predict(mocker): """testing _predict""" - # set segment models - - A1_start_date = TEST_DATE_STR - parameter_list = [ - { - "segment": {"a": "A1"}, - "start_date": A1_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, - "cv_settings": {}, - } - ] - - predict_start_date = TEST_DATE_NEXT_DAY_STR - predict_end_date = TEST_PREDICT_END_STR - - forecast = FunnelForecast( - model_type="test", - parameters=parameter_list, - use_all_us_holidays=None, - start_date=predict_start_date, - end_date=predict_end_date, - metric_hub=None, + mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) + # mock_get_crossvalidation_metric will choose the parameters that + # have the lowest absolute product + mocker.patch.object( + ProphetAutotunerForecast, + "_get_crossvalidation_metric", + mock_get_crossvalidation_metric, ) - # this ensures forecast is using MockModel - mocker.patch.object(forecast, "_build_model", mock_build_model) - # the optimization is just using the value attribute of MockModel, - # which is the product of the parameteres passed. The crossvalidation - # will choose the parameters where the absolute value of the product is smallest - mocker.patch.object( - forecast, "_get_crossvalidation_metric", mock_get_crossvalidation_metric + forecast = ProphetAutotunerForecast( + growth="testval", + grid_parameters={ + "seasonality_prior_scale": [1, 2], + "holidays_prior_scale": [20, 10], + }, ) observed_df = pd.DataFrame( { "a": ["A1", "A1"], "b": ["B1", "B2"], - "y": [0, 1], - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], + "submission_date": pd.to_datetime( + [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ] + ), + "y": [1, 2], } ) - segment_list = ["a"] - - # manually set segment_models attribute here instead of in __post_init__ - # which is bypassed to avoid a metric hub call - forecast._set_segment_models( - observed_df=observed_df, segment_column_list=segment_list - ) - # check that we only have one element here - assert len(forecast.segment_models) == 1 - # because of the check above we can use the first element - # and know that's all the segments present - segment_settings = forecast.segment_models[0] + forecast.fit(observed_df) dates_to_predict = pd.DataFrame( { - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ] + "submission_date": pd.to_datetime( + [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ] + ) } ) - forecast.observed_df = observed_df - forecast.fit() - out = forecast._predict(dates_to_predict, segment_settings).reset_index(drop=True) + + out = forecast.predict(dates_to_predict).reset_index(drop=True) # in MockModel, the predictive_samples method sets the output to # np.arange(len(dates_to_predict)) * self.value for one column called 0 # this helps ensure the forecast_df in segment_models is set properly - model_value = forecast.segment_models[0].segment_model.value + model_value = forecast.model.value expected = pd.DataFrame( { 0: [0, model_value], - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], + "submission_date": pd.to_datetime( + [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ] + ), } ) @@ -588,6 +808,9 @@ def test_under_predict(mocker): # check the components expected_components = observed_df[["submission_date", "y"]].copy() + expected_components["submission_date"] = pd.to_datetime( + expected_components["submission_date"] + ) expected_components[ [ "yhat", @@ -603,291 +826,66 @@ def test_under_predict(mocker): ] ] = 0 - components_df = forecast.segment_models[0].components_df + components_df = forecast.components_df assert set(expected_components.columns) == set(components_df.columns) pd.testing.assert_frame_equal( components_df, expected_components[components_df.columns] ) -def test_predict(funnel_forecast_for_fit_tests, segment_info_fit_tests): - """test the predict method. This is similar to test_under_predict - but multiple segments are acted upon""" - - observed_data = pd.DataFrame( - { - "a": ["A1", "A1", "A2", "A2"], - "b": ["B1", "B2", "B1", "B2"], - "y": [-1, 1, -1, 1], - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } - ) - - segment_list = ["a"] - - funnel_forecast_for_fit_tests._set_segment_models( - observed_df=observed_data, segment_column_list=segment_list - ) - funnel_forecast_for_fit_tests.observed_df = observed_data - funnel_forecast_for_fit_tests.fit() - funnel_forecast_for_fit_tests.predict() - - for segment in funnel_forecast_for_fit_tests.segment_models: - key = segment.segment["a"] - - model_value = segment_info_fit_tests[key]["min_param_value"] - - # in MockModel, the predictive_samples method sets the output to - # np.arange(len(dates_to_predict)) * self.value for one column called 0 - # this helps ensure the forecast_df in segment_models is set properly - expected_raw = pd.DataFrame( - { - 0: [0, model_value], - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } - ) - - # filter in predict happens against object start_date not - # segment start_date - expected_time_filter = ( - expected_raw["submission_date"] - >= pd.to_datetime(funnel_forecast_for_fit_tests.start_date).date() - ) - expected = expected_raw[expected_time_filter].reset_index(drop=True) - - forecast_df = segment.forecast_df - pd.testing.assert_frame_equal(forecast_df, expected) - - # check the components - expected_components = expected_raw[["submission_date"]].copy() - expected_components[ - [ - "yhat", - "trend", - "trend_upper", - "trend_lower", - "weekly", - "weekly_upper", - "weekly_lower", - "yearly", - "yearly_upper", - "yearly_lower", - ] - ] = 0 - - # because of time filtereing of training data, if the history has one - # element, y will but [0, 1]. The first element is turned into a NULL - # and then becomes a 0 because of fillna(0) - # if it has two it will have both elements and be [-1,1] - - if len(segment.segment_model.history) == 2: - expected_components["y"] = [-1, 1] - else: - expected_components["y"] = [0, 1] - - components_df = segment.components_df - - # there is weird stuff going on with the types but it shouldn't matter - # so coerce the type - expected_components["y"] = expected_components["y"].astype( - components_df["y"].dtype - ) - assert set(expected_components.columns) == set(components_df.columns) - pd.testing.assert_frame_equal( - components_df, - expected_components[components_df.columns], - check_column_type=False, - ) - - -def test_auto_tuning(forecast, mocker): - """test the auto_tuning function""" - - # set one segment with two sets of grid parameters - segment_settings = SegmentModelSettings( - segment={"a": "A1"}, - start_date=TEST_DATE_STR, - end_date=TEST_PREDICT_END_STR, - holidays=[], - regressors=[], - grid_parameters={"param1": [1, 2], "param2": [20, 10]}, - cv_settings={}, - ) - - mocker.patch.object(forecast, "_build_model", mock_build_model) - - # mock_get_crossvalidation_metric will choose the parameters that - # have the lowest absolute product - mocker.patch.object( - forecast, "_get_crossvalidation_metric", mock_get_crossvalidation_metric - ) - - observed_df = pd.DataFrame( - { - "a": ["A1", "A1"], - "b": ["B1", "B2"], - "submission_date": [ - TEST_DATE, - TEST_DATE, - ], - } - ) - - forecast.segment_models = [segment_settings] - - best_params = forecast._auto_tuning(observed_df, segment_settings) - - # in the mocked class the two params get multiplied and the lowest combo gets select - assert best_params == {"param1": 1, "param2": 10} - - -def test_under_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests): - """test the _fit method""" - - observed_data = pd.DataFrame( - { - "a": ["A1", "A1", "A2", "A2"], - "b": ["B1", "B2", "B1", "B2"], - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } - ) - - segment_list = ["a"] - - funnel_forecast_for_fit_tests._set_segment_models( - observed_df=observed_data, segment_column_list=segment_list - ) - funnel_forecast_for_fit_tests._fit(observed_data) - - # _fit iterates though all the segments in segment_modles - # iterate through them and check based on the value in - # segment_info_fit_tests defined in the fixture of the same name - for segment in funnel_forecast_for_fit_tests.segment_models: - key = segment.segment["a"] - - assert segment.start_date == segment_info_fit_tests[key]["start_date"] - assert segment.grid_parameters == segment_info_fit_tests[key]["grid_parameters"] - segment_model = segment.segment_model - assert segment_model.value == segment_info_fit_tests[key]["min_param_value"] - - # the history attribute is used in the components output so check it is set properly - expected_training = observed_data[ - (observed_data["a"] == key) - & ( - observed_data["submission_date"] - >= pd.to_datetime(segment_info_fit_tests[key]["start_date"]).date() - ) - ].rename(columns={"submission_date": "ds"}) - - pd.testing.assert_frame_equal(segment_model.history, expected_training) - - -def test_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests): - """test the fit function. It is inherited from BaseForecast - and calls _fit with the proper object attributes. Test looks very - similar to that for _fit""" - observed_data = pd.DataFrame( - { - "a": ["A1", "A1", "A2", "A2"], - "b": ["B1", "B2", "B1", "B2"], - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } - ) - - segment_list = ["a"] - - funnel_forecast_for_fit_tests._set_segment_models( - observed_df=observed_data, segment_column_list=segment_list - ) - funnel_forecast_for_fit_tests.observed_df = observed_data - funnel_forecast_for_fit_tests.fit() - - # _fit is called by fit and iterates though all the segments in segment_modles - # iterate through them and check based on the value in - # segment_info_fit_tests defined in the fixture of the same name - for segment in funnel_forecast_for_fit_tests.segment_models: - key = segment.segment["a"] - - assert segment.start_date == segment_info_fit_tests[key]["start_date"] - assert segment.grid_parameters == segment_info_fit_tests[key]["grid_parameters"] - segment_model = segment.segment_model - assert segment_model.value == segment_info_fit_tests[key]["min_param_value"] - - # check history attribute - expected_training = observed_data[ - (observed_data["a"] == key) - & ( - observed_data["submission_date"] - >= pd.to_datetime(segment_info_fit_tests[key]["start_date"]).date() - ) - ].rename(columns={"submission_date": "ds"}) - pd.testing.assert_frame_equal(segment_model.history, expected_training) - - -def test_set_segment_models(): - """test the set_segment_models method""" - A1_start_date = "2018-01-01" - A2_start_date = "2020-02-02" +def test_funnelforecast_fit(mocker): + """test the fit method, and implicitly the set_segment_models method""" + # arbitrarily choose growth as a parameter + # to set in order to check the test parameter_list = [ { "segment": {"a": "A1"}, - "start_date": A1_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, + "parameters": { + "growth": "logistic", + "grid_parameters": { + "seasonality_prior_scale": [1, 2], + "holidays_prior_scale": [20, 10], + }, + }, }, { "segment": {"a": "A2"}, - "start_date": A2_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, + "parameters": { + "growth": "A2", + "grid_parameters": { + "seasonality_prior_scale": [3, 4], + "holidays_prior_scale": [40, 30], + }, + }, }, ] - predict_start_date = TEST_DATE_STR - predict_end_date = TEST_PREDICT_END_STR - - forecast = FunnelForecast( - model_type="test", - parameters=parameter_list, - use_all_us_holidays=None, - start_date=predict_start_date, - end_date=predict_end_date, - metric_hub=None, + mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) + mocker.patch.object( + ProphetAutotunerForecast, + "_get_crossvalidation_metric", + mock_get_crossvalidation_metric, ) + ensemble_object = FunnelForecast(parameters=parameter_list, segments=["a", "b"]) observed_data = pd.DataFrame( - {"a": ["A1", "A1", "A2", "A2", "A2"], "b": ["B1", "B2", "B1", "B2", "B2"]} + { + "a": ["A1", "A1", "A2", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2", "B2"], + "submission_date": [ + TEST_DATE_STR, + TEST_DATE_STR, + TEST_DATE_STR, + TEST_DATE_STR, + TEST_DATE_STR, + ], + "value": [1, 2, 3, 4, 5], + } ) - segment_list = ["a", "b"] + ensemble_object.fit(observed_data) - forecast._set_segment_models( - observed_df=observed_data, segment_column_list=segment_list - ) + segment_models = ensemble_object.segment_models # put the segments and the start date in the same dictionary to make # comparison easier @@ -896,14 +894,18 @@ def test_set_segment_models(): # start_date is a stand-in for these parameters and # is determined by the value of a as specified in parameter_dict check_segment_models = [ - dict(**el.segment, **{"start_date": el.start_date}) - for el in forecast.segment_models + dict( + **el["segment"], + **{"value": el["model"].model.value, "growth": el["model"].growth}, + ) + for el in segment_models ] + expected = [ - {"a": "A1", "b": "B1", "start_date": A1_start_date}, - {"a": "A1", "b": "B2", "start_date": A1_start_date}, - {"a": "A2", "b": "B1", "start_date": A2_start_date}, - {"a": "A2", "b": "B2", "start_date": A2_start_date}, + {"a": "A1", "b": "B1", "growth": "logistic", "value": 10}, + {"a": "A1", "b": "B2", "growth": "logistic", "value": 10}, + {"a": "A2", "b": "B1", "growth": "A2", "value": 90}, + {"a": "A2", "b": "B2", "growth": "A2", "value": 90}, ] # can't make a set of dicts for comparison @@ -916,76 +918,106 @@ def test_set_segment_models(): for checkval, expectedval in compare_sorted: assert checkval == expectedval + # test that the seed was set for all models during fitting + assert all([el["model"]._set_seed for el in segment_models]) -def test_set_segment_models_multiple(): + # test that the fit was applied properly to all models + # to do this check the is_fit attribute, which will equal + # A1_start_date for A1 segments and A2_start_date for A2 segments + + # check that it fit by making sure model.history is not null + for segment in segment_models: + subset = observed_data[ + (observed_data["a"] == segment["segment"]["a"]) + & (observed_data["b"] == segment["segment"]["b"]) + ] + subset = subset.rename(columns={"submission_date": "ds", "value": "y"}) + if segment["segment"]["a"] == "A1": + if segment["segment"]["b"] == "B1": + floor = 0.5 * 1 + cap = 1.5 * 1 + else: + floor = 0.5 * 2 + cap = 1.5 * 2 + subset["floor"] = floor + subset["cap"] = cap + pd.testing.assert_frame_equal(subset, segment["model"].model.history) + + +def test_funnelforecast_fit_multiple(mocker): """test the set_segment_models method with segments on multiple columns""" - # set arbitrary dates - # they're only used to make sure segments are set correctly - A1B1_start_date = "2018-01-01" - A1B2_start_date = "2019-01-01" - A2B1_start_date = "2020-02-02" - A2B2_start_date = "2021-02-02" + # arbitrarily choose growth as a parameter + # to set in order to check the test parameter_list = [ { "segment": {"a": "A1", "b": "B1"}, - "start_date": A1B1_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, + "parameters": { + "growth": "logistic", + "grid_parameters": { + "seasonality_prior_scale": [1, 2], + "holidays_prior_scale": [20, 10], + }, + }, }, { - "segment": {"a": "A1", "b": "B2"}, - "start_date": A1B2_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, + "segment": {"a": "A2", "b": "B1"}, + "parameters": { + "growth": "A2B1", + "grid_parameters": { + "seasonality_prior_scale": [3, 4], + "holidays_prior_scale": [40, 30], + }, + }, }, { - "segment": {"a": "A2", "b": "B1"}, - "start_date": A2B1_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, + "segment": {"a": "A1", "b": "B2"}, + "parameters": { + "growth": "logistic", + "grid_parameters": { + "seasonality_prior_scale": [10, 20], + "holidays_prior_scale": [200, 100], + }, + }, }, { "segment": {"a": "A2", "b": "B2"}, - "start_date": A2B2_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, + "parameters": { + "growth": "A2B2", + "grid_parameters": { + "seasonality_prior_scale": [30, 40], + "holidays_prior_scale": [400, 300], + }, + }, }, ] - predict_start_date = TEST_DATE_STR - predict_end_date = TEST_PREDICT_END_STR - - forecast = FunnelForecast( - model_type="test", - parameters=parameter_list, - use_all_us_holidays=None, - start_date=predict_start_date, - end_date=predict_end_date, - metric_hub=None, + mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) + mocker.patch.object( + ProphetAutotunerForecast, + "_get_crossvalidation_metric", + mock_get_crossvalidation_metric, ) + ensemble_object = FunnelForecast(parameters=parameter_list, segments=["a", "b"]) observed_data = pd.DataFrame( - {"a": ["A1", "A1", "A2", "A2", "A2"], "b": ["B1", "B2", "B1", "B2", "B2"]} + { + "a": ["A1", "A1", "A2", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2", "B2"], + "submission_date": [ + TEST_DATE_STR, + TEST_DATE_STR, + TEST_DATE_STR, + TEST_DATE_STR, + TEST_DATE_STR, + ], + "value": [1, 2, 3, 4, 5], + } ) - segment_list = ["a", "b"] + ensemble_object.fit(observed_data) - forecast._set_segment_models( - observed_df=observed_data, segment_column_list=segment_list - ) + segment_models = ensemble_object.segment_models # put the segments and the start date in the same dictionary to make # comparison easier @@ -994,14 +1026,18 @@ def test_set_segment_models_multiple(): # start_date is a stand-in for these parameters and # is determined by the value of a as specified in parameter_dict check_segment_models = [ - dict(**el.segment, **{"start_date": el.start_date}) - for el in forecast.segment_models + dict( + **el["segment"], + **{"value": el["model"].model.value, "growth": el["model"].growth}, + ) + for el in segment_models ] + expected = [ - {"a": "A1", "b": "B1", "start_date": A1B1_start_date}, - {"a": "A1", "b": "B2", "start_date": A1B2_start_date}, - {"a": "A2", "b": "B1", "start_date": A2B1_start_date}, - {"a": "A2", "b": "B2", "start_date": A2B2_start_date}, + {"a": "A1", "b": "B1", "growth": "logistic", "value": 10}, + {"a": "A1", "b": "B2", "growth": "logistic", "value": 1000}, + {"a": "A2", "b": "B1", "growth": "A2B1", "value": 90}, + {"a": "A2", "b": "B2", "growth": "A2B2", "value": 9000}, ] # can't make a set of dicts for comparison @@ -1014,625 +1050,376 @@ def test_set_segment_models_multiple(): for checkval, expectedval in compare_sorted: assert checkval == expectedval + # test that the seed was set for all models during fitting + assert all([el["model"]._set_seed for el in segment_models]) -def test_set_segment_models_exception(): - """test the exception for segment_models where - and exception is raised if a model_setting_split_dim - is specified that isn't in the data""" - A1_start_date = "2018-01-01" - A2_start_date = "2020-02-02" + # test that the fit was applied properly to all models + # to do this check the is_fit attribute, which will equal + # A1_start_date for A1 segments and A2_start_date for A2 segments + + # check that it fit by making sure model.history is not null + for segment in segment_models: + subset = observed_data[ + (observed_data["a"] == segment["segment"]["a"]) + & (observed_data["b"] == segment["segment"]["b"]) + ] + subset = subset.rename(columns={"submission_date": "ds", "value": "y"}) + if segment["segment"]["a"] == "A1": + if segment["segment"]["b"] == "B1": + floor = 0.5 * 1 + cap = 1.5 * 1 + else: + floor = 0.5 * 2 + cap = 1.5 * 2 + subset["floor"] = floor + subset["cap"] = cap + pd.testing.assert_frame_equal(subset, segment["model"].model.history) + + +def test_funnel_predict(mocker): + """test the predict method. This is similar to test_under_predict + but multiple segments are acted upon""" + + # arbitrarily choose growth as a parameter + # to set in order to check the test parameter_list = [ { - "segment": {"c": "A1"}, - "start_date": A1_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, + "segment": {"a": "A1"}, + "parameters": { + "growth": "logistic", + "grid_parameters": { + "seasonality_prior_scale": [1, 2], + "holidays_prior_scale": [20, 10], + }, + }, }, { - "segment": {"c": "A2"}, - "start_date": A2_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, + "segment": {"a": "A2"}, + "parameters": { + "growth": "A2", + "grid_parameters": { + "seasonality_prior_scale": [3, 4], + "holidays_prior_scale": [40, 30], + }, + }, }, ] - predict_start_date = TEST_DATE_STR - predict_end_date = TEST_PREDICT_END_STR - - forecast = FunnelForecast( - model_type="test", - parameters=parameter_list, - use_all_us_holidays=None, - start_date=predict_start_date, - end_date=predict_end_date, - metric_hub=None, + mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) + mocker.patch.object( + ProphetAutotunerForecast, + "_get_crossvalidation_metric", + mock_get_crossvalidation_metric, ) + ensemble_object = FunnelForecast(parameters=parameter_list, segments=["a", "b"]) observed_data = pd.DataFrame( - {"a": ["A1", "A1", "A2", "A2", "A2"], "b": ["B1", "B2", "B1", "B2", "B2"]} - ) - - segment_list = ["a", "b"] - - with pytest.raises( - ValueError, - match="Segment keys missing from metric hub segments: c", - ): - forecast._set_segment_models( - observed_df=observed_data, segment_column_list=segment_list - ) - - -def test_fill_regressor_dates(forecast): - """test _fill_regressor_dates - the name in the regressor info indicates which case is being tested - Dates are chosen arbitrarily""" - # get the set start and end dates for the forecast fixture - # as datetime objects - default_start_datetime = datetime(TEST_DATE.year, TEST_DATE.month, TEST_DATE.day) - default_end_datetime = datetime( - TEST_PREDICT_END.year, TEST_PREDICT_END.month, TEST_PREDICT_END.day - ) - - # set the start date with an arbitrary date - regressor_info = { - "name": "only_start", - "description": "only has a start", - "start_date": "2020-08-15", - } - regressor = ProphetRegressor(**regressor_info) - forecast._fill_regressor_dates(regressor) - assert regressor.start_date == pd.to_datetime("2020-08-15") - - # this is the end dat for the forecast fixture - assert regressor.end_date == default_end_datetime - - # set the end date with an arbitrary date - regressor_info = { - "name": "only_end", - "description": "only has a end", - "end_date": "2125-08-15", - } - regressor = ProphetRegressor(**regressor_info) - forecast._fill_regressor_dates(regressor) - # the start date for the forecast fixture is TEST_DATE - assert regressor.start_date == default_start_datetime - assert regressor.end_date == pd.to_datetime("2125-08-15") - - # set both the start and end dates to arbitrary dates - regressor_info = { - "name": "both", - "description": "only has a start", - "start_date": "2020-08-15", - "end_date": "2020-09-15", - } - regressor = ProphetRegressor(**regressor_info) - forecast._fill_regressor_dates(regressor) - assert regressor.start_date == pd.to_datetime("2020-08-15") - assert regressor.end_date == pd.to_datetime("2020-09-15") - - # use the defaults for both - regressor_info = { - "name": "neither", - "description": "nothin to see here", - } - regressor = ProphetRegressor(**regressor_info) - forecast._fill_regressor_dates(regressor) - assert regressor.start_date == default_start_datetime - assert regressor.end_date == default_end_datetime - - # use arbitrary out of order dates to set - regressor_info = { - "name": "out_of_order", - "description": "best better break", - "start_date": "2020-08-15", - "end_date": "2000-09-15", - } - regressor = ProphetRegressor(**regressor_info) - with pytest.raises( - Exception, - match="Regressor out_of_order start date comes after end date", - ): - forecast._fill_regressor_dates(regressor) - - -def test_add_regressors(forecast): - """test add regressors - test case for each element of regressor_list_raw is indicated in name""" - - # choose arbitrary dates for dates - # name indicates the relationship of the window - # to the timeframe of the data as defined in the ds - # column of df below - regressor_list_raw = [ - { - "name": "all_in", - "description": "it's all in", - "start_date": "2124-01-01", - "end_date": "2124-01-06", - }, { - "name": "all_out", - "description": "it's all out", - "start_date": "2124-02-01", - "end_date": "2124-02-06", - }, - { - "name": "just_end", - "description": "just the second half", - "start_date": "2124-01-03", - "end_date": "2124-02-06", - }, - { - "name": "just_middle", - "description": "just the middle two", - "start_date": "2124-01-02", - "end_date": "2124-01-03", - }, - ] - - regressor_list = [ProphetRegressor(**r) for r in regressor_list_raw] - - df = pd.DataFrame( - { - "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-03").date(), - pd.to_datetime("2124-01-04").date(), - ], + "a": ["A1", "A1", "A2", "A2", "A2"] * 2, + "b": ["B1", "B2", "B1", "B2", "B2"] * 2, + "submission_date": pd.to_datetime( + [ + TEST_DATE, + TEST_DATE, + TEST_DATE, + TEST_DATE, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + ] + ), + "value": [1, 2, 3, 4, 5] * 2, } ) - output_df = forecast._add_regressors(df, regressors=regressor_list) + ensemble_object.fit(observed_data) - expected_df = pd.DataFrame( + dates_to_predict = pd.DataFrame( { - "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-03").date(), - pd.to_datetime("2124-01-04").date(), - ], - "all_in": [0, 0, 0, 0], - "all_out": [1, 1, 1, 1], - "just_end": [1, 1, 0, 0], - "just_middle": [1, 0, 0, 1], + "submission_date": pd.to_datetime( + [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ] + ) } ) - assert set(output_df.columns) == set(expected_df.columns) - pd.testing.assert_frame_equal(output_df, expected_df[output_df.columns]) - + out = ensemble_object.predict(dates_to_predict).reset_index(drop=True) -def test_build_train_dataframe_no_regressors(forecast): - """test _build_train_dataframe with no regressors""" - regressor_list = [] - - grid_parameters = { - "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], - "changepoint_range": [0.8, 0.9, 1], - "n_changepoints": [30], - "weekly_seasonality": True, - "yearly_seasonality": True, - "growth": "logistic", - } - cv_settings = { - "initial": "366 days", - "period": "30 days", - "horizon": "30 days", - "parallel": "processes", - } - segment_settings = SegmentModelSettings( - segment={"a": 1, "b": 2}, - start_date=TEST_DATE_STR, - end_date=TEST_PREDICT_END_STR, - holidays=[], - regressors=[ProphetRegressor(**r) for r in regressor_list], - grid_parameters=grid_parameters, - cv_settings=cv_settings, - ) + for segment in ensemble_object.segment_models: + # in MockModel, the predictive_samples method sets the output to + # np.arange(len(dates_to_predict)) * self.value for one column called 0 + # this helps ensure the forecast_df in segment_models is set properly + out_subset = out[ + (out["a"] == segment["segment"]["a"]) + & (out["b"] == segment["segment"]["b"]) + ] + model_value = segment["model"].model.value + expected = pd.DataFrame( + { + 0: [0, model_value], + "submission_date": pd.to_datetime( + [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ] + ), + "a": [segment["segment"]["a"], segment["segment"]["a"]], + "b": [segment["segment"]["b"], segment["segment"]["b"]], + "forecast_parameters": [json.dumps(segment["model"]._get_parameters())] + * 2, + } + ) - observed_df = pd.DataFrame( - { - "a": [1, 1, 1, 1, 3, 3], - "b": [1, 1, 2, 2, 2, 2], - "y": [1, 2, 3, 4, 5, 6], - "submission_date": [ - TEST_DATE - relativedelta(months=1), - TEST_DATE_NEXT_DAY - relativedelta(months=1), - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE + relativedelta(months=1), - TEST_DATE_NEXT_DAY + relativedelta(months=1), - ], - } - ) + pd.testing.assert_frame_equal( + out_subset.reset_index(drop=True), expected.reset_index(drop=True) + ) - output_train_df = forecast._build_train_dataframe( - observed_df, segment_settings=segment_settings - ) - expected_train_df = pd.DataFrame( - { - "a": [1, 1], - "b": [2, 2], - "y": [3, 4], - "ds": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } - ) - pd.testing.assert_frame_equal( - output_train_df.reset_index(drop=True), expected_train_df - ) + # check the components + expected_components = ( + observed_data.loc[ + (observed_data["a"] == segment["segment"]["a"]) + & (observed_data["b"] == segment["segment"]["b"]), + ["submission_date", "value"], + ] + .rename(columns={"value": "y"}) + .copy() + ) + expected_components[ + [ + "yhat", + "trend", + "trend_upper", + "trend_lower", + "weekly", + "weekly_upper", + "weekly_lower", + "yearly", + "yearly_upper", + "yearly_lower", + ] + ] = 0 - # test again but with add_logistic_growth_cols set to true - output_train_wlog_df = forecast._build_train_dataframe( - observed_df, segment_settings=segment_settings, add_logistic_growth_cols=True - ) - expected_train_wlog_df = pd.DataFrame( - { - "a": [1, 1], - "b": [2, 2], - "y": [3, 4], - "ds": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - "floor": [1.5, 1.5], - "cap": [6.0, 6.0], - } - ) + components_df = segment["model"].components_df + assert set(expected_components.columns) == set(components_df.columns) + pd.testing.assert_frame_equal( + components_df.reset_index(drop=True), + expected_components[components_df.columns].reset_index(drop=True), + ) - assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) - pd.testing.assert_frame_equal( - output_train_wlog_df.reset_index(drop=True), - expected_train_wlog_df[output_train_wlog_df.columns], - ) +def test_funnel_predict_growth(mocker): + """test the predict method when growth is set in the + grid parameters. Extra attributes need to be updated with this one""" -def test_build_train_dataframe(forecast): - """test _build_train_dataframe and include regressors""" - regressor_list = [ - { - "name": "all_in", - "description": "it's all in", - "start_date": TEST_DATE_STR, - "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), - }, + # arbitrarily choose growth as a parameter + # to set in order to check the test + parameter_list = [ { - "name": "all_out", - "description": "it's all in", - "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), - "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( - "%Y-%m-%d" - ), + "segment": {"a": "A1"}, + "parameters": { + "grid_parameters": { + "seasonality_prior_scale": [1, 2], + "holidays_prior_scale": [20, 10], + "growth": "logistic", + }, + }, }, { - "name": "just_end", - "description": "just the second one", - "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), - "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( - "%Y-%m-%d" - ), + "segment": {"a": "A2"}, + "parameters": { + "growth": "A2", + "grid_parameters": { + "seasonality_prior_scale": [3, 4], + "holidays_prior_scale": [40, 30], + }, + }, }, ] - grid_parameters = { - "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], - "changepoint_range": [0.8, 0.9, 1], - "n_changepoints": [30], - "weekly_seasonality": True, - "yearly_seasonality": True, - "growth": "logistic", - } - cv_settings = { - "initial": "366 days", - "period": "30 days", - "horizon": "30 days", - "parallel": "processes", - } - segment_settings = SegmentModelSettings( - segment={"a": 1, "b": 2}, - start_date=TEST_DATE_STR, - end_date=(TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), - holidays=[], - regressors=[ProphetRegressor(**r) for r in regressor_list], - grid_parameters=grid_parameters, - cv_settings=cv_settings, + mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) + mocker.patch.object( + ProphetAutotunerForecast, + "_get_crossvalidation_metric", + mock_get_crossvalidation_metric, ) + ensemble_object = FunnelForecast(parameters=parameter_list, segments=["a", "b"]) - observed_df = pd.DataFrame( - { - "a": [1, 1, 1, 1, 3, 3], - "b": [1, 1, 2, 2, 2, 2], - "y": [1, 2, 3, 4, 5, 6], - "submission_date": [ - TEST_DATE - relativedelta(months=1), - TEST_DATE_NEXT_DAY - relativedelta(months=1), - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE + relativedelta(months=1), - TEST_DATE_NEXT_DAY + relativedelta(months=1), - ], - } - ) - output_train_df = forecast._build_train_dataframe( - observed_df, segment_settings=segment_settings - ) - expected_train_df = pd.DataFrame( + observed_data = pd.DataFrame( { - "a": [1, 1], - "b": [2, 2], - "y": [3, 4], - "ds": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - "all_in": [0, 0], - "all_out": [ - 1, - 1, - ], - "just_end": [1, 0], + "a": ["A1", "A1", "A2", "A2", "A2"] * 2, + "b": ["B1", "B2", "B1", "B2", "B2"] * 2, + "submission_date": pd.to_datetime( + [ + TEST_DATE, + TEST_DATE, + TEST_DATE, + TEST_DATE, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + TEST_DATE_NEXT_DAY, + ] + ), + "value": [1, 2, 3, 4, 5] * 2, } ) - pd.testing.assert_frame_equal( - output_train_df.reset_index(drop=True), expected_train_df - ) - output_train_wlog_df = forecast._build_train_dataframe( - observed_df, segment_settings=segment_settings, add_logistic_growth_cols=True - ) - expected_train_wlog_df = pd.DataFrame( + ensemble_object.fit(observed_data) + + dates_to_predict = pd.DataFrame( { - "a": [1, 1], - "b": [2, 2], - "y": [3, 4], - "ds": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - "all_in": [0, 0], - "all_out": [1, 1], - "just_end": [1, 0], - "floor": [1.5, 1.5], - "cap": [6.0, 6.0], + "submission_date": pd.to_datetime( + [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ] + ) } ) - assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) - pd.testing.assert_frame_equal( - output_train_wlog_df.reset_index(drop=True), - expected_train_wlog_df[output_train_wlog_df.columns], - ) - - -def test_build_predict_dataframe_no_regressors(forecast): - """test _build_predict with no regressors""" - regressor_list = [] - - grid_parameters = { - "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], - "changepoint_range": [0.8, 0.9, 1], - "n_changepoints": [30], - "weekly_seasonality": True, - "yearly_seasonality": True, - "growth": "logistic", - } - cv_settings = { - "initial": "366 days", - "period": "30 days", - "horizon": "30 days", - "parallel": "processes", - } - segment_settings = SegmentModelSettings( - segment={"a": 1, "b": 2}, - start_date=TEST_DATE_STR, - end_date=TEST_PREDICT_END_STR, - holidays=[], - regressors=[ProphetRegressor(**r) for r in regressor_list], - grid_parameters=grid_parameters, - cv_settings=cv_settings, - ) + out = ensemble_object.predict(dates_to_predict).reset_index(drop=True) - # manually set trained_parameters, normally this would happen during training - segment_settings.trained_parameters = {"floor": -1.0, "cap": 10.0} + for segment in ensemble_object.segment_models: + # in MockModel, the predictive_samples method sets the output to + # np.arange(len(dates_to_predict)) * self.value for one column called 0 + # this helps ensure the forecast_df in segment_models is set properly + out_subset = out[ + (out["a"] == segment["segment"]["a"]) + & (out["b"] == segment["segment"]["b"]) + ] + model_value = segment["model"].model.value + expected = pd.DataFrame( + { + 0: [0, model_value], + "submission_date": pd.to_datetime( + [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ] + ), + "a": [segment["segment"]["a"], segment["segment"]["a"]], + "b": [segment["segment"]["b"], segment["segment"]["b"]], + "forecast_parameters": [json.dumps(segment["model"]._get_parameters())] + * 2, + } + ) - dates_to_predict = pd.DataFrame( - { - "submission_date": [ - TEST_DATE - relativedelta(months=1), - TEST_DATE_NEXT_DAY - relativedelta(months=1), - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } - ) + pd.testing.assert_frame_equal( + out_subset.reset_index(drop=True), expected.reset_index(drop=True) + ) - output_predict_df = forecast._build_predict_dataframe( - dates_to_predict, segment_settings=segment_settings - ) - expected_predict_df = pd.DataFrame( - { - "ds": [ - TEST_DATE - relativedelta(months=1), - TEST_DATE_NEXT_DAY - relativedelta(months=1), - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } - ) - pd.testing.assert_frame_equal( - output_predict_df.reset_index(drop=True), expected_predict_df - ) + # check that the growth attributes were set + if segment["segment"]["a"] == "A1": + if segment["segment"]["b"] == "B1": + assert segment["model"].logistic_growth_floor == 0.5 + assert segment["model"].logistic_growth_cap == 1.5 + elif segment["segment"]["b"] == "B2": + assert segment["model"].logistic_growth_floor == 1.0 + assert segment["model"].logistic_growth_cap == 3.0 - # test against but with add_logistic_growth_cols set to true - output_predict_wlog_df = forecast._build_predict_dataframe( - dates_to_predict, - segment_settings=segment_settings, - add_logistic_growth_cols=True, - ) - expected_predict_wlog_df = pd.DataFrame( - { - "ds": [ - TEST_DATE - relativedelta(months=1), - TEST_DATE_NEXT_DAY - relativedelta(months=1), - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - "floor": [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0], - "cap": [10.0, 10.0, 10.0, 10.0, 10.0, 10.0], - } - ) + # check the components + expected_components = ( + observed_data.loc[ + (observed_data["a"] == segment["segment"]["a"]) + & (observed_data["b"] == segment["segment"]["b"]), + ["submission_date", "value"], + ] + .rename(columns={"value": "y"}) + .copy() + ) + expected_components[ + [ + "yhat", + "trend", + "trend_upper", + "trend_lower", + "weekly", + "weekly_upper", + "weekly_lower", + "yearly", + "yearly_upper", + "yearly_lower", + ] + ] = 0 - assert set(output_predict_wlog_df.columns) == set(expected_predict_wlog_df.columns) - pd.testing.assert_frame_equal( - output_predict_wlog_df.reset_index(drop=True), - expected_predict_wlog_df[output_predict_wlog_df.columns], - ) + components_df = segment["model"].components_df + assert set(expected_components.columns) == set(components_df.columns) + pd.testing.assert_frame_equal( + components_df.reset_index(drop=True), + expected_components[components_df.columns].reset_index(drop=True), + ) -def test_build_predict_dataframe(forecast): - """test _build_predict_dataframe including regressors""" - regressor_list = [ - { - "name": "all_in", - "description": "it's all in", - "start_date": TEST_DATE_STR, - "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), - }, +def test_set_segment_models_exception(mocker): + """test the exception for segment_models where + and exception is raised if a model_setting_split_dim + is specified that isn't in the data""" + # arbitrarily choose growth as a parameter + # to set in order to check the test + parameter_list = [ { - "name": "all_out", - "description": "it's all in", - "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), - "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( - "%Y-%m-%d" - ), + "segment": {"c": "A1"}, + "parameters": { + "growth": "logistic", + "grid_parameters": { + "seasonality_prior_scale": [1, 2], + "holidays_prior_scale": [20, 10], + }, + }, }, { - "name": "just_end", - "description": "just the second one", - "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), - "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( - "%Y-%m-%d" - ), + "segment": {"c": "A2"}, + "parameters": { + "growth": "A2", + "grid_parameters": { + "seasonality_prior_scale": [3, 4], + "holidays_prior_scale": [40, 30], + }, + }, }, ] - grid_parameters = { - "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], - "changepoint_range": [0.8, 0.9, 1], - "n_changepoints": [30], - "weekly_seasonality": True, - "yearly_seasonality": True, - "growth": "logistic", - } - cv_settings = { - "initial": "366 days", - "period": "30 days", - "horizon": "30 days", - "parallel": "processes", - } - segment_settings = SegmentModelSettings( - segment={"a": 1, "b": 2}, - start_date=TEST_DATE_STR, - end_date=TEST_PREDICT_END_STR, - holidays=[], - regressors=[ProphetRegressor(**r) for r in regressor_list], - grid_parameters=grid_parameters, - cv_settings=cv_settings, - ) - - # set training_parameters, which is usually done in the fit method - segment_settings.trained_parameters = {"floor": -1.0, "cap": 10.0} - - dates_to_predict = pd.DataFrame( - { - "submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY], - } - ) - - output_train_df = forecast._build_predict_dataframe( - dates_to_predict, - segment_settings=segment_settings, - ) - expected_train_df = pd.DataFrame( - { - "ds": [TEST_DATE, TEST_DATE_NEXT_DAY], - "all_in": [0, 0], - "all_out": [1, 1], - "just_end": [1, 0], - } - ) - pd.testing.assert_frame_equal( - output_train_df.reset_index(drop=True), expected_train_df + mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) + mocker.patch.object( + ProphetAutotunerForecast, + "_get_crossvalidation_metric", + mock_get_crossvalidation_metric, ) + ensemble_object = FunnelForecast(parameters=parameter_list, segments=["a", "b"]) - # test again but with add_logistic_growth_cols set to true - output_train_wlog_df = forecast._build_predict_dataframe( - dates_to_predict, - segment_settings=segment_settings, - add_logistic_growth_cols=True, - ) - expected_train_wlog_df = pd.DataFrame( + observed_data = pd.DataFrame( { - "ds": [TEST_DATE, TEST_DATE_NEXT_DAY], - "all_in": [0, 0], - "all_out": [1, 1], - "just_end": [1, 0], - "floor": [-1.0, -1.0], - "cap": [10.0, 10.0], + "a": ["A1", "A1", "A2", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2", "B2"], + "submission_date": [ + TEST_DATE_STR, + TEST_DATE_STR, + TEST_DATE_STR, + TEST_DATE_STR, + TEST_DATE_STR, + ], + "value": [1, 2, 3, 4, 5], } ) - assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) - pd.testing.assert_frame_equal( - output_train_wlog_df.reset_index(drop=True), - expected_train_wlog_df[output_train_wlog_df.columns], - ) + with pytest.raises( + ValueError, + match="Segment keys missing from metric hub segments: c", + ): + ensemble_object.fit(observed_df=observed_data) -def test_build_model(forecast): +def test_build_model(): """test build_model just runs the function and ensures no error is raised""" - regressor_list = [ - { - "name": "all_in", - "description": "it's all in", - "start_date": TEST_DATE_STR, - "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), - }, - { - "name": "all_out", - "description": "it's all in", - "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), - "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( - "%Y-%m-%d" - ), - }, - { - "name": "just_end", - "description": "just the second one", - "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), - "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( - "%Y-%m-%d" - ), - }, - ] + regressor_list = ["post_esr_migration", "in_covid", "ad_click_bug"] # use holidays from holiday config file holiday_list = { @@ -1681,29 +1468,16 @@ def test_build_model(forecast): "horizon": "30 days", "parallel": "processes", } - segment_settings = SegmentModelSettings( - segment={"a": 1, "b": 2}, - start_date=TEST_DATE_STR, - end_date=TEST_PREDICT_END_STR, - holidays=[ProphetHoliday(**h) for h in holiday_list.values()], - regressors=[ProphetRegressor(**r) for r in regressor_list], + forecast = ProphetAutotunerForecast( + holidays=holiday_list.keys(), + regressors=regressor_list, grid_parameters=grid_parameters, cv_settings=cv_settings, ) - model = forecast._build_model( - segment_settings=segment_settings, - parameters={ - "changepoint_prior_scale": 0.01, - "changepoint_range": 0.8, - "n_changepoints": 30, - "weekly_seasonality": True, - "yearly_seasonality": True, - "growth": "logistic", - }, - ) + _ = forecast._build_model() - holiday_df = model.holidays + holiday_df = forecast.holidays expected_holidays = pd.concat( [ pd.DataFrame( diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py index c512e0c9..842740e6 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py @@ -52,6 +52,61 @@ def test_only_numeric(): pd.testing.assert_frame_equal(month_output, expected_month) +def test_only_numeric_with_additional(): + df = pd.DataFrame( + { + "submission_date": [ + "2020-01-01", + "2020-01-01", + "2020-01-02", + "2020-01-02", + "2020-01-02", + ], + "additional_col": ["A", "B", "A", "A", "B"], + "ints": [1, 2, 3, 4, 5], + "floats": [10.0, 20.0, 30.0, 40.0, 50.0], + } + ) + + day_output = aggregate_to_period( + df, "day", additional_aggregation_columns=["additional_col"] + ) + + expected_day = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-01-02"), + pd.to_datetime("2020-01-02"), + ], + "additional_col": ["A", "B", "A", "B"], + "ints": [1, 2, 7, 5], + "floats": [10.0, 20.0, 70.0, 50.0], + } + ) + + pd.testing.assert_frame_equal(day_output, expected_day) + + month_output = aggregate_to_period( + df, "month", additional_aggregation_columns=["additional_col"] + ) + + expected_month = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-01-01"), + ], + "additional_col": ["A", "B"], + "ints": [8, 7], + "floats": [80.0, 70.0], + } + ) + + pd.testing.assert_frame_equal(month_output, expected_month) + + def test_with_string_and_numeric(): df = pd.DataFrame( { @@ -103,6 +158,55 @@ def test_with_string_and_numeric(): pd.testing.assert_frame_equal(month_output, expected_month) +def test_with_string_and_numeri_with_additional(): + df = pd.DataFrame( + { + "submission_date": [ + "2020-01-01", + "2020-01-01", + "2020-01-02", + "2020-01-02", + "2020-01-02", + ], + "ints": [1, 2, 3, 4, 5], + "floats": [10.0, 20.0, 30.0, 40.0, 50.0], + "string": ["A01", "B01", "A02", "A02", "B02"], + "additional_col": ["A", "B", "A", "A", "B"], + } + ) + + day_output = aggregate_to_period( + df, "day", additional_aggregation_columns=["additional_col"] + ) + + expected_day = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-01-02"), + pd.to_datetime("2020-01-02"), + ], + "additional_col": ["A", "B", "A", "B"], + "ints": [1, 2, 7, 5], + "floats": [10.0, 20.0, 70.0, 50.0], + "string": ["A01", "B01", "A02", "B02"], + } + ) + + pd.testing.assert_frame_equal(day_output, expected_day) + + # strings no longer have the same value within an aggregation category + # so error is expected + with pytest.raises( + ValueError, + match="String and Numeric dataframes have different length, likely due to strings not being unique up to aggregation", + ): + _ = aggregate_to_period( + df, "month", additional_aggregation_columns=["additional_col"] + ) + + def test_only_string(): df = pd.DataFrame( { @@ -148,6 +252,60 @@ def test_only_string(): pd.testing.assert_frame_equal(month_output, expected_month) +def test_only_string_with_additional(): + df = pd.DataFrame( + { + "submission_date": [ + "2020-01-01", + "2020-01-01", + "2020-01-02", + "2020-02-01", + "2020-02-02", + ], + "string": ["jan", "jan", "jan", "feb", "feb"], + "additional_col": ["jan", "jan", "jan", "feb", "feb"], + } + ) + + day_output = aggregate_to_period( + df, "day", additional_aggregation_columns=["additional_col"] + ) + + expected_day = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-01-02"), + pd.to_datetime("2020-02-01"), + pd.to_datetime("2020-02-02"), + ], + "string": ["jan", "jan", "feb", "feb"], + "additional_col": ["jan", "jan", "feb", "feb"], + } + ) + + assert set(day_output.columns) == set(expected_day.columns) + pd.testing.assert_frame_equal(day_output, expected_day[day_output.columns]) + + month_output = aggregate_to_period( + df, "month", additional_aggregation_columns=["additional_col"] + ) + + expected_month = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-02-01"), + ], + "string": ["jan", "feb"], + "additional_col": ["jan", "feb"], + } + ) + + assert set(month_output.columns) == set(expected_month.columns) + pd.testing.assert_frame_equal(month_output, expected_month[month_output.columns]) + + def test_non_unique_string_exception(): df = pd.DataFrame( { diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py index adc9c4ba..928b7ba3 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py @@ -4,53 +4,31 @@ import pandas as pd import numpy as np import pytest -import collections -from kpi_forecasting.models.prophet_forecast import ProphetForecast +from kpi_forecasting.models.prophet_forecast import ( + ProphetForecast, + combine_forecast_observed, + aggregate_forecast_observed, + summarize, +) +from kpi_forecasting.configs.model_inputs import ProphetRegressor + # Arbitrarily choose some date to use for the tests TEST_DATE = date(2024, 1, 1) TEST_DATE_STR = TEST_DATE.strftime("%Y-%m-%d") -TEST_DATE_NEXT_DAY = date(2024, 1, 1) +TEST_DATE_NEXT_DAY = date(2024, 1, 2) TEST_DATE_NEXT_DAY_STR = TEST_DATE_NEXT_DAY.strftime("%Y-%m-%d") - - -@pytest.fixture -def forecast(): - A1_start_date = TEST_DATE_STR - parameter_dict = { - "model_setting_split_dim": "a", - "segment_settings": { - "A1": { - "start_date": A1_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, - "cv_settings": {}, - }, - }, - } - - predict_start_date = TEST_DATE_NEXT_DAY_STR - # arbitarily set it a couple months in the future - predict_end_date = (TEST_DATE + relativedelta(months=2)).strftime("%Y-%m-%d") - return ProphetForecast( - model_type="test", - parameters=parameter_dict, - use_all_us_holidays=None, - start_date=predict_start_date, - end_date=predict_end_date, - metric_hub=None, - ) +TEST_PREDICT_END = TEST_DATE + relativedelta(months=2) +TEST_PREDICT_END_STR = TEST_PREDICT_END.strftime("%Y-%m-%d") class MockModel: """Used in place of prophet.Prophet for testing purposes""" - def __init__(self, param1=0, param2=0, **kwargs): - self.value = param1 * param2 + def __init__(self, **kwargs): + self.value = 2 self.history = None def fit(self, df, *args, **kwargs): @@ -84,52 +62,66 @@ def predictive_samples(self, dates_to_predict): return {"yhat": {0: output}} -def mock_build_model(parameters): +def mock_build_model(self): """mocks the FunnelForecast build_model method""" - return MockModel( - **parameters, - ) + return MockModel(holidays=self.holidays, regressors=self.regressors) + +@pytest.fixture +def forecast(mocker): + parameter_dict = {"uncertainty_samples": 1} -def mock_aggregate_forecast_observed( - forecast_df, observed_df, period, numpy_aggregations, percentiles -): - """Mocks the aggregate_forecast_observed function defined in ProphetForecast - and inherited in FunnelForecast. - This function is tested extensively in test_prophet_forecast - so we can make dummy outputs for tests related to it""" + mocker.patch.object(ProphetForecast, "_build_model", mock_build_model) - # add dummy columns where aggregated metrics woudl go - percentile_columns = [f"p{el}" for el in percentiles] - output_forecast_df = forecast_df.copy() - output_forecast_df[numpy_aggregations + percentile_columns] = 0 - return output_forecast_df, observed_df.copy() + # arbitarily set it a couple months in the future + return ProphetForecast(**parameter_dict) -def test_under_fit(forecast, mocker): - """test the _fit method""" +def test_predict(forecast): + """testing _predict""" - observed_data = pd.DataFrame( + observed_df = pd.DataFrame( { + "y": [0, 1], "submission_date": [ TEST_DATE, TEST_DATE_NEXT_DAY, + ], + } + ) + + dates_to_predict = pd.DataFrame( + { + "submission_date": [ TEST_DATE, TEST_DATE_NEXT_DAY, - ], + ] } ) - mocker.patch.object(forecast, "_build_model", mock_build_model) - forecast._fit(observed_data) + forecast.fit(observed_df) - # checking that history is set in the mocked Model ensures fit was called on it - pd.testing.assert_frame_equal( - observed_data.rename(columns={"submission_date": "ds"}), forecast.model.history + # to make sure the validation works set the number of simulations + out = forecast.predict(dates_to_predict).reset_index(drop=True) + + # in MockModel, the predictive_samples method sets the output to + # np.arange(len(dates_to_predict)) * self.value for one column called 0 + # this helps ensure the forecast_df in segment_models is set properly + # self.value is 2 + expected = pd.DataFrame( + { + 0: [0, 2], + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } ) + pd.testing.assert_frame_equal(out, expected) + -def test_fit(forecast, mocker): +def test_fit(forecast): """test the fit function. It is inherited from BaseForecast and calls _fit with the proper object attributes. Test looks very similar to that for _fit""" @@ -143,216 +135,324 @@ def test_fit(forecast, mocker): ], } ) - mocker.patch.object(forecast, "_build_model", mock_build_model) - forecast.observed_df = observed_data - forecast.fit() + forecast.fit(observed_data) # checking that history is set in the mocked Model ensures fit was called on it pd.testing.assert_frame_equal( observed_data.rename(columns={"submission_date": "ds"}), forecast.model.history ) - assert forecast.trained_at is not None - -def test_combine_forecast_observed(mocker, forecast): - """tests the _combine_forecast_observed method""" - # forecast predictions are set with the - # mock_aggregate_forecast_observed function so they - # can be ommited here +def test_aggregate_forecast_to_day(): + """tests the aggregate_forecast_observed method in the case + where the observed and forecasted have no overlap and the aggregation + happens at the day level""" + test_date_samples = np.arange(1000) + test_next_date_samples = np.arange(1000) * 2 forecast_df = pd.DataFrame( - { - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } + [ + { + **{"submission_date": TEST_DATE}, + **{i: el for i, el in enumerate(test_date_samples)}, + }, + { + **{"submission_date": TEST_DATE_NEXT_DAY}, + **{i: el for i, el in enumerate(test_next_date_samples)}, + }, + ] ) # rows with negative values are those expected to be removed # by filters in summarize + # arbitrarily subtract 1 month so there's not overlap observed_df = pd.DataFrame( { "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), ], "value": [10, 20], } ) - mocker.patch.object( - forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed - ) - numpy_aggregations = ["mean"] percentiles = [10, 50, 90] - output_df = forecast._combine_forecast_observed( - forecast_df, - observed_df, - period="period", - numpy_aggregations=numpy_aggregations, - percentiles=percentiles, - ) - observed_expected_df = pd.DataFrame( + forecast_summarized_output, observed_summarized_output = ( + aggregate_forecast_observed( + forecast_df, + observed_df, + period="day", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + ) + ) + observed_summarized_expected_df = pd.DataFrame( { "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, + pd.to_datetime(TEST_DATE - relativedelta(months=1)), + pd.to_datetime(TEST_DATE_NEXT_DAY - relativedelta(months=1)), ], "value": [10, 20], - "measure": ["observed", "observed"], - "source": ["historical", "historical"], + "aggregation_period": ["day", "day"], } ) - # 4x2 columns, 4 metrics (mean, p10, p50, p90) - forecast_expected_df = pd.DataFrame( - { - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], - "value": [0] * 8, - "source": ["forecast"] * 8, - } + forecast_summarized_expected_df = pd.DataFrame( + [ + { + "submission_date": pd.to_datetime(TEST_DATE), + "mean": np.mean(test_date_samples), + "p10": np.percentile(test_date_samples, 10), + "p50": np.percentile(test_date_samples, 50), + "p90": np.percentile(test_date_samples, 90), + "aggregation_period": "day", + }, + { + "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), + "mean": np.mean(test_next_date_samples), + "p10": np.percentile(test_next_date_samples, 10), + "p50": np.percentile(test_next_date_samples, 50), + "p90": np.percentile(test_next_date_samples, 90), + "aggregation_period": "day", + }, + ] ) - # concat in same order to make our lives easier - expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( - ["submission_date", "measure"] - ) - assert set(expected.columns) == set(output_df.columns) - # force value columns to be floats in both cases to make check easier - numeric_cols = ["value", "value_low", "value_mid", "value_high"] pd.testing.assert_frame_equal( - output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), - expected[output_df.columns].reset_index(drop=True), + forecast_summarized_output, forecast_summarized_expected_df ) - # should not be any nulls outside the metric column - non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] - assert not pd.isna(output_df[non_metric_columns]).any(axis=None) + pd.testing.assert_frame_equal( + observed_summarized_output, observed_summarized_expected_df + ) -def test_under_summarize(mocker, forecast): - """testing _summarize""" - # forecast predictions are set with the - # mock_aggregate_forecast_observed function so they - # can be ommited here +def test_aggregate_forecast_to_month(): + """tests the aggregate_forecast_observed method in the case + where the observed and forecasted have no overlap and the aggregation + happens at the day level""" + test_date_samples = np.arange(1000) + test_next_date_samples = np.arange(1000) * 2 forecast_df = pd.DataFrame( - { - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } + [ + { + **{"submission_date": TEST_DATE, "forecast_parameters": "test_month"}, + **{i: el for i, el in enumerate(test_date_samples)}, + }, + { + **{ + "submission_date": TEST_DATE_NEXT_DAY, + "forecast_parameters": "test_month", + }, + **{i: el for i, el in enumerate(test_next_date_samples)}, + }, + ] ) # rows with negative values are those expected to be removed # by filters in summarize + # arbitrarily subtract 1 month so there's not overlap observed_df = pd.DataFrame( { "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), ], "value": [10, 20], } ) - mocker.patch.object( - forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed - ) - numpy_aggregations = ["mean"] percentiles = [10, 50, 90] - output_df = forecast._summarize( - forecast_df, - observed_df, - period="period", - numpy_aggregations=numpy_aggregations, - percentiles=percentiles, + forecast_summarized_output, observed_summarized_output = ( + aggregate_forecast_observed( + forecast_df, + observed_df, + period="month", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + ) + ) + + # TEST_DATE should be the first of the month + observed_summarized_expected_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime(TEST_DATE - relativedelta(months=1)), + ], + "value": [30], + "aggregation_period": ["month"], + } ) - observed_expected_df = pd.DataFrame( + + forecast_summarized_expected_df = pd.DataFrame( + [ + { + "submission_date": pd.to_datetime(TEST_DATE), + "mean": np.mean(test_date_samples + test_next_date_samples), + "p10": np.percentile(test_date_samples + test_next_date_samples, 10), + "p50": np.percentile(test_date_samples + test_next_date_samples, 50), + "p90": np.percentile(test_date_samples + test_next_date_samples, 90), + "forecast_parameters": "test_month", + "aggregation_period": "month", + }, + ] + ) + + pd.testing.assert_frame_equal( + forecast_summarized_output, forecast_summarized_expected_df + ) + + pd.testing.assert_frame_equal( + observed_summarized_output, observed_summarized_expected_df + ) + + +def test_aggregate_forecast_to_month_extra_agg_col(): + """tests the aggregate_forecast_observed method in the case + where the observed and forecasted have no overlap and the aggregation + happens at the day level""" + test_date_samples = np.arange(1000) + test_next_date_samples = np.arange(1000) * 2 + forecast_df = pd.DataFrame( + [ + { + **{ + "submission_date": TEST_DATE, + "a": "A1", + "forecast_parameters": "A1", + }, + **{i: el for i, el in enumerate(test_date_samples)}, + }, + { + **{ + "submission_date": TEST_DATE_NEXT_DAY, + "a": "A1", + "forecast_parameters": "A1", + }, + **{i: el for i, el in enumerate(test_next_date_samples)}, + }, + { + **{ + "submission_date": TEST_DATE, + "a": "A2", + "forecast_parameters": "A2", + }, + **{i: el for i, el in enumerate(2 * test_date_samples)}, + }, + { + **{ + "submission_date": TEST_DATE_NEXT_DAY, + "a": "A2", + "forecast_parameters": "A2", + }, + **{i: el for i, el in enumerate(2 * test_next_date_samples)}, + }, + ] + ) + + # rows with negative values are those expected to be removed + # by filters in summarize + # arbitrarily subtract 1 month so there's not overlap + observed_df = pd.DataFrame( { "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), ], "value": [10, 20], - "measure": ["observed", "observed"], - "source": ["historical", "historical"], + "a": ["A1", "A1"], } ) - # 4x2 columns, 4 metrics (mean, p10, p50, p90) - forecast_expected_df = pd.DataFrame( + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + forecast_summarized_output, observed_summarized_output = ( + aggregate_forecast_observed( + forecast_df, + observed_df, + period="month", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + additional_aggregation_columns=["a"], + ) + ) + + # TEST_DATE should be the first of the month + observed_summarized_expected_df = pd.DataFrame( { "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, + pd.to_datetime(TEST_DATE - relativedelta(months=1)), ], - "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], - "value": [0] * 8, - "source": ["forecast"] * 8, + "value": [30], + "a": ["A1"], + "aggregation_period": "month", } ) - # concat in same order to make our lives easier - expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( - ["submission_date", "measure"] + forecast_summarized_expected_df = pd.DataFrame( + [ + { + "submission_date": pd.to_datetime(TEST_DATE), + "mean": np.mean(test_date_samples + test_next_date_samples), + "p10": np.percentile(test_date_samples + test_next_date_samples, 10), + "p50": np.percentile(test_date_samples + test_next_date_samples, 50), + "p90": np.percentile(test_date_samples + test_next_date_samples, 90), + "a": "A1", + "forecast_parameters": "A1", + "aggregation_period": "month", + }, + { + "submission_date": pd.to_datetime(TEST_DATE), + "mean": 2 * np.mean(test_date_samples + test_next_date_samples), + "p10": 2 + * np.percentile(test_date_samples + test_next_date_samples, 10), + "p50": 2 + * np.percentile(test_date_samples + test_next_date_samples, 50), + "p90": 2 + * np.percentile(test_date_samples + test_next_date_samples, 90), + "a": "A2", + "forecast_parameters": "A2", + "aggregation_period": "month", + }, + ] ) - expected["aggregation_period"] = "period" - assert set(expected.columns) == set(output_df.columns) - # force value columns to be floats in both cases to make check easier - numeric_cols = ["value", "value_low", "value_mid", "value_high"] + assert set(forecast_summarized_output.columns) == set( + forecast_summarized_output.columns + ) pd.testing.assert_frame_equal( - output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), - expected[output_df.columns].reset_index(drop=True), + forecast_summarized_output[forecast_summarized_expected_df.columns], + forecast_summarized_expected_df, ) - # should not be any nulls outside the metric column - non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] - assert not pd.isna(output_df[non_metric_columns]).any(axis=None) - - -def test_summarize(mocker, forecast): - """testing summarize""" - # create dummy metric hub object to when meta data from - # it is added we don't get an error - MetricHub = collections.namedtuple( - "MetricHub", - ["alias", "app_name", "slug", "min_date", "max_date"], + assert set(observed_summarized_output.columns) == set( + observed_summarized_expected_df.columns + ) + pd.testing.assert_frame_equal( + observed_summarized_output[observed_summarized_expected_df.columns], + observed_summarized_expected_df, ) - dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR) - # forecast predictions are set with the - # mock_aggregate_forecast_observed function so they - # can be ommited here +def test_aggregate_forecast_observed_overlap_to_day(): + """tests the aggregate_forecast_observed method in the case + where the observed and forecasted overlap and the aggregation + happens at the day level""" + test_date_samples = np.arange(1000) + test_next_date_samples = np.arange(1000) * 2 forecast_df = pd.DataFrame( - { - "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ], - } + [ + { + **{"submission_date": TEST_DATE}, + **{i: el for i, el in enumerate(test_date_samples)}, + }, + { + **{"submission_date": TEST_DATE_NEXT_DAY}, + **{i: el for i, el in enumerate(test_next_date_samples)}, + }, + ] ) # rows with negative values are those expected to be removed @@ -367,174 +467,640 @@ def test_summarize(mocker, forecast): } ) - mocker.patch.object( - forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed - ) - numpy_aggregations = ["mean"] percentiles = [10, 50, 90] + forecast_summarized_output, observed_summarized_output = ( + aggregate_forecast_observed( + forecast_df, + observed_df, + period="day", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + ) + ) + observed_summarized_expected_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime(TEST_DATE), + pd.to_datetime(TEST_DATE_NEXT_DAY), + ], + "value": [10, 20], + "aggregation_period": ["day", "day"], + } + ) - forecast.observed_df = observed_df - forecast.forecast_df = forecast_df - forecast.metric_hub = dummy_metric_hub + # add values from observed because of overlap + forecast_summarized_expected_df = pd.DataFrame( + [ + { + "submission_date": pd.to_datetime(TEST_DATE), + "mean": np.mean(test_date_samples + 10), + "p10": np.percentile(test_date_samples + 10, 10), + "p50": np.percentile(test_date_samples + 10, 50), + "p90": np.percentile(test_date_samples + 10, 90), + "aggregation_period": "day", + }, + { + "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), + "mean": np.mean(test_next_date_samples + 20), + "p10": np.percentile(test_next_date_samples + 20, 10), + "p50": np.percentile(test_next_date_samples + 20, 50), + "p90": np.percentile(test_next_date_samples + 20, 90), + "aggregation_period": "day", + }, + ] + ) - # timestamp attributes created by fit and predict - # must be added manuall - forecast.collected_at = "" - forecast.trained_at = "" - forecast.predicted_at = "" - forecast.metadata_params = "" + pd.testing.assert_frame_equal( + forecast_summarized_output, forecast_summarized_expected_df + ) - numpy_aggregations = ["mean"] - percentiles = [10, 50, 90] - forecast.summarize( - periods=["period1", "period2"], - numpy_aggregations=numpy_aggregations, - percentiles=percentiles, + pd.testing.assert_frame_equal( + observed_summarized_output, observed_summarized_expected_df ) - output_df = forecast.summary_df - observed_expected_df = pd.DataFrame( +def test_aggregate_forecast_observed_overlap_to_day_with_additional(): + """tests the aggregate_forecast_observed method in the case + where the observed and forecasted overlap and the aggregation + happens at the day level""" + test_date_samples = np.arange(1000) + test_next_date_samples = np.arange(1000) * 2 + forecast_df = pd.DataFrame( + [ + { + **{ + "submission_date": TEST_DATE, + "a": "A1", + "forecast_parameters": "A1", + }, + **{i: el for i, el in enumerate(test_date_samples)}, + }, + { + **{ + "submission_date": TEST_DATE_NEXT_DAY, + "a": "A1", + "forecast_parameters": "A1", + }, + **{i: el for i, el in enumerate(test_next_date_samples)}, + }, + { + **{ + "submission_date": TEST_DATE, + "a": "A2", + "forecast_parameters": "A2", + }, + **{i: el for i, el in enumerate(2 * test_date_samples)}, + }, + { + **{ + "submission_date": TEST_DATE_NEXT_DAY, + "a": "A2", + "forecast_parameters": "A2", + }, + **{i: el for i, el in enumerate(2 * test_next_date_samples)}, + }, + ] + ) + + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( { "submission_date": [ TEST_DATE, TEST_DATE_NEXT_DAY, ], "value": [10, 20], - "measure": ["observed", "observed"], - "source": ["historical", "historical"], + "a": ["A1", "A2"], } ) - # 4x2 columns, 4 metrics (mean, p10, p50, p90) - forecast_expected_df = pd.DataFrame( + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + forecast_summarized_output, observed_summarized_output = ( + aggregate_forecast_observed( + forecast_df, + observed_df, + period="day", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + additional_aggregation_columns=["a"], + ) + ) + observed_summarized_expected_df = pd.DataFrame( { "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, + pd.to_datetime(TEST_DATE), + pd.to_datetime(TEST_DATE_NEXT_DAY), ], - "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], - "value": [0] * 8, - "source": ["forecast"] * 8, + "value": [10, 20], + "a": ["A1", "A2"], + "aggregation_period": ["day", "day"], } ) - # concat in same order to make our lives easier - expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( - ["submission_date", "measure"] + # add values from observed because of overlap + forecast_summarized_expected_df = pd.DataFrame( + [ + { + "submission_date": pd.to_datetime(TEST_DATE), + "a": "A1", + "forecast_parameters": "A1", + "mean": np.mean(test_date_samples + 10), + "p10": np.percentile(test_date_samples + 10, 10), + "p50": np.percentile(test_date_samples + 10, 50), + "p90": np.percentile(test_date_samples + 10, 90), + "aggregation_period": "day", + }, + { + "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), + "a": "A1", + "forecast_parameters": "A1", + "mean": np.mean(test_next_date_samples), + "p10": np.percentile(test_next_date_samples, 10), + "p50": np.percentile(test_next_date_samples, 50), + "p90": np.percentile(test_next_date_samples, 90), + "aggregation_period": "day", + }, + { + "submission_date": pd.to_datetime(TEST_DATE), + "a": "A2", + "forecast_parameters": "A2", + "mean": np.mean(2 * test_date_samples), + "p10": np.percentile(2 * test_date_samples, 10), + "p50": np.percentile(2 * test_date_samples, 50), + "p90": np.percentile(2 * test_date_samples, 90), + "aggregation_period": "day", + }, + { + "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), + "a": "A2", + "forecast_parameters": "A2", + "mean": np.mean(2 * test_next_date_samples + 20), + "p10": np.percentile(2 * test_next_date_samples + 20, 10), + "p50": np.percentile(2 * test_next_date_samples + 20, 50), + "p90": np.percentile(2 * test_next_date_samples + 20, 90), + "aggregation_period": "day", + }, + ] + ) + + assert set(forecast_summarized_expected_df.columns) == set( + forecast_summarized_output.columns ) - expected1 = expected.copy() - expected2 = expected.copy() - expected1["aggregation_period"] = "period1" - expected2["aggregation_period"] = "period2" - - expected = pd.concat([expected1, expected2]) - - # not going to check all the metadata columns - # in assert_frame_equal. Just make sure they're there - metadata_columns = { - "metric_alias", - "metric_hub_app_name", - "metric_hub_slug", - "metric_start_date", - "metric_end_date", - "metric_collected_at", - "forecast_start_date", - "forecast_end_date", - "forecast_trained_at", - "forecast_predicted_at", - "forecast_parameters", - } - assert set(expected.columns) | metadata_columns == set(output_df.columns) - # force value columns to be floats in both cases to make check easier - numeric_cols = ["value", "value_low", "value_mid", "value_high"] pd.testing.assert_frame_equal( - output_df.sort_values(["submission_date", "aggregation_period", "measure"])[ - expected.columns - ].reset_index(drop=True), - expected.sort_values( - ["submission_date", "aggregation_period", "measure"] + forecast_summarized_output[forecast_summarized_expected_df.columns] + .sort_values(["submission_date", "a"]) + .reset_index(drop=True), + forecast_summarized_expected_df.sort_values( + ["submission_date", "a"] ).reset_index(drop=True), ) - # should not be any nulls outside the metric column - non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] - assert not pd.isna(output_df[non_metric_columns]).any(axis=None) + assert set(observed_summarized_expected_df.columns) == set( + observed_summarized_output.columns + ) + pd.testing.assert_frame_equal( + observed_summarized_output[observed_summarized_expected_df.columns] + .sort_values(["submission_date", "a"]) + .reset_index(drop=True), + observed_summarized_expected_df.sort_values( + ["submission_date", "a"] + ).reset_index(drop=True), + ) -def test_under_predict(mocker, forecast): - """testing _predict""" - # this ensures forecast is using MockModel - mocker.patch.object(forecast, "_build_model", mock_build_model) +def test_aggregate_forecast_observed_overlap_to_month(): + """tests the aggregate_forecast_observed method in the case + where the observed and forecasted overlap and the aggregation + happens at the day level""" + test_date_samples = np.arange(1000) + test_next_date_samples = np.arange(1000) * 2 + forecast_df = pd.DataFrame( + [ + { + **{"submission_date": TEST_DATE}, + **{i: el for i, el in enumerate(test_date_samples)}, + }, + { + **{"submission_date": TEST_DATE_NEXT_DAY}, + **{i: el for i, el in enumerate(test_next_date_samples)}, + }, + ] + ) + # rows with negative values are those expected to be removed + # by filters in summarize observed_df = pd.DataFrame( { - "y": [0, 1], "submission_date": [ TEST_DATE, TEST_DATE_NEXT_DAY, ], + "value": [10, 20], } ) - dates_to_predict = pd.DataFrame( + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + forecast_summarized_output, observed_summarized_output = ( + aggregate_forecast_observed( + forecast_df, + observed_df, + period="month", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + ) + ) + observed_summarized_expected_df = pd.DataFrame( { "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - ] + pd.to_datetime(TEST_DATE), + ], + "value": [30], + "aggregation_period": ["month"], } ) - forecast.observed_df = observed_df - forecast.parameters = {"param1": 1, "param2": 2} - forecast.fit() - out = forecast._predict(dates_to_predict).reset_index(drop=True) - # in MockModel, the predictive_samples method sets the output to - # np.arange(len(dates_to_predict)) * self.value for one column called 0 - # this helps ensure the forecast_df in segment_models is set properly - expected = pd.DataFrame( + # add values from observed because of overlap + forecast_summarized_expected_df = pd.DataFrame( + [ + { + "submission_date": pd.to_datetime(TEST_DATE), + "mean": np.mean(test_date_samples + test_next_date_samples + 30), + "p10": np.percentile( + test_date_samples + test_next_date_samples + 30, 10 + ), + "p50": np.percentile( + test_date_samples + test_next_date_samples + 30, 50 + ), + "p90": np.percentile( + test_date_samples + test_next_date_samples + 30, 90 + ), + "aggregation_period": "month", + }, + ] + ) + + pd.testing.assert_frame_equal( + forecast_summarized_output, forecast_summarized_expected_df + ) + + pd.testing.assert_frame_equal( + observed_summarized_output, observed_summarized_expected_df + ) + + +def test_aggregate_forecast_observed_overlap_to_month_with_additional(): + """tests the aggregate_forecast_observed method in the case + where the observed and forecasted overlap and the aggregation + happens at the day level""" + test_date_samples = np.arange(1000) + test_next_date_samples = np.arange(1000) * 2 + forecast_df = pd.DataFrame( + [ + { + **{ + "submission_date": TEST_DATE, + "forecast_parameters": "A1", + "a": "A1", + }, + **{i: el for i, el in enumerate(test_date_samples)}, + }, + { + **{ + "submission_date": TEST_DATE_NEXT_DAY, + "forecast_parameters": "A1", + "a": "A1", + }, + **{i: el for i, el in enumerate(test_next_date_samples)}, + }, + { + **{ + "submission_date": TEST_DATE, + "forecast_parameters": "A2", + "a": "A2", + }, + **{i: el for i, el in enumerate(2 * test_date_samples)}, + }, + { + **{ + "submission_date": TEST_DATE_NEXT_DAY, + "forecast_parameters": "A2", + "a": "A2", + }, + **{i: el for i, el in enumerate(2 * test_next_date_samples)}, + }, + ] + ) + + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( { - 0: [0, 2], "submission_date": [ TEST_DATE, TEST_DATE_NEXT_DAY, ], + "value": [10, 20], + "a": ["A1", "A2"], } ) - pd.testing.assert_frame_equal(out, expected) + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + forecast_summarized_output, observed_summarized_output = ( + aggregate_forecast_observed( + forecast_df, + observed_df, + period="month", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + additional_aggregation_columns=["a"], + ) + ) + observed_summarized_expected_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime(TEST_DATE), + pd.to_datetime(TEST_DATE), + ], + "value": [10, 20], + "a": ["A1", "A2"], + "aggregation_period": ["month", "month"], + } + ) - # test predict while we're here + # add values from observed because of overlap + forecast_summarized_expected_df = pd.DataFrame( + [ + { + "submission_date": pd.to_datetime(TEST_DATE), + "forecast_parameters": "A1", + "a": "A1", + "mean": np.mean(test_date_samples + test_next_date_samples + 10), + "p10": np.percentile( + test_date_samples + test_next_date_samples + 10, 10 + ), + "p50": np.percentile( + test_date_samples + test_next_date_samples + 10, 50 + ), + "p90": np.percentile( + test_date_samples + test_next_date_samples + 10, 90 + ), + "aggregation_period": "month", + }, + { + "submission_date": pd.to_datetime(TEST_DATE), + "forecast_parameters": "A2", + "a": "A2", + "mean": np.mean( + 2 * test_date_samples + 2 * test_next_date_samples + 20 + ), + "p10": np.percentile( + 2 * test_date_samples + 2 * test_next_date_samples + 20, 10 + ), + "p50": np.percentile( + 2 * test_date_samples + 2 * test_next_date_samples + 20, 50 + ), + "p90": np.percentile( + 2 * test_date_samples + 2 * test_next_date_samples + 20, 90 + ), + "aggregation_period": "month", + }, + ] + ) - forecast.dates_to_predict = dates_to_predict - forecast.number_of_simulations = 1 # so that _validate doesn't break - forecast.predict() + assert set(forecast_summarized_expected_df.columns) == set( + forecast_summarized_output.columns + ) + pd.testing.assert_frame_equal( + forecast_summarized_output[forecast_summarized_expected_df.columns] + .sort_values(["submission_date", "a"]) + .reset_index(drop=True), + forecast_summarized_expected_df.sort_values( + ["submission_date", "a"] + ).reset_index(drop=True), + ) - out = forecast.forecast_df + assert set(observed_summarized_expected_df.columns) == set( + observed_summarized_output.columns + ) + pd.testing.assert_frame_equal( + observed_summarized_output[observed_summarized_expected_df.columns] + .sort_values(["submission_date", "a"]) + .reset_index(drop=True), + observed_summarized_expected_df.sort_values( + ["submission_date", "a"] + ).reset_index(drop=True), + ) - # in MockModel, the predictive_samples method sets the output to - # np.arange(len(dates_to_predict)) * self.value for one column called 0 - # this helps ensure the forecast_df in segment_models is set properly - expected = pd.DataFrame( + +def test_combine_forecast_observed(): + """tests the combine_forecast_observed method""" + forecast_df = pd.DataFrame( { - 0: [0, 2], "submission_date": [ TEST_DATE, TEST_DATE_NEXT_DAY, ], + "mean": [0, 0], + "p10": [0, 0], + "p50": [0, 0], + "p90": [0, 0], + "aggregation_period": ["I get removed"] * 2, } ) - pd.testing.assert_frame_equal(out, expected) - assert forecast.predicted_at is not None + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "value": [10, 20], + "aggregation_period": ["I get removed"] * 2, + } + ) + + output_df = combine_forecast_observed(forecast_df, observed_df) + observed_expected_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "value": [10, 20], + "measure": ["observed", "observed"], + "source": ["historical", "historical"], + } + ) + + # 4x2 columns, 4 metrics (mean, p10, p50, p90) + forecast_expected_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], + "value": [0] * 8, + "source": ["forecast"] * 8, + } + ) + + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( + ["submission_date", "measure"] + ) + assert set(expected.columns) == set(output_df.columns) + + pd.testing.assert_frame_equal( + output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), + expected[output_df.columns].reset_index(drop=True), + ) + + assert not pd.isna(output_df).any(axis=None) + + +def test_summarize(): + """testing _summarize""" + test_date_samples = np.arange(1000) + test_next_date_samples = np.arange(1000) * 2 + forecast_df = pd.DataFrame( + [ + { + **{"submission_date": TEST_DATE}, + **{i: el for i, el in enumerate(test_date_samples)}, + }, + { + **{"submission_date": TEST_DATE_NEXT_DAY}, + **{i: el for i, el in enumerate(test_next_date_samples)}, + }, + ] + ) + + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "value": [10, 20], + } + ) + + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + output_df = summarize( + forecast_df, + observed_df, + periods=["day"], + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + forecast_parameters="", + ) + + observed_expected_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime(TEST_DATE), + pd.to_datetime(TEST_DATE_NEXT_DAY), + ], + "value": [10, 20], + "measure": ["observed", "observed"], + "source": ["historical", "historical"], + "aggregation_period": ["day", "day"], + } + ) + + # add values from observed because of overlap + forecast_expected_df = pd.DataFrame( + [ + { + "submission_date": pd.to_datetime(TEST_DATE), + "measure": "mean", + "value": np.mean(test_date_samples + 10), + "source": "forecast", + "aggregation_period": "day", + }, + { + "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), + "measure": "mean", + "value": np.mean(test_next_date_samples + 20), + "source": "forecast", + "aggregation_period": "day", + }, + { + "submission_date": pd.to_datetime(TEST_DATE), + "measure": "p10", + "value": np.percentile(test_date_samples + 10, 10), + "source": "forecast", + "aggregation_period": "day", + }, + { + "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), + "measure": "p10", + "value": np.percentile(test_next_date_samples + 20, 10), + "source": "forecast", + "aggregation_period": "day", + }, + { + "submission_date": pd.to_datetime(TEST_DATE), + "measure": "p50", + "value": np.percentile(test_date_samples + 10, 50), + "source": "forecast", + "aggregation_period": "day", + }, + { + "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), + "measure": "p50", + "value": np.percentile(test_next_date_samples + 20, 50), + "source": "forecast", + "aggregation_period": "day", + }, + { + "submission_date": pd.to_datetime(TEST_DATE), + "measure": "p90", + "value": np.percentile(test_date_samples + 10, 90), + "source": "forecast", + "aggregation_period": "day", + }, + { + "submission_date": pd.to_datetime(TEST_DATE_NEXT_DAY), + "measure": "p90", + "value": np.percentile(test_next_date_samples + 20, 90), + "source": "forecast", + "aggregation_period": "day", + }, + ] + ) + + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( + ["submission_date", "measure"] + ) + expected["aggregation_period"] = "day" + expected["forecast_parameters"] = "" + + assert set(expected.columns) == set(output_df.columns) + pd.testing.assert_frame_equal( + output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), + expected[output_df.columns].reset_index(drop=True), + ) + + assert not pd.isna(output_df).any(axis=None) def test_summarize_non_overlapping_day(): @@ -546,18 +1112,12 @@ def test_summarize_non_overlapping_day(): ) predict_end_date = (TEST_DATE + relativedelta(months=2)).strftime("%Y-%m-%d") - forecast = ProphetForecast( - model_type="test", - parameters={}, - use_all_us_holidays=None, - start_date=predict_start_date, - end_date=predict_end_date, - metric_hub=None, - ) observed_submission_dates = pd.date_range( pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date) ).date - predict_submission_dates = forecast.dates_to_predict["submission_date"].values + predict_submission_dates = pd.date_range( + pd.to_datetime(predict_start_date), pd.to_datetime(predict_end_date) + ).date observed_df = pd.DataFrame( { @@ -584,8 +1144,8 @@ def test_summarize_non_overlapping_day(): dict(**{"submission_date": predict_submission_dates}, **forecast_data) ) - output_df = forecast._combine_forecast_observed( - forecast_df, observed_df, "day", ["mean", "median"], [50] + output_df = summarize( + forecast_df, observed_df, ["day"], ["mean", "median"], [50], "" ) expected_observed_df = observed_df.copy() @@ -630,6 +1190,9 @@ def test_summarize_non_overlapping_day(): [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] ) + expected_df["aggregation_period"] = "day" + expected_df["forecast_parameters"] = "" + assert set(expected_df.columns) == set(output_df.columns) columns = expected_df.columns expected_df_compare = ( @@ -661,21 +1224,12 @@ def test_summarize_non_overlapping_month(): predict_start_date = "2124-04-01" predict_end_date = "2124-05-31" - print(observed_start_date, observed_end_date) - print(predict_start_date, predict_end_date) - - forecast = ProphetForecast( - model_type="test", - parameters={}, - use_all_us_holidays=None, - start_date=predict_start_date, - end_date=predict_end_date, - metric_hub=None, - ) observed_submission_dates = pd.date_range( pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date) ).date - predict_submission_dates = forecast.dates_to_predict["submission_date"].values + predict_submission_dates = pd.date_range( + pd.to_datetime(predict_start_date), pd.to_datetime(predict_end_date) + ).date observed_df = pd.DataFrame( { @@ -697,8 +1251,8 @@ def test_summarize_non_overlapping_month(): dict(**{"submission_date": predict_submission_dates}, **forecast_data) ) - output_df = forecast._combine_forecast_observed( - forecast_df, observed_df, "month", ["mean", "median"], [50] + output_df = summarize( + forecast_df, observed_df, ["month"], ["mean", "median"], [50], "" ) expected_observed_dates = sorted( @@ -759,6 +1313,9 @@ def test_summarize_non_overlapping_month(): [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] ) + expected_df["aggregation_period"] = "month" + expected_df["forecast_parameters"] = "" + assert set(expected_df.columns) == set(output_df.columns) columns = expected_df.columns expected_df_compare = ( @@ -783,19 +1340,12 @@ def test_summarize_overlapping_day(): predict_start_date = TEST_DATE_STR predict_end_date = (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d") - forecast = ProphetForecast( - model_type="test", - parameters={}, - use_all_us_holidays=None, - start_date=predict_start_date, - end_date=predict_end_date, - metric_hub=None, - ) observed_submission_dates = pd.date_range( pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date) ).date - predict_submission_dates = forecast.dates_to_predict["submission_date"].values - + predict_submission_dates = pd.date_range( + pd.to_datetime(predict_start_date), pd.to_datetime(predict_end_date) + ).date observed_df = pd.DataFrame( { "submission_date": observed_submission_dates, @@ -821,8 +1371,8 @@ def test_summarize_overlapping_day(): dict(**{"submission_date": predict_submission_dates}, **forecast_data) ) - output_df = forecast._combine_forecast_observed( - forecast_df, observed_df, "day", ["mean", "median"], [50] + output_df = summarize( + forecast_df, observed_df, ["day"], ["mean", "median"], [50], "" ) expected_observed_df = observed_df.copy() @@ -869,6 +1419,9 @@ def test_summarize_overlapping_day(): [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] ) + expected_df["aggregation_period"] = "day" + expected_df["forecast_parameters"] = "" + assert set(expected_df.columns) == set(output_df.columns) columns = expected_df.columns expected_df_compare = ( @@ -897,19 +1450,12 @@ def test_summarize_overlapping_month(): predict_start_date = "2124-01-01" predict_end_date = "2124-02-28" - forecast = ProphetForecast( - model_type="test", - parameters={}, - use_all_us_holidays=None, - start_date=predict_start_date, - end_date=predict_end_date, - metric_hub=None, - ) observed_submission_dates = pd.date_range( pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date) ).date - predict_submission_dates = forecast.dates_to_predict["submission_date"].values - + predict_submission_dates = pd.date_range( + pd.to_datetime(predict_start_date), pd.to_datetime(predict_end_date) + ).date observed_df = pd.DataFrame( { "submission_date": observed_submission_dates, @@ -935,8 +1481,8 @@ def test_summarize_overlapping_month(): dict(**{"submission_date": predict_submission_dates}, **forecast_data) ) - output_df = forecast._combine_forecast_observed( - forecast_df, observed_df, "month", ["mean", "median"], [50] + output_df = summarize( + forecast_df, observed_df, ["month"], ["mean", "median"], [50], "" ) expected_observed_dates = sorted( @@ -1003,6 +1549,9 @@ def test_summarize_overlapping_month(): [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] ) + expected_df["aggregation_period"] = "month" + expected_df["forecast_parameters"] = "" + assert set(expected_df.columns) == set(output_df.columns) columns = expected_df.columns expected_df_compare = ( @@ -1018,3 +1567,552 @@ def test_summarize_overlapping_month(): pd.testing.assert_frame_equal( expected_df_compare, output_df_compare, check_exact=False ) + + +def test_summarize_overlapping_month_and_day(): + # choose arbitrary year for the start and end dates + # the first date of two different months is chosen + # this is a simple way to check that the aggregation + # for multiple periods is working + observed_start_date = "2124-01-01" + observed_end_date = "2124-02-01" + + observed_submission_dates = [ + pd.to_datetime(observed_start_date), + pd.to_datetime(observed_end_date), + ] + predict_submission_dates = [ + pd.to_datetime(observed_start_date), + pd.to_datetime(observed_end_date), + ] + observed_df = pd.DataFrame( + { + "submission_date": observed_submission_dates, + "value": [1] * len(observed_submission_dates), + } + ) + + # there are the samples generated + # the mean and median are the aggregates used + test_samples = np.array([1, 1, 2, 3, 5, 8, 13]) + test_mean = np.mean(test_samples) + test_median = np.median(test_samples) + + # mean and median scale with a factor + # so a factor is multiplied on to make sure the aggregation is working + # across rows properly + forecast_array = np.stack( + [test_samples] * len(predict_submission_dates), + axis=0, + ) + forecast_data = {str(i): forecast_array[:, i] for i in range(len(test_samples))} + forecast_df = pd.DataFrame( + dict(**{"submission_date": predict_submission_dates}, **forecast_data) + ) + + output_df = summarize( + forecast_df, observed_df, ["month", "day"], ["mean", "median"], [50], "" + ) + + expected_observed_dates = sorted( + pd.to_datetime(observed_df["submission_date"].values) + .to_period("m") + .to_timestamp() + .unique() + ) + expected_observed_df = pd.DataFrame( + { + "submission_date": expected_observed_dates, + "source": ["historical", "historical"], + "measure": ["observed", "observed"], + "value": [1, 1], + } + ) + + forecast_observed_dates = sorted( + pd.to_datetime(forecast_df["submission_date"].values) + .to_period("m") + .to_timestamp() + .unique() + ) + + forecast_mean_df = pd.DataFrame( + { + "submission_date": forecast_observed_dates, + "source": ["forecast", "forecast"], + "measure": ["mean", "mean"], + "value": [ + test_mean + 1, + test_mean + 1, + ], + } + ) + + forecast_median_df = pd.DataFrame( + { + "submission_date": forecast_observed_dates, + "source": ["forecast", "forecast"], + "measure": ["median", "median"], + "value": [ + test_median + 1, + test_median + 1, + ], + } + ) + + forecast_p50_df = pd.DataFrame( + { + "submission_date": forecast_observed_dates, + "source": ["forecast", "forecast"], + "measure": ["p50", "p50"], + "value": [ + test_median + 1, + test_median + 1, + ], + } + ) + + expected_df = pd.concat( + [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] + ) + + expected_df["aggregation_period"] = "month" + + # day will have the same values because there are two days + # the first day of two different months + # only thing that changes is the aggregation_period + # and 01-31 becomes 01-01 + expected_df_day = expected_df.copy() + expected_df_day["aggregation_period"] = "day" + + expected_df = pd.concat([expected_df_day, expected_df]) + expected_df["forecast_parameters"] = "" + + assert set(expected_df.columns) == set(output_df.columns) + columns = expected_df.columns + expected_df_compare = ( + expected_df[columns] + .sort_values(["submission_date", "source", "measure", "aggregation_period"]) + .reset_index(drop=True) + ) + output_df_compare = ( + output_df[columns] + .sort_values(["submission_date", "source", "measure", "aggregation_period"]) + .reset_index(drop=True) + ) + pd.testing.assert_frame_equal( + expected_df_compare, output_df_compare, check_exact=False + ) + + +def test_add_regressors(forecast): + """test add regressors + test case for each element of regressor_list_raw is indicated in name""" + + # choose arbitrary dates for dates + # name indicates the relationship of the window + # to the timeframe of the data as defined in the ds + # column of df below + regressor_list_raw = [ + { + "name": "all_in", + "description": "it's all in", + "start_date": "2124-01-01", + "end_date": "2124-01-06", + }, + { + "name": "all_out", + "description": "it's all out", + "start_date": "2124-02-01", + "end_date": "2124-02-06", + }, + { + "name": "just_end", + "description": "just the second half", + "start_date": "2124-01-03", + "end_date": "2124-02-06", + }, + { + "name": "just_middle", + "description": "just the middle two", + "start_date": "2124-01-02", + "end_date": "2124-01-03", + }, + ] + + regressor_list = [ProphetRegressor(**r) for r in regressor_list_raw] + + df = pd.DataFrame( + { + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-03").date(), + pd.to_datetime("2124-01-04").date(), + ], + } + ) + + output_df = forecast._add_regressors(df, regressors=regressor_list) + + expected_df = pd.DataFrame( + { + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-03").date(), + pd.to_datetime("2124-01-04").date(), + ], + "all_in": [0, 0, 0, 0], + "all_out": [1, 1, 1, 1], + "just_end": [1, 1, 0, 0], + "just_middle": [1, 0, 0, 1], + } + ) + + assert set(output_df.columns) == set(expected_df.columns) + pd.testing.assert_frame_equal(output_df, expected_df[output_df.columns]) + + +def test_add_regressors_partial(forecast): + """test add regressors when some fields aren't set + test case for each element of regressor_list_raw is indicated in name""" + + # choose arbitrary dates for dates + # name indicates the relationship of the window + # to the timeframe of the data as defined in the ds + # column of df below + regressor_list_raw = [ + { + "name": "just_end", + "description": "just the second half", + "start_date": "2124-01-03", + }, + { + "name": "just_start", + "description": "just the beginning", + "end_date": "2124-01-03", + }, + ] + + regressor_list = [ProphetRegressor(**r) for r in regressor_list_raw] + + df = pd.DataFrame( + { + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-03").date(), + pd.to_datetime("2124-01-04").date(), + ], + } + ) + + output_df = forecast._add_regressors(df, regressors=regressor_list) + + expected_df = pd.DataFrame( + { + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-03").date(), + pd.to_datetime("2124-01-04").date(), + ], + "just_end": [1, 1, 0, 0], + "just_start": [0, 0, 0, 1], + } + ) + + assert set(output_df.columns) == set(expected_df.columns) + pd.testing.assert_frame_equal(output_df, expected_df[output_df.columns]) + + +def test_build_train_dataframe_no_regressors(forecast): + """test _build_train_dataframe with no regressors""" + # only the growth and regressors attributes matter for train_dataframe + # so they can be manually set here + regressor_list = [] + forecast.regressors = regressor_list + + observed_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + + output_train_df = forecast._build_train_dataframe(observed_df) + expected_train_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + pd.testing.assert_frame_equal( + output_train_df.reset_index(drop=True), expected_train_df + ) + + # test again but with add_logistic_growth_cols set to true + forecast.growth = "logistic" + output_train_wlog_df = forecast._build_train_dataframe(observed_df) + expected_train_wlog_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "floor": [1.5, 1.5], + "cap": [6.0, 6.0], + } + ) + + assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) + pd.testing.assert_frame_equal( + output_train_wlog_df.reset_index(drop=True), + expected_train_wlog_df[output_train_wlog_df.columns], + ) + + +def test_build_train_dataframe(forecast): + """test _build_train_dataframe and include regressors""" + regressor_list = [ + { + "name": "all_in", + "description": "it's all in", + "start_date": TEST_DATE_STR, + "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), + }, + { + "name": "all_out", + "description": "it's all in", + "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), + }, + { + "name": "just_end", + "description": "just the second one", + "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), + }, + ] + # only the growth and regressors attributes matter for train_dataframe + # so they can be manually set here + forecast.regressors = [ProphetRegressor(**r) for r in regressor_list] + + observed_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + + output_train_df = forecast._build_train_dataframe(observed_df) + expected_train_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "all_in": [0, 0], + "all_out": [ + 1, + 1, + ], + "just_end": [1, 0], + } + ) + pd.testing.assert_frame_equal( + output_train_df.reset_index(drop=True), expected_train_df + ) + + # now with logistic growth set + forecast.growth = "logistic" + output_train_wlog_df = forecast._build_train_dataframe(observed_df) + expected_train_wlog_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "all_in": [0, 0], + "all_out": [1, 1], + "just_end": [1, 0], + "floor": [1.5, 1.5], + "cap": [6.0, 6.0], + } + ) + + assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) + pd.testing.assert_frame_equal( + output_train_wlog_df.reset_index(drop=True), + expected_train_wlog_df[output_train_wlog_df.columns], + ) + + +def test_build_predict_dataframe_no_regressors(forecast): + """test _build_predict with no regressors""" + # only the growth and regressors attributes matter for train_dataframe + # so they can be manually set here + regressor_list = [] + forecast.regressors = regressor_list + + # manually set trained_parameters, normally this would happen during training + forecast.logistic_growth_floor = -1.0 + forecast.logistic_growth_cap = 10.0 + + dates_to_predict = pd.DataFrame( + { + "submission_date": [ + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + + output_predict_df = forecast._build_predict_dataframe(dates_to_predict) + expected_predict_df = pd.DataFrame( + { + "ds": [ + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + pd.testing.assert_frame_equal( + output_predict_df.reset_index(drop=True), expected_predict_df + ) + + # test against but with add_logistic_growth_cols set to true + forecast.growth = "logistic" + output_predict_wlog_df = forecast._build_predict_dataframe(dates_to_predict) + expected_predict_wlog_df = pd.DataFrame( + { + "ds": [ + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "floor": [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0], + "cap": [10.0, 10.0, 10.0, 10.0, 10.0, 10.0], + } + ) + + assert set(output_predict_wlog_df.columns) == set(expected_predict_wlog_df.columns) + pd.testing.assert_frame_equal( + output_predict_wlog_df.reset_index(drop=True), + expected_predict_wlog_df[output_predict_wlog_df.columns], + ) + + +def test_build_predict_dataframe(forecast): + """test _build_predict_dataframe including regressors""" + regressor_list = [ + { + "name": "all_in", + "description": "it's all in", + "start_date": TEST_DATE_STR, + "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), + }, + { + "name": "all_out", + "description": "it's all in", + "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), + }, + { + "name": "just_end", + "description": "just the second one", + "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), + }, + ] + + # only the growth and regressors attributes matter for train_dataframe + # so they can be manually set here + forecast.regressors = [ProphetRegressor(**r) for r in regressor_list] + + # manually set trained_parameters, normally this would happen during training + forecast.logistic_growth_floor = -1.0 + forecast.logistic_growth_cap = 10.0 + + dates_to_predict = pd.DataFrame( + { + "submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY], + } + ) + + output_train_df = forecast._build_predict_dataframe(dates_to_predict) + expected_train_df = pd.DataFrame( + { + "ds": [TEST_DATE, TEST_DATE_NEXT_DAY], + "all_in": [0, 0], + "all_out": [1, 1], + "just_end": [1, 0], + } + ) + pd.testing.assert_frame_equal( + output_train_df.reset_index(drop=True), expected_train_df + ) + + # test again but with add_logistic_growth_cols set to true + forecast.growth = "logistic" + output_train_wlog_df = forecast._build_predict_dataframe(dates_to_predict) + expected_train_wlog_df = pd.DataFrame( + { + "ds": [TEST_DATE, TEST_DATE_NEXT_DAY], + "all_in": [0, 0], + "all_out": [1, 1], + "just_end": [1, 0], + "floor": [-1.0, -1.0], + "cap": [10.0, 10.0], + } + ) + + assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) + pd.testing.assert_frame_equal( + output_train_wlog_df.reset_index(drop=True), + expected_train_wlog_df[output_train_wlog_df.columns], + )