CDCgov
diff --git a/‎iup/__init__.py‎
Lines changed: 31 additions & 17 deletions b/‎iup/__init__.py‎
Lines changed: 31 additions & 17 deletions
diff --git a/‎iup/models.py‎
Lines changed: 100 additions & 165 deletions b/‎iup/models.py‎
Lines changed: 100 additions & 165 deletions
@@ -160,30 +160,44 @@ def validate(self):
         )
 
 
-def date_to_season(
-    date: pl.Expr, season_start_month: int, season_start_day: int = 1
+def to_season(
+    date: pl.Expr,
+    season_start_month: int,
+    season_end_month: int,
+    season_start_day: int = 1,
+    season_end_day: int = 1,
 ) -> pl.Expr:
-    """Extract the overwinter disease season from a date.
+    """
+    Identify the overwinter season from a date.
 
-    Dates in year Y before the season start (e.g., Sep 1) are in the second part of
-    the season (i.e., in season Y-1/Y). Dates in year Y after the season start are in
-    season Y/Y+1. E.g., 2023-10-07 and 2024-04-18 are both in "2023/2024".
+    Every year, there is a season end (e.g., May 1) and a season start (e.g., Sep 1).
+    Dates before the season end are associated with the prior season (e.g., Feb 1, 2020
+    belongs to 2019/2020 season). Dates after the season start are associated with the
+    next season (e.g., Oct 1, 2020 belongs to 2020/2021). Dates between the season end
+    and season start are not in any season (e.g., June 1).
 
     Args:
-        date: Dates in an coverage data frame.
-        season_start_month: First month of the overwinter disease season.
-        season_start_day: First day of the first month of the overwinter disease season.
+        date: dates
+        season_start_month: first month
+        season_end_month: last month
+        season_start_day: first day
+        season_end_day: last day
 
     Returns:
-        Seasons for each date.
+        season like "2020/2021"
     """
+    assert (season_start_month, season_start_day) > (
+        season_end_month,
+        season_end_day,
+    ), "Only overwinter seasons are supported"
 
-    # for every date, figure out the season breakpoint in that year
-    season_start = pl.date(date.dt.year(), season_start_month, season_start_day)
+    # year of this date
+    y = date.dt.year()
+    # start and end dates of seasons in this year
+    end = pl.date(y, season_end_month, season_end_day)
+    start = pl.date(y, season_start_month, season_start_day)
 
-    # what is the first year in the two-year season indicator?
-    date_year = date.dt.year()
-    year1 = pl.when(date < season_start).then(date_year - 1).otherwise(date_year)
+    # first year of the two-year season
+    sy1 = pl.when(date <= end).then(y - 1).when(date >= start).then(y).otherwise(None)
 
-    year2 = year1 + 1
-    return pl.format("{}/{}", year1, year2)
+    return pl.when(sy1.is_null()).then(None).otherwise(pl.format("{}/{}", sy1, sy1 + 1))
@@ -4,10 +4,9 @@
 os.environ["JAX_PLATFORMS"] = "cpu"
 
 import abc
-import calendar
 import datetime
 import inspect
-from typing import Any, List
+from typing import Any
 
 import jax.numpy as jnp
 import numpy as np
@@ -381,202 +380,138 @@ def __init__(
         self.quantiles = quantiles
         self.season = season
         self.params = params
-        self.months = self._month_order(self.season["start_month"])
-        self.end_month_index = self.months.index(
-            datetime.date(
-                self.season["end_year"],
-                self.season["end_month"],
-                self.season["end_day"],
-            ).strftime("%b")
-        )
 
         # other params include max_depth, min_samples_split, min_samples_leaf
         rf_keys = {"n_estimators"}
-
         self.rf_params = {k: v for k, v in params.items() if k in rf_keys}
 
-        self.data = self._preprocess(
-            self.raw_data,
-            self.months,
-            self.end_month_index,
-            self.date_column,
-        )
+        data_t = self.raw_data.with_columns(
+            t=pl.col(self.date_column).map_elements(self._month_in_season)
+        ).sort(["season", "geography", "t"])
 
-    @classmethod
-    def _preprocess(
-        cls, data: pl.DataFrame, months, end_month_index, date_column
-    ) -> pl.DataFrame:
-        out = (
-            data.with_columns(
-                t=pl.col(date_column)
-                .dt.to_string("%b")
-                .map_elements(lambda x: months.index(x) - end_month_index, pl.Int64)
-            )
-            .filter(pl.col("t").is_between(1 - end_month_index, 0))
-            .select(["season", "geography", "t", "estimate"])
-            .with_columns(pl.format("t={}", pl.col("t")))
-            .pivot(on="t", values="estimate")
+        # preprocessing
+        self.date_crosswalk = data_t.select("season", date_column, "t").unique()
+
+        self.data = (
+            data_t.select(["season", "geography", "t", "estimate"])
+            .pivot(on="t", values="estimate", sort_columns=True)
+            # impute zero uptake at start of season
+            .with_columns(pl.coalesce(pl.col("0"), 0.0))
+            # drop season/geo's with any other missing values
             .drop_nulls()
             .sort(["season", "geography"])
         )
 
-        return out
-
-    def fit(self) -> Self:
-        self.enc = CoverageEncoder()
-        self.enc.fit(self.data)
-
-        target_season = iup.date_to_season(
-            pl.lit(self.forecast_date),
-            season_start_month=self.season["start_month"],
-            season_start_day=self.season["start_day"],
-        )
-
-        forecast_t = (
-            self.months.index(self.forecast_date.strftime("%b")) - self.end_month_index
-        )
+        self.forecast_season = pl.select(
+            iup.to_season(
+                pl.lit(self.forecast_date),
+                season_start_month=self.season["start_month"],
+                season_end_month=self.season["end_month"],
+                season_end_day=self.season["end_day"],
+                season_start_day=self.season["start_day"],
+            )
+        ).item()
+        self.forecast_month = self._month_in_season(self.forecast_date)
+
+    def _month_in_season(self, date: datetime.date) -> int:
+        assert date.day == 1
+        year = date.year
+        # start of a season that's in this year
+        ssiy = datetime.date(year, self.season["start_month"], self.season["start_day"])
+
+        # season start year
+        if date < ssiy:
+            ssy = year - 1
+        else:
+            ssy = year
 
-        end_date = datetime.date(
-            self.season["end_year"], self.season["end_month"], self.season["end_day"]
-        )
+        return (year - ssy) * 12 + (date.month - self.season["start_month"])
 
-        # this is true only when target_season is the last season in the data, which is our case for now
-        assert self.data.select(target_season).item() == self.data["season"].max()
-        data_fit = self.data.filter(pl.col("season") != target_season)
+    def fit(self) -> Self:
+        self.enc = Encoder().fit(self.data)
 
-        # fit all the data after forecast_t
-        features = ["season", "geography"] + [
-            f"t={t}"
-            for t in range(
-                1 - self.months.index(end_date.strftime("%b")), forecast_t + 1
-            )
+        self.X_features = ["season", "geography"] + [
+            str(t)
+            for t in range(0, self.forecast_month + 1)
+            if str(t) in self.data.columns
         ]
+        self.y_features = [
+            str(t)
+            for t in range(self.forecast_month + 1, 12)
+            if str(t) in self.data.columns
+        ]
+
+        # fit the model
+        data_fit = self.data.filter(pl.col("season") < self.forecast_season)
+        X_fit = self.enc.encode(data_fit.select(self.X_features))
+        y_fit = data_fit.select(self.y_features).to_numpy()
 
-        X_fit = self.enc.encode(data_fit.select(features))
-        y_fit = data_fit.select(
-            [f"t={target_t}" for target_t in range(forecast_t + 1, 1)]
-        ).to_numpy()
+        # sklearn complains if you pass a column vector rather than a 1d array
+        if y_fit.shape[1] == 1:
+            y_fit = y_fit.ravel()
 
         self.model = RandomForestRegressor(**self.rf_params).fit(X_fit, y_fit)
 
         return self
 
     def predict(self) -> pl.DataFrame:
-        assert self.model is not None
-
-        # include in-sample and out-of-sample prediction
-        data_pred = self.data
-
-        forecast_t = (
-            self.months.index(self.forecast_date.strftime("%b")) - self.end_month_index
-        )
-
-        end_date = datetime.date(
-            self.season["end_year"], self.season["end_month"], self.season["end_day"]
-        )
-
-        features = ["season", "geography"] + [
-            f"t={t}"
-            for t in range(
-                1 - self.months.index(end_date.strftime("%b")), forecast_t + 1
-            )
-        ]
-
-        X_pred = self.enc.encode(data_pred.select(features))
-        t_cols = [f"t={t}" for t in range(forecast_t + 1, 1)]
-        index_cols = ["season", "geography", "quantile"]
-
-        pred = np.array([tree.predict(X_pred) for tree in self.model.estimators_])
-        pred = {f"q={k}": np.quantile(pred, k, axis=0) for k in self.quantiles}
-        all_pred = pl.DataFrame()
-
-        for k, v in pred.items():
-            df = pl.DataFrame(v, schema=[f"t={t}" for t in range(forecast_t + 1, 1)])
-            df = df.with_columns(
-                quantile=pl.lit(k).str.replace("q=", "").cast(pl.Float64)
-            )
-
-            pred_df = pl.concat(
-                [data_pred.select(["season", "geography"]), df], how="horizontal"
+        # make the forecast
+        data_pred = self.data.filter(pl.col("season") >= self.forecast_season)
+
+        X_data = data_pred.select(self.X_features)
+        assert X_data.shape[0] > 0, f"RF prediction for {self.forecast_date} failed"
+        X_pred = self.enc.encode(X_data)
+
+        # make predictions using each tree
+        y_tree = np.stack([tree.predict(X_pred) for tree in self.model.estimators_])
+
+        return iup.QuantileForecast(
+            pl.concat(
+                [
+                    self._postprocess(
+                        data_pred=data_pred,
+                        y_pred=np.quantile(y_tree, q=q, axis=0),
+                        quantile=q,
+                    )
+                    for q in self.quantiles
+                ]
             )
+        )
 
-            all_pred = pl.concat([all_pred, pred_df])
+    def _postprocess(
+        self, data_pred: pl.DataFrame, y_pred: np.ndarray, quantile: float
+    ) -> pl.DataFrame:
+        if len(y_pred.shape) == 1:
+            y_pred = y_pred.reshape(-1, 1)
 
-        all_pred = (
-            all_pred.unpivot(
-                on=t_cols,
-                index=index_cols,
-                variable_name="target_t",
+        return (
+            data_pred.select(["season", "geography"])
+            .hstack(pl.DataFrame(y_pred, schema=self.y_features))
+            .unpivot(
+                on=self.y_features,
+                index=["season", "geography"],
+                variable_name="t",
                 value_name="estimate",
             )
-            .with_columns(
-                forecast_date=self.forecast_date,
-                target_index=(
-                    pl.col("target_t").str.replace("t=", "").cast(pl.Int8)
-                    + self.end_month_index
-                ),  # convert back to month index
-                target_year=pl.col("season").str.extract(r"^(\d{4})/\d{4}"),
-            )
-            .with_columns(
-                season_start_date=pl.date(
-                    pl.col("target_year"),
-                    self.season["start_month"],
-                    self.season["start_day"],
-                ),
-                target_index=pl.format("{}mo", pl.col("target_index")),
-            )
-            .with_columns(
-                pl.col("season_start_date")
-                .dt.offset_by(pl.col("target_index"))
-                .alias("time_end")
-            )
-            .drop(["target_index", "target_year", "season_start_date", "target_t"])
+            .with_columns(pl.col("t").cast(pl.Int64))
+            .join(self.date_crosswalk, on=["season", "t"], how="left")
+            .drop("t")
+            .with_columns(forecast_date=self.forecast_date, quantile=quantile)
         )
 
-        return all_pred
-
-    @staticmethod
-    def _month_order(season_start_month: int) -> List[str]:
-        return [
-            calendar.month_abbr[i]
-            for i in list(range(season_start_month, 12 + 1))
-            + list(range(1, season_start_month))
-        ]
 
-
-class CoverageEncoder:
-    def __init__(self, categorical_feature_names: tuple = ("season", "geography")):
-        self.categorical_feature_names = categorical_feature_names
+class Encoder:
+    def __init__(self, categorical_features: tuple = ("season", "geography")):
+        self.categorical_features = categorical_features
         self.enc = OneHotEncoder(sparse_output=False)
-        self.categorical_features = None
-
-    def fit(self, data: pl.DataFrame):
-        self.enc.fit(data.select(self.categorical_feature_names).to_numpy())
 
-        self.categorical_features = list(
-            self._iter_features(self.categorical_feature_names, self.enc.categories_)
-        )
-
-    @staticmethod
-    def _iter_features(names, categories):
-        for feature, values in zip(names, categories):
-            for value in values:
-                yield (feature, value)
+    def fit(self, data: pl.DataFrame) -> Self:
+        self.enc.fit(data.select(self.categorical_features).to_numpy())
+        return self
 
     def encode(self, data: pl.DataFrame) -> np.ndarray:
-        X_enc = self.enc.transform(
-            data.select(self.categorical_feature_names).to_numpy()
-        )
-        X_pass = data.drop(self.categorical_feature_names).to_numpy()
+        X_enc = self.enc.transform(data.select(self.categorical_features).to_numpy())
+        X_pass = data.drop(self.categorical_features).to_numpy()
 
         assert isinstance(X_enc, np.ndarray)
         return np.asarray(np.hstack((X_enc, X_pass)))
-
-    def categories(self, data: pl.DataFrame):
-        if self.categorical_features is None:
-            raise RuntimeError
-        else:
-            return self.categorical_features + [
-                ("unencoded", col)
-                for col in data.drop(self.categorical_feature_names).columns
-            ]