Jettison data classes

swo · swo · commit 1812ba765548 · 2026-04-26T11:26:49.000-04:00
diff --git a/iup/__init__.py b/iup/__init__.py
@@ -1,163 +1,4 @@
-from typing import List
-
 import polars as pl
-from polars.datatypes.classes import DataTypeClass
-
-
-class Data(pl.DataFrame):
-    """
-    Abstract class for observed data and forecast data.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.validate()
-
-    def validate(self):
-        raise NotImplementedError("Subclasses must implement this method.")
-
-    def assert_in_schema(self, names_types: dict[str, DataTypeClass]):
-        """Verify that columns of the expected types are present in the data frame.
-
-        Args:
-            names_types: Column names and types mapping.
-        """
-        for name, type_ in names_types.items():
-            if name not in self.schema.names():
-                raise RuntimeError(f"Column '{name}' not found")
-            elif (
-                name in self.schema.names() and (name, type_) not in self.schema.items()
-            ):
-                actual_type = self.schema.to_python()[name]
-                raise RuntimeError(
-                    f"Column '{name}' has type {actual_type}, not {type_}"
-                )
-            else:
-                assert (name, type_) in self.schema.items()
-
-
-class CoverageData(Data):
-    def validate(self):
-        """Must have time_end and estimate columns; can have more."""
-        self.assert_in_schema({"time_end": pl.Date, "estimate": pl.Float64})
-
-
-class IncidentCoverageData(CoverageData):
-    def validate(self):
-        super().validate()
-        if not self["estimate"].is_between(-1.0, 1.0).all():
-            bad_values = (
-                self.filter(pl.col("estimate").is_between(-1.0, 1.0).not_())["estimate"]
-                .unique()
-                .to_list()
-            )
-            raise ValueError(
-                f"Incident coverage `estimate` must be have values between -1 and +1. "
-                f"Values included {bad_values}"
-            )
-
-    def to_cumulative(
-        self, groups: List[str,] | None, prev_cumulative: pl.DataFrame | None = None
-    ) -> "CumulativeCoverageData":
-        """Convert incident to cumulative coverage data.
-
-        Cumulative sum of incident coverage gives the cumulative coverage.
-        Optionally, additional cumulative coverage from before the start of
-        the incident data may be provided.
-        Even if no groups are specified, the data must at least be grouped by season.
-
-        Args:
-            groups: Names of the columns of grouping factors, or None. If `None`, then
-                data will be grouped by `"season"`.
-            prev_cumulative: Cumulative coverage from before the start of the incident
-                data, for each group, or None. If `None`, group by `"season"`.
-
-        Returns:
-            Cumulative coverage on each date in the input incident coverage data.
-        """
-        if groups is None:
-            groups = ["season"]
-
-        out = self.with_columns(estimate=pl.col("estimate").cum_sum().over(groups))
-
-        if prev_cumulative is not None:
-            out = out.join(prev_cumulative, on=groups)
-
-            out = out.with_columns(
-                estimate=pl.col("estimate") + pl.col("last_cumulative")
-            ).drop("last_cumulative")
-
-        return CumulativeCoverageData(out)
-
-
-class CumulativeCoverageData(CoverageData):
-    def validate(self):
-        super().validate()
-        assert self["estimate"].is_between(0.0, 1.0).all(), (
-            "Cumulative coverage `estimate` must be a proportion"
-        )
-
-    def to_incident(self, groups: List[str,] | None) -> IncidentCoverageData:
-        """Convert cumulative to incident coverage data.
-
-        Because the first report date for each group is often rollout,
-        incident coverage on the first report date is 0.
-
-        Args:
-            groups: Names of the columns of grouping factors, or None. If `None`,
-                then data will be grouped by `"season"`.
-
-        Returns:
-            Incident coverage on each date in the input cumulative coverage data.
-        """
-        if groups is None:
-            groups = ["season"]
-
-        out = self.with_columns(
-            estimate=pl.col("estimate").diff().over(groups).fill_null(0)
-        )
-
-        return IncidentCoverageData(out)
-
-
-class QuantileForecast(Data):
-    """
-    Class for forecast with quantiles.
-    Save for future.
-    """
-
-    def validate(self):
-        self.assert_in_schema(
-            {"time_end": pl.Date, "quantile": pl.Float64, "estimate": pl.Float64}
-        )
-
-        assert self["quantile"].is_between(0.0, 1.0).all(), (
-            "quantiles must be between 0 and 1"
-        )
-
-
-class PointForecast(QuantileForecast):
-    """
-    Class for forecast with point estimate
-    A subclass when quantile is 50%
-    For now, enforce the "quantile50" to be "estimate"
-    """
-
-    def validate(self):
-        super().validate()
-        assert (self["quantile"] == 0.50).all()
-
-
-class SampleForecast(Data):
-    """
-    Class for forecast with posterior distribution.
-    Save for future.
-    """
-
-    def validate(self):
-        self.assert_in_schema(
-            {"time_end": pl.Date, "sample_id": pl.UInt64, "estimate": pl.Float64}
-        )
 
 
 def to_season(
diff --git a/iup/models.py b/iup/models.py
@@ -51,7 +51,7 @@ class LPLModel(CoverageModel):
 
     def __init__(
         self,
-        data: iup.CumulativeCoverageData,
+        data: pl.DataFrame,
         forecast_date: datetime.date,
         params: dict[str, Any],
         season: dict[str, Any],
@@ -361,13 +361,13 @@ def predict(self) -> pl.DataFrame:
             )
         )
 
-        return iup.QuantileForecast(data_pred.explode(["quantile", "estimate"]))
+        return data_pred.explode(["quantile", "estimate"])
 
 
 class RFModel(CoverageModel):
     def __init__(
         self,
-        data: iup.CumulativeCoverageData,
+        data: pl.DataFrame,
         params: dict[str, Any],
         season: dict[str, Any],
         forecast_date: datetime.date,
@@ -465,17 +465,15 @@ def predict(self) -> pl.DataFrame:
         # make predictions using each tree
         y_tree = np.stack([tree.predict(X_pred) for tree in self.model.estimators_])
 
-        return iup.QuantileForecast(
-            pl.concat(
-                [
-                    self._postprocess(
-                        data_pred=data_pred,
-                        y_pred=np.quantile(y_tree, q=q, axis=0),
-                        quantile=q,
-                    )
-                    for q in self.quantiles
-                ]
-            )
+        return pl.concat(
+            [
+                self._postprocess(
+                    data_pred=data_pred,
+                    y_pred=np.quantile(y_tree, q=q, axis=0),
+                    quantile=q,
+                )
+                for q in self.quantiles
+            ]
         )
 
     def _postprocess(
diff --git a/scripts/preprocess.py b/scripts/preprocess.py
@@ -5,11 +5,11 @@
 import polars as pl
 import yaml
 
-from iup import CumulativeCoverageData, to_season
+from iup import to_season
 
 
 def preprocess(
-    raw_data: pl.LazyFrame,
+    raw_data: pl.DataFrame,
     start_year: int,
     end_year: int,
     season_start_month: int,
@@ -18,7 +18,7 @@ def preprocess(
     season_end_day: int,
     geographies: Optional[List[str] | None],
     date_col: str = "time_end",
-) -> CumulativeCoverageData:
+) -> pl.DataFrame:
     """
     Preprocess the raw data (Filter the raw data with certain states and seasons, add season column).
 
@@ -37,13 +37,13 @@ def preprocess(
 
     """
 
-    def geo_filter(df: pl.LazyFrame) -> pl.LazyFrame:
+    def geo_filter(df: pl.DataFrame) -> pl.DataFrame:
         if geographies is None:
             return df
         else:
             return df.filter(pl.col("geography").is_in(geographies))
 
-    data = (
+    return (
         raw_data.filter(
             pl.col("geography_type") == pl.lit("admin1"),
             pl.col("geography")
@@ -66,11 +66,8 @@ def geo_filter(df: pl.LazyFrame) -> pl.LazyFrame:
             pl.col("season").is_null().not_(),
         )
         .pipe(geo_filter)
-        .collect()
     )
 
-    return CumulativeCoverageData(data)
-
 
 if __name__ == "__main__":
     p = argparse.ArgumentParser()
@@ -82,7 +79,7 @@ def geo_filter(df: pl.LazyFrame) -> pl.LazyFrame:
     with open(args.config) as f:
         config = yaml.safe_load(f)
 
-    raw_data = pl.scan_parquet(args.input)
+    raw_data = pl.read_parquet(args.input)
 
     assert isinstance(config, dict)
     geographies = config.get("geographies", None)
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,8 +1,6 @@
 import polars as pl
 import pytest
 
-import iup
-
 
 @pytest.fixture
 def frame():
@@ -80,6 +78,4 @@ def frame():
         schema_overrides={"time_end": pl.Date},
     )
 
-    frame = iup.CumulativeCoverageData(frame)
-
     return frame
diff --git a/tests/test_data.py b/tests/test_data.py