Added check and stratification

sjentoft · web-flow · commit c676f00e6041 · 2025-04-01T13:12:03.000+02:00
Dev checks
diff --git a/src/vaskify/createdata.py b/src/vaskify/createdata.py
@@ -81,4 +81,6 @@ def create_test_data(
         2,
     )  # check if all get same random or not...
 
+    data["id_company"] = data["id_company"].astype(str)
+
     return data
diff --git a/src/vaskify/detect.py b/src/vaskify/detect.py
@@ -9,6 +9,7 @@
 
 # %%
 import logging
+import re
 
 import numpy as np
 import pandas as pd
@@ -31,9 +32,14 @@ def __init__(
             id_nr: String variable for the name of the variable to identify units with.
             logger_level: Detail level for information output. Choose between 'debug','info','warning','error' and 'critical'.
         """
+        # Check data
+        self._check_data(data, id_nr=id_nr)
+
+        # Create self variables
         self.data = data
         self.id_nr = id_nr
 
+        # Start logging
         logging_dict = {
             "debug": 10,
             "info": 20,
@@ -52,6 +58,54 @@ def __init__(
             console_handler.setFormatter(formatter)
             self.logger.addHandler(console_handler)
 
+    @staticmethod
+    def _check_data(
+        data: pd.DataFrame,
+        y_var: str = "",
+        time_var: str = "",
+        id_nr: str = "",
+    ) -> None:
+        """Check if the data contains the necessary columns, correct data types, and valid date format.
+
+        Args:
+            data: The DataFrame to check.
+            y_var: The variable of interest to check.
+            time_var: String variable for indicating the time period.
+            id_nr: String variable for the identifier.
+
+        Raises:
+            ValueError: If any of the checks fail.
+        """
+        required_columns = [y_var, time_var, id_nr]
+        for col in required_columns:
+            if col and col not in data.columns:
+                mes = f"Missing column: {col}"
+                raise ValueError(mes)
+        if id_nr and not pd.api.types.is_string_dtype(data[id_nr]):
+            mes = f"{id_nr} should be a string."
+            raise ValueError(mes)
+
+        if y_var and not pd.api.types.is_numeric_dtype(data[y_var]):
+            mes = f"{y_var} should be numeric."
+            raise ValueError(mes)
+
+        if time_var:
+            if not pd.api.types.is_string_dtype(data[time_var]):
+                mes = f"{time_var} should be a string."
+                raise ValueError(mes)
+
+            date_format_pattern = re.compile(
+                r"^\d{4}(-\d{2}(-\d{2})?|-(Q[1-4]|W(0[1-9]|[1-4][0-9]|5[0-3]))|-\d{3)$",
+            )
+
+            if (
+                not data[time_var]
+                .apply(lambda x: bool(date_format_pattern.match(x)))
+                .all()
+            ):
+                mes = f"{time_var} should be in the format 'YYYY', 'YYYY-Qq', 'YYYY-MM','YYYY-Www','YYYY-MM-DD', 'YYYY-DDD'."
+                raise ValueError(mes)
+
     def change_logging_level(self, logger_level: str) -> None:
         """Change the logging print level.
 
@@ -82,7 +136,7 @@ def thousand_error(
 
         Args:
             y_var: The variable of insterest to check.
-            time_var: String variable for indicating the time period. This should be in a standard format: 'YYYY', 'YYYY-Mm', 'YYYY-Kk'.
+            time_var: String variable for indicating the time period. This should be in a ISO 8601 standard format for example: 'YYYY', 'YYYY-MM', 'YYYY-MM-DD' or a SSB standard like 'YYYY-Qq'.
             lower_bound: Float variable for the lower bound log factor for defining an outlier.
             upper_bound: Float variable for the upper bound log factor for defining an outlier.
             flag: String for the name of the flag variable to add to the data. Default is 'flag_thousand'.
@@ -93,13 +147,14 @@ def thousand_error(
         Returns:
             Data frame containing a flag variable for identified outliers or a dataframe containing only the outliers.
         """
+        # Check data
+        self._check_data(self.data, y_var=y_var, time_var=time_var)
+
         if (not impute_var) and (impute):
             impute_var = f"{y_var}_imputed"
             mes = f"No impute variable given so using {impute_var}"
             self.logger.info(mes)
 
-        # check data - add in
-
         # Find differences by sorting first - not efficient but works
         data = self.data.sort_values(by=[self.id_nr, time_var]).reset_index(drop=True)
         log10_diff = data.groupby(self.id_nr)[y_var].transform(
@@ -130,7 +185,9 @@ def thousand_error(
             mask_outlier_units = data[self.id_nr].isin(outlier_ids)
             output = data.loc[mask_outlier_units, :]
         else:
-            self.logger.warning("output_format is not valid. Use 'data' or 'outliers'")
+            output = data
+            mes = "output_format is not valid. Use 'data' or 'outliers'. Returning 'data' format."
+            self.logger.warning(mes)
 
         return output
 
@@ -148,7 +205,7 @@ def accumulation_error(
 
         Args:
             y_var: The variable of insterest to check.
-            time_var: String variable for indicating the time period. This should be in a standard format: 'YYYY', 'YYYY-Mm', 'YYYY-Kk'.
+            time_var: String variable for indicating the time period. This should be in a ISO 8601 standard format for example: 'YYYY', 'YYYY-MM', 'YYYY-MM-DD' or a SSB standard like 'YYYY-Qq'.
             error: Float for the allowed error factor.
             flag: String for the name of the flag variable to add to the data. Default is 'flag_thousand'.
             impute: Boolean for whether to impute the flagged observations. Default is False. (NOT IMPLEMENTED)
@@ -158,13 +215,14 @@ def accumulation_error(
         Returns:
             Data frame containing a flag variable for identified outliers or a dataframe containing only the outliers.
         """
+        # Check data
+        self._check_data(self.data, y_var=y_var, time_var=time_var)
+
         if (not impute_var) and (impute):
             impute_var = f"{y_var}_imputed"
             mes = f"No imputed variable name given so {impute_var} is being used"
             self.logger.info(mes)
 
-        # check data
-
         # Sort and get previous period data
         data = self.data.sort_values(by=[self.id_nr, time_var]).reset_index(drop=True)
         expected_turnover = data.groupby(self.id_nr)[y_var].shift(1)
@@ -201,10 +259,48 @@ def accumulation_error(
 
         return output
 
+    @staticmethod
+    def _calculate_hb(
+        x1: pd.Series,  # type: ignore[type-arg]
+        x2: pd.Series,  # type: ignore[type-arg]
+        pu: float,
+        pa: float,
+        pc: float,
+        percentiles: tuple[float, float],
+    ) -> pd.DataFrame:
+        """Calculate HB method."""
+        rat = x1 / x2
+        med_ratio = rat.median()
+        s_ratio = np.where(
+            rat >= med_ratio,
+            rat / med_ratio - 1,
+            1 - med_ratio / rat,
+        )
+
+        max_y = pd.concat([x1, x2], axis=1).max(axis=1)
+        e_ratio = s_ratio * max_y**pu
+
+        e_ratio_q = e_ratio.quantile([percentiles[0], 0.5, percentiles[1]]).to_numpy()
+        q1, q2, q3 = e_ratio_q
+
+        if q2 != 0:
+            ell = q2 - pc * max(q2 - q1, abs(q2 * pa))
+            eul = q2 + pc * max(q3 - q2, abs(q2 * pa))
+        else:
+            ell = q2 - pc * max(q2 - q1, pa)
+            eul = q2 + pc * max(q3 - q2, pa)
+
+        lower_limit = med_ratio * max_y**pu / (max_y**pu - ell)
+        upper_limit = med_ratio * (max_y**pu + eul) / max_y**pu
+
+        return pd.DataFrame({"lower_limit": lower_limit, "upper_limit": upper_limit})
+
     def hb(
         self,
         y_var: str,
         time_var: str,
+        time_periods: list[str] | None = None,
+        strata_var: str = "",
         pu: float = 0.5,
         pa: float = 0.05,
         pc: float = 20,
@@ -218,74 +314,102 @@ def hb(
 
         Args:
             y_var: String for the name of the variable of interest to check.
-            time_var: String variable for indicating the time period. This should be in a standard format: 'YYYY', 'YYYY-Mm', 'YYYY-Kk'.
+            time_var: String variable for indicating the time period. This should be in a ISO 8601 standard format for example: 'YYYY', 'YYYY-MM', 'YYYY-MM-DD' or a SSB standard like 'YYYY-Qq'.
+            time_periods: List of strings for the two time periods to compare. Default None, in which case it is assumed that the time variable contains exactly two time preiods.
+            strata_var: String variable for stratification. Default is blank ("").
             pu: Parameter that adjusts for different level of the variables. Default value 0.5.
             pa: Parameter that adjusts for small differences between the median and the 1st or 3rd quartile. Default value 0.05.
-            pc: Parameter that controls the width of the confidence interval. Default value 4.
+            pc: Parameter that controls the width of the confidence interval. Default value 20.
             percentiles: Tuple for percentile values to use.
             flag: String variable name to use to indicate outliers.
             output_format: String for format to return. Can be 'wide','long','outliers'.
 
         Returns:
             Dataframe with flags or with identified units
         """
-        # check data ...
+        # Check data
+        self._check_data(self.data, y_var=y_var, time_var=time_var)
         data = self.data.copy()
 
-        # Get time levesl
+        # Add in check if number of companies in each strata is too low.
+
+        # Filter time periods
+        if time_periods:
+            if len(time_periods) != 2:
+                mes = "Two time periods should be specified."
+                self.logger.error(mes)
+            data = data.loc[data[time_var].isin(time_periods), :]
+
+        # Get time levels
         time_levels = np.unique(data[time_var])
         if len(time_levels) != 2:
             mes = "The time variable must have exactly two unique levels."
             self.logger.error(mes)
-        x1 = time_levels[1]  # t
-        x2 = time_levels[0]  # t-1
+        time1 = time_levels[1]  # t
+        time0 = time_levels[0]  # t-1
 
         # Convert to wide
+        wide_index = [self.id_nr, strata_var] if strata_var else self.id_nr
         wide_data = data.pivot_table(
-            index=self.id_nr,
+            index=wide_index,
             columns=time_var,
             values=y_var,
             aggfunc="first",
         ).reset_index()
         wide_data.columns.name = None
 
         # Check for valid rows
-        valid_rows = wide_data[(wide_data[x1] > 0) & (wide_data[x2] > 0)]
+        valid_rows = wide_data[(wide_data[time1] > 0) & (wide_data[time0] > 0)]
         if valid_rows.empty:
             mes = "No valid rows with y_var > 0 for both time periods."
             self.logger.error(mes)
 
-        # Calculate the ratio and related metrics
-        valid_rows["ratio"] = valid_rows[x1] / valid_rows[x2]
-        med_ratio = valid_rows["ratio"].median()
-        s_ratio = np.where(
-            valid_rows["ratio"] >= med_ratio,
-            valid_rows["ratio"] / med_ratio - 1,
-            1 - med_ratio / valid_rows["ratio"],
-        )
-
-        max_y = valid_rows[[x1, x2]].max(axis=1)
-        e_ratio = s_ratio * max_y**pu
+        # Add in ratio
+        valid_rows["ratio"] = valid_rows[time1] / valid_rows[time0]
+
+        # Apply the HB function to each strata group
+        if strata_var:
+            limits = (
+                valid_rows.groupby(strata_var)
+                .apply(
+                    lambda group: self._calculate_hb(
+                        group[time1],
+                        group[time0],
+                        pu,
+                        pa,
+                        pc,
+                        percentiles,
+                    ),
+                )
+                .reset_index(level=strata_var, drop=True)
+            )
+        else:
+            limits = self._calculate_hb(
+                valid_rows[time1],
+                valid_rows[time0],
+                pu,
+                pa,
+                pc,
+                percentiles,
+            )
 
-        # Compute quantiles for e ratio
-        percentiles = (0.25, 0.75)  # Can also be 0.1, 0.9
-        e_ratio_q = e_ratio.quantile([percentiles[0], 0.5, percentiles[1]]).to_numpy()
-        q1, q2, q3 = e_ratio_q
+        # Merge the limits back into the valid_rows
+        valid_rows = valid_rows.merge(
+            limits,
+            left_index=True,
+            right_index=True,
+            how="left",
+        )
 
-        if q2 != 0:
-            ell = q2 - pc * max(q2 - q1, abs(q2 * pa))
-            eul = q2 + pc * max(q3 - q2, abs(q2 * pa))
-        else:
-            ell = q2 - pc * max(q2 - q1, pa)
-            eul = q2 + pc * max(q3 - q2, pa)
-        valid_rows["lower_limit"] = med_ratio * max_y**pu / (max_y**pu - ell)
-        valid_rows["upper_limit"] = med_ratio * (max_y**pu + eul) / max_y**pu
+        # Add in flag
         valid_rows[flag] = np.where(
             (valid_rows["ratio"] < valid_rows["lower_limit"])
             | (valid_rows["ratio"] > valid_rows["upper_limit"]),
             1,
             0,
         )
+
+        # Format in correct output format
         if output_format == "wide":
             output: pd.DataFrame = valid_rows
         elif output_format == "outliers":
@@ -300,10 +424,11 @@ def hb(
                 var_name=time_var,
                 value_name=y_var,
             )
-            # Add in NAs for first time period here ...
+            mask = output[time_var] == time_levels[0]
+            output.loc[mask, ["lower_limit", "upper_limit", flag]] = np.nan
         else:
-            self.logger.warning(
-                "output_format is not valid. Use 'wide' or 'outliers' or 'long'",
-            )
+            mes = "output_format is not valid. Use 'wide', 'outliers' or 'long'. Wide being returned."
+            self.logger.warning(mes)
+            output = valid_rows
 
         return output
diff --git a/tests/test_detect.py b/tests/test_detect.py
@@ -67,6 +67,39 @@ def test_hb() -> None:
     assert dt_controlled.shape[0] == expected_shape, "Long format returned"
 
 
+def test_hb_strata() -> None:
+    dt = create_test_data(n=50, seed=10)
+    dt2 = dt.loc[dt.time_period.isin(["2020-04", "2020-05"]), :]
+
+    detect = Detect(dt2, id_nr="id_company")
+    dt_controlled = detect.hb(
+        y_var="turnover",
+        time_var="time_period",
+        strata_var="nace",
+    )
+
+    assert any(dt_controlled.columns.isin(["flag_hb"])), "Flag variable created"
+    expected_shape = 50
+    assert dt_controlled.shape[0] == expected_shape, "Wide format returned as default"
+
+    dt_controlled = detect.hb(
+        y_var="turnover",
+        strata_var="nace",
+        time_var="time_period",
+        output_format="outliers",
+    )
+    expected_shape = 2
+    assert dt_controlled.shape[0] == expected_shape, "Oulier format returned"
+
+    dt_controlled = detect.hb(
+        y_var="turnover",
+        time_var="time_period",
+        output_format="long",
+    )
+    expected_shape = 100
+    assert dt_controlled.shape[0] == expected_shape, "Long format returned"
+
+
 # %%
 def test_logger() -> None:
     dt = create_test_data(n=5, n_periods=2, freq="monthly", seed=42)