Fix Bug in Resid / Predict when Model is Emtpy (#597)

s3alfisc · web-flow · commit 05f44bd5b0a4 · 2024-08-31T17:34:24.000+02:00
* flatten residual

* add tests for empty models

* bump version, build lock file

* udpate tests

* add test typo

* update tests

* fix test typo

* cleanups

* fix resid for weights

* only test predict when fepois without fixef

* fix bug in predict method with WLS
diff --git a/.coverage b/.coverage
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py
@@ -329,7 +329,7 @@ def _estimate_all_models(
                     # special case: sometimes it is useful to fit models as
                     # "Y ~ 0 | f1 + f2" to demean Y and to use the predict() method
                     if FIT._X_is_empty:
-                        FIT._u_hat = Y.to_numpy() - Yd_array
+                        FIT._u_hat = Yd_array
                     else:
                         FIT.get_fit()
 
diff --git a/pyfixest/estimation/feols_.py b/pyfixest/estimation/feols_.py
@@ -1465,7 +1465,7 @@ def predict(
             )
 
         if newdata is None:
-            return self._Y_untransformed.to_numpy().flatten() - self._u_hat.flatten()
+            return self._Y_untransformed.to_numpy().flatten() - self.resid()
 
         newdata = _polars_to_pandas(newdata).reset_index(drop=False)
 
@@ -1803,7 +1803,7 @@ def resid(self) -> np.ndarray:
         np.ndarray
             A np.ndarray with the residuals of the estimated regression model.
         """
-        return self._u_hat
+        return self._u_hat.flatten() / np.sqrt(self._weights).flatten()
 
     def ritest(
         self,
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pyfixest"
-version = "0.24.1"
+version = "0.24.2"
 
 description = "Fast high dimensional fixed effect estimation following syntax of the fixest R package. Supports OLS, IV and Poisson regression and a range of inference procedures (HC1-3, CRV1 & CRV3, wild bootstrap, randomization inference, simultaneous CIs, Romano-Wolf's multiple testing correction). Additionally, supports (some of) the regression based new Difference-in-Differences Estimators (Did2s, Linear Projections)."
 authors = ["Alexander Fischer <alexander-fischer1801@t-online.de>", "Styfen Schär"]
diff --git a/tests/test_vs_fixest.py b/tests/test_vs_fixest.py
@@ -81,6 +81,11 @@
     ("Y~X1|f2^f3"),
     ("Y~X1|f1 + f2^f3"),
     ("Y~X1|f2^f3^f1"),
+    # empty models
+    ("Y ~ 1 | f1"),
+    ("Y ~ 1 | f1 + f2"),
+    ("Y ~ 0 | f1"),
+    ("Y ~ 0 | f1 + f2"),
 ]
 
 iv_fmls = [
@@ -122,15 +127,11 @@ def check_absolute_diff(x1, x2, tol, msg=None):
 @pytest.mark.parametrize("error_type", ["2"])
 @pytest.mark.parametrize("dropna", [False])
 @pytest.mark.parametrize("inference", ["iid", "hetero", {"CRV1": "group_id"}])
-# @pytest.mark.parametrize("inference", ["iid", {"CRV1": "group_id"}])
 @pytest.mark.parametrize("weights", [None, "weights"])
 @pytest.mark.parametrize("f3_type", ["str", "object", "int", "categorical", "float"])
 @pytest.mark.parametrize("fml", ols_fmls + ols_but_not_poisson_fml)
 @pytest.mark.parametrize("adj", [False, True])
-# see here for why not test against cluster_adj = True
-# it triggers the N / (N-1) correction, not sure why
-# https://github.com/lrberge/fixest/issues/518#issuecomment-2227365516
-@pytest.mark.parametrize("cluster_adj", [False])
+@pytest.mark.parametrize("cluster_adj", [False, True])
 def test_single_fit_feols(
     N,
     seed,
@@ -197,9 +198,8 @@ def test_single_fit_feols(
     py_confint = mod.confint().xs("X1").values
     py_nobs = mod._N
     py_vcov = mod._vcov[0, 0]
-
-    py_resid = mod._u_hat.flatten()  # noqa: F841
-    # TODO: test residuals
+    py_resid = mod.resid()
+    py_predict = mod.predict()
 
     df_X1 = _get_r_df(r_fixest)
 
@@ -209,16 +209,32 @@ def test_single_fit_feols(
     r_tstat = df_X1["statistic"]
     r_confint = df_X1[["conf.low", "conf.high"]].values.astype(np.float64)
     r_nobs = int(stats.nobs(r_fixest)[0])
-    r_resid = r_fixest.rx2("working_residuals")  # noqa: F841
     r_vcov = stats.vcov(r_fixest)[0, 0]
+    r_resid = stats.residuals(r_fixest)
+    r_predict = stats.predict(r_fixest)
+
+    if not mod._X_is_empty:
+        if inference == "iid" and adj and cluster_adj:
+            check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs")
+            check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef")
+
+        check_absolute_diff(py_vcov, r_vcov, 1e-08, "py_vcov != r_vcov")
+        check_absolute_diff(py_se, r_se, 1e-08, "py_se != r_se")
+        check_absolute_diff(py_pval, r_pval, 1e-08, "py_pval != r_pval")
+        check_absolute_diff(py_tstat, r_tstat, 1e-07, "py_tstat != r_tstat")
+        check_absolute_diff(py_confint, r_confint, 1e-08, "py_confint != r_confint")
+
+    # residuals invariant so to vcov type
+    if inference == "iid" and adj and not cluster_adj:
+        check_absolute_diff(
+            (py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_resid != r_resid"
+        )
+        check_absolute_diff(
+            py_predict[0:5], r_predict[0:5], 1e-07, "py_predict != r_predict"
+        )
 
-    check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs")
-    check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef")
-    check_absolute_diff(py_vcov, r_vcov, 1e-08, "py_vcov != r_vcov")
-    check_absolute_diff(py_se, r_se, 1e-08, "py_se != r_se")
-    check_absolute_diff(py_pval, r_pval, 1e-08, "py_pval != r_pval")
-    check_absolute_diff(py_tstat, r_tstat, 1e-07, "py_tstat != r_tstat")
-    check_absolute_diff(py_confint, r_confint, 1e-08, "py_confint != r_confint")
+    if mod._X_is_empty:
+        assert mod._beta_hat.size == 0
 
     if not weights:
         py_r2 = mod._r2
@@ -296,9 +312,7 @@ def test_single_fit_fepois(
     py_nobs = mod._N
     py_vcov = mod._vcov[0, 0]
     py_deviance = mod.deviance
-
-    py_resid = mod._u_hat.flatten()  # noqa: F841
-    # TODO: test residuals
+    py_resid = mod.resid()
 
     df_X1 = _get_r_df(r_fixest)
 
@@ -308,19 +322,29 @@ def test_single_fit_fepois(
     r_tstat = df_X1["statistic"]
     r_confint = df_X1[["conf.low", "conf.high"]].values.astype(np.float64)
     r_nobs = int(stats.nobs(r_fixest)[0])
-    r_resid = r_fixest.rx2("working_residuals")  # noqa: F841
+    r_resid = stats.residuals(r_fixest)
     r_vcov = stats.vcov(r_fixest)[0, 0]
     r_deviance = r_fixest.rx2("deviance")
 
-    check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs")
-    check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef")
+    if inference == "iid" and adj and cluster_adj:
+        check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs")
+        check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef")
+        check_absolute_diff((py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_coef != r_coef")
+
     check_absolute_diff(py_vcov, r_vcov, 1e-06, "py_vcov != r_vcov")
     check_absolute_diff(py_se, r_se, 1e-06, "py_se != r_se")
     check_absolute_diff(py_pval, r_pval, 1e-06, "py_pval != r_pval")
     check_absolute_diff(py_tstat, r_tstat, 1e-06, "py_tstat != r_tstat")
     check_absolute_diff(py_confint, r_confint, 1e-06, "py_confint != r_confint")
     check_absolute_diff(py_deviance, r_deviance, 1e-08, "py_deviance != r_deviance")
 
+    if not mod._has_fixef:
+        py_predict = mod.predict()
+        r_predict = stats.predict(r_fixest)
+        check_absolute_diff(
+            py_predict[0:5], r_predict[0:5], 1e-07, "py_predict != r_predict"
+        )
+
 
 @pytest.mark.parametrize("N", [1000])
 @pytest.mark.parametrize("seed", [76540251])
@@ -398,9 +422,8 @@ def test_single_fit_iv(
     py_confint = mod.confint().xs("X1").values
     py_nobs = mod._N
     py_vcov = mod._vcov[0, 0]
-
-    py_resid = mod._u_hat.flatten()  # noqa: F841
-    # TODO: test residuals
+    py_resid = mod.resid()
+    py_predict = mod.predict()
 
     df_X1 = _get_r_df(r_fixest)
 
@@ -410,11 +433,16 @@ def test_single_fit_iv(
     r_tstat = df_X1["statistic"]
     r_confint = df_X1[["conf.low", "conf.high"]].values.astype(np.float64)
     r_nobs = int(stats.nobs(r_fixest)[0])
-    r_resid = r_fixest.rx2("working_residuals")  # noqa: F841
+    r_resid = stats.resid(r_fixest)
+    r_predict = stats.predict(r_fixest)
     r_vcov = stats.vcov(r_fixest)[0, 0]
 
-    check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs")
-    check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef")
+    if inference == "iid" and adj and cluster_adj:
+        check_absolute_diff(py_nobs, r_nobs, 1e-08, "py_nobs != r_nobs")
+        check_absolute_diff(py_coef, r_coef, 1e-08, "py_coef != r_coef")
+        check_absolute_diff(py_predict[0:5], r_predict[0:5], 1e-07, "py_coef != r_coef")
+        check_absolute_diff((py_resid)[0:5], (r_resid)[0:5], 1e-07, "py_coef != r_coef")
+
     check_absolute_diff(py_vcov, r_vcov, 1e-07, "py_vcov != r_vcov")
     check_absolute_diff(py_se, r_se, 1e-07, "py_se != r_se")
     check_absolute_diff(py_pval, r_pval, 1e-06, "py_pval != r_pval")
diff --git a/tests/texfiles/test.tex b/tests/texfiles/test.tex
@@ -12,9 +12,9 @@
 \midrule
 f1 & x & x \\
 \midrule
-$R^2$ & 0.489 & - \\
-S.E. type & by: f1 & by: f1+f2 \\
 Observations & 1994 & 997 \\
+S.E. type & by: f1 & by: f1+f2 \\
+$R^2$ & 0.489 & - \\
 \bottomrule
 \end{tabular}
 \footnotesize Significance levels: $*$ p $<$ 0.05, $**$ p $<$ 0.01, $***$ p $<$ 0.001. Format of coefficient cell: Coefficient