Update new_simpleNIPA.py

giulio-palcic · web-flow · commit 86f4939de665 · 2025-09-02T15:38:54.000+02:00
diff --git a/albatross/new_simpleNIPA.py b/albatross/new_simpleNIPA.py
@@ -166,7 +166,7 @@ def gridCheck(self, lim=5, ntim=2, debug=False):
 
         return
 
-    """def crossvalpcr(self, xval=True, explained_variance_threshold=0.95):
+    def crossvalpcr(self, xval=True, explained_variance_threshold=0.95):
         import numpy as np
         from scipy.stats import pearsonr as corr
         from sklearn.linear_model import LinearRegression
@@ -292,180 +292,14 @@ def gridCheck(self, lim=5, ntim=2, debug=False):
         pcs_full = raw_glo_var.dot(eofs)
         reg_full = LinearRegression().fit(pcs_full, predictand)
 
-        self.pcs = pcs_full
-        self.lin_model = {
-            "eofs": eofs,
-            "regression": reg_full,
-            "n_pc": n_pc_best
-        }"""
-
-    def crossvalpcr(self, xval=True, explained_variance_threshold=0.95):
-        import numpy as np
-        from scipy.stats import pearsonr as corr
-        from sklearn.linear_model import LinearRegression
-        from sklearn.model_selection import KFold
-        from albatross.utils import weight_glo_var, vcorr, sig_test
-
-        predictand = self.clim_data
-        n_samples = len(predictand)
-        yhat = np.zeros(n_samples)
-
-        # NOTE: The full corr_grid and raw_glo_var are no longer calculated here.
-        # They will be calculated *inside* each CV loop to prevent data leakage.
-
-        if not xval:
-            # Standard PCA regression (no CV)
-            # This part still needs the full corr_grid, but it's not for CV
-            # so data leakage isn't an issue here.
-            self.bootcorr(corrconf=0.95)
-            self.gridCheck()
-            if self.corr_grid.mask.sum() >= len(self.glo_var.lat) * len(self.glo_var.lon) - 4:
-                self.flags [ "noSST" ] = True
-                self.hindcast = None
-                self.pcs = None
-                self.lin_model = None
-                self.correlation = None
-                print("Insufficient SST data for PCA regression.")
-                return
-
-            glo_var_idx = ~self.corr_grid.mask
-            raw_glo_var = weight_glo_var(self.glo_var).data [ :, glo_var_idx ]
-
-            cov_matrix = np.cov(raw_glo_var.T)
-            eigval, eigvec = np.linalg.eig(cov_matrix)
-            eigval, eigvec = np.real(eigval), np.real(eigvec)
-
-            sorted_idx = np.argsort(eigval) [ ::-1 ]
-            eigvec = eigvec [ :, sorted_idx ]
-
-            explained_ratio = eigval / eigval.sum()
-            cumulative_var = np.cumsum(explained_ratio)
-            n_pc = np.searchsorted(cumulative_var, explained_variance_threshold) + 1
-
-            eofs = eigvec [ :, :n_pc ]
-            pcs = raw_glo_var.dot(eofs)
-
-            reg = LinearRegression().fit(pcs, predictand)
-            yhat = reg.predict(pcs)
-
-            self.pcs = pcs
-            self.hindcast = yhat
-            self.correlation = corr(predictand, yhat) [ 0 ]
-            self.lin_model = {
-                "eofs": eofs,
-                "regression": reg,
-                "n_pc": n_pc
-            }
-            return
-
-        # Cross-validation PCA regression
-        models = [ ]
-        kf = KFold(n_splits=5, shuffle=True, random_state=42)
-        p_value_threshold = 1 - 0.95  # This should match the value used in bootcorr
-
-        # Get full global data for slicing later
-        full_glo_var_data = weight_glo_var(self.glo_var).data
-
-        for train_idx, test_idx in kf.split(predictand):
-            X_train_full = full_glo_var_data [ train_idx ]
-            X_test_full = full_glo_var_data [ test_idx ]
-            y_train = predictand [ train_idx ]
-
-            # --- CORRECTION: Step 1 (Feature Selection within the loop) ---
-            # Calculate correlation mask on *training data only*
-            corr_grid_train = vcorr(X=X_train_full, y=y_train)
-            n_yrs_train = len(y_train)
-            p_value = sig_test(corr_grid_train, n_yrs_train)
-
-            # This is a simplified version of bootcorr for demonstration.
-            # You may want to call a modified `bootcorr_fold` function.
-            glo_var_idx_train = ~np.ma.masked_array(corr_grid_train, ~(p_value < p_value_threshold)).mask
-
-            # Check for insufficient data in the training set
-            if glo_var_idx_train.sum()==0:
-                print("Skipping fold: Insufficient SST data in training set.")
-                continue
-
-            # --- CORRECTION: Step 2 (Data Filtering) ---
-            # Apply the mask from the training set to both training and test data
-            X_train = X_train_full [ :, glo_var_idx_train ]
-            X_test = X_test_full [ :, glo_var_idx_train ]
-
-            # --- CORRECTION: Step 3 (PCA and Regression within the loop) ---
-            # Perform PCA and regression on the training data
-            cov_matrix = np.cov(X_train.T)
-            eigval, eigvec = np.linalg.eig(cov_matrix)
-            eigval, eigvec = np.real(eigval), np.real(eigvec)
-
-            sorted_idx = np.argsort(eigval) [ ::-1 ]
-            eigvec = eigvec [ :, sorted_idx ]
-
-            explained_ratio = eigval / eigval.sum()
-            cumulative_var = np.cumsum(explained_ratio)
-            n_pc = np.searchsorted(cumulative_var, explained_variance_threshold) + 1
-
-            if n_pc==0 or np.isnan(eigval [ :n_pc ]).any():
-                continue
-
-            eofs = eigvec [ :, :n_pc ]
-            pcs_train = X_train.dot(eofs)
-            pcs_test = X_test.dot(eofs)
-
-            if pcs_train.shape [ 0 ] < n_pc:
-                continue
-
-            reg = LinearRegression().fit(pcs_train, y_train)
-            preds = reg.predict(pcs_test)
-
-            yhat [ test_idx ] = preds
-
-            models.append({
-                "eofs": eofs,
-                "regression": reg,
-                "n_pc": n_pc,
-                "corr": corr(y_train, reg.predict(pcs_train)) [ 0 ]
-            })
-
-        if not models:
-            self.hindcast = None
-            self.pcs = None
-            self.lin_model = None
-            self.correlation = None
-            self.flags [ "noSST" ] = True
-            return
-
-        # Store hindcast from CV
-        self.hindcast = yhat
-        self.correlation = corr(predictand, yhat) [ 0 ]
-
-        # Select best number of PCs and refit on all data
-        best_model = max(models, key=lambda m: m [ "corr" ])
-        n_pc_best = best_model [ "n_pc" ]
-
-        # --- Refit on full dataset (this can still use the old method) ---
-        self.bootcorr(corrconf=0.95)
-        self.gridCheck()
-        glo_var_idx_full = ~self.corr_grid.mask
-        raw_glo_var_full = weight_glo_var(self.glo_var).data [ :, glo_var_idx_full ]
-
-        cov_matrix = np.cov(raw_glo_var_full.T)
-        eigval, eigvec = np.linalg.eig(cov_matrix)
-        eigval, eigvec = np.real(eigval), np.real(eigvec)
-
-        sorted_idx = np.argsort(eigval) [ ::-1 ]
-        eigvec = eigvec [ :, sorted_idx ]
-
-        eofs = eigvec [ :, :n_pc_best ]
-        pcs_full = raw_glo_var_full.dot(eofs)
-        reg_full = LinearRegression().fit(pcs_full, predictand)
-
         self.pcs = pcs_full
         self.lin_model = {
             "eofs": eofs,
             "regression": reg_full,
             "n_pc": n_pc_best
         }
 
+
     def save_regressor(self, workdir):
         """
         Save EOFs and regression coefficients to CSV files.