@@ -166,7 +166,7 @@ def gridCheck(self, lim=5, ntim=2, debug=False):
166166
167167 return
168168
169- """ def crossvalpcr(self, xval=True, explained_variance_threshold=0.95):
169+ def crossvalpcr (self , xval = True , explained_variance_threshold = 0.95 ):
170170 import numpy as np
171171 from scipy .stats import pearsonr as corr
172172 from sklearn .linear_model import LinearRegression
@@ -292,180 +292,14 @@ def gridCheck(self, lim=5, ntim=2, debug=False):
292292 pcs_full = raw_glo_var .dot (eofs )
293293 reg_full = LinearRegression ().fit (pcs_full , predictand )
294294
295- self.pcs = pcs_full
296- self.lin_model = {
297- "eofs": eofs,
298- "regression": reg_full,
299- "n_pc": n_pc_best
300- }"""
301-
302- def crossvalpcr (self , xval = True , explained_variance_threshold = 0.95 ):
303- import numpy as np
304- from scipy .stats import pearsonr as corr
305- from sklearn .linear_model import LinearRegression
306- from sklearn .model_selection import KFold
307- from albatross .utils import weight_glo_var , vcorr , sig_test
308-
309- predictand = self .clim_data
310- n_samples = len (predictand )
311- yhat = np .zeros (n_samples )
312-
313- # NOTE: The full corr_grid and raw_glo_var are no longer calculated here.
314- # They will be calculated *inside* each CV loop to prevent data leakage.
315-
316- if not xval :
317- # Standard PCA regression (no CV)
318- # This part still needs the full corr_grid, but it's not for CV
319- # so data leakage isn't an issue here.
320- self .bootcorr (corrconf = 0.95 )
321- self .gridCheck ()
322- if self .corr_grid .mask .sum () >= len (self .glo_var .lat ) * len (self .glo_var .lon ) - 4 :
323- self .flags [ "noSST" ] = True
324- self .hindcast = None
325- self .pcs = None
326- self .lin_model = None
327- self .correlation = None
328- print ("Insufficient SST data for PCA regression." )
329- return
330-
331- glo_var_idx = ~ self .corr_grid .mask
332- raw_glo_var = weight_glo_var (self .glo_var ).data [ :, glo_var_idx ]
333-
334- cov_matrix = np .cov (raw_glo_var .T )
335- eigval , eigvec = np .linalg .eig (cov_matrix )
336- eigval , eigvec = np .real (eigval ), np .real (eigvec )
337-
338- sorted_idx = np .argsort (eigval ) [ ::- 1 ]
339- eigvec = eigvec [ :, sorted_idx ]
340-
341- explained_ratio = eigval / eigval .sum ()
342- cumulative_var = np .cumsum (explained_ratio )
343- n_pc = np .searchsorted (cumulative_var , explained_variance_threshold ) + 1
344-
345- eofs = eigvec [ :, :n_pc ]
346- pcs = raw_glo_var .dot (eofs )
347-
348- reg = LinearRegression ().fit (pcs , predictand )
349- yhat = reg .predict (pcs )
350-
351- self .pcs = pcs
352- self .hindcast = yhat
353- self .correlation = corr (predictand , yhat ) [ 0 ]
354- self .lin_model = {
355- "eofs" : eofs ,
356- "regression" : reg ,
357- "n_pc" : n_pc
358- }
359- return
360-
361- # Cross-validation PCA regression
362- models = [ ]
363- kf = KFold (n_splits = 5 , shuffle = True , random_state = 42 )
364- p_value_threshold = 1 - 0.95 # This should match the value used in bootcorr
365-
366- # Get full global data for slicing later
367- full_glo_var_data = weight_glo_var (self .glo_var ).data
368-
369- for train_idx , test_idx in kf .split (predictand ):
370- X_train_full = full_glo_var_data [ train_idx ]
371- X_test_full = full_glo_var_data [ test_idx ]
372- y_train = predictand [ train_idx ]
373-
374- # --- CORRECTION: Step 1 (Feature Selection within the loop) ---
375- # Calculate correlation mask on *training data only*
376- corr_grid_train = vcorr (X = X_train_full , y = y_train )
377- n_yrs_train = len (y_train )
378- p_value = sig_test (corr_grid_train , n_yrs_train )
379-
380- # This is a simplified version of bootcorr for demonstration.
381- # You may want to call a modified `bootcorr_fold` function.
382- glo_var_idx_train = ~ np .ma .masked_array (corr_grid_train , ~ (p_value < p_value_threshold )).mask
383-
384- # Check for insufficient data in the training set
385- if glo_var_idx_train .sum ()== 0 :
386- print ("Skipping fold: Insufficient SST data in training set." )
387- continue
388-
389- # --- CORRECTION: Step 2 (Data Filtering) ---
390- # Apply the mask from the training set to both training and test data
391- X_train = X_train_full [ :, glo_var_idx_train ]
392- X_test = X_test_full [ :, glo_var_idx_train ]
393-
394- # --- CORRECTION: Step 3 (PCA and Regression within the loop) ---
395- # Perform PCA and regression on the training data
396- cov_matrix = np .cov (X_train .T )
397- eigval , eigvec = np .linalg .eig (cov_matrix )
398- eigval , eigvec = np .real (eigval ), np .real (eigvec )
399-
400- sorted_idx = np .argsort (eigval ) [ ::- 1 ]
401- eigvec = eigvec [ :, sorted_idx ]
402-
403- explained_ratio = eigval / eigval .sum ()
404- cumulative_var = np .cumsum (explained_ratio )
405- n_pc = np .searchsorted (cumulative_var , explained_variance_threshold ) + 1
406-
407- if n_pc == 0 or np .isnan (eigval [ :n_pc ]).any ():
408- continue
409-
410- eofs = eigvec [ :, :n_pc ]
411- pcs_train = X_train .dot (eofs )
412- pcs_test = X_test .dot (eofs )
413-
414- if pcs_train .shape [ 0 ] < n_pc :
415- continue
416-
417- reg = LinearRegression ().fit (pcs_train , y_train )
418- preds = reg .predict (pcs_test )
419-
420- yhat [ test_idx ] = preds
421-
422- models .append ({
423- "eofs" : eofs ,
424- "regression" : reg ,
425- "n_pc" : n_pc ,
426- "corr" : corr (y_train , reg .predict (pcs_train )) [ 0 ]
427- })
428-
429- if not models :
430- self .hindcast = None
431- self .pcs = None
432- self .lin_model = None
433- self .correlation = None
434- self .flags [ "noSST" ] = True
435- return
436-
437- # Store hindcast from CV
438- self .hindcast = yhat
439- self .correlation = corr (predictand , yhat ) [ 0 ]
440-
441- # Select best number of PCs and refit on all data
442- best_model = max (models , key = lambda m : m [ "corr" ])
443- n_pc_best = best_model [ "n_pc" ]
444-
445- # --- Refit on full dataset (this can still use the old method) ---
446- self .bootcorr (corrconf = 0.95 )
447- self .gridCheck ()
448- glo_var_idx_full = ~ self .corr_grid .mask
449- raw_glo_var_full = weight_glo_var (self .glo_var ).data [ :, glo_var_idx_full ]
450-
451- cov_matrix = np .cov (raw_glo_var_full .T )
452- eigval , eigvec = np .linalg .eig (cov_matrix )
453- eigval , eigvec = np .real (eigval ), np .real (eigvec )
454-
455- sorted_idx = np .argsort (eigval ) [ ::- 1 ]
456- eigvec = eigvec [ :, sorted_idx ]
457-
458- eofs = eigvec [ :, :n_pc_best ]
459- pcs_full = raw_glo_var_full .dot (eofs )
460- reg_full = LinearRegression ().fit (pcs_full , predictand )
461-
462295 self .pcs = pcs_full
463296 self .lin_model = {
464297 "eofs" : eofs ,
465298 "regression" : reg_full ,
466299 "n_pc" : n_pc_best
467300 }
468301
302+
469303 def save_regressor (self , workdir ):
470304 """
471305 Save EOFs and regression coefficients to CSV files.
0 commit comments