Merge pull request #40 from usnistgov/39-autosas_model_selection_broke

martintb · web-flow · commit 37001390bb6b · 2025-05-27T14:12:24.000-04:00
39 autosas model selection broke
diff --git a/AFL/double_agent/AutoSAS.py b/AFL/double_agent/AutoSAS.py
@@ -956,12 +956,12 @@ class ModelSelectParsimony(PipelineOp):
         The variable name for model names in the dataset
     sample_dim : str
         The dimension containing each sample
-    cutoff_threshold : float
+    cutoff : float
         The chi-squared threshold for acceptable fits (default: 1.0)
-    model_complexity : dict[str, int] | None
+    model_priority : dict[str, int] | None
         Dictionary mapping model names to their complexity (number of parameters)
     model_inputs : list[dict[str, Any]] | None
-        List of model configurations, used to determine complexity if model_complexity is None
+        List of model configurations, used to determine complexity if model_priority is None
     server_id : str | None
         Server ID in the format "host:port" for remote execution, or None for local execution
     output_prefix : str
@@ -972,8 +972,8 @@ def __init__(
         all_chisq_var,
         model_names_var,
         sample_dim,
-        cutoff_threshold=1.0,
-        model_complexity=None,
+        cutoff=1.0,
+        model_priority=None,
         model_inputs=None,  # Added to support local complexity calculation
         server_id=None,     # Made optional to support local operation
         output_prefix='Parsimony',
@@ -993,8 +993,8 @@ def __init__(
         
         self.sample_dim = sample_dim
         self.model_names_var = model_names_var
-        self.cutoff_threshold = cutoff_threshold 
-        self.model_complexity = model_complexity
+        self.cutoff = cutoff 
+        self.model_priority = model_priority
         self.model_inputs = model_inputs
         self.all_chisq_var = all_chisq_var
         self.server_id = server_id
@@ -1016,67 +1016,77 @@ def calculate(self, dataset):
         """Method for selecting the model based on parsimony given a user defined ChiSq threshold """
         
         self.dataset = dataset.copy(deep=True)
-
-        bestChiSq_labels = self.dataset[self.all_chisq_var].argmin(self.model_names_var)
-        bestChiSq_label_names = np.array([self.dataset[self.model_names_var][i].values for i in bestChiSq_labels.values])
-        
+       
         ### default behavior is that complexity is determined by number of free parameters. 
         ### this is an issue if the number of parameters is the same between models. You bank on them having wildly different ChiSq vals
         ### could use a neighbor approach or some more intelligent selection methods
-        if self.model_complexity is None:
-            print('aggregating complexity')
+        
             
-            # Determine model complexity either from server or local model_inputs
-            if self.server_id is not None:
-                # Get complexity from server
-                self.construct_clients()
-                aSAS_config = self.AutoSAS_client.get_config('all', interactive=True)['return_val']
-                model_inputs = aSAS_config['model_inputs']
-            elif self.model_inputs is not None:
-                # Use local model_inputs
-                model_inputs = self.model_inputs
-            else:
-                raise ValueError("Either server_id or model_inputs must be provided to calculate model complexity")
-                
-            # Calculate complexity based on number of free parameters
-            order = []
-            for model in model_inputs:
-                n_params = 0
-                for p in model['fit_params']:
-                    if model['fit_params'][p]['bounds'] != None:
-                        n_params += 1
-                order.append(n_params)
-            print(order)
-            print(np.argsort(order))
-            self.model_complexity = np.argsort(order).tolist()
-
-        # As written in dev full of jank...
-        replacement_labels = bestChiSq_labels.copy(deep=True)
-        all_chisq = self.dataset[self.all_chisq_var]
-        sorted_chisq = all_chisq.sortby(self.model_names_var, ascending=False).values
-
-        min_diff_chisq = np.array([row[1] - row[0] for row in sorted_chisq])
-        next_best_idx = np.array([np.argpartition(row,1)[1] for row in all_chisq])
-
-        for idx in range(len(replacement_labels)):
-            chisq_set = all_chisq.min(dim=self.model_names_var).values
-
-            if (min_diff_chisq[idx] <= self.cutoff_threshold):
-                best_model_index = replacement_labels[idx]
-                next_best_index = next_best_idx[idx]
-                bm_rank = self.model_complexity.index(best_model_index)
-                nbm_rank = self.model_complexity.index(next_best_index)
-                
-                if (bm_rank > nbm_rank):
-                    replacement_labels[idx] = next_best_index
+        # Determine model complexity either from server or local model_inputs
+        if self.server_id is not None:
+            # Get complexity from server
+            self.construct_clients()
+            aSAS_config = self.AutoSAS_client.get_config('all', interactive=True)['return_val']
+            model_inputs = aSAS_config['model_inputs']
+        elif self.model_inputs is not None:
+            # Use local model_inputs
+            model_inputs = self.model_inputs
+        else:
+            raise ValueError("Either server_id or model_inputs must be provided to calculate model complexity")
+            
+        # Calculate complexity based on number of free parameters
+        model_params = []
+        for model in model_inputs:
+            n_params = 0
+            for p in model['fit_params']:
+                if model['fit_params'][p]['bounds'] != None:
+                    n_params += 1
+            model_params.append(n_params)
+            
+        if self.model_priority is None:
+            self.model_priority = np.argsort(model_params).tolist()
+            
+        models = self.dataset[self.model_names_var].values #extract models
+        
+        # Sort models by priority
+        priority_order = [m for _,m in sorted(zip(self.model_priority,models))]
+        
+        # Sort chi-squared and params accordingly
+        sorted_chisq = self.dataset[self.all_chisq_var].sortby(self.model_names_var)
+        
+        # Find best model per sample based on chi-squared
+        best_indices = sorted_chisq.argmin(dim=self.model_names_var)
+        best_chisq = sorted_chisq.min(dim=self.model_names_var)
+        
+        # Iterate over samples to apply parsimony rule
+        selected_indices = []
+        for i in range(self.dataset.sizes[self.sample_dim]):
+            chisq_values = sorted_chisq.isel(sample=i).values
+            min_chisq = best_chisq.isel(sample=i).item()
+
+            # Find all models within cutoff
+            within_cutoff = np.where(chisq_values - min_chisq <= self.cutoff)[0]
 
+            # Choose the simplest model among them
+            simplest_idx = within_cutoff[np.argmin([self.model_priority[i] for i in within_cutoff])]
+            
+            # print(chisq_values)
+            # print(chisq_values - min_chisq)
+            # print(within_cutoff)
+            # print(self.model_priority)
+            # print(simplest_idx,'\n')
+            selected_indices.append(simplest_idx)
+        
+        selected_indices = np.array(selected_indices)
+
+        
         self.output[self._prefix_output("labels")] = xr.DataArray(
-            data=replacement_labels,
+            data=selected_indices,
             dims=[self.sample_dim]
         )
         
         self.output[self._prefix_output("label_names")] = xr.DataArray(
-            data=[self.dataset[self.model_names_var].values[i] for i in replacement_labels],
+            data=[priority_order[i] for i in selected_indices],
             dims=[self.sample_dim]
         )
         return self
@@ -1153,9 +1163,6 @@ def calculate(self, dataset):
         
         self.dataset = dataset.copy(deep=True)
 
-        bestChiSq_labels = self.dataset[self.all_chisq_var].argmin(self.model_names_var).values
-        bestChiSq_label_names = np.array([self.dataset[self.model_names_var][i].values for i in bestChiSq_labels])
-        
         # Determine model complexity either from server or local model_inputs
         if self.server_id is not None:
             # Get complexity from server
@@ -1168,18 +1175,18 @@ def calculate(self, dataset):
         else:
             raise ValueError("Either server_id or model_inputs must be provided to calculate model complexity")
             
-        # Calculate number of parameters for each model
-        n = []
+        # Calculate number of parameters for each model, d
+        d = []
         for model in model_inputs:
             n_params = 0
             for p in model['fit_params']:
                 if model['fit_params'][p]['bounds'] != None:
                     n_params += 1
-            n.append(n_params)
-        n = np.array(n)
+            d.append(n_params)
+        d = np.array(d)
         
-        ### chisq + 2*ln(d) = AIC    
-        AIC = np.array([2*np.log(i) + 2*n for i in self.dataset[self.all_chisq_var].values])
+        ### AIC = 2*d + chisq 
+        AIC = np.array([2*i + 2*d for i in self.dataset[self.all_chisq_var].values])
 
         AIC_labels = np.argmin(AIC, axis=1)
         AIC_label_names = np.array([self.dataset[self.model_names_var][i].values for i in AIC_labels])
@@ -1348,13 +1355,84 @@ def calculate(self, dataset: xr.Dataset) -> Self:
 
 
 
+class ModelSelectMostProbable(PipelineOp):
+    """ModelSelectMostProbable is a pipeline operation for selecting the most probable model.
+    
+    This class selects the model with the highest probability for each sample by 
+    argmaxing over the 'probabilities' data variable.
+    
+    Attributes
+    ----------
+    model_names_var : str
+        The variable name for model names in the dataset
+    sample_dim : str
+        The dimension containing each sample
+    model_inputs : list[dict[str, Any]] | None
+        List of model configurations (optional)
+    server_id : str | None
+        Server ID in the format "host:port" for remote execution, or None for local execution
+    output_prefix : str
+        Prefix to add to output variable names
+    """
+    
+    def __init__(
+        self,
+        model_names_var,
+        sample_dim,
+        output_prefix='MostProbable',
+        name="ModelSelection_MostProbable",
+        **kwargs
+    ):
+        output_variables = ["labels", "label_names"]
+        super().__init__(
+            name=name,
+            input_variable=[model_names_var],
+            output_variable=[
+                output_prefix + "_" + o for o in listify(output_variables)
+            ],
+            output_prefix=output_prefix,
+        )
+        
+        self.sample_dim = sample_dim
+        self.model_names_var = model_names_var
 
 
+    def calculate(self, dataset):
+        """Method for selecting the model with the highest probability for each sample
+        
+        Raises
+        ------
+        ValueError
+            If 'probabilities' variable is not present in the dataset
+        """
+        
+        self.dataset = dataset.copy(deep=True)
+        
+        # Check if 'probabilities' variable exists in the dataset
+        if 'probabilities' not in self.dataset:
+            raise ValueError(
+                f"The 'probabilities' variable is required for {self.__class__.__name__}. "
+                "Please ensure the dataset contains a 'probabilities' variable with model probabilities."
+            )
+        
+        # Determine the most probable model by argmaxing over probabilities
+        probabilities = self.dataset['probabilities']
+        
+        # Find the index of the maximum probability for each sample
+        most_probable_indices = probabilities.argmax(dim=self.model_names_var)
+        print(most_probable_indices) 
+        # Get the corresponding model names 
+        model_names = self.dataset[self.model_names_var]
+        most_probable_labels = model_names.isel(**{self.model_names_var: most_probable_indices})
+        
+        self.output[self._prefix_output("labels")] = xr.DataArray(
+            data=most_probable_indices.values,
+            dims=[self.sample_dim]
+        )
+        
+        self.output[self._prefix_output("label_names")] = xr.DataArray(
+            data=most_probable_labels.values,
+            dims=[self.sample_dim]
+        )
+        return self
 
-
-
-
-
-
-    
-