monarch-initiative · cmungall · Jan 31, 2026
diff --git a/src/boomer/model.py b/src/boomer/model.py
@@ -414,13 +414,82 @@ class GridSearchResult(BaseModel):
     evaluation: EvalStats | None = None
     pr_filter: float | None = None
 
+
+class PFactConsensus(BaseModel):
+    """
+    Consensus information for a probabilistic fact across grid search configurations.
+
+    Tracks how consistently a fact is accepted/rejected across different parameter settings,
+    providing a robustness measure for each mapping.
+    """
+    pfact: PFact
+    acceptance_rate: float = Field(..., description="Proportion of configs that accepted this fact")
+    mean_posterior: float = Field(..., description="Mean posterior probability when accepted")
+    std_posterior: float = Field(..., description="Std dev of posterior probability")
+    consensus_score: float = Field(..., description="Weighted consensus score (0-1)")
+    configurations_accepted: List[int] = Field(default_factory=list, description="Indices of configs that accepted")
+    configurations_total: int = Field(..., description="Total number of configurations evaluated")
+
+
+class SynthesizedSolution(BaseModel):
+    """
+    A synthesized solution combining results across multiple grid search configurations.
+
+    Rather than picking a single "best" configuration, this aggregates evidence across
+    all configurations to identify robustly supported mappings.
+    """
+    pfact_consensus: List[PFactConsensus] = Field(..., description="Consensus for each pfact")
+    aggregation_method: str = Field("weighted_vote", description="Method used for aggregation")
+    min_consensus_threshold: float = Field(0.5, description="Minimum consensus score for acceptance")
+    contributing_configs: int = Field(..., description="Number of configs contributing to synthesis")
+    high_confidence_facts: List[PFact] = Field(default_factory=list, description="Facts accepted with >80% consensus")
+    uncertain_facts: List[PFact] = Field(default_factory=list, description="Facts with 40-60% consensus")
+
+
+class AggregateStats(BaseModel):
+    """
+    Aggregate statistics across all grid search configurations.
+
+    Provides summary statistics to understand performance distribution and stability.
+    """
+    # Performance metrics
+    mean_precision: float = Field(..., description="Mean precision across configs")
+    std_precision: float = Field(..., description="Std dev of precision")
+    mean_recall: float = Field(..., description="Mean recall across configs")
+    std_recall: float = Field(..., description="Std dev of recall")
+    mean_f1: float = Field(..., description="Mean F1 score across configs")
+    std_f1: float = Field(..., description="Std dev of F1 score")
+
+    # Solution quality metrics
+    mean_confidence: float = Field(..., description="Mean solution confidence")
+    std_confidence: float = Field(..., description="Std dev of confidence")
+    mean_posterior_prob: float = Field(..., description="Mean posterior probability")
+
+    # Computational metrics
+    mean_time: float = Field(..., description="Mean execution time (seconds)")
+    std_time: float = Field(..., description="Std dev of execution time")
+    mean_combinations_explored: int = Field(..., description="Mean combinations explored")
+
+    # Success metrics
+    success_rate: float = Field(..., description="Proportion of configs that found solutions")
+    timeout_rate: float = Field(..., description="Proportion of configs that timed out")
+
+    # Parameter impact analysis (optional)
+    parameter_impacts: Dict[str, float] | None = Field(None, description="Impact score for each parameter")
+
+
 class GridSearch(BaseModel):
     """
     A grid search is a grid search over a set of hyperparameters.
     """
     configurations: list[SearchConfig]
     configuration_matrix: dict[str, list[Any]] | None = None
     results: list[GridSearchResult] | None = None
+    aggregate_stats: AggregateStats | None = None
+    synthesized_solution: SynthesizedSolution | None = None
+    best_config: SearchConfig | None = Field(None, description="Best config by chosen metric")
+    best_config_metric: str | None = Field(None, description="Metric used to select best config")
+    pareto_frontier: List[GridSearchResult] | None = Field(None, description="Pareto optimal configs (speed vs accuracy)")
 
     def to_flat_dicts(self) -> dict[str, Any]:
         """

diff --git a/src/boomer/search.py b/src/boomer/search.py
@@ -18,6 +18,9 @@
     Grounding,
     Solution,
     SearchConfig,
+    PFactConsensus,
+    SynthesizedSolution,
+    AggregateStats,
 )
 from typing import Iterator, List, Set, Tuple
 
@@ -531,6 +534,197 @@ def evaluate_hypotheses(
     return solutions
 
 
+def compute_aggregate_stats(results: List[GridSearchResult]) -> AggregateStats:
+    """
+    Compute aggregate statistics across all grid search results.
+
+    >>> results = []  # Would normally have GridSearchResult instances
+    >>> # stats = compute_aggregate_stats(results)
+    """
+    import numpy as np
+
+    # Filter for results with evaluations if available
+    eval_results = [r for r in results if r.evaluation is not None]
+    all_results = results if not eval_results else eval_results
+
+    # Extract metrics
+    if eval_results:
+        precisions = [r.evaluation.precision for r in eval_results]
+        recalls = [r.evaluation.recall for r in eval_results]
+        f1s = [r.evaluation.f1 for r in eval_results]
+    else:
+        precisions = recalls = f1s = [0.0]
+
+    confidences = [r.result.confidence for r in all_results]
+    posterior_probs = [r.result.posterior_prob for r in all_results]
+    times = [r.result.time_elapsed or 0.0 for r in all_results]
+    combinations = [r.result.number_of_combinations for r in all_results]
+
+    # Calculate success and timeout rates
+    success_rate = sum(1 for r in all_results if r.result.confidence > 0) / len(all_results)
+    timeout_rate = sum(1 for r in all_results if r.result.timed_out) / len(all_results)
+
+    return AggregateStats(
+        mean_precision=np.mean(precisions) if precisions else 0.0,
+        std_precision=np.std(precisions) if precisions else 0.0,
+        mean_recall=np.mean(recalls) if recalls else 0.0,
+        std_recall=np.std(recalls) if recalls else 0.0,
+        mean_f1=np.mean(f1s) if f1s else 0.0,
+        std_f1=np.std(f1s) if f1s else 0.0,
+        mean_confidence=np.mean(confidences),
+        std_confidence=np.std(confidences),
+        mean_posterior_prob=np.mean(posterior_probs),
+        mean_time=np.mean(times),
+        std_time=np.std(times),
+        mean_combinations_explored=int(np.mean(combinations)),
+        success_rate=success_rate,
+        timeout_rate=timeout_rate,
+    )
+
+
+def synthesize_solution(kb: KB, results: List[GridSearchResult]) -> SynthesizedSolution:
+    """
+    Synthesize a consensus solution across all grid search results.
+
+    Creates a robust solution by aggregating evidence across all configurations,
+    identifying mappings that are consistently accepted regardless of parameter settings.
+    """
+    import numpy as np
+
+    # Collect acceptance data for each pfact
+    pfact_data: dict[int, dict] = defaultdict(lambda: {
+        "accepted_configs": [],
+        "posteriors": [],
+        "truth_values": [],
+    })
+
+    for config_idx, result in enumerate(results):
+        for pfact_idx, spfact in enumerate(result.result.solved_pfacts):
+            data = pfact_data[pfact_idx]
+            data["pfact"] = spfact.pfact
+            data["truth_values"].append(spfact.truth_value)
+
+            if spfact.truth_value:
+                data["accepted_configs"].append(config_idx)
+                data["posteriors"].append(spfact.posterior_prob)
+
+    # Build consensus for each pfact
+    pfact_consensus_list = []
+    high_confidence = []
+    uncertain = []
+
+    for pfact_idx in sorted(pfact_data.keys()):
+        data = pfact_data[pfact_idx]
+        n_accepted = len(data["accepted_configs"])
+        n_total = len(data["truth_values"])
+
+        acceptance_rate = n_accepted / n_total if n_total > 0 else 0.0
+
+        if data["posteriors"]:
+            mean_posterior = np.mean(data["posteriors"])
+            std_posterior = np.std(data["posteriors"])
+        else:
+            mean_posterior = 0.0
+            std_posterior = 0.0
+
+        # Consensus score weights acceptance rate by mean posterior probability
+        consensus_score = acceptance_rate * (mean_posterior if mean_posterior > 0 else 0.0)
+
+        consensus = PFactConsensus(
+            pfact=data["pfact"],
+            acceptance_rate=acceptance_rate,
+            mean_posterior=mean_posterior,
+            std_posterior=std_posterior,
+            consensus_score=consensus_score,
+            configurations_accepted=data["accepted_configs"],
+            configurations_total=n_total,
+        )
+        pfact_consensus_list.append(consensus)
+
+        # Categorize by consensus strength
+        if consensus_score > 0.8:
+            high_confidence.append(data["pfact"])
+        elif 0.4 <= consensus_score <= 0.6:
+            uncertain.append(data["pfact"])
+
+    return SynthesizedSolution(
+        pfact_consensus=pfact_consensus_list,
+        aggregation_method="weighted_vote",
+        min_consensus_threshold=0.5,
+        contributing_configs=len(results),
+        high_confidence_facts=high_confidence,
+        uncertain_facts=uncertain,
+    )
+
+
+def find_best_config(results: List[GridSearchResult]) -> Tuple[SearchConfig | None, str | None]:
+    """
+    Find the best configuration based on F1 score (or confidence if no evaluation).
+
+    Returns the config and the metric name used for selection.
+    """
+    if not results:
+        return None, None
+
+    # Prefer F1 score if evaluations are available
+    eval_results = [r for r in results if r.evaluation is not None]
+
+    if eval_results:
+        best = max(eval_results, key=lambda r: r.evaluation.f1)
+        return best.config, "f1_score"
+    else:
+        best = max(results, key=lambda r: r.result.confidence)
+        return best.config, "confidence"
+
+
+def find_pareto_frontier(results: List[GridSearchResult]) -> List[GridSearchResult]:
+    """
+    Find the Pareto frontier of configurations (speed vs accuracy trade-off).
+
+    A configuration is on the Pareto frontier if no other configuration is both
+    faster AND more accurate.
+    """
+    if not results:
+        return []
+
+    # For configurations with evaluation, use F1; otherwise use confidence
+    def get_accuracy(r: GridSearchResult) -> float:
+        if r.evaluation:
+            return r.evaluation.f1
+        return r.result.confidence
+
+    def get_speed(r: GridSearchResult) -> float:
+        # Inverse of time (higher is better/faster)
+        time = r.result.time_elapsed or 0.001
+        return 1.0 / time
+
+    pareto = []
+
+    for candidate in results:
+        candidate_acc = get_accuracy(candidate)
+        candidate_speed = get_speed(candidate)
+
+        # Check if dominated by any other result
+        dominated = False
+        for other in results:
+            if other is candidate:
+                continue
+
+            other_acc = get_accuracy(other)
+            other_speed = get_speed(other)
+
+            # Other dominates if better in both dimensions
+            if other_acc >= candidate_acc and other_speed >= candidate_speed:
+                if other_acc > candidate_acc or other_speed > candidate_speed:
+                    dominated = True
+                    break
+
+        if not dominated:
+            pareto.append(candidate)
+
+    return pareto
+
+
 def grid_search(
     kb: KB,
     grid: GridSearch,
@@ -581,4 +775,12 @@ def grid_search(
         else:
             results.append(GridSearchResult(config=cfg, result=sol))
     grid.results = results
+
+    # Compute aggregate statistics and synthesized solution
+    if results:
+        grid.aggregate_stats = compute_aggregate_stats(results)
+        grid.synthesized_solution = synthesize_solution(kb, results)
+        grid.best_config, grid.best_config_metric = find_best_config(results)
+        grid.pareto_frontier = find_pareto_frontier(results)
+
     return grid
diff --git a/tests/__snapshots__/animals/solution.json b/tests/__snapshots__/animals/solution.json
@@ -129,8 +129,8 @@
     }
   ],
   "sub_solutions": [],
-  "time_started": 1758781080.561839,
-  "time_finished": 1758781080.607877,
+  "time_started": 1769821846.459719,
+  "time_finished": 1769821846.505715,
   "timed_out": false,
-  "time_elapsed": 0.046037912368774414
+  "time_elapsed": 0.04599595069885254
 }
diff --git a/tests/__snapshots__/animals/solution.tsv b/tests/__snapshots__/animals/solution.tsv
@@ -1,13 +1,13 @@
 # BOOMER Solution TSV Output
 #
 # Metadata:
-#   generated_date: 2025-09-24T23:18:00.608215
+#   generated_date: 2026-01-30T17:10:46.506172
 #   combinations: 10
 #   satisfiable_combinations: 2
 #   confidence: 0.8999999999999999
 #   prior_probability: 0.38742048900000015
 #   posterior_probability: 0.9
-#   time_elapsed_seconds: 0.046037912368774414
+#   time_elapsed_seconds: 0.04599595069885254
 #   timed_out: False
 #
 # Format: fact_type followed by arguments, then truth_value and probabilities

diff --git a/tests/__snapshots__/animals/solution.yaml b/tests/__snapshots__/animals/solution.yaml
@@ -91,7 +91,7 @@ solved_pfacts:
   posterior_prob: 0.0
   metadata: null
 sub_solutions: []
-time_started: 1758781080.561839
-time_finished: 1758781080.607877
+time_started: 1769821846.459719
+time_finished: 1769821846.505715
 timed_out: false
-time_elapsed: 0.046037912368774414
+time_elapsed: 0.04599595069885254
diff --git a/tests/__snapshots__/bfo/solution.json b/tests/__snapshots__/bfo/solution.json
@@ -11,8 +11,8 @@
   "ground_pfacts": [],
   "solved_pfacts": [],
   "sub_solutions": [],
-  "time_started": 1758781080.6131022,
-  "time_finished": 1758781080.61313,
+  "time_started": 1769821846.516468,
+  "time_finished": 1769821846.5165062,
   "timed_out": false,
-  "time_elapsed": 2.7894973754882812e-05
+  "time_elapsed": 3.814697265625e-05
 }
diff --git a/tests/__snapshots__/bfo/solution.tsv b/tests/__snapshots__/bfo/solution.tsv
@@ -1,13 +1,13 @@
 # BOOMER Solution TSV Output
 #
 # Metadata:
-#   generated_date: 2025-09-24T23:18:00.613286
+#   generated_date: 2026-01-30T17:10:46.516719
 #   combinations: 0
 #   satisfiable_combinations: 0
 #   confidence: 0.0
 #   prior_probability: 0.0
 #   posterior_probability: 0.0
-#   time_elapsed_seconds: 2.7894973754882812e-05
+#   time_elapsed_seconds: 3.814697265625e-05
 #   timed_out: False
 #
 # Format: fact_type followed by arguments, then truth_value and probabilities

diff --git a/tests/__snapshots__/bfo/solution.yaml b/tests/__snapshots__/bfo/solution.yaml
@@ -10,7 +10,7 @@ proportion_of_combinations_explored: 0.0
 ground_pfacts: []
 solved_pfacts: []
 sub_solutions: []
-time_started: 1758781080.6131022
-time_finished: 1758781080.61313
+time_started: 1769821846.516468
+time_finished: 1769821846.5165062
 timed_out: false
-time_elapsed: 2.7894973754882812e-05
+time_elapsed: 3.814697265625e-05