Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions src/boomer/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,13 +414,82 @@ class GridSearchResult(BaseModel):
evaluation: EvalStats | None = None
pr_filter: float | None = None


class PFactConsensus(BaseModel):
"""
Consensus information for a probabilistic fact across grid search configurations.

Tracks how consistently a fact is accepted/rejected across different parameter settings,
providing a robustness measure for each mapping.
"""
pfact: PFact
acceptance_rate: float = Field(..., description="Proportion of configs that accepted this fact")
mean_posterior: float = Field(..., description="Mean posterior probability when accepted")
std_posterior: float = Field(..., description="Std dev of posterior probability")
consensus_score: float = Field(..., description="Weighted consensus score (0-1)")
configurations_accepted: List[int] = Field(default_factory=list, description="Indices of configs that accepted")
configurations_total: int = Field(..., description="Total number of configurations evaluated")


class SynthesizedSolution(BaseModel):
"""
A synthesized solution combining results across multiple grid search configurations.

Rather than picking a single "best" configuration, this aggregates evidence across
all configurations to identify robustly supported mappings.
"""
pfact_consensus: List[PFactConsensus] = Field(..., description="Consensus for each pfact")
aggregation_method: str = Field("weighted_vote", description="Method used for aggregation")
min_consensus_threshold: float = Field(0.5, description="Minimum consensus score for acceptance")
contributing_configs: int = Field(..., description="Number of configs contributing to synthesis")
high_confidence_facts: List[PFact] = Field(default_factory=list, description="Facts accepted with >80% consensus")
uncertain_facts: List[PFact] = Field(default_factory=list, description="Facts with 40-60% consensus")


class AggregateStats(BaseModel):
"""
Aggregate statistics across all grid search configurations.

Provides summary statistics to understand performance distribution and stability.
"""
# Performance metrics
mean_precision: float = Field(..., description="Mean precision across configs")
std_precision: float = Field(..., description="Std dev of precision")
mean_recall: float = Field(..., description="Mean recall across configs")
std_recall: float = Field(..., description="Std dev of recall")
mean_f1: float = Field(..., description="Mean F1 score across configs")
std_f1: float = Field(..., description="Std dev of F1 score")

# Solution quality metrics
mean_confidence: float = Field(..., description="Mean solution confidence")
std_confidence: float = Field(..., description="Std dev of confidence")
mean_posterior_prob: float = Field(..., description="Mean posterior probability")

# Computational metrics
mean_time: float = Field(..., description="Mean execution time (seconds)")
std_time: float = Field(..., description="Std dev of execution time")
mean_combinations_explored: int = Field(..., description="Mean combinations explored")

# Success metrics
success_rate: float = Field(..., description="Proportion of configs that found solutions")
timeout_rate: float = Field(..., description="Proportion of configs that timed out")

# Parameter impact analysis (optional)
parameter_impacts: Dict[str, float] | None = Field(None, description="Impact score for each parameter")


class GridSearch(BaseModel):
"""
A grid search is a grid search over a set of hyperparameters.
"""
configurations: list[SearchConfig]
configuration_matrix: dict[str, list[Any]] | None = None
results: list[GridSearchResult] | None = None
aggregate_stats: AggregateStats | None = None
synthesized_solution: SynthesizedSolution | None = None
best_config: SearchConfig | None = Field(None, description="Best config by chosen metric")
best_config_metric: str | None = Field(None, description="Metric used to select best config")
pareto_frontier: List[GridSearchResult] | None = Field(None, description="Pareto optimal configs (speed vs accuracy)")

def to_flat_dicts(self) -> dict[str, Any]:
"""
Expand Down
202 changes: 202 additions & 0 deletions src/boomer/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
Grounding,
Solution,
SearchConfig,
PFactConsensus,
SynthesizedSolution,
AggregateStats,
)
from typing import Iterator, List, Set, Tuple

Expand Down Expand Up @@ -531,6 +534,197 @@ def evaluate_hypotheses(
return solutions


def compute_aggregate_stats(results: List[GridSearchResult]) -> AggregateStats:
"""
Compute aggregate statistics across all grid search results.

>>> results = [] # Would normally have GridSearchResult instances
>>> # stats = compute_aggregate_stats(results)
"""
import numpy as np

# Filter for results with evaluations if available
eval_results = [r for r in results if r.evaluation is not None]
all_results = results if not eval_results else eval_results

# Extract metrics
if eval_results:
precisions = [r.evaluation.precision for r in eval_results]
recalls = [r.evaluation.recall for r in eval_results]
f1s = [r.evaluation.f1 for r in eval_results]
else:
precisions = recalls = f1s = [0.0]

confidences = [r.result.confidence for r in all_results]
posterior_probs = [r.result.posterior_prob for r in all_results]
times = [r.result.time_elapsed or 0.0 for r in all_results]
combinations = [r.result.number_of_combinations for r in all_results]

# Calculate success and timeout rates
success_rate = sum(1 for r in all_results if r.result.confidence > 0) / len(all_results)
timeout_rate = sum(1 for r in all_results if r.result.timed_out) / len(all_results)

return AggregateStats(
mean_precision=np.mean(precisions) if precisions else 0.0,
std_precision=np.std(precisions) if precisions else 0.0,
mean_recall=np.mean(recalls) if recalls else 0.0,
std_recall=np.std(recalls) if recalls else 0.0,
mean_f1=np.mean(f1s) if f1s else 0.0,
std_f1=np.std(f1s) if f1s else 0.0,
mean_confidence=np.mean(confidences),
std_confidence=np.std(confidences),
mean_posterior_prob=np.mean(posterior_probs),
mean_time=np.mean(times),
std_time=np.std(times),
mean_combinations_explored=int(np.mean(combinations)),
success_rate=success_rate,
timeout_rate=timeout_rate,
)


def synthesize_solution(kb: KB, results: List[GridSearchResult]) -> SynthesizedSolution:
"""
Synthesize a consensus solution across all grid search results.

Creates a robust solution by aggregating evidence across all configurations,
identifying mappings that are consistently accepted regardless of parameter settings.
"""
import numpy as np

# Collect acceptance data for each pfact
pfact_data: dict[int, dict] = defaultdict(lambda: {
"accepted_configs": [],
"posteriors": [],
"truth_values": [],
})

for config_idx, result in enumerate(results):
for pfact_idx, spfact in enumerate(result.result.solved_pfacts):
data = pfact_data[pfact_idx]
data["pfact"] = spfact.pfact
data["truth_values"].append(spfact.truth_value)

if spfact.truth_value:
data["accepted_configs"].append(config_idx)
data["posteriors"].append(spfact.posterior_prob)

# Build consensus for each pfact
pfact_consensus_list = []
high_confidence = []
uncertain = []

for pfact_idx in sorted(pfact_data.keys()):
data = pfact_data[pfact_idx]
n_accepted = len(data["accepted_configs"])
n_total = len(data["truth_values"])

acceptance_rate = n_accepted / n_total if n_total > 0 else 0.0

if data["posteriors"]:
mean_posterior = np.mean(data["posteriors"])
std_posterior = np.std(data["posteriors"])
else:
mean_posterior = 0.0
std_posterior = 0.0

# Consensus score weights acceptance rate by mean posterior probability
consensus_score = acceptance_rate * (mean_posterior if mean_posterior > 0 else 0.0)

consensus = PFactConsensus(
pfact=data["pfact"],
acceptance_rate=acceptance_rate,
mean_posterior=mean_posterior,
std_posterior=std_posterior,
consensus_score=consensus_score,
configurations_accepted=data["accepted_configs"],
configurations_total=n_total,
)
pfact_consensus_list.append(consensus)

# Categorize by consensus strength
if consensus_score > 0.8:
high_confidence.append(data["pfact"])
elif 0.4 <= consensus_score <= 0.6:
uncertain.append(data["pfact"])

return SynthesizedSolution(
pfact_consensus=pfact_consensus_list,
aggregation_method="weighted_vote",
min_consensus_threshold=0.5,
contributing_configs=len(results),
high_confidence_facts=high_confidence,
uncertain_facts=uncertain,
)


def find_best_config(results: List[GridSearchResult]) -> Tuple[SearchConfig | None, str | None]:
"""
Find the best configuration based on F1 score (or confidence if no evaluation).

Returns the config and the metric name used for selection.
"""
if not results:
return None, None

# Prefer F1 score if evaluations are available
eval_results = [r for r in results if r.evaluation is not None]

if eval_results:
best = max(eval_results, key=lambda r: r.evaluation.f1)
return best.config, "f1_score"
else:
best = max(results, key=lambda r: r.result.confidence)
return best.config, "confidence"


def find_pareto_frontier(results: List[GridSearchResult]) -> List[GridSearchResult]:
"""
Find the Pareto frontier of configurations (speed vs accuracy trade-off).

A configuration is on the Pareto frontier if no other configuration is both
faster AND more accurate.
"""
if not results:
return []

# For configurations with evaluation, use F1; otherwise use confidence
def get_accuracy(r: GridSearchResult) -> float:
if r.evaluation:
return r.evaluation.f1
return r.result.confidence

def get_speed(r: GridSearchResult) -> float:
# Inverse of time (higher is better/faster)
time = r.result.time_elapsed or 0.001
return 1.0 / time

pareto = []

for candidate in results:
candidate_acc = get_accuracy(candidate)
candidate_speed = get_speed(candidate)

# Check if dominated by any other result
dominated = False
for other in results:
if other is candidate:
continue

other_acc = get_accuracy(other)
other_speed = get_speed(other)

# Other dominates if better in both dimensions
if other_acc >= candidate_acc and other_speed >= candidate_speed:
if other_acc > candidate_acc or other_speed > candidate_speed:
dominated = True
break

if not dominated:
pareto.append(candidate)

return pareto


def grid_search(
kb: KB,
grid: GridSearch,
Expand Down Expand Up @@ -581,4 +775,12 @@ def grid_search(
else:
results.append(GridSearchResult(config=cfg, result=sol))
grid.results = results

# Compute aggregate statistics and synthesized solution
if results:
grid.aggregate_stats = compute_aggregate_stats(results)
grid.synthesized_solution = synthesize_solution(kb, results)
grid.best_config, grid.best_config_metric = find_best_config(results)
grid.pareto_frontier = find_pareto_frontier(results)

return grid
6 changes: 3 additions & 3 deletions tests/__snapshots__/animals/solution.json
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,8 @@
}
],
"sub_solutions": [],
"time_started": 1758781080.561839,
"time_finished": 1758781080.607877,
"time_started": 1769821846.459719,
"time_finished": 1769821846.505715,
"timed_out": false,
"time_elapsed": 0.046037912368774414
"time_elapsed": 0.04599595069885254
}
4 changes: 2 additions & 2 deletions tests/__snapshots__/animals/solution.tsv
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# BOOMER Solution TSV Output
#
# Metadata:
# generated_date: 2025-09-24T23:18:00.608215
# generated_date: 2026-01-30T17:10:46.506172
# combinations: 10
# satisfiable_combinations: 2
# confidence: 0.8999999999999999
# prior_probability: 0.38742048900000015
# posterior_probability: 0.9
# time_elapsed_seconds: 0.046037912368774414
# time_elapsed_seconds: 0.04599595069885254
# timed_out: False
#
# Format: fact_type followed by arguments, then truth_value and probabilities
Expand Down
6 changes: 3 additions & 3 deletions tests/__snapshots__/animals/solution.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ solved_pfacts:
posterior_prob: 0.0
metadata: null
sub_solutions: []
time_started: 1758781080.561839
time_finished: 1758781080.607877
time_started: 1769821846.459719
time_finished: 1769821846.505715
timed_out: false
time_elapsed: 0.046037912368774414
time_elapsed: 0.04599595069885254
6 changes: 3 additions & 3 deletions tests/__snapshots__/bfo/solution.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
"ground_pfacts": [],
"solved_pfacts": [],
"sub_solutions": [],
"time_started": 1758781080.6131022,
"time_finished": 1758781080.61313,
"time_started": 1769821846.516468,
"time_finished": 1769821846.5165062,
"timed_out": false,
"time_elapsed": 2.7894973754882812e-05
"time_elapsed": 3.814697265625e-05
}
4 changes: 2 additions & 2 deletions tests/__snapshots__/bfo/solution.tsv
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# BOOMER Solution TSV Output
#
# Metadata:
# generated_date: 2025-09-24T23:18:00.613286
# generated_date: 2026-01-30T17:10:46.516719
# combinations: 0
# satisfiable_combinations: 0
# confidence: 0.0
# prior_probability: 0.0
# posterior_probability: 0.0
# time_elapsed_seconds: 2.7894973754882812e-05
# time_elapsed_seconds: 3.814697265625e-05
# timed_out: False
#
# Format: fact_type followed by arguments, then truth_value and probabilities
Expand Down
6 changes: 3 additions & 3 deletions tests/__snapshots__/bfo/solution.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ proportion_of_combinations_explored: 0.0
ground_pfacts: []
solved_pfacts: []
sub_solutions: []
time_started: 1758781080.6131022
time_finished: 1758781080.61313
time_started: 1769821846.516468
time_finished: 1769821846.5165062
timed_out: false
time_elapsed: 2.7894973754882812e-05
time_elapsed: 3.814697265625e-05
Loading
Loading