Open-Athena
diff --git a/‎experiments/defaults.py‎
Lines changed: 1 addition & 0 deletions b/‎experiments/defaults.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎experiments/plantcad/evaluation.py‎
Lines changed: 50 additions & 67 deletions b/‎experiments/plantcad/evaluation.py‎
Lines changed: 50 additions & 67 deletions
@@ -386,6 +386,7 @@ def default_train(
         data_seed=train_config.data_seed,
         eval_harness_steps=train_config.steps_per_task_eval or 10000,
         eval_harness=harness_config,
+        eval_plugins=train_config.eval_plugins,
     )
 
     # Create the pod config
 
@@ -22,7 +22,6 @@
 import json
 import dataclasses
 from dataclasses import dataclass
-from typing import Any
 from collections.abc import Callable
 from datasets import Dataset
 
@@ -38,7 +37,6 @@
 from huggingface_hub import HfApi
 from transformers import AutoModelForCausalLM
 from levanter.callbacks import StepInfo
-from levanter.utils.tree_utils import inference_mode
 from marin.utilities.json_encoder import CustomJsonEncoder
 
 from experiments.plantcad.utils import get_available_gpus, get_nucleotide_token_ids, get_plantcad_tokenizer
@@ -48,21 +46,12 @@
 
 
 @dataclass
-class DnaEvalConfig:
-    """Configuration for DNA model evolutionary conservation evaluation"""
-
-    checkpoint_path: str | InputName
-    """Path to the model checkpoint directory"""
+class DnaEvalBaseConfig:
+    """Base configuration for DNA evaluation with fields needed for training callbacks"""
 
     model_config: str
     """Model configuration size (e.g., '300m', '100m', etc.)"""
 
-    device: str = "cuda"
-    """Device to use for model inference (e.g., 'cuda', 'cpu')"""
-
-    dtype: str | None = None
-    """Dtype to use for model inference (e.g., 'float32', 'float16', 'bfloat16' or any torch dtype)"""
-
     dataset_path: str = "plantcad/evolutionary-constraint-example"
     """Dataset repository path"""
 
@@ -75,15 +64,29 @@ class DnaEvalConfig:
     batch_size: int = 32
     """Batch size to use for inference"""
 
-    num_workers: int | None = None
-    """Number of workers to use for parallel evaluation (defaults to number of GPUs if None)"""
-
     max_samples: int | None = None
     """Maximum number of samples to evaluate (for quick testing)"""
 
-    random_seed: int = versioned(42)
+    random_seed: int = 42
     """Random seed for data shuffling prior to downsampling"""
 
+
+@dataclass
+class DnaEvalConfig(DnaEvalBaseConfig):
+    """Configuration for standalone DNA model evolutionary conservation evaluation"""
+
+    checkpoint_path: str | InputName | None = None
+    """Path to the model checkpoint directory (None for training callbacks)"""
+
+    device: str = "cuda"
+    """Device to use for model inference (e.g., 'cuda', 'cpu')"""
+
+    dtype: str | None = None
+    """Dtype to use for model inference (e.g., 'float32', 'float16', 'bfloat16' or any torch dtype)"""
+
+    num_workers: int | None = None
+    """Number of workers to use for parallel evaluation (defaults to number of GPUs if None)"""
+
     revision: str = versioned("0.1")
     """Revision number to force re-runs when needed"""
 
@@ -406,6 +409,7 @@ def score_eval_dataset(
     eval_dataset: Dataset,
     logit_function: Callable[[TokenArray], LogitArray],
     batch_size: int = 32,
+    log_progress: bool = True,
 ) -> ConservationResult:
     """Score evaluation dataset based on zero-shot conservation prediction."""
 
@@ -420,7 +424,8 @@ def score_eval_dataset(
     batches = eval_dataset.with_format(None).batch(batch_size=batch_size)
     total_batches = len(batches)
     progress_interval = max(1, total_batches // 20)  # Every 5%
-    logger.info(f"Processing {len(eval_dataset)} samples in {total_batches} batches (batch_size={batch_size})")
+    if log_progress:
+        logger.info(f"Processing {len(eval_dataset)} samples in {total_batches} batches (batch_size={batch_size})")
 
     for batch_index, batch_data in enumerate(batches):
         # Tokenize sequences
@@ -451,7 +456,7 @@ def score_eval_dataset(
         total_processed += len(sequences)
 
         # Log progress every 5% of batches
-        if batch_index % progress_interval == 0 or batch_index == total_batches - 1:
+        if log_progress and (batch_index % progress_interval == 0 or batch_index == total_batches - 1):
             progress_pct = ((batch_index + 1) / total_batches) * 100
             logger.info(
                 f"Progress: {batch_index + 1}/{total_batches} batches ({progress_pct:.1f}%) - "
@@ -466,44 +471,7 @@ def score_eval_dataset(
 # ------------------------------------------------------------------------------------------------
 
 
-def evaluate_dna_conservation(
-    tokenizer: AutoTokenizer,
-    logit_function: Callable[[Any], Any],
-    eval_dataset: Dataset,
-    batch_size: int = 32,
-    step: int | None = None,
-) -> dict[str, float]:
-    """
-    Core evaluation logic - works for both training callbacks and standalone evaluation.
-
-    Args:
-        logit_function: Function that takes tokens and returns logits
-        eval_dataset: HuggingFace dataset with 'seq' field and binary 'label' field
-        batch_size: Batch size for evaluation
-        step: Training step (for logging), None for standalone
-
-    Returns:
-        Dictionary with evaluation metrics including ROC AUC
-    """
-    # Collect scores and labels using shared function
-    result = score_eval_dataset(
-        tokenizer=tokenizer, logit_function=logit_function, eval_dataset=eval_dataset, batch_size=batch_size
-    )
-
-    # Calculate metrics using shared function
-    results = evaluate_conservation_scores(result)
-
-    # Log during training, log for standalone
-    if step is not None:
-        levanter.tracker.log({"eval/dna_conservation/roc": results["roc_auc"]}, step=step)
-        logger.info(f"Step {step}: ROC AUC = {results['roc_auc']:.3f}")
-    else:
-        logger.info(f"ROC AUC = {results['roc_auc']:.4f} ({results['n_total']} valid nucleotides)")
-
-    return results
-
-
-def create_dna_eval_callback(config: DnaEvalConfig) -> Callable[[StepInfo], None]:
+def create_dna_eval_callback(config: DnaEvalBaseConfig) -> Callable[[StepInfo], None]:
     """Create a training callback for DNA evaluation."""
 
     # Load tokenizer
@@ -514,25 +482,39 @@ def create_dna_eval_callback(config: DnaEvalConfig) -> Callable[[StepInfo], None
     dataset = load_eval_dataset(config)
 
     def dna_conservation_callback(step_info: StepInfo) -> None:
-        # Put model in inference mode
-        eval_model = inference_mode(step_info.state.model, True)
+        logger.info(f"Running PlantCAD DNA conservation evaluation (step={step_info.step})")
+        eval_model = step_info.state.eval_model
 
         # Create logit function for Levanter model
         def logit_function(
             tokens: ht.Int[ht.NamedArray, "batch position"],
         ) -> ht.Float[ht.NamedArray, "batch position vocab"]:
-            # TODO: validate input / output types
             return eval_model(tokens)
 
-        # Run evaluation
-        evaluate_dna_conservation(
+        # Compute scores with binary labels
+        scores = score_eval_dataset(
             tokenizer=tokenizer,
             logit_function=logit_function,
-            eval_dataset=dataset,  # Use the loaded dataset
+            eval_dataset=dataset,
             batch_size=config.batch_size,
+            log_progress=False,
+        )
+
+        # Evaluate scores and labels
+        metrics = evaluate_conservation_scores(scores)
+
+        # Log results
+        levanter.tracker.log(
+            {
+                "eval/dna_conservation_roc": metrics["roc_auc"],
+            },
             step=step_info.step,
         )
 
+        logger.info(
+            f"PlantCAD evaluation complete: ROC AUC = {metrics['roc_auc']:.4f}, " f"n_samples = {metrics['n_total']}"
+        )
+
     return dna_conservation_callback
 
 
@@ -614,7 +596,11 @@ def logit_function(
 
     # Generate raw conservation scores and labels
     result = score_eval_dataset(
-        tokenizer=tokenizer, logit_function=logit_function, eval_dataset=dataset, batch_size=config.batch_size
+        tokenizer=tokenizer,
+        logit_function=logit_function,
+        eval_dataset=dataset,
+        batch_size=config.batch_size,
+        log_progress=True,
     )
 
     logger.info(f"Generated {len(result.scores)} conservation scores")
@@ -640,10 +626,7 @@ def evaluate_conservation_scores(scores: ConservationResult) -> dict[str, float]
     if len(scores.scores) == 0:
         raise ValueError("No valid conservation scores found")
 
-    # Log total before filtering and filter out NaN scores
     n_unmasked_total = len(scores.scores)
-    logger.info(f"n_unmasked_total: {n_unmasked_total}")
-
     valid_mask = ~np.isnan(scores.scores)
     filtered_scores = np.array(scores.scores)[valid_mask]
     filtered_labels = np.array(scores.labels)[valid_mask]
Original file line number	Diff line number	Diff line change
`@@ -386,6 +386,7 @@ def default_train(`
`386`	`386`	`data_seed=train_config.data_seed,`
`387`	`387`	`eval_harness_steps=train_config.steps_per_task_eval or 10000,`
`388`	`388`	`eval_harness=harness_config,`
	`389`	`+ eval_plugins=train_config.eval_plugins,`
`389`	`390`	`)`
`390`	`391`
`391`	`392`	`# Create the pod config`