add analyze

Calvin-Xu · Calvin-Xu · commit 46c07bd0bb05 · 2026-01-19T23:07:40.000-08:00
diff --git a/experiments/domain_phase_mix/analysis.py b/experiments/domain_phase_mix/analysis.py
@@ -49,11 +49,14 @@
 DEFAULT_METRICS = [
     "eval/loss",
     "eval/paloma/c4_en/bpb",
-    "eval/paloma/wikipedia_en/bpb",
-    "eval_harness/gsm8k/acc",
-    "eval_harness/mmlu/acc",
-    "eval_harness/hellaswag/acc",
-    "eval_harness/arc_challenge/acc",
+    "eval/paloma/m2d2_wikipedia_unsplit/bpb",
+    "lm_eval/arc_challenge/acc",
+    "lm_eval/arc_challenge/acc_norm",
+    "lm_eval/hellaswag_0shot/acc",
+    "lm_eval/hellaswag_0shot/acc_norm",
+    "lm_eval/piqa/acc",
+    "lm_eval/boolq/acc",
+    "lm_eval/averages/macro_avg_acc",
 ]
 
 
@@ -114,7 +117,7 @@ def collect_results(config: CollectResultsConfig):
     logger.info(f"Found {len(runs)} W&B runs")
 
     # 3. Match runs to configs by run_id
-    matched = match_runs_to_configs(runs, configs)
+    matched = match_runs_to_configs(runs, configs, experiment_name=experiment_name)
     logger.info(f"Matched {sum(1 for m in matched if m.get('wandb_run_id'))} runs to configs")
 
     # 4. Build DataFrame with all weights and metrics
@@ -203,22 +206,29 @@ def query_wandb_runs(
     return results
 
 
-def match_runs_to_configs(runs: list[dict], configs: list[dict]) -> list[dict]:
+def match_runs_to_configs(
+    runs: list[dict], configs: list[dict], experiment_name: str
+) -> list[dict]:
     """Match W&B runs to weight configurations by run_id pattern.
 
-    Extracts run_id from W&B run names (e.g., "experiment/run_042" -> 42)
-    and matches to the corresponding config.
+    Extracts run_id from W&B run names and matches to the corresponding config.
+    Tries multiple patterns to handle different W&B naming conventions:
+    1. Full path: "pinlin_calvin_xu/data_mixture/3_partitions_3_phases/run_00042"
+    2. Short name: "run_00042-abc123" (W&B may truncate long names)
 
     Args:
         runs: List of W&B run dictionaries.
         configs: List of weight configuration dictionaries.
+        experiment_name: Experiment name prefix to filter runs (required to avoid false positives).
 
     Returns:
         List of matched dictionaries with config + run info.
     """
     # Build lookup from run_id to W&B run
     run_by_id: dict[int, dict] = {}
-    run_id_pattern = re.compile(r"run_(\d+)")
+
+    escaped_name = re.escape(experiment_name)
+    run_id_pattern = re.compile(rf"{escaped_name}/run_(\d+)")
 
     for run in runs:
         name = run.get("wandb_run_name", "")
diff --git a/experiments/domain_phase_mix/three_phase_experiment.py b/experiments/domain_phase_mix/three_phase_experiment.py
@@ -21,14 +21,19 @@
 - Three data domains: pretrain (Nemotron), midtrain (full Dolmino), SFT
 
 Usage:
+    # Run training
     python -m experiments.domain_phase_mix.three_phase_experiment [--n_runs N] [--seed SEED]
+
+    # Run analysis (after training completes)
+    python -m experiments.domain_phase_mix.three_phase_experiment --analyze
 """
 
 import logging
 import os
 
 from experiments.evals.task_configs import CORE_TASKS
 from experiments.domain_phase_mix.proxy_sweep import regmix_60m_proxy
+from experiments.domain_phase_mix.analysis import create_analysis_step
 from marin.execution.executor import executor_main
 
 from experiments.domain_phase_mix.config import PhaseSchedule
@@ -63,7 +68,7 @@
 
 
 def create_three_phase_experiment(
-    name: str = "pinlin_calvin_xu/data_mixture/domain_phase_mix",
+    name: str = "pinlin_calvin_xu/data_mixture/3_partitions_3_phases",
     experiment_budget: int = EXPERIMENT_BUDGET,
     target_budget: int = TARGET_BUDGET,
     batch_size: int = BATCH_SIZE,
@@ -117,40 +122,55 @@ def create_three_phase_experiment(
 def main(
     n_runs: int = 100,
     seed: int = 42,
-    name_prefix: str = "pinlin_calvin_xu/data_mixture/domain_phase_mix",
+    name_prefix: str = "pinlin_calvin_xu/data_mixture/3_partitions_3_phases",
+    analyze: bool = False,
 ):
     """Main entry point for running the swarm experiment.
 
     Args:
         n_runs: Number of training runs.
         seed: Random seed for weight sampling.
         name_prefix: Prefix for run names.
+        analyze: If True, only run analysis step (collect results from W&B).
     """
     if os.getenv("CI", None) is not None:
         logger.info("Skipping experiment execution on CI environment.")
         return
 
-    # Create experiment
     experiment = create_three_phase_experiment(name=name_prefix)
 
-    # Create steps (weight_configs_step saves to GCS, training_steps run the models)
     weight_configs_step, training_steps = experiment.create_swarm_steps(
         n_runs=n_runs, seed=seed, name_prefix=name_prefix
     )
+    
+    analysis_step = create_analysis_step(
+        weight_configs_step=weight_configs_step,
+        name_prefix=name_prefix,
+    )
+
+    if analyze:
+        # Only run analysis
+        logger.info("Running analysis only (collecting results from W&B)")
+        all_steps = [weight_configs_step, analysis_step]
+        executor_main(
+            steps=all_steps,
+            description=f"Analysis for {name_prefix}",
+        )
+        return
 
     # Log experiment details
     tokens_per_step = BATCH_SIZE * SEQ_LEN
     total_steps = EXPERIMENT_BUDGET // tokens_per_step
     phase1_end = int(total_steps * PHASE_BOUNDARIES[0])
     phase2_end = int(total_steps * PHASE_BOUNDARIES[1])
 
-    logger.info(f"Created {len(training_steps)} training steps + 1 weight configs step")
+    logger.info(f"Created {len(training_steps)} training steps + 1 weight configs step + 1 analysis step")
     logger.info(f"Total tokens per run: {EXPERIMENT_BUDGET:,}")
     logger.info(f"Total steps per run: {total_steps:,}")
     logger.info(f"Phase boundaries: step {phase1_end} (33%), step {phase2_end} (67%)")
 
-    # All steps: weight configs first, then training runs
-    all_steps = [weight_configs_step, *training_steps]
+    # All steps: weight configs first, then training runs, then analysis
+    all_steps = [weight_configs_step, *training_steps, analysis_step]
 
     executor_main(
         steps=all_steps,
@@ -177,9 +197,14 @@ def _parse_args():
     parser.add_argument(
         "--name_prefix",
         type=str,
-        default="pinlin_calvin_xu/data_mixture/domain_phase_mix",
+        default="pinlin_calvin_xu/data_mixture/3_partitions_3_phases",
         help="Prefix for run names.",
     )
+    parser.add_argument(
+        "--analyze",
+        action="store_true",
+        help="Run analysis only (collect results from W&B and export CSV).",
+    )
     return parser.parse_known_args()
 
 
@@ -193,4 +218,5 @@ def _parse_args():
         n_runs=args.n_runs,
         seed=args.seed,
         name_prefix=args.name_prefix,
+        analyze=args.analyze,
     )