Add yaml configuration/ parsing of Extraction Configuration (#12)

patricktnast · web-flow · commit 72e75529447e · 2026-01-16T09:29:26.000-08:00
* [COPILOT] refactor extraction code to separate module

* format

* [COPILOT] consolidate benchmark and phase configs.

* refactor extraction to create a 'configuration'

* remove unused imports

* fix method sig

* minor fixes

* cleanup

* add basic unit tests

* add back result summary columns

* make callpattern more ergonomic

* condense

* add cli for summarization

* [COPILOT] Add tests

* edits for readability

* change nan check to warning

* format

* add summarize run at the end of the run_benchmark loop

* [COPILOT] extract plotting functions to new module

* [COPILOT] refactor plots

* adjust so that we only create fractions for bottleneck patterns, which are defined in a particular way.

* make bottleneck patterns more strict

* [COPILOT] add nb generation

* adjust organization

* format

* [COPILOT] add explicit extraction configuration

* remove preset

* remove other examples

* consolidate tests

* format

* add context manager

* use a fixture instead

* fix typo

* remove duplicate param

* rename callpattern

* add line number to extraction

* add test to ensure we can select correct line

* use pipeline call as ex instead

* format

* updates

* format

* rename callpattern

* adjust test

* add name to tests
diff --git a/extraction_config_example.yaml b/extraction_config_example.yaml
@@ -0,0 +1,145 @@
+# Example Extraction Configuration
+#
+# This file demonstrates the YAML syntax for configuring extraction patterns
+# that define which profiling metrics to extract from benchmark results.
+#
+# Usage:
+#   run_benchmark -m models/*.yaml -r 10 -b 20 --extraction-config extraction_config.yaml
+#   summarize results/profile_*/benchmark_results.csv --extraction-config extraction_config.yaml
+#
+# Each pattern defines:
+# - What function to match in cProfile output (filename + function_name)
+# - Which metrics to extract (cumtime, percall, ncalls) - all optional with defaults
+# - How to name the resulting columns (templates) - optional with defaults
+#
+# Common use cases:
+#
+# 1. Bottlenecks: Extract all 3 metrics (cumtime, percall, ncalls)
+#    Use for performance hotspots you want to track in detail
+#
+# 2. Simulation phases: Extract only cumtime with "rt_{name}_s" column naming
+#    Use for high-level simulation phases
+#
+# 3. Custom patterns: Mix and match metrics and templates as needed
+
+patterns:
+  # ===========================================================================
+  # BOTTLENECK PATTERNS
+  # ===========================================================================
+  # These extract cumtime, percall, and ncalls for detailed bottleneck analysis
+  
+  - name: gather_results
+    filename: results/manager.py
+    function_name: gather_results
+    extract_cumtime: true
+    extract_percall: true
+    extract_ncalls: true
+    # Generates columns: gather_results_cumtime, gather_results_percall, gather_results_ncalls
+
+  - name: pipeline_call
+    filename: values/pipeline.py
+    function_name: __call__
+    extract_cumtime: true
+    extract_percall: true
+    extract_ncalls: true
+    # Generates columns: pipeline_call_cumtime, pipeline_call_percall, pipeline_call_ncalls
+
+  - name: population_get
+    filename: population/population_view.py
+    function_name: get
+    extract_cumtime: true
+    extract_percall: true
+    extract_ncalls: true
+    # Generates columns: population_get_cumtime, population_get_percall, population_get_ncalls
+
+  # ===========================================================================
+  # SIMULATION PHASE PATTERNS
+  # ===========================================================================
+  # These extract only cumtime for high-level simulation phases
+  # Uses custom template for runtime column naming: rt_{name}_s
+  
+  - name: setup
+    filename: /vivarium/framework/engine.py
+    function_name: setup
+    extract_cumtime: true
+    cumtime_template: "rt_{name}_s"
+    # extract_percall and extract_ncalls default to false
+    # Generates column: rt_setup_s
+
+  - name: initialize_simulants
+    filename: /vivarium/framework/engine.py
+    function_name: initialize_simulants
+    extract_cumtime: true
+    cumtime_template: "rt_{name}_s"
+    # Generates column: rt_initialize_simulants_s
+
+  - name: run
+    filename: /vivarium/framework/engine.py
+    function_name: run
+    extract_cumtime: true
+    cumtime_template: "rt_{name}_s"
+    # Generates column: rt_run_s
+
+  - name: finalize
+    filename: /vivarium/framework/engine.py
+    function_name: finalize
+    extract_cumtime: true
+    cumtime_template: "rt_{name}_s"
+    # Generates column: rt_finalize_s
+
+  - name: report
+    filename: /vivarium/framework/engine.py
+    function_name: report
+    extract_cumtime: true
+    cumtime_template: "rt_{name}_s"
+    # Generates column: rt_report_s
+
+  # ===========================================================================
+  # CUSTOM PATTERN EXAMPLES
+  # ===========================================================================
+  
+  # Example: Custom function with selective metric extraction and custom templates
+  # - name: my_bottleneck
+  #   filename: my/module.py
+  #   function_name: my_function
+  #   extract_cumtime: true
+  #   extract_percall: true
+  #   extract_ncalls: false
+  #   cumtime_template: "{name}_total_time"
+  #   percall_template: "{name}_avg_time"
+  #   # Generates columns: my_bottleneck_total_time, my_bottleneck_avg_time
+  
+  # Example: Minimal pattern (only cumtime with default template)
+  # - name: simple_func
+  #   filename: simple/module.py
+  #   function_name: simple_function
+  #   # extract_cumtime defaults to true
+  #   # extract_percall and extract_ncalls default to false
+  #   # cumtime_template defaults to "{name}_cumtime"
+  #   # Generates column: simple_func_cumtime
+
+# ===========================================================================
+# FIELD REFERENCE
+# ===========================================================================
+#
+# Required fields:
+#   name: Logical name used in column name templates (e.g., "setup", "gather_results")
+#   filename: Path pattern to match the source file (can be partial path)
+#   function_name: Name of the function to match in cProfile output
+#
+# Optional fields (with defaults):
+#   extract_cumtime: Extract cumulative time (default: true)
+#   extract_percall: Extract time per call (default: false)
+#   extract_ncalls: Extract number of calls (default: false)
+#   cumtime_template: Column name template for cumtime (default: "{name}_cumtime")
+#   percall_template: Column name template for percall (default: "{name}_percall")
+#   ncalls_template: Column name template for ncalls (default: "{name}_ncalls")
+#
+# Template variables:
+#   {name}: Replaced with the pattern's name field
+#
+# File path matching:
+#   - Patterns match any path ending with the specified filename
+#   - Use forward slashes even on Windows
+#   - Special regex characters (., *, etc.) are automatically escaped
+#   - Example: "results/manager.py" matches "/full/path/to/results/manager.py"
diff --git a/src/vivarium_profiling/tools/cli.py b/src/vivarium_profiling/tools/cli.py
@@ -11,6 +11,7 @@
 
 from vivarium_profiling.constants import metadata, paths
 from vivarium_profiling.tools import build_artifacts, configure_logging_to_terminal
+from vivarium_profiling.tools.extraction import ExtractionConfig
 from vivarium_profiling.tools.run_benchmark import run_benchmark_loop
 from vivarium_profiling.tools.summarize import run_summarize_analysis
 
@@ -228,6 +229,11 @@ def make_artifacts(
     type=click.Path(exists=True, file_okay=False, dir_okay=True),
     help="Directory where the timestamped results directory will be created.",
 )
+@click.option(
+    "--extraction-config",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False),
+    help="Path to YAML file defining extraction patterns. If not provided, uses default patterns.",
+)
 @click.option("-v", "verbose", count=True, help="Configure logging verbosity.")
 @click.option(
     "--pdb",
@@ -240,6 +246,7 @@ def run_benchmark(
     model_runs: int,
     baseline_model_runs: int,
     output_dir: str,
+    extraction_config: str | None,
     verbose: int,
     with_debugger: bool,
 ) -> None:
@@ -256,7 +263,14 @@ def run_benchmark(
 
     # Run benchmarks with error handling
     main = handle_exceptions(run_benchmark_loop, logger, with_debugger=with_debugger)
-    main(model_specifications, model_runs, baseline_model_runs, output_dir, verbose)
+    main(
+        model_specifications,
+        model_runs,
+        baseline_model_runs,
+        output_dir,
+        extraction_config,
+        verbose,
+    )
 
 
 def _expand_model_specs(model_patterns: list[str]) -> list[Path]:
@@ -293,6 +307,11 @@ def _expand_model_specs(model_patterns: list[str]) -> list[Path]:
     is_flag=True,
     help="Drop into python debugger if an error occurs.",
 )
+@click.option(
+    "--extraction-config",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False),
+    help="Path to YAML file defining extraction patterns. If not provided, uses default patterns.",
+)
 @click.option(
     "--nb",
     is_flag=True,
@@ -305,6 +324,7 @@ def summarize(
     benchmark_results: str,
     verbose: int,
     with_debugger: bool,
+    extraction_config: str | None,
     nb: bool,
 ) -> None:
     """Summarize benchmark results and create visualizations.
@@ -327,6 +347,12 @@ def summarize(
         summarize results/profile_2026_01_07/benchmark_results.csv
     """
     configure_logging_to_terminal(verbose)
+
+    # Parse extraction config if provided
+    config = None
+    if extraction_config is not None:
+        config = ExtractionConfig.from_yaml(extraction_config)
+
     benchmark_results_path = Path(benchmark_results)
     main = handle_exceptions(run_summarize_analysis, logger, with_debugger=with_debugger)
-    main(benchmark_results_path, nb=nb)
+    main(benchmark_results_path, config=config, nb=nb)
diff --git a/src/vivarium_profiling/tools/extraction.py b/src/vivarium_profiling/tools/extraction.py
@@ -13,9 +13,21 @@
 from dataclasses import dataclass
 from itertools import chain
 from pathlib import Path
+from typing import Any
 
+import yaml
 from loguru import logger
 
+# Default values for optional FunctionCallConfiguration fields
+DEFAULT_PATTERN_CONFIG = {
+    "extract_cumtime": True,
+    "extract_percall": False,
+    "extract_ncalls": False,
+    "cumtime_template": "{name}_cumtime",
+    "percall_template": "{name}_percall",
+    "ncalls_template": "{name}_ncalls",
+}
+
 
 @dataclass
 class FunctionCallConfiguration:
@@ -203,6 +215,78 @@ def __init__(self, patterns: list[FunctionCallConfiguration] | None = None):
             patterns = DEFAULT_BOTTLENECKS + DEFAULT_PHASES
         self.patterns = patterns
 
+    @classmethod
+    def from_yaml(cls, yaml_path: str | Path) -> ExtractionConfig:
+        """Create an ExtractionConfig from a YAML file.
+
+        Parameters
+        ----------
+        yaml_path
+            Path to the YAML configuration file.
+
+        Returns
+        -------
+            ExtractionConfig object with patterns defined in the YAML file.
+
+        Raises
+        ------
+        ValueError
+            If the YAML file is invalid or missing required fields.
+        FileNotFoundError
+            If the YAML file doesn't exist.
+
+        Examples
+        --------
+        YAML format::
+
+            patterns:
+              - name: custom_func
+                filename: my/module.py
+                function_name: my_function
+                extract_cumtime: true
+                extract_percall: true
+                cumtime_template: "custom_{name}_time"
+
+        """
+        yaml_path = Path(yaml_path)
+        if not yaml_path.exists():
+            raise FileNotFoundError(f"YAML config file not found: {yaml_path}")
+
+        with open(yaml_path, "r") as f:
+            config_data = yaml.safe_load(f)
+
+        if not isinstance(config_data, dict) or "patterns" not in config_data:
+            raise ValueError(
+                "YAML file must contain a 'patterns' key with a list of pattern definitions"
+            )
+
+        patterns = []
+        for i, pattern_dict in enumerate(config_data["patterns"]):
+            if not isinstance(pattern_dict, dict):
+                raise ValueError(f"Pattern at index {i} must be a dictionary")
+
+            # Merge defaults with user config
+            merged_config = DEFAULT_PATTERN_CONFIG.copy()
+            merged_config.update(pattern_dict)
+
+            # Validate required fields
+            if "name" not in merged_config:
+                raise ValueError(f"Pattern at index {i} is missing required field 'name'")
+
+            name = merged_config["name"]
+
+            if "filename" not in merged_config or "function_name" not in merged_config:
+                raise ValueError(
+                    f"Pattern '{name}' requires 'filename' and 'function_name' fields"
+                )
+
+            # Create FunctionCallConfiguration from merged config
+            pattern = FunctionCallConfiguration(**merged_config)
+
+            patterns.append(pattern)
+
+        return cls(patterns=patterns)
+
     @property
     def metric_names(self) -> list[str]:
         """Get the names of all configured metrics."""
diff --git a/src/vivarium_profiling/tools/run_benchmark.py b/src/vivarium_profiling/tools/run_benchmark.py
@@ -148,6 +148,7 @@ def run_benchmark_loop(
     model_runs: int,
     baseline_model_runs: int,
     output_dir: str = ".",
+    extraction_config: str | Path | None = None,
     verbose: int = 0,
 ) -> str:
     """Main function to run benchmarks on model specifications.
@@ -162,17 +163,20 @@ def run_benchmark_loop(
         Number of runs for the baseline model.
     output_dir
         Directory to save results.
+    extraction_config
+        Path to YAML file defining extraction patterns. If None, uses default patterns.
     verbose
         Verbosity level for logging.
-    config
-        Extraction configuration. Defaults to DEFAULT_CONFIG.
 
     Returns
     -------
         Path to the results directory.
 
     """
-    config = ExtractionConfig()
+    if extraction_config is not None:
+        config = ExtractionConfig.from_yaml(extraction_config)
+    else:
+        config = ExtractionConfig()
 
     configure_logging_to_terminal(verbose)
 
diff --git a/tests/test_extraction.py b/tests/test_extraction.py