Analytics cli (#894)

bkorycki · web-flow · commit 602c62da7329 · 2025-03-12T19:07:32.000-04:00
* run-stuff is straight copy of run-csv-items

* only 1 sut per run

* fix small bug

* correct output directory structure

* The runners are responsible for setting up the output directory

* write empty metadata

* rename sub_dir_name to run_id

* basic metadata

* annotation and response counts in metadata

* No more --cache-dir

* run-stuff is now run-csv

* black

* mypy

* more details about how validate_uid works

* log instead of print

* use logger instead of warnings in main

* rename to run-job
diff --git a/src/modelgauge/annotation_pipeline.py b/src/modelgauge/annotation_pipeline.py
@@ -141,19 +141,25 @@ def handle_uncached_item(self, item):
 
 class AnnotatorSink(Sink):
     unfinished: defaultdict[SutInteraction, dict[str, str]]
+    sut_response_counts: defaultdict[str, int]
+    annotation_counts: defaultdict[str, int]
 
     def __init__(self, annotators: dict[str, Annotator], writer: JsonlAnnotatorOutput):
         super().__init__()
         self.annotators = annotators
         self.writer = writer
         self.unfinished = defaultdict(lambda: dict())
+        self.sut_response_counts = defaultdict(lambda: 0)
+        self.annotation_counts = defaultdict(lambda: 0)
 
     def run(self):
         with self.writer:
             super().run()
 
     def handle_item(self, item):
         sut_interaction, annotator_uid, annotation = item
+        self.sut_response_counts[sut_interaction.sut_uid] += 1
+        self.annotation_counts[annotator_uid] += 1
         if isinstance(annotation, BaseModel):
             annotation = annotation.model_dump()
         self.unfinished[sut_interaction][annotator_uid] = annotation
diff --git a/src/modelgauge/command_line.py b/src/modelgauge/command_line.py
@@ -101,7 +101,10 @@ def validate_uid(ctx, param, value):
     """Callback function for click.option UID validation.
     Raises a BadParameter exception if the user-supplied arg(s) are not valid UIDs.
     Applicable for parameters '--sut', '--test', and '--annotator'.
+    If no UID is provided (e.g. an empty list or `None`), the value is returned as-is.
     """
+    if not value:
+        return value
     # Identify what object we are validating UIDs for.
     if "--sut" in param.opts:
         registry = SUTS
diff --git a/src/modelgauge/main.py b/src/modelgauge/main.py
@@ -1,4 +1,4 @@
-import datetime
+import logging
 import os
 import pathlib
 import warnings
@@ -35,6 +35,8 @@
 from modelgauge.sut_registry import SUTS
 from modelgauge.test_registry import TESTS
 
+logger = logging.getLogger(__name__)
+
 
 @modelgauge_cli.command(name="list")
 @LOCAL_PLUGIN_DIR_OPTION
@@ -233,6 +235,109 @@ def run_test(
     print("Full TestRecord json written to", output_file)
 
 
+@modelgauge_cli.command()
+@sut_options_options
+@click.option(
+    "sut_uid",
+    "-s",
+    "--sut",
+    help="Which registered SUT to run.",
+    multiple=False,
+    required=False,
+    callback=validate_uid,
+)
+@click.option(
+    "annotator_uids",
+    "-a",
+    "--annotator",
+    help="Which registered annotator(s) to run",
+    multiple=True,
+    required=False,
+    callback=validate_uid,
+)
+@click.option(
+    "--workers",
+    type=int,
+    default=None,
+    help="Number of worker threads, default is 10 * number of SUTs.",
+)
+@click.option(
+    "--output-dir",
+    "-o",
+    default="airr_data/runs",
+    type=click.Path(file_okay=False, dir_okay=True, path_type=pathlib.Path),
+)
+@click.option("--tag", type=str, help="Tag to include in the output directory name.")
+@click.option("--debug", is_flag=True, help="Show internal pipeline debugging information.")
+@click.argument(
+    "input_path",
+    type=click.Path(exists=True, path_type=pathlib.Path),
+)
+def run_job(sut_uid, annotator_uids, workers, output_dir, tag, debug, input_path, max_tokens, temp, top_p, top_k):
+    """Run rows in a CSV through some SUTs and/or annotators.
+
+    If running a SUT, the file must have 'UID' and 'Text' columns. The output will be saved to a CSV file.
+    If running ONLY  annotators, the file must have 'UID', 'Prompt', 'SUT', and 'Response' columns. The output will be saved to a json lines file.
+    """
+    # Check all objects for missing secrets.
+    secrets = load_secrets_from_config()
+    if sut_uid:
+        check_secrets(secrets, sut_uids=[sut_uid], annotator_uids=annotator_uids)
+    else:
+        check_secrets(secrets, annotator_uids=annotator_uids)
+
+    if sut_uid:
+        sut = SUTS.make_instance(sut_uid, secrets=secrets)
+        if AcceptsTextPrompt not in sut.capabilities:
+            raise click.BadParameter(f"{sut_uid} does not accept text prompts")
+        suts = {sut_uid: sut}
+
+    annotators = {
+        annotator_uid: ANNOTATORS.make_instance(annotator_uid, secrets=secrets) for annotator_uid in annotator_uids
+    }
+
+    # Get all SUT options
+    sut_options = create_sut_options(max_tokens, temp, top_p, top_k)
+
+    # Create correct pipeline runner based on input.
+    if sut_uid and annotators:
+        pipeline_runner = PromptPlusAnnotatorRunner(
+            workers,
+            input_path,
+            output_dir,
+            None,
+            sut_options,
+            tag,
+            suts=suts,
+            annotators=annotators,
+        )
+    elif sut_uid:
+        pipeline_runner = PromptRunner(workers, input_path, output_dir, None, sut_options, tag, suts=suts)
+    elif annotators:
+        if max_tokens is not None or temp is not None or top_p is not None or top_k is not None:
+            logger.warning(f"Received SUT options but only running annotators. Options will not be used.")
+        pipeline_runner = AnnotatorRunner(workers, input_path, output_dir, None, None, tag, annotators=annotators)
+    else:
+        raise ValueError("Must specify at least one SUT or annotator.")
+
+    with click.progressbar(
+        length=pipeline_runner.num_total_items,
+        label=f"Processing {pipeline_runner.num_input_items} input items"
+        + (f" * 1 SUT" if sut_uid else "")
+        + (f" * {len(annotators)} annotators" if annotators else "")
+        + ":",
+    ) as bar:
+        last_complete_count = 0
+
+        def show_progress(data):
+            nonlocal last_complete_count
+            complete_count = data["completed"]
+            bar.update(complete_count - last_complete_count)
+            last_complete_count = complete_count
+
+        pipeline_runner.run(show_progress, debug)
+
+
 @modelgauge_cli.command()
 @sut_options_options
 @click.option(
@@ -300,9 +405,8 @@ def run_csv_items(sut_uids, annotator_uids, workers, cache_dir, debug, input_pat
     print(sut_options)
 
     # Create correct pipeline runner based on input.
-    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+    output_path = input_path.parent
     if suts and annotators:
-        output_path = input_path.parent / pathlib.Path(input_path.stem + "-annotated-responses" + timestamp + ".jsonl")
         pipeline_runner = PromptPlusAnnotatorRunner(
             workers,
             input_path,
@@ -313,13 +417,11 @@ def run_csv_items(sut_uids, annotator_uids, workers, cache_dir, debug, input_pat
             annotators=annotators,
         )
     elif suts:
-        output_path = input_path.parent / pathlib.Path(input_path.stem + "-responses-" + timestamp + ".csv")
         pipeline_runner = PromptRunner(workers, input_path, output_path, cache_dir, sut_options, suts=suts)
     elif annotators:
         if max_tokens is not None or temp is not None or top_p is not None or top_k is not None:
             warnings.warn(f"Received SUT options but only running annotators. Options will not be used.")
-        output_path = input_path.parent / pathlib.Path(input_path.stem + "-annotations-" + timestamp + ".jsonl")
-        pipeline_runner = AnnotatorRunner(workers, input_path, output_path, cache_dir, annotators=annotators)
+        pipeline_runner = AnnotatorRunner(workers, input_path, output_path, cache_dir, None, annotators=annotators)
     else:
         raise ValueError("Must specify at least one SUT or annotator.")
 
@@ -340,8 +442,6 @@ def show_progress(data):
 
         pipeline_runner.run(show_progress, debug)
 
-    print(f"output saved to {output_path}")
-
 
 def main():
     modelgauge_cli()
diff --git a/src/modelgauge/pipeline_runner.py b/src/modelgauge/pipeline_runner.py
@@ -1,4 +1,7 @@
 from abc import ABC, abstractmethod
+import datetime
+import json
+import logging
 
 from modelgauge.annotation_pipeline import (
     AnnotatorAssigner,
@@ -18,15 +21,20 @@
     CsvPromptOutput,
 )
 
+logger = logging.getLogger(__name__)
+
 
 class PipelineRunner(ABC):
-    def __init__(self, num_workers, input_path, output_path, cache_dir, sut_options=None):
+    def __init__(self, num_workers, input_path, output_dir, cache_dir, sut_options, tag=None):
         self.num_workers = num_workers
         self.input_path = input_path
-        self.output_path = output_path
+        self.root_dir = output_dir
         self.cache_dir = cache_dir
         self.sut_options = sut_options
+        self.tag = tag
         self.pipeline_segments = []
+        self.start_time = datetime.datetime.now()
+        self.finish_time = None
 
         self._initialize_segments()
 
@@ -44,13 +52,46 @@ def num_total_items(self):
         """Total number of items to process."""
         pass
 
+    def metadata(self):
+        duration = self.finish_time - self.start_time
+        hours, minutes, seconds = str(duration).split(":")
+        duration_string = f"{hours}h{minutes}m{seconds}s"
+
+        metadata = {
+            "run_id": self.run_id,
+            "run_info": {
+                "started": str(self.start_time),
+                "finished": str(self.finish_time),
+                "duration": duration_string,
+            },
+            "input": {
+                "source": self.input_path.name,
+                "num_items": self.num_input_items,
+            },
+        }
+        return metadata
+
+    def output_dir(self):
+        output_path = self.root_dir / self.run_id
+        if not output_path.exists():
+            logger.info(f"Creating output dir {output_path}")
+            output_path.mkdir(parents=True)
+        return output_path
+
     def run(self, progress_callback, debug):
         pipeline = Pipeline(
             *self.pipeline_segments,
             progress_callback=progress_callback,
             debug=debug,
         )
         pipeline.run()
+        self.finish_time = datetime.datetime.now()
+        logger.info(f"\noutput saved to {self.output_dir() / self.output_file_name}")
+        self._write_metadata()
+
+    @staticmethod
+    def format_date(date):
+        return date.strftime("%Y%m%d-%H%M%S")
 
     @abstractmethod
     def _initialize_segments(self):
@@ -62,7 +103,7 @@ def _add_prompt_segments(self, suts, include_sink=True):
         self.pipeline_segments.append(PromptSutAssigner(suts))
         self.pipeline_segments.append(PromptSutWorkers(suts, self.num_workers, cache_path=self.cache_dir))
         if include_sink:
-            output = CsvPromptOutput(self.output_path, suts)
+            output = CsvPromptOutput(self.output_dir() / self.output_file_name, suts)
             self.pipeline_segments.append(PromptSink(suts, output))
 
     def _add_annotator_segments(self, annotators, include_source=True):
@@ -71,9 +112,45 @@ def _add_annotator_segments(self, annotators, include_source=True):
             self.pipeline_segments.append(AnnotatorSource(input))
         self.pipeline_segments.append(AnnotatorAssigner(annotators))
         self.pipeline_segments.append(AnnotatorWorkers(annotators, self.num_workers))
-        output = JsonlAnnotatorOutput(self.output_path)
+        output = JsonlAnnotatorOutput(self.output_dir() / self.output_file_name)
         self.pipeline_segments.append(AnnotatorSink(annotators, output))
 
+    def _annotator_metadata(self):
+        counts = self.pipeline_segments[-1].annotation_counts
+        return {
+            "annotators": [
+                {
+                    "uid": uid,
+                }
+                for uid, annotator in self.annotators.items()
+            ],
+            "annotations": {
+                "count": sum(counts.values()),
+                "by_annotator": {uid: {"count": count} for uid, count in counts.items()},
+            },
+        }
+
+    def _sut_metadata(self):
+        counts = self.pipeline_segments[-1].sut_response_counts
+        return {
+            "suts": [
+                {
+                    "uid": uid,
+                    "initialization_record": sut.initialization_record.model_dump(),
+                    "sut_options": self.sut_options.model_dump(exclude_none=True),
+                }
+                for uid, sut in self.suts.items()
+            ],
+            "responses": {
+                "count": sum(counts.values()),
+                "by_sut": {uid: {"count": count} for uid, count in counts.items()},
+            },
+        }
+
+    def _write_metadata(self):
+        with open(self.output_dir() / "metadata.json", "w") as f:
+            json.dump(self.metadata(), f, indent=4)
+
 
 class PromptRunner(PipelineRunner):
     def __init__(self, *args, suts):
@@ -84,6 +161,19 @@ def __init__(self, *args, suts):
     def num_total_items(self):
         return self.num_input_items * len(self.suts)
 
+    @property
+    def output_file_name(self):
+        return "prompt-responses.csv"
+
+    @property
+    def run_id(self):
+        timestamp = self.format_date(self.start_time)
+        base_subdir_name = timestamp + "-" + self.tag if self.tag else timestamp
+        return f"{base_subdir_name}-{'-'.join(self.suts.keys())}"
+
+    def metadata(self):
+        return {**super().metadata(), **self._sut_metadata()}
+
     def _initialize_segments(self):
         self._add_prompt_segments(self.suts, include_sink=True)
 
@@ -98,6 +188,19 @@ def __init__(self, *args, suts, annotators):
     def num_total_items(self):
         return self.num_input_items * len(self.suts) * len(self.annotators)
 
+    @property
+    def output_file_name(self):
+        return "prompt-responses-annotated.jsonl"
+
+    @property
+    def run_id(self):
+        timestamp = self.format_date(self.start_time)
+        base_subdir_name = timestamp + "-" + self.tag if self.tag else timestamp
+        return f"{base_subdir_name}-{'-'.join(self.suts.keys())}-{'-'.join(self.annotators.keys())}"
+
+    def metadata(self):
+        return {**super().metadata(), **self._sut_metadata(), **self._annotator_metadata()}
+
     def _initialize_segments(self):
         # Hybrid pipeline: prompt source + annotator sink
         self._add_prompt_segments(self.suts, include_sink=False)
@@ -113,5 +216,18 @@ def __init__(self, *args, annotators):
     def num_total_items(self):
         return self.num_input_items * len(self.annotators)
 
+    @property
+    def output_file_name(self):
+        return "annotations.jsonl"
+
+    @property
+    def run_id(self):
+        timestamp = self.format_date(self.start_time)
+        base_subdir_name = timestamp + "-" + self.tag if self.tag else timestamp
+        return f"{base_subdir_name}-{'-'.join(self.annotators.keys())}"
+
+    def metadata(self):
+        return {**super().metadata(), **self._annotator_metadata()}
+
     def _initialize_segments(self):
         self._add_annotator_segments(self.annotators, include_source=True)
diff --git a/src/modelgauge/prompt_pipeline.py b/src/modelgauge/prompt_pipeline.py
diff --git a/tests/modelgauge_tests/test_cli.py b/tests/modelgauge_tests/test_cli.py