awslabs
diff --git a/‎keys_values/evaluation/evaluator.py‎
Lines changed: 8 additions & 3 deletions b/‎keys_values/evaluation/evaluator.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎keys_values/evaluation/tasks.py‎
Lines changed: 35 additions & 18 deletions b/‎keys_values/evaluation/tasks.py‎
Lines changed: 35 additions & 18 deletions
diff --git a/‎keys_values/finetune/longcontext_eval.py‎
Lines changed: 13 additions & 0 deletions b/‎keys_values/finetune/longcontext_eval.py‎
Lines changed: 13 additions & 0 deletions
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional, List, Union
+from typing import Any, Dict, Optional, List, Union, Tuple
 
 import torch
 
@@ -129,7 +129,8 @@ def __call__(
         model: LongContextInferenceModel,
         prompts: torch.Tensor,
         targets: List[TargetType],
-    ) -> Dict[str, torch.Tensor]:
+        return_samples: bool = False,
+    ) -> Tuple[Dict[str, torch.Tensor], Optional[List[str]]]:
         """
         Computes metric values for data case `(input_ids, targets)`. The
         metrics to be computed are in `metrics`.
@@ -141,11 +142,15 @@ def __call__(
             targets: List of targets of length `batch_size`. Each entry is a
                 string or list of strings. Some metrics allow for lists of
                 strings, others require a single string
+            return_samples: If `True`, we also return a list of generated
+                sequences (of length `batch_size`)
 
         Returns:
             Dictionary with entries `{name: values}`, where `name in self.metrics`
             and `values.shape = (batch_size,)`, the metric values for each
             entry in the batch.
+            If `return_samples == True`, we also return a list of generated
+            sequences.
 
         """
         assert prompts.ndim == 2
@@ -186,4 +191,4 @@ def __call__(
                 device=prompts.device,
             )
             for metric in self.metrics
-        }
+        }, (outputs if return_samples else None)
@@ -14,7 +14,7 @@
 from filelock import FileLock, Timeout
 from pathlib import Path
 import re
-from typing import List, Dict, Any, Optional, Iterable, Tuple
+from typing import List, Dict, Any, Optional, Iterable, Tuple, Literal
 
 from keys_values.data.base import (
     LIT_MODEL_FNAME,
@@ -25,8 +25,6 @@
 
 EVAL_METRICS_FNAME = "eval/eval_metrics_{}.csv"
 
-EVAL_METRICS_GLOB = EVAL_METRICS_FNAME.replace("{}", "*")
-
 REGEX_TASKNAME = re.compile(r"step-[0-9]{6}|final")
 
 _REQUIRED_FILES = [
@@ -57,12 +55,17 @@ def __init__(
         model_type: str,
         tasks: Optional[List[str]] = None,
         collect_results: bool = False,
+        eval_metrics_filename: Optional[str] = None,
     ):
         if isinstance(out_dir, str):
             out_dir = Path(out_dir)
         self._out_dir = out_dir
         self.model_type = model_type
         self._tasks = tasks.copy() if tasks is not None else None
+        if eval_metrics_filename is None:
+            eval_metrics_filename = EVAL_METRICS_FNAME
+        self._eval_metrics_filename = eval_metrics_filename
+        self._eval_metrics_glob = eval_metrics_filename.replace("{}", "*")
         self._init_task_names(collect_results)
 
     def _init_task_names(self, collect_results: bool):
@@ -100,9 +103,8 @@ def _init_task_names(self, collect_results: bool):
                 elif self._num_result_files(path) == 0:
                     raise ValueError(f"{path} contains no evaluation result files")
 
-    @staticmethod
-    def _num_result_files(path: Path) -> int:
-        return len(list(path.glob(EVAL_METRICS_GLOB)))
+    def _num_result_files(self, path: Path) -> int:
+        return len(list(path.glob(self._eval_metrics_glob)))
 
     @property
     def tasks(self) -> List[str]:
@@ -127,37 +129,44 @@ def check_complete(task_path: Path, model_type: str) -> bool:
 
     def eval_result_files(
         self,
-        return_incompletes: bool = False,
+        mode: Literal["non-lock", "lock", "all"] = "non-lock",
     ) -> Iterable[Tuple[str, List[Path]]]:
         """
         Args:
-            return_incompletes: If `True`, we return the complete lock files.
-                Defaults to `False`, so lock files are filtered out.
+            mode: For "non-lock", we return complete files (not locks). For
+                "lock", we return incomplete lock files. For "all", we
+                return all files.
         Yields:
             `(task_name, result_file_paths)`, where `result_file_paths`
             is list of paths of evaluation result files for this task name.
-            These files are filtered to not contain incomplete lock files.
-            But if `return_incompletes == True`, only incomplete files are
-            returned.
+            This list is filtered depending on `mode`.
 
         """
+        choices = ("non-lock", "lock", "all")
+        if mode not in choices:
+            raise ValueError(f"Invalid mode = {mode}, must be in {choices}")
         for task_name in self._tasks:
             result_file_paths = self._filter_incomplete_files(
-                (self._out_dir / task_name).glob(EVAL_METRICS_GLOB),
-                return_incompletes=return_incompletes,
+                (self._out_dir / task_name).glob(self._eval_metrics_glob),
+                mode=mode,
             )
             if result_file_paths:
                 yield task_name, result_file_paths
 
     @staticmethod
     def _filter_incomplete_files(
         paths: Iterable[Path],
-        return_incompletes: bool = False,
+        mode: Literal["non-lock", "lock", "all"],
     ) -> List[Path]:
         result = []
+        return_all = mode == "all"
+        return_incompletes = mode == "lock"
         for path in paths:
             with path.open("r") as fp:
-                if fp.readline().startswith(FILE_LOCK_TEXT) == return_incompletes:
+                if (
+                    return_all
+                    or fp.readline().startswith(FILE_LOCK_TEXT) == return_incompletes
+                ):
                     result.append(path)
         return result
 
@@ -172,11 +181,19 @@ class EvaluationWithTasksHelper:
     dataloader we use.
     """
 
-    def __init__(self, out_dir: Path, tag: Optional[str] = None):
+    def __init__(
+        self,
+        out_dir: Path,
+        tag: Optional[str] = None,
+        eval_metrics_filename: Optional[str] = None,
+    ):
         self._out_dir = out_dir
         if tag is None:
             tag = ""
         self._tag = tag
+        if eval_metrics_filename is None:
+            eval_metrics_filename = EVAL_METRICS_FNAME
+        self._eval_metrics_filename = eval_metrics_filename
 
     def evaluation_metrics_path(self, batch: Dict[str, Any]) -> Path:
         """
@@ -197,7 +214,7 @@ def evaluation_metrics_path(self, batch: Dict[str, Any]) -> Path:
                 f"batch[{TASK_NAME}] = {task}."
             )
         suffix = self._tag + str(orig_idxs[0])
-        fname = EVAL_METRICS_FNAME.format(suffix)
+        fname = self._eval_metrics_filename.format(suffix)
         return self._out_dir / task / fname
 
     def get_lock(self, batch: Dict[str, Any]) -> Optional[Path]:
 
@@ -36,6 +36,8 @@ def setup(
     use_sample_metric: bool = True,
     sample_metric_max_generated_tokens: int = 20,
     sample_metric_kwargs: Optional[Dict[str, Any]] = None,
+    num_store_generated_samples: Optional[int] = None,
+    skip_eval: bool = False,
 ) -> None:
     """Evaluate a range of model checkpoints on a test set
 
@@ -101,6 +103,15 @@ def setup(
             for sample-based metric evaluation
         sample_metric_kwargs: Keyword arguments for token sampling (params
             can be "temperature", "top_k", "top_p")
+        num_store_generated_samples: If given and positive, we write files
+            containing the generated sequences along with SFT targets and raw
+            targets. These files are written alongside metric files, using the
+            same naming convention. They are written for the initial test set
+            batches, until `num_store_generated_samples` cases are covered
+            (rounded up to a multiple of `batch_size`). Must have
+            `use_sample_metric == True`.
+        skip_eval: If `True`, we skip evaluations and only write files related
+            to `num_store_generated_samples`.
 
     """
     entry = {
@@ -124,4 +135,6 @@ def setup(
         use_sample_metric=use_sample_metric,
         sample_metric_max_generated_tokens=sample_metric_max_generated_tokens,
         sample_metric_kwargs=sample_metric_kwargs,
+        num_store_generated_samples=num_store_generated_samples,
+        skip_eval=skip_eval,
     )