s

kddubey · kddubey · commit 76b0c075988d · 2026-05-19T15:40:29.000-07:00
diff --git a/benchmark/compare_compiled_uncompiled.py b/benchmark/compare_compiled_uncompiled.py
@@ -1,5 +1,6 @@
 """
-Check whether compiled and uncompiled SentenceTransformer embeddings can be mixed in the prod vector DB.
+Verify that compiled and uncompiled SentenceTransformer embeddings can be arbitrarily mixed in the prod vector DB. W/
+v2.1 this is almost certainly fine b/c its distances live on a wider scale.
 
 For each test pair we have incoming query embeddings and candidate DB embeddings from a compiled model (C) and from the
 uncompiled model (U). Mid-migration, any of these four sims could be the prod similarity op:
@@ -36,7 +37,7 @@
 
 
 def _sync_gcs(gcs_dir: str, dir_local: Path) -> None:
-    """Sync a GCS similarities directory to a local cache. Mirrors eval/compare.py's `_sync_gcs` layout."""
+    """Sync a GCS similarities directory to a local cache."""
     dir_local.mkdir(parents=True, exist_ok=True)
     logger.info(f"Syncing {gcs_dir} -> {dir_local}")
     subprocess.run(["gcloud", "storage", "rsync", "-r", gcs_dir, str(dir_local)], check=True)
@@ -67,7 +68,7 @@ def _percentiles(x: np.ndarray) -> dict[str, float]:
 
 
 def _df_to_markdown(df: pl.DataFrame) -> str:
-    """Render a Polars DataFrame as a GitHub-flavored markdown table. Copied from eval/compare.py."""
+    """Render a Polars DataFrame as a GitHub-flavored markdown table."""
     with pl.Config(
         tbl_formatting="MARKDOWN",
         tbl_hide_column_data_types=True,
diff --git a/eval/README.md b/eval/README.md
@@ -1,6 +1,6 @@
 # Eval scripts
 
-Typical flow:
+Typical steps:
 
 1. If a training run did well enough on validation to warrant an evaluation on the test set, run
    [`./save_embeddings.py`](./save_embeddings.py) to launch a GPU to save embeddings and similarities for that model in
@@ -15,7 +15,7 @@ Typical flow:
 
    Consider tuning the token buckets for a compiled model using [`../benchmark`](../benchmark/).
 
-2. Run [`./compare.py`](./compare.py) to compare the model to another model on the test set.
+2. Run `eval.compare` to compare the model to another model on the test set.
 
    ```bash
    python -m eval.compare \
diff --git a/eval/compare/metrics.py b/eval/compare/metrics.py
@@ -7,7 +7,7 @@
 
 import polars as pl
 
-from .report import emit
+from .report import emit, emit_table
 
 
 @dataclass
@@ -373,8 +373,7 @@ def compute_stacktrace_token_percentiles(df: pl.DataFrame) -> pl.DataFrame:
 
     emit(f"\n### Stacktrace token percentiles ({len(df)} pairs)\n")
     emit("(Using len(stacktrace) // 4 as token approximation)")
-    with pl.Config(tbl_rows=-1, tbl_cols=-1):
-        emit(result)
+    emit_table(result)
 
     return result
 
@@ -405,8 +404,7 @@ def sweep_thresholds(
 
     result = pl.DataFrame(rows).with_columns(pl.col(pl.Float64).round(2))
     emit(f"\n### Threshold sweep for {model_name}\n")
-    with pl.Config(tbl_rows=-1, tbl_cols=-1):
-        emit(result)
+    emit_table(result)
     return result
 
 
@@ -504,8 +502,7 @@ def sweep_thresholds_by_project(
             )
         by_platform = pl.DataFrame(rows_by_platform).sort("platform").with_columns(pl.col(pl.Float64).round(2))
         emit(f"\n### Per-project precision_GROUP: platform-specific vs {baseline_key} by platform\n")
-        with pl.Config(tbl_rows=-1, tbl_cols=-1):
-            emit(by_platform)
+        emit_table(by_platform)
 
 
 def metrics_by_platform(
@@ -547,8 +544,7 @@ def metrics_by_platform(
 
     threshold_label = "platform-specific" if isinstance(threshold, dict) else f"threshold={threshold}"
     emit(f"\n### Metrics by platform, avg over projects ({model_name}, {threshold_label})\n")
-    with pl.Config(tbl_rows=-1, tbl_cols=-1):
-        emit(result)
+    emit_table(result)
 
     return result
 
@@ -638,8 +634,7 @@ def find_threshold_by_platform(
 
     precision_label = "per-platform" if precision_by_platform else f"{min_precision:.0%}"
     emit(f"\n### Min threshold for >= {precision_label} avg project precision_GROUP by platform ({model_name})\n")
-    with pl.Config(tbl_rows=-1, tbl_cols=-1):
-        emit(result)
+    emit_table(result)
 
     return result
 
diff --git a/eval/compare/report.py b/eval/compare/report.py
@@ -45,6 +45,14 @@ def emit(*args, **print_kwargs):
     _report_lines.append(" ".join(parts))
 
 
+def emit_table(df: pl.DataFrame) -> None:
+    """Emit a DataFrame untruncated. The scoped pl.Config affects the console-print side of `emit()`
+    (`emit` itself routes DataFrames through `_df_to_markdown`, which already disables truncation).
+    """
+    with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=1000):
+        emit(df)
+
+
 def emit_plot(title: str, filename: str) -> None:
     """Buffer a collapsible image embed for the markdown report."""
     emit(f"\n<details>\n<summary>{title}</summary>\n\n![{title}]({filename})\n</details>\n")
diff --git a/eval/compare/sheets.py b/eval/compare/sheets.py
@@ -12,7 +12,7 @@
 from google.auth import default as google_auth_default
 from tqdm.auto import tqdm
 
-from .report import emit
+from .report import emit, emit_table
 
 
 def stratify_round_robin(df: pl.DataFrame, group_name: str, target_num_rows: int) -> pl.DataFrame:
@@ -67,8 +67,7 @@ def print_projects(
 
     stratify_msg = f" (stratified by {stratify_by})" if stratify_by else ""
     emit(f"\n### {description}{stratify_msg}\n")
-    with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=1000):
-        emit(_projects_to_display_df(projects))
+    emit_table(_projects_to_display_df(projects))
 
     return projects
 
diff --git a/eval/export_for_db.py b/eval/export_for_db.py
@@ -50,7 +50,7 @@ def _load_model_dir(similarities_dir: str) -> tuple[pl.DataFrame, np.ndarray]:
 def _sync_gcs_dir(gcs_dir: str) -> Path:
     """Sync a GCS similarities dir to eval/similarities/{run}/{dataset}/ and return the local dir.
 
-    Mirrors compare.py: expects ``gs://bucket/runs/{run}/similarities/{dataset}``.
+    Expects ``gs://bucket/runs/{run}/similarities/{dataset}``.
     """
     gcs_dir = gcs_dir.rstrip("/")
     parts = gcs_dir.split("/")
diff --git a/eval/save_bm25_scores.py b/eval/save_bm25_scores.py
@@ -3,19 +3,19 @@
 
 Score test pairs with per-project BM25 and save similarities to GCS, mirroring eval/save_embeddings.py.
 
-Output schema is a drop-in `gcs_model2` for eval/compare.py: a `similarities.csv` with a single `cos_sim_1` column
+Output schema is a drop-in `gcs_model2` for `eval.compare`: a `similarities.csv` with a single `cos_sim_1` column
 holding the symmetrized BM25 score for each pair. The "1" is a placeholder — BM25 has no MRL dim concept; pass
-`--dim_model2 1` to compare.py.
+`--dim_model2 1` to `eval.compare`.
 
 IDF is computed per-project to mirror prod's per-project vector DB lookup. Each pair's score is averaged across both
 orderings (query→candidate and candidate→query) so the result is independent of which side was the "query."
 
-Caveats on what to expect from compare.py on this baseline:
+Caveats on what to expect from `eval.compare` on this baseline:
 
 - BM25 looked weak on a 3000-pair smoke test (median per-project AUC ~0.64; per-platform AUC < 0.5 for go, php,
   csharp). The boundary-sampled data over-represents pairs where lexical similarity disagrees with the GROUP/SEPARATE
   label (e.g., templated Go errors like ``failed to generate unique username: name=<varies>`` — same semantic, very
-  different lexical). Per-platform thresholds in compare.py can absorb scale differences but not signal inversion.
+  different lexical). Per-platform thresholds in `eval.compare` can absorb scale differences but not signal inversion.
 - Switching to per-platform or global IDF moved global AUC by ~0.02 — within noise; not worth the memory cost at full
   scale (a global N x N score matrix is GB-sized).
 - camelCase/code-aware tokenization, BM25 b-parameter tuning, and key=value masking all left AUC essentially
@@ -104,7 +104,7 @@ def main(
     ----------
     run_gcs_dir
         GCS path under which to write ``similarities/{name_dataset}/``. Mirrors save_embeddings.py's layout so the
-        result is a drop-in ``gcs_model2`` for eval/compare.py. Example:
+        result is a drop-in ``gcs_model2`` for `eval.compare`. Example:
         ``gs://$GROUPING_TRAINER_BUCKET/runs/bm25``
     df_path
         Path to the validation/test CSV file.
diff --git a/eval/save_embeddings.py b/eval/save_embeddings.py
@@ -1,7 +1,7 @@
 """
 Download a model from GCS, encode test data, save embeddings and similarities, and upload results to GCS.
 
-After running this script for some `run_gcs_dir`, you can use it as a gcs_model1/gcs_model2 in eval/compare.py
+After running this script for some `run_gcs_dir`, you can use it as a gcs_model1/gcs_model2 in `eval.compare`
 
 For example to evaluate the baseline/prod model:
 
diff --git a/eval/save_gemini_embeddings.py b/eval/save_gemini_embeddings.py
@@ -1,7 +1,7 @@
 """
 Encode test data with Vertex Gemini's `gemini-embedding-2`, save embeddings and similarities, and upload to GCS.
 
-Output schema mirrors eval/save_embeddings.py so the resulting GCS dir is a drop-in `gcs_model2` for eval/compare.py.
+Output schema mirrors eval/save_embeddings.py so the resulting GCS dir is a drop-in `gcs_model2` for `eval.compare`.
 
 Full run:
 
@@ -87,7 +87,7 @@ def main(
     ----------
     run_gcs_dir
         GCS path under which to write `similarities/{name_dataset}/`. Mirrors save_embeddings.py's layout so the
-        result is a drop-in `gcs_model2` for eval/compare.py. Example:
+        result is a drop-in `gcs_model2` for `eval.compare`. Example:
         gs://$GROUPING_TRAINER_BUCKET/runs/gemini-embedding-2
     model
         Vertex model name. Default `gemini-embedding-2`.