Skip to content

Commit 76b0c07

Browse files
committed
s
1 parent 39467b5 commit 76b0c07

9 files changed

Lines changed: 31 additions & 28 deletions

File tree

benchmark/compare_compiled_uncompiled.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""
2-
Check whether compiled and uncompiled SentenceTransformer embeddings can be mixed in the prod vector DB.
2+
Verify that compiled and uncompiled SentenceTransformer embeddings can be arbitrarily mixed in the prod vector DB. W/
3+
v2.1 this is almost certainly fine b/c its distances live on a wider scale.
34
45
For each test pair we have incoming query embeddings and candidate DB embeddings from a compiled model (C) and from the
56
uncompiled model (U). Mid-migration, any of these four sims could be the prod similarity op:
@@ -36,7 +37,7 @@
3637

3738

3839
def _sync_gcs(gcs_dir: str, dir_local: Path) -> None:
39-
"""Sync a GCS similarities directory to a local cache. Mirrors eval/compare.py's `_sync_gcs` layout."""
40+
"""Sync a GCS similarities directory to a local cache."""
4041
dir_local.mkdir(parents=True, exist_ok=True)
4142
logger.info(f"Syncing {gcs_dir} -> {dir_local}")
4243
subprocess.run(["gcloud", "storage", "rsync", "-r", gcs_dir, str(dir_local)], check=True)
@@ -67,7 +68,7 @@ def _percentiles(x: np.ndarray) -> dict[str, float]:
6768

6869

6970
def _df_to_markdown(df: pl.DataFrame) -> str:
70-
"""Render a Polars DataFrame as a GitHub-flavored markdown table. Copied from eval/compare.py."""
71+
"""Render a Polars DataFrame as a GitHub-flavored markdown table."""
7172
with pl.Config(
7273
tbl_formatting="MARKDOWN",
7374
tbl_hide_column_data_types=True,

eval/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Eval scripts
22

3-
Typical flow:
3+
Typical steps:
44

55
1. If a training run did well enough on validation to warrant an evaluation on the test set, run
66
[`./save_embeddings.py`](./save_embeddings.py) to launch a GPU to save embeddings and similarities for that model in
@@ -15,7 +15,7 @@ Typical flow:
1515

1616
Consider tuning the token buckets for a compiled model using [`../benchmark`](../benchmark/).
1717

18-
2. Run [`./compare.py`](./compare.py) to compare the model to another model on the test set.
18+
2. Run `eval.compare` to compare the model to another model on the test set.
1919

2020
```bash
2121
python -m eval.compare \

eval/compare/metrics.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import polars as pl
99

10-
from .report import emit
10+
from .report import emit, emit_table
1111

1212

1313
@dataclass
@@ -373,8 +373,7 @@ def compute_stacktrace_token_percentiles(df: pl.DataFrame) -> pl.DataFrame:
373373

374374
emit(f"\n### Stacktrace token percentiles ({len(df)} pairs)\n")
375375
emit("(Using len(stacktrace) // 4 as token approximation)")
376-
with pl.Config(tbl_rows=-1, tbl_cols=-1):
377-
emit(result)
376+
emit_table(result)
378377

379378
return result
380379

@@ -405,8 +404,7 @@ def sweep_thresholds(
405404

406405
result = pl.DataFrame(rows).with_columns(pl.col(pl.Float64).round(2))
407406
emit(f"\n### Threshold sweep for {model_name}\n")
408-
with pl.Config(tbl_rows=-1, tbl_cols=-1):
409-
emit(result)
407+
emit_table(result)
410408
return result
411409

412410

@@ -504,8 +502,7 @@ def sweep_thresholds_by_project(
504502
)
505503
by_platform = pl.DataFrame(rows_by_platform).sort("platform").with_columns(pl.col(pl.Float64).round(2))
506504
emit(f"\n### Per-project precision_GROUP: platform-specific vs {baseline_key} by platform\n")
507-
with pl.Config(tbl_rows=-1, tbl_cols=-1):
508-
emit(by_platform)
505+
emit_table(by_platform)
509506

510507

511508
def metrics_by_platform(
@@ -547,8 +544,7 @@ def metrics_by_platform(
547544

548545
threshold_label = "platform-specific" if isinstance(threshold, dict) else f"threshold={threshold}"
549546
emit(f"\n### Metrics by platform, avg over projects ({model_name}, {threshold_label})\n")
550-
with pl.Config(tbl_rows=-1, tbl_cols=-1):
551-
emit(result)
547+
emit_table(result)
552548

553549
return result
554550

@@ -638,8 +634,7 @@ def find_threshold_by_platform(
638634

639635
precision_label = "per-platform" if precision_by_platform else f"{min_precision:.0%}"
640636
emit(f"\n### Min threshold for >= {precision_label} avg project precision_GROUP by platform ({model_name})\n")
641-
with pl.Config(tbl_rows=-1, tbl_cols=-1):
642-
emit(result)
637+
emit_table(result)
643638

644639
return result
645640

eval/compare/report.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,14 @@ def emit(*args, **print_kwargs):
4545
_report_lines.append(" ".join(parts))
4646

4747

48+
def emit_table(df: pl.DataFrame) -> None:
49+
"""Emit a DataFrame untruncated. The scoped pl.Config affects the console-print side of `emit()`
50+
(`emit` itself routes DataFrames through `_df_to_markdown`, which already disables truncation).
51+
"""
52+
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=1000):
53+
emit(df)
54+
55+
4856
def emit_plot(title: str, filename: str) -> None:
4957
"""Buffer a collapsible image embed for the markdown report."""
5058
emit(f"\n<details>\n<summary>{title}</summary>\n\n![{title}]({filename})\n</details>\n")

eval/compare/sheets.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from google.auth import default as google_auth_default
1313
from tqdm.auto import tqdm
1414

15-
from .report import emit
15+
from .report import emit, emit_table
1616

1717

1818
def stratify_round_robin(df: pl.DataFrame, group_name: str, target_num_rows: int) -> pl.DataFrame:
@@ -67,8 +67,7 @@ def print_projects(
6767

6868
stratify_msg = f" (stratified by {stratify_by})" if stratify_by else ""
6969
emit(f"\n### {description}{stratify_msg}\n")
70-
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=1000):
71-
emit(_projects_to_display_df(projects))
70+
emit_table(_projects_to_display_df(projects))
7271

7372
return projects
7473

eval/export_for_db.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def _load_model_dir(similarities_dir: str) -> tuple[pl.DataFrame, np.ndarray]:
5050
def _sync_gcs_dir(gcs_dir: str) -> Path:
5151
"""Sync a GCS similarities dir to eval/similarities/{run}/{dataset}/ and return the local dir.
5252
53-
Mirrors compare.py: expects ``gs://bucket/runs/{run}/similarities/{dataset}``.
53+
Expects ``gs://bucket/runs/{run}/similarities/{dataset}``.
5454
"""
5555
gcs_dir = gcs_dir.rstrip("/")
5656
parts = gcs_dir.split("/")

eval/save_bm25_scores.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,19 @@
33
44
Score test pairs with per-project BM25 and save similarities to GCS, mirroring eval/save_embeddings.py.
55
6-
Output schema is a drop-in `gcs_model2` for eval/compare.py: a `similarities.csv` with a single `cos_sim_1` column
6+
Output schema is a drop-in `gcs_model2` for `eval.compare`: a `similarities.csv` with a single `cos_sim_1` column
77
holding the symmetrized BM25 score for each pair. The "1" is a placeholder — BM25 has no MRL dim concept; pass
8-
`--dim_model2 1` to compare.py.
8+
`--dim_model2 1` to `eval.compare`.
99
1010
IDF is computed per-project to mirror prod's per-project vector DB lookup. Each pair's score is averaged across both
1111
orderings (query→candidate and candidate→query) so the result is independent of which side was the "query."
1212
13-
Caveats on what to expect from compare.py on this baseline:
13+
Caveats on what to expect from `eval.compare` on this baseline:
1414
1515
- BM25 looked weak on a 3000-pair smoke test (median per-project AUC ~0.64; per-platform AUC < 0.5 for go, php,
1616
csharp). The boundary-sampled data over-represents pairs where lexical similarity disagrees with the GROUP/SEPARATE
1717
label (e.g., templated Go errors like ``failed to generate unique username: name=<varies>`` — same semantic, very
18-
different lexical). Per-platform thresholds in compare.py can absorb scale differences but not signal inversion.
18+
different lexical). Per-platform thresholds in `eval.compare` can absorb scale differences but not signal inversion.
1919
- Switching to per-platform or global IDF moved global AUC by ~0.02 — within noise; not worth the memory cost at full
2020
scale (a global N x N score matrix is GB-sized).
2121
- camelCase/code-aware tokenization, BM25 b-parameter tuning, and key=value masking all left AUC essentially
@@ -104,7 +104,7 @@ def main(
104104
----------
105105
run_gcs_dir
106106
GCS path under which to write ``similarities/{name_dataset}/``. Mirrors save_embeddings.py's layout so the
107-
result is a drop-in ``gcs_model2`` for eval/compare.py. Example:
107+
result is a drop-in ``gcs_model2`` for `eval.compare`. Example:
108108
``gs://$GROUPING_TRAINER_BUCKET/runs/bm25``
109109
df_path
110110
Path to the validation/test CSV file.

eval/save_embeddings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
Download a model from GCS, encode test data, save embeddings and similarities, and upload results to GCS.
33
4-
After running this script for some `run_gcs_dir`, you can use it as a gcs_model1/gcs_model2 in eval/compare.py
4+
After running this script for some `run_gcs_dir`, you can use it as a gcs_model1/gcs_model2 in `eval.compare`
55
66
For example to evaluate the baseline/prod model:
77

eval/save_gemini_embeddings.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
Encode test data with Vertex Gemini's `gemini-embedding-2`, save embeddings and similarities, and upload to GCS.
33
4-
Output schema mirrors eval/save_embeddings.py so the resulting GCS dir is a drop-in `gcs_model2` for eval/compare.py.
4+
Output schema mirrors eval/save_embeddings.py so the resulting GCS dir is a drop-in `gcs_model2` for `eval.compare`.
55
66
Full run:
77
@@ -87,7 +87,7 @@ def main(
8787
----------
8888
run_gcs_dir
8989
GCS path under which to write `similarities/{name_dataset}/`. Mirrors save_embeddings.py's layout so the
90-
result is a drop-in `gcs_model2` for eval/compare.py. Example:
90+
result is a drop-in `gcs_model2` for `eval.compare`. Example:
9191
gs://$GROUPING_TRAINER_BUCKET/runs/gemini-embedding-2
9292
model
9393
Vertex model name. Default `gemini-embedding-2`.

0 commit comments

Comments
 (0)