-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsave_bm25_scores.py
More file actions
149 lines (117 loc) · 6.75 KB
/
save_bm25_scores.py
File metadata and controls
149 lines (117 loc) · 6.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
NOTE: this script is vibe-coded.
Score test pairs with per-project BM25 and save similarities to GCS, mirroring eval/save_embeddings.py.
Output schema is a drop-in `gcs_model2` for `eval.compare`: a `similarities.csv` with a single `cos_sim_1` column
holding the symmetrized BM25 score for each pair. The "1" is a placeholder — BM25 has no MRL dim concept; pass
`--dim_model2 1` to `eval.compare`.
IDF is computed per-project to mirror prod's per-project vector DB lookup. Each pair's score is averaged across both
orderings (query→candidate and candidate→query) so the result is independent of which side was the "query."
Caveats on what to expect from `eval.compare` on this baseline:
- BM25 looked weak on a 3000-pair smoke test (median per-project AUC ~0.64; per-platform AUC < 0.5 for go, php,
csharp). The boundary-sampled data over-represents pairs where lexical similarity disagrees with the GROUP/SEPARATE
label (e.g., templated Go errors like ``failed to generate unique username: name=<varies>`` — same semantic, very
different lexical). Per-platform thresholds in `eval.compare` can absorb scale differences but not signal inversion.
- Switching to per-platform or global IDF moved global AUC by ~0.02 — within noise; not worth the memory cost at full
scale (a global N x N score matrix is GB-sized).
- camelCase/code-aware tokenization, BM25 b-parameter tuning, and key=value masking all left AUC essentially
unchanged. Don't bother engineering these unless you have a specific failure case in mind.
Install the optional dep:
uv sync --extra eval-bm25
Full run:
python eval/save_bm25_scores.py \
--run_gcs_dir gs://$GROUPING_TRAINER_BUCKET/runs/bm25
BM25 scores are not in [0, 1], so the right threshold needs to be swept rather than copied from a cos_sim model:
python -m eval.compare \
--name_model1 v1 \
--name_model2 bm25 \
--gcs_model1 gs://$GROUPING_TRAINER_BUCKET/runs/issue_grouping_v1/similarities/test_full3 \
--gcs_model2 gs://$GROUPING_TRAINER_BUCKET/runs/bm25/similarities/test_full3 \
--threshold_model1 0.99 \
--threshold_model2 <swept-value> \
--dim_model2 1
"""
import logging
import os.path
import subprocess
import tempfile
import time
import bm25s
import numpy as np
import polars as pl
from tap import tapify
from tqdm.auto import tqdm
import grouping_trainer as gt
logger = logging.getLogger(__name__)
def _score_project(texts_query: list[str], texts_candidate: list[str]) -> np.ndarray:
"""Score each (query, candidate) pair within a single project using symmetrized BM25.
The corpus is the set of unique stacktraces appearing on either side of any pair in the project. IDF and average
document length are derived from that corpus. Returned score for pair i is
``0.5 * (BM25(query_i, candidate_i) + BM25(candidate_i, query_i))`` — averaging both orderings makes the score
order-independent, which matches how this dataset's pairs are unordered.
"""
texts_unique = sorted({*texts_query, *texts_candidate})
idx_by_text = {text: idx for idx, text in enumerate(texts_unique)}
num_unique = len(texts_unique)
# stopwords=[] disables stopword removal (default is English stopwords, which is wrong for stacktraces — frames
# contain identifiers like "in", "as", "do" that we want to keep). stemmer defaults to None.
tokens_corpus = bm25s.tokenize(texts_unique, stopwords=[], show_progress=False)
retriever = bm25s.BM25()
retriever.index(tokens_corpus, show_progress=False)
# Retrieve every doc for every query to materialize the full NxN score matrix. Memory is num_unique^2 floats per
# project; this is fine while per-project unique-stacktrace counts stay in the low thousands. If a future project
# blows past that, switch to chunked retrieval here.
docs_ranked, scores_ranked = retriever.retrieve(tokens_corpus, k=num_unique, show_progress=False, return_as="tuple")
scores_dense = np.zeros((num_unique, num_unique), dtype=np.float32)
for row_idx in range(num_unique):
scores_dense[row_idx, docs_ranked[row_idx]] = scores_ranked[row_idx]
scores_pair = np.empty(len(texts_query), dtype=np.float32)
for pair_idx, (text_query, text_candidate) in enumerate(zip(texts_query, texts_candidate, strict=True)):
idx_query = idx_by_text[text_query]
idx_candidate = idx_by_text[text_candidate]
scores_pair[pair_idx] = 0.5 * (scores_dense[idx_query, idx_candidate] + scores_dense[idx_candidate, idx_query])
return scores_pair
def main(
run_gcs_dir: str,
df_path: str = "final_csvs/test_full3.csv",
sample_size: int | None = None,
):
"""Score test pairs with per-project BM25 and save similarities to GCS.
Parameters
----------
run_gcs_dir
GCS path under which to write ``similarities/{name_dataset}/``. Mirrors save_embeddings.py's layout so the
result is a drop-in ``gcs_model2`` for `eval.compare`. Example:
``gs://$GROUPING_TRAINER_BUCKET/runs/bm25``
df_path
Path to the validation/test CSV file.
sample_size
Number of rows to sample. None (default) uses the full dataset.
"""
gt.logging.configure_logging(process_type="save_bm25_scores")
logging.getLogger("bm25s").setLevel(logging.WARNING)
run_gcs_dir = run_gcs_dir.rstrip("/")
name_dataset = os.path.splitext(os.path.basename(df_path))[0]
dir_gcs_output = f"{run_gcs_dir}/similarities/{name_dataset}"
df_pairs = gt.data.load_val_df(paths=(df_path,), sample_size=sample_size)
logger.info(f"Test df shape: {df_pairs.shape}")
start = time.monotonic()
df_indexed = df_pairs.with_row_index("_pair_idx")
num_projects = df_indexed["project_id"].n_unique()
scores_by_pair_idx: dict[int, float] = {}
for _, df_project in tqdm(df_indexed.group_by("project_id"), desc="Scoring projects", total=num_projects):
scores_project = _score_project(
df_project["query_stacktrace_string"].to_list(),
df_project["candidate_stacktrace_string"].to_list(),
)
for pair_idx, score in zip(df_project["_pair_idx"].to_list(), scores_project, strict=True):
scores_by_pair_idx[pair_idx] = float(score)
logger.info(f"Scored {len(df_pairs)} pairs in {time.monotonic() - start:.1f}s")
scores_ordered = np.array([scores_by_pair_idx[idx] for idx in range(len(df_pairs))], dtype=np.float32)
df_out = df_pairs.with_columns(pl.Series(name="cos_sim_1", values=scores_ordered))
with tempfile.TemporaryDirectory() as dir_tmp_output:
df_out.write_csv(f"{dir_tmp_output}/similarities.csv")
logger.info(f"Uploading to {dir_gcs_output}...")
subprocess.run(["gcloud", "storage", "rsync", "-r", dir_tmp_output, dir_gcs_output], check=True)
logger.info(f"Uploaded to {dir_gcs_output}")
if __name__ == "__main__":
tapify(main)