getsentry
diff --git a/‎.envrc‎
Lines changed: 11 additions & 0 deletions b/‎.envrc‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎decisions.md‎
Lines changed: 1 addition & 1 deletion b/‎decisions.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eval/compare.py‎
Lines changed: 27 additions & 3 deletions b/‎eval/compare.py‎
Lines changed: 27 additions & 3 deletions
@@ -10,6 +10,17 @@ dotenv
 
 gcloud config set project "$GOOGLE_CLOUD_PROJECT"
 
+if ! gcloud auth print-access-token > /dev/null 2>&1; then
+    printf "gcloud credentials need refresh. Run 'gcloud auth login' now? [Y/n] " > /dev/tty
+    read -r answer < /dev/tty
+    if [ "$answer" != "n" ] && [ "$answer" != "N" ]; then
+        gcloud auth login < /dev/tty
+    else
+        echo "Aborting .envrc — gcloud auth required" > /dev/tty
+        exit 1
+    fi
+fi
+
 WANDB_API_KEY=$(gcloud secrets versions access latest --secret=wandb-api-key)
 export WANDB_API_KEY
 : "${WANDB_API_KEY:?upload to GCP Secret Manager as 'wandb-api-key' (wandb is free) — the remote startup script fetches from there too}"
@@ -45,7 +45,7 @@ way to select from the combinatorial explosion of triplets without dropping some
 relationships.
 
 In general, non-pairwise losses coerce a jagged but rich similarity structure into a rectangular one. The data already
-intentionally contains hard positives and negatives. Pairwise losses put their faith in the data and accomodate the
+intentionally contains hard positives and hard negatives. Pairwise losses put their faith in the data and accomodate the
 jagged structure by melting it into a rectangular one.
 
 Pairwise losses do have downsides. The statistical one is that we don't have many negatives per positive. To softly
 
@@ -1132,6 +1132,16 @@ def compare_metrics_by_stacktrace_length(
 COLUMNS_ANONYMIZED_DENYLIST = ("path",)
 
 
+def _parse_threshold_list(value: str | None) -> list[float] | None:
+    """Parse a comma-separated list of floats, e.g. "10,15,20,25" -> [10.0, 15.0, 20.0, 25.0].
+
+    None / empty string returns None (caller falls back to the function's default).
+    """
+    if not value:
+        return None
+    return [float(part.strip()) for part in value.split(",") if part.strip()]
+
+
 def _parse_threshold(value: str) -> float | dict[str, float]:
     """Parse a threshold CLI argument.
 
@@ -1306,6 +1316,8 @@ def _main(
     dim_model2: int = 768,
     threshold_model1: str = "0.99",
     threshold_model2: str = "0.90",
+    sweep_thresholds_model1: str | None = None,
+    sweep_thresholds_model2: str | None = None,
     min_group_rate_increase: float = 0.15,
     min_group_rate_decrease: float = 0.10,
     max_display_projects: int = 30,
@@ -1385,13 +1397,18 @@ def _main(
 
     report("\n## Threshold sweep\n")
 
-    # Threshold sweep for model2
-    sweep_thresholds(df, name_model2)
+    sweep_list_model1 = _parse_threshold_list(sweep_thresholds_model1)
+    sweep_list_model2 = _parse_threshold_list(sweep_thresholds_model2)
+    sweep_lists_by_name = {name_model1: sweep_list_model1, name_model2: sweep_list_model2}
+
+    sweep_thresholds(df, name_model1, thresholds=sweep_list_model1)
+    sweep_thresholds(df, name_model2, thresholds=sweep_list_model2)
     threshold1_parsed = thresholds[name_model1]
     threshold2_parsed = thresholds[name_model2]
     sweep_thresholds_by_project(
         df,
         name_model2,
+        thresholds=sweep_list_model2,
         thresholds_platform=threshold2_parsed if isinstance(threshold2_parsed, dict) else None,
         baseline_model=name_model1,
         baseline_threshold=threshold1_parsed,
@@ -1405,7 +1422,7 @@ def _main(
 
     # Find minimum threshold per platform for each model
     for name in [name_model1, name_model2]:
-        find_threshold_by_platform(df, name)
+        find_threshold_by_platform(df, name, thresholds=sweep_lists_by_name[name])
 
     fig = plot_metrics_by_platform(result.df, result.model_names)
     fig.savefig(dir_output / "metrics_by_platform.png", dpi=150, bbox_inches="tight")
@@ -1509,6 +1526,8 @@ def main(
     dim_model2: int = 768,
     threshold_model1: str = "0.99",
     threshold_model2: str = "0.90",
+    sweep_thresholds_model1: str | None = None,
+    sweep_thresholds_model2: str | None = None,
     min_group_rate_increase: float = 0.15,
     min_group_rate_decrease: float = 0.10,
     max_display_projects: int = 30,
@@ -1542,6 +1561,11 @@ def main(
         or comma-separated platform=value pairs (e.g. "default=0.92,cocoa=0.80,node=0.90").
     threshold_model2
         Cosine similarity threshold for model 2. Same format as threshold_model1.
+    sweep_thresholds_model1
+        Comma-separated thresholds to sweep for model 1's per-threshold metrics and per-platform threshold finder,
+        e.g. "0.95,0.97,0.99". Override the cosine-range defaults when model 1's score isn't in [0, 1].
+    sweep_thresholds_model2
+        Comma-separated thresholds to sweep for model 2. Same format as sweep_thresholds_model1.
     min_group_rate_increase
         Flag projects where model2 GROUP rate exceeds model1 by at least this amount.
     min_group_rate_decrease