temp test uv

kddubey · kddubey · commit ef002ea25e29 · 2026-05-08T23:17:04.000-07:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,36 @@
+name: CI
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+      - run: uv tool run ruff check .
+      - run: uv tool run ruff format --check .
+
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+          python-version: "3.13"
+      - run: uv sync --extra dev
+      - uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: hf-${{ runner.os }}-gte-modernbert-base
+      - run: uv run pytest -q
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.13
diff --git a/bin/_startup.sh b/bin/_startup.sh
@@ -1,8 +1,12 @@
 #!/bin/bash
 # Sets up the environment for grouping-trainer instances.
+# Normally invoked as root via GCP instance startup. To step through manually
+# after SSH'ing in, run `sudo -i` first so $HOME=/root and paths line up.
 set -euo pipefail
 
-apt-get update -y && apt-get install -y python3.12-venv
+# Install uv (manages its own Python; respects .python-version in the repo).
+curl -LsSf https://astral.sh/uv/install.sh | sh
+export PATH="$HOME/.local/bin:$PATH"
 
 REPO_DIR="/root/grouping-trainer"
 
@@ -18,16 +22,15 @@ cd "$REPO_DIR"
 mkdir -p lightonai/modernbert-embed-large
 gcloud storage cp -r gs://grouping-data/base_models/lightonai/modernbert-embed-large/* lightonai/modernbert-embed-large
 
-python3.12 -m venv .venv
-# shellcheck disable=SC1091
-source .venv/bin/activate
-pip install --upgrade pip
-pip install -e .
+uv sync --locked
 
 gcloud storage cp -r gs://grouping-data/final_csvs/ .
 
-# Auto-cd into the repo and activate the venv on `sudo -i`.
-echo "cd $REPO_DIR && source .venv/bin/activate" >> /root/.bashrc
+# Auto-cd into the repo, put uv on PATH, and activate the venv on `sudo -i`.
+{
+    echo "export PATH=\"\$HOME/.local/bin:\$PATH\""
+    echo "cd $REPO_DIR && source .venv/bin/activate"
+} >> /root/.bashrc
 
 # screen -S run
 # ctrl+a d
@@ -46,4 +49,5 @@ if [ -n "$COMMAND" ]; then
     eval "$COMMAND" >>"$LOG_FILE" 2>&1 || true
     shutdown -h now
 fi
-# To follow the log: `sudo tail -f /var/log/grouping_trainer_run.log`
+# To follow the log:
+# sudo tail -f /var/log/grouping_trainer_run.log
diff --git a/bin/set_up_local.sh b/bin/set_up_local.sh
@@ -2,8 +2,5 @@
 set -eu
 
 direnv allow
-python3.13 -m venv .venv
-# shellcheck source=/dev/null
-. .venv/bin/activate
-python -m pip install -e ".[dev,sheets]"
-pre-commit install
+uv sync --extra dev --extra sheets
+uv run pre-commit install
diff --git a/eval/acc_across_dims.ipynb b/eval/acc_across_dims.ipynb
@@ -293,7 +293,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.12"
+   "version": "3.13.1"
   }
  },
  "nbformat": 4,
diff --git a/eval/compare.py b/eval/compare.py
@@ -349,7 +349,7 @@ def plot_metrics_by_platform(df: pl.DataFrame, model_names: list[str]) -> plt.Fi
     fig, axes = plt.subplots(1, len(metrics_to_plot), figsize=(4 * len(metrics_to_plot), 5))
     axes: list[plt.Axes] = list(axes)
 
-    for ax, metric in zip(axes, metrics_to_plot):
+    for ax, metric in zip(axes, metrics_to_plot, strict=True):
         pivot_df = metrics_pd.pivot(index="platform", columns="model", values=metric)
         pivot_df = pivot_df[model_names]  # ensure consistent column order
         pivot_df.plot(kind="bar", ax=ax, rot=45, legend=False, color=MODEL_COLORS)
@@ -388,7 +388,7 @@ def plot_similarity_distribution(
     if n == 1:
         axes = [axes]
 
-    for ax, platform in zip(axes, platforms):
+    for ax, platform in zip(axes, platforms, strict=True):
         data = df.filter(pl.col("platform") == platform)[sim_col].to_numpy()
         ax.hist(data, bins=bins, edgecolor="none", alpha=0.8)
         ax.set_ylabel(platform, rotation=0, labelpad=60, ha="right")
@@ -434,7 +434,7 @@ def plot_dumbbell_by_project(
     ).sort("_delta")
     y_labels = [f"{row['org_id']}|{row['project_id']}" for row in sorted_df.iter_rows(named=True)]
 
-    for ax, metric in zip(axes, metrics):
+    for ax, metric in zip(axes, metrics, strict=True):
         col1 = f"{model1}_{metric}"
         col2 = f"{model2}_{metric}"
 
@@ -443,7 +443,7 @@ def plot_dumbbell_by_project(
         y = range(len(sorted_df))
 
         # Draw lines colored by direction
-        for i, (v1, v2) in enumerate(zip(x1, x2)):
+        for i, (v1, v2) in enumerate(zip(x1, x2, strict=True)):
             color = "green" if v2 >= v1 else "red"
             ax.hlines(y=i, xmin=min(v1, v2), xmax=max(v1, v2), color=color, alpha=0.6)
 
@@ -485,7 +485,7 @@ def compare_models(
               used for platforms not explicitly listed.
             First key = model1 (baseline), second key = model2 (new model).
         output_dir: Directory for writing CSVs. Required if write_csvs is True.
-        min_group_rate_increase: Track projects where model2 GROUP rate is >= this value higher than model1. None to skip.
+        min_group_rate_increase: Track projects where model2 GROUP rate is >= this value higher than model1. None skips.
         min_group_rate_decrease: Track projects where model2 GROUP rate is >= this value lower than model1 (absolute).
             E.g., 0.10 means model2 has at least 10pp lower GROUP rate. None to skip.
         write_csvs: If True, write new.csv and merged.csv files for each project.
@@ -753,7 +753,7 @@ def compute_stacktrace_token_percentiles(df: pl.DataFrame) -> pl.DataFrame:
 def sweep_thresholds(
     df: pl.DataFrame,
     model_name: str,
-    thresholds: list[float] = [0.80, 0.85, 0.87, 0.90],
+    thresholds: list[float] | None = None,
 ) -> pl.DataFrame:
     """
     Show metrics for a single model at multiple similarity thresholds.
@@ -766,6 +766,8 @@ def sweep_thresholds(
     Returns:
         DataFrame with one row per threshold and metric columns.
     """
+    if thresholds is None:
+        thresholds = [0.80, 0.85, 0.87, 0.90]
     sim_col = f"cos_sim_{model_name}"
     rows = []
     for thresh in thresholds:
@@ -788,7 +790,7 @@ def sweep_thresholds(
 def sweep_thresholds_by_project(
     df: pl.DataFrame,
     model_name: str,
-    thresholds: list[float] = [0.80, 0.85, 0.87, 0.90],
+    thresholds: list[float] | None = None,
     precision_floor: float = 0.8,
     harm_threshold: float = 0.05,
     thresholds_platform: dict[str, float] | None = None,
@@ -811,8 +813,8 @@ def sweep_thresholds_by_project(
         baseline_threshold: Threshold for the baseline model. Can be a float or a
             per-platform dict (with a "default" key), same format as thresholds_platform.
     """
-    sim_col = f"cos_sim_{model_name}"
-    pred_col = f"pred_{model_name}"
+    if thresholds is None:
+        thresholds = [0.80, 0.85, 0.87, 0.90]
     thresholds_sorted = sorted(thresholds, reverse=True)
 
     def _compute_project_precisions(model: str, threshold: float) -> pl.DataFrame:
diff --git a/eval/export_for_db.py b/eval/export_for_db.py
@@ -220,6 +220,7 @@ def export_for_load_test(
         export_for_load_test(sys.argv[1], **kwargs)
     else:
         print(
-            f"Usage: python {sys.argv[0]} <similarities_dir> [--load-test [keep_fraction_candidates=X] [keep_fraction_queries=X]]"
+            f"Usage: python {sys.argv[0]} <similarities_dir> "
+            f"[--load-test [keep_fraction_candidates=X] [keep_fraction_queries=X]]"
         )
         sys.exit(1)
diff --git a/profile_dataloading.ipynb b/profile_dataloading.ipynb
@@ -17,13 +17,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "34b40b8b",
    "metadata": {},
    "outputs": [],
    "source": [
     "import time\n",
     "from collections import Counter\n",
+    "from functools import wraps\n",
     "\n",
     "import numpy as np\n",
     "import seaborn as sns\n",
@@ -46,13 +47,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "e8b10dc8",
    "metadata": {},
    "outputs": [],
    "source": [
     "def record_times(times: list[float]):\n",
     "    def decorator(func):\n",
+    "        @wraps(func)\n",
     "        def wrapper(*args, **kwargs):\n",
     "            start_time = time.monotonic()\n",
     "            result = func(*args, **kwargs)\n",
@@ -190,46 +192,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "410a876f",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "42310f0811c14a47afde208aa9d7a941",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/13130 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 37min 10s, sys: 1min 53s, total: 39min 3s\n",
-      "Wall time: 7min 4s\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%time\n",
     "train_dataloader = trainer.get_train_dataloader()\n",
     "for batch in tqdm(train_dataloader, total=len(train_dataloader)):\n",
-    "    for sub_batch_idx, sub_batch in enumerate(\n",
-    "        gt.train.batch_pairs_by_token_budget(batch, token_budget=training_config_full.per_device_token_budget)\n",
+    "    num_sub_batches = 0\n",
+    "    for sub_batch in gt.train.batch_pairs_by_token_budget(\n",
+    "        batch, token_budget=training_config_full.per_device_token_budget\n",
     "    ):\n",
     "        encodings = preprocess(\n",
     "            trainer.model.encoder,\n",
     "            sub_batch[\"query_stacktrace_string\"],\n",
     "            sub_batch[\"candidate_stacktrace_string\"],\n",
     "        )\n",
-    "    num_sub_batches_per_batch.append(sub_batch_idx + 1)"
+    "        num_sub_batches += 1\n",
+    "    num_sub_batches_per_batch.append(num_sub_batches)"
    ]
   },
   {
@@ -370,7 +351,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.12"
+   "version": "3.13.1"
   }
  },
  "nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,7 @@ dependencies = [
     "typed-argument-parser==1.11.0",
     "wandb==0.23.1",
 ]
-requires-python = ">=3.12"
+requires-python = ">=3.13"
 authors = [
     { name = "Kush Dubey", email = "kushdubey63@gmail.com" },
 ]
@@ -61,7 +61,7 @@ indent-width = 4
 extend-include = ["*.ipynb"]
 
 [tool.ruff.lint]
-select = ["I"]
+select = ["I", "F", "E", "B", "UP"]
 
 [project.urls]
 Homepage = "https://github.com/getsentry/grouping-trainer"
diff --git a/src/grouping_trainer/compiled.py b/src/grouping_trainer/compiled.py
@@ -178,8 +178,8 @@ def compile_and_warm_up(self):
 
     @_set_float32_matmul_precision(_COMPILED_MATMUL_PRECISION)
     def forward(self, input: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
-        # Only use the compiled forward if the sequence length matches one of our buckets. If we used the compiled forward
-        # for one that doesn't hit the bucket, we create a new CUDA graph for every unique sequence length above
+        # Only use the compiled forward if the sequence length matches one of our buckets. If we used the compiled
+        # forward for one that doesn't hit the bucket, we create a new CUDA graph for every unique sequence length above
         # 2048, which thrashes the cache.
 
         if self.training:
diff --git a/src/grouping_trainer/data.py b/src/grouping_trainer/data.py
@@ -1,8 +1,4 @@
-from typing import TypedDict
-
 import polars as pl
-import torch
-from datasets import DatasetDict
 
 import grouping_trainer as gt
 
diff --git a/src/grouping_trainer/evaluator.py b/src/grouping_trainer/evaluator.py
@@ -241,7 +241,7 @@ def find_recall_at_precision_thresholds(
         assert len(scores) == len(labels)
 
         # Sort by score descending (highest similarity first)
-        rows = list(zip(scores, labels))
+        rows = list(zip(scores, labels, strict=True))
         rows = sorted(rows, key=lambda x: x[0], reverse=True)
 
         total_positives = sum(labels)
diff --git a/src/grouping_trainer/loss.py b/src/grouping_trainer/loss.py
@@ -14,8 +14,6 @@
 import torch
 from sentence_transformers.util import pairwise_cos_sim
 
-import grouping_trainer as gt
-
 
 class Features(TypedDict):
     query_embeddings: torch.Tensor
@@ -163,11 +161,13 @@ def __init__(
         self,
         *,
         bias_init: float = 0.0,
-        log_of_scale_init: torch.Tensor = torch.tensor(5.0).log(),
+        log_of_scale_init: torch.Tensor | None = None,
         mrl_dim_to_weight: dict[int, float] | None = None,
         n_dims_per_step: int = -1,
     ):
         super().__init__()
+        if log_of_scale_init is None:
+            log_of_scale_init = torch.tensor(5.0).log()
         self.log_scale = torch.nn.Parameter(log_of_scale_init.clone().detach())
         self.bias = torch.nn.Parameter(torch.tensor(bias_init))
 
diff --git a/src/grouping_trainer/synthetic.py b/src/grouping_trainer/synthetic.py
@@ -22,7 +22,7 @@
 import subprocess
 import tempfile
 from dataclasses import asdict, dataclass
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any, Literal, cast
 
@@ -75,7 +75,7 @@ def top_combos(
     indices_selected = indices[indices_sorted]  # back to raveled space
     top_indices = indices_selected[:num_combos]  # still in raveled space
     unraveled = np.unravel_index(top_indices, distances.shape)  # finally unravel
-    return list(zip(*unraveled))  # :-]
+    return list(zip(*unraveled, strict=True))  # :-]
 
 
 def mine_semi_easy_negatives_from_distance_matrix(
@@ -302,7 +302,7 @@ def main(
     subprocess.run(["gcloud", "storage", "rsync", "-r", gcs_model_folder, dir_model], check=True)
     model = gt.utils.SentenceTransformer(dir_model, trust_remote_code=True, text_prefix=text_prefix)
 
-    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H-%M-%S")
+    timestamp = datetime.now(UTC).strftime("%Y-%m-%d-%H-%M-%S")
     df = gt.data.load_train_df(paths=csv_paths)
     df = df.sort(pl.col("query_stacktrace_string").str.len_chars().mean().over("org_id", "project_id"))
     for (org_id, project_id), df_project in tqdm(
diff --git a/src/grouping_trainer/train.py b/src/grouping_trainer/train.py
diff --git a/src/grouping_trainer/utils.py b/src/grouping_trainer/utils.py
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
diff --git a/tests/test_train.py b/tests/test_train.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -293,7 +293,7 @@`
`293`	`293`	`"name": "python",`
`294`	`294`	`"nbconvert_exporter": "python",`
`295`	`295`	`"pygments_lexer": "ipython3",`
`296`		`- "version": "3.13.12"`
	`296`	`+ "version": "3.13.1"`
`297`	`297`	`}`
`298`	`298`	`},`
`299`	`299`	`"nbformat": 4,`
Original file line number	Diff line number	Diff line change
`@@ -220,6 +220,7 @@ def export_for_load_test(`
`220`	`220`	`export_for_load_test(sys.argv[1], **kwargs)`
`221`	`221`	`else:`
`222`	`222`	`print(`
`223`		`- f"Usage: python {sys.argv[0]} <similarities_dir> [--load-test [keep_fraction_candidates=X] [keep_fraction_queries=X]]"`
	`223`	`+ f"Usage: python {sys.argv[0]} <similarities_dir> "`
	`224`	`+ f"[--load-test [keep_fraction_candidates=X] [keep_fraction_queries=X]]"`
`224`	`225`	`)`
`225`	`226`	`sys.exit(1)`