move up onnx

kddubey · kddubey · commit 65c2554e7907 · 2026-04-25T12:50:14.000-07:00
diff --git a/benchmark/run.py b/benchmark/run.py
@@ -96,6 +96,23 @@ def main(
         logger.info(f"Downloading model from {path_gcs_inference} ...")
         subprocess.run(["gcloud", "storage", "rsync", "-r", path_gcs_inference, dir_tmp], check=True)
 
+        # ONNX model
+        logger.info("Loading ONNX model (first load triggers export, can take a couple minutes)")
+        start = time.monotonic()
+        model_onnx = gt.utils.SentenceTransformer(
+            dir_tmp,
+            backend="onnx",
+            trust_remote_code=True,
+            model_kwargs={"provider": "CUDAExecutionProvider"},
+            text_prefix=text_prefix,
+        )
+        _ = model_onnx.encode("warm up")
+        logger.info(f"ONNX model ready in {time.monotonic() - start:.1f}s")
+
+        times_onnx = _encode_timed(model_onnx, texts, desc="onnx")
+        (model_onnx,) = release_memory(model_onnx)
+
+        # Compiled model
         logger.info("Loading compiled model")
         start = time.monotonic()
         model_compiled = gt.compiled.SentenceTransformer(
@@ -109,6 +126,7 @@ def main(
         times_compiled = _encode_timed(model_compiled, texts, desc="compiled")
         (model_compiled,) = release_memory(model_compiled)
 
+        # Base model
         logger.info("Loading base model")
         start = time.monotonic()
         model_base = gt.utils.SentenceTransformer(
@@ -120,23 +138,6 @@ def main(
         times_base = _encode_timed(model_base, texts, desc="base")
         (model_base,) = release_memory(model_base)
 
-        # ONNX export ignores dtype/attn_implementation, so we run it fp32 here. A post-export fp16
-        # optimization pass is the next step if this looks promising.
-        logger.info("Loading ONNX model (first load triggers export, can take a couple minutes)")
-        start = time.monotonic()
-        model_onnx = gt.utils.SentenceTransformer(
-            dir_tmp,
-            backend="onnx",
-            trust_remote_code=True,
-            model_kwargs={"provider": "CUDAExecutionProvider"},
-            text_prefix=text_prefix,
-        )
-        _ = model_onnx.encode("warm up")
-        logger.info(f"ONNX model ready in {time.monotonic() - start:.1f}s")
-
-        times_onnx = _encode_timed(model_onnx, texts, desc="onnx")
-        (model_onnx,) = release_memory(model_onnx)
-
     df_out = pl.DataFrame(
         {
             "query_stacktrace_string": texts,
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ license = { file = "LICENSE" }
 dependencies = [
     "accelerate==1.12.0",
     "datasets==4.4.1",
-    "onnxruntime-gpu>=1.22",  # placeholder floor; pin once we know what resolves on cu128
+    "onnxruntime-gpu==1.25.0",
     "optimum==1.27.0",
     "polars==1.32.0",  # cudf lol
     "pydantic==2.11.9",