Added extractor inference

adaamko · adaamko · commit 8f234aeed08e · 2026-04-27T15:38:18.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -33,3 +33,4 @@ wandb/
 runs/
 checkpoints/
 output/
+paper/
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@
 
 - Tool output pruner for LLM coding agents
 - Pipe any tool output (pytest, grep, git log, npm build, kubectl, ...) through squeez with a task description, get back only the relevant lines
-- Fine-tuned Qwen 3.5 2B, 0.80 F1, 92% compression
+- Two models, same CLI: a **generative** Qwen 3.5 2B (0.80 F1, 92% compression) or a smaller **extractive** ModernBERT alternative
 - CLI pipe, Python library, or vLLM server
 
 Existing context pruning tools ([SWE-Pruner](https://github.com/Ayanami1314/swe-pruner), [Zilliz Semantic Highlight](https://huggingface.co/zilliz/semantic-highlight-bilingual-v1), [Provence](https://arxiv.org/abs/2501.16214)) are built for source code or document paragraphs. They don't handle the mixed, unstructured format of tool output (stack traces interleaved with passing tests, grep matches with context lines, build logs with timestamps). Squeez is trained on 27 types of tool output from real SWE-bench workflows and synthetic multi-ecosystem observations.
@@ -236,41 +236,29 @@ Environment variables:
 | `SQUEEZ_LOCAL_MODEL` | Path to local model directory |
 | `SQUEEZ_SERVER_MODEL` | Model name on the server |
 | `SQUEEZ_API_KEY` | API key (if needed) |
-| `SQUEEZ_BACKEND` | Force backend: `transformers`, `vllm`, `encoder` |
+| `SQUEEZ_BACKEND` | Force backend (rarely needed; auto-detected from the model) |
 
 </details>
 
 <details>
-<summary><b>Encoder models</b></summary>
+<summary><b>Use the extractive model instead</b></summary>
 
-Squeez also supports encoder-based extraction (ModernBERT, etc.) as an alternative to the generative model. These are faster but less accurate.
+If you don't need the 2B generative model, point squeez at a smaller
+extractive one — same CLI, same Python API. Configure once, then use
+`squeez` normally:
 
-Two encoder approaches:
-- **Token encoder**: per-token binary classification, aggregated per line via max-pool
-- **Pooled encoder**: single-pass encoder with line-level mean-pool classification
-
-```python
-from squeez.inference.extractor import ToolOutputExtractor
+```bash
+export SQUEEZ_LOCAL_MODEL=KRLabsOrg/verbatim-rag-modern-bert-v2
 
-extractor = ToolOutputExtractor(model_path="./output/squeez_encoder")
-filtered = extractor.extract(task="Find the bug", tool_output=raw_output)
+pytest -q 2>&1 | squeez "find the failing test"
+git log --oneline -50 | squeez "find the auth commit"
 ```
 
-Standalone loading without squeez installed:
+`KRLabsOrg/verbatim-rag-modern-bert-v2` is a 150M ModernBERT span model
+trained on a multi-domain mix that includes Squeez tool-output. See
+[RESULTS.md](RESULTS.md) for the head-to-head with Squeez-2B.
 
-```python
-from transformers import AutoModel, AutoTokenizer
-
-model = AutoModel.from_pretrained("output/squeez_pooled", trust_remote_code=True)
-tokenizer = AutoTokenizer.from_pretrained("output/squeez_pooled")
-
-result = model.process(
-    task="Find the traceback",
-    tool_output=open("output.log").read(),
-    tokenizer=tokenizer,
-)
-print(result["highlighted_lines"])
-```
+To train your own extractive model, see [TRAINING.md](TRAINING.md).
 
 </details>
 
diff --git a/scripts/evaluate_baselines.py b/scripts/evaluate_baselines.py
@@ -43,7 +43,7 @@
 logger = logging.getLogger(__name__)
 
 ALL_NAIVE = ["random", "first_n", "last_n", "bm25"]
-ALL_MODEL = ["swe_pruner", "zilliz", "gliner2"]
+ALL_MODEL = ["swe_pruner", "zilliz", "gliner2", "verbatim_v2"]
 ALL_BASELINES = ALL_NAIVE + ALL_MODEL
 
 
@@ -272,6 +272,75 @@ def baseline_zilliz(model, task: str, tool_output: str, threshold: float = 0.5)
     return kept
 
 
+def _load_verbatim_v2(model_name: str = "KRLabsOrg/verbatim-rag-modern-bert-v2"):
+    """Load Verbatim-RAG ModernBERT v2 (needs: transformers + trust_remote_code).
+
+    Device selection:
+      - CUDA when available (intended path on the eval GPU node)
+      - CPU otherwise. We skip MPS by default because the long-context tool-output
+        forward pass routinely bumps into Metal's per-buffer size cap. Set
+        ``SQUEEZ_VERBATIM_DEVICE=mps`` (with ``PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0``)
+        to force MPS anyway.
+    """
+    import os
+
+    import torch
+    from transformers import AutoModel
+
+    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
+    forced = os.environ.get("SQUEEZ_VERBATIM_DEVICE")
+    if forced:
+        device = forced
+    elif torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    model = model.to(device)
+    model.eval()
+    return model
+
+
+def baseline_verbatim_v2(
+    model,
+    task: str,
+    tool_output: str,
+    threshold: float = 0.1,
+    min_span_chars: int = 10,
+    merge_gap_chars: int = 20,
+) -> list[str]:
+    """Verbatim-RAG ModernBERT v2 — keep any line touched by an extracted span.
+
+    Defaults to the recall-tuned config (threshold=0.1, min_span_chars=10) which
+    handles short structured answers (file paths, line numbers, log lines)
+    common in tool output. The model card documents this as the recommended
+    config for technical / structured content.
+    """
+    if not tool_output:
+        return []
+    result = model.process(
+        question=task,
+        context=tool_output,
+        threshold=threshold,
+        min_span_chars=min_span_chars,
+        merge_gap_chars=merge_gap_chars,
+    )
+    spans = result.get("spans", [])
+    if not spans:
+        return []
+    lines = tool_output.split("\n")
+    line_offsets, pos = [], 0
+    for line in lines:
+        line_offsets.append((pos, pos + len(line)))
+        pos += len(line) + 1
+    kept_indices: set[int] = set()
+    for sp in spans:
+        a, b = sp["start"], sp["end"]
+        for i, (lo, hi) in enumerate(line_offsets):
+            if not (b <= lo or a >= hi):
+                kept_indices.add(i)
+    return [lines[i] for i in sorted(kept_indices) if lines[i].strip()]
+
+
 def _load_gliner2():
     """Load GLiNER2 model (needs: pip install gliner2)."""
     from gliner2 import GLiNER2
@@ -507,6 +576,22 @@ def main():
         except Exception as e:
             logger.error(f"GLiNER2 failed: {e}")
 
+    if "verbatim_v2" in baselines:
+        logger.info("Loading Verbatim-RAG ModernBERT v2...")
+        try:
+            model = _load_verbatim_v2()
+            logger.info("Running: verbatim_v2")
+            results.append(
+                evaluate_baseline(
+                    "Verbatim-RAG ModernBERT v2",
+                    baseline_verbatim_v2,
+                    samples,
+                    model=model,
+                )
+            )
+        except Exception as e:
+            logger.error(f"verbatim_v2 failed: {type(e).__name__}: {e}")
+
     # Print and save
     print_results(results)
 
diff --git a/squeez/inference/extractor.py b/squeez/inference/extractor.py

-Original file line number
+Diff line change
 runs/
 checkpoints/
 output/
 +paper/