Tests work

adaamko · adaamko · commit 7e4081304851 · 2026-03-10T18:17:45.000+01:00
diff --git a/squeez/encoder/__init__.py b/squeez/encoder/__init__.py
@@ -1,5 +1,15 @@
 """Encoder-based line classifier for tool output extraction."""
 
-from squeez.encoder.model import SqueezEncoderConfig, SqueezEncoderForLineClassification
-
 __all__ = ["SqueezEncoderConfig", "SqueezEncoderForLineClassification"]
+
+
+def __getattr__(name: str):
+    """Lazily import encoder model classes so lightweight helpers stay optional."""
+    if name in __all__:
+        from squeez.encoder.model import SqueezEncoderConfig, SqueezEncoderForLineClassification
+
+        return {
+            "SqueezEncoderConfig": SqueezEncoderConfig,
+            "SqueezEncoderForLineClassification": SqueezEncoderForLineClassification,
+        }[name]
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/squeez/encoder/chunking.py b/squeez/encoder/chunking.py
@@ -8,11 +8,16 @@
 
 from __future__ import annotations
 
-from transformers import PreTrainedTokenizer
+from typing import Protocol
+
+
+class _TokenizerLike(Protocol):
+    def __call__(self, text: str, **kwargs) -> dict:
+        ...
 
 
 def encode_text(
-    tokenizer: PreTrainedTokenizer,
+    tokenizer: _TokenizerLike,
     text: str,
     truncation: bool = False,
     max_length: int | None = None,
@@ -32,7 +37,7 @@ def encode_text(
 
 
 def chunk_output_lines(
-    tokenizer: PreTrainedTokenizer,
+    tokenizer: _TokenizerLike,
     output_lines: list[str],
     max_tokens_per_chunk: int,
 ) -> tuple[list[list[int]], list[int]]:
diff --git a/tests/test_extractor.py b/tests/test_extractor.py
@@ -20,20 +20,19 @@ def test_format_prompt_truncates_long_task():
     prompt = _format_prompt(long_task, "output")
     assert len(long_task) > 3000
     assert "..." in prompt
-    # Should be truncated to 3000 + "..."
-    task_section = prompt.split("Task: ")[1].split("\n\n")[0]
+    task_section = prompt.split("<task>\n", 1)[1].split("\n</task>", 1)[0]
     assert len(task_section) == 3003  # 3000 + "..."
 
 
 def test_format_prompt_empty_task():
     prompt = _format_prompt("", "some output")
-    assert "Task: \n" in prompt
+    assert "<task>\n\n</task>" in prompt
     assert "some output" in prompt
 
 
-def test_system_prompt_has_json_format():
+def test_system_prompt_has_relevant_lines_format():
     assert "relevant_lines" in SYSTEM_PROMPT
-    assert "JSON" in SYSTEM_PROMPT
+    assert "<relevant_lines>" in SYSTEM_PROMPT
 
 
 def test_load_config_returns_dict():
@@ -53,8 +52,9 @@ def test_assign_split(self):
         from squeez.data.sample_assembler import _assign_split
 
         assert _assign_split("django__django") == "train"
-        assert _assign_split("pydata__xarray") == "eval"
-        assert _assign_split("pallets__flask") == "eval"
+        assert _assign_split("pydata__xarray") == "test"
+        assert _assign_split("pallets__flask") == "test"
+        assert _assign_split("psf__requests") == "dev"
         assert _assign_split("scikit-learn__scikit-learn") == "train"
 
     def test_format_prompt(self):