drop bad training examples

flaviusburca · flaviusburca · commit 8bbe253b9566 · 2026-04-20T08:40:42.000+03:00
diff --git a/surogates/jobs/training_collector.py b/surogates/jobs/training_collector.py
@@ -30,6 +30,14 @@
 logger = logging.getLogger(__name__)
 
 
+_SKILL_TRAJECTORY_BOUNDARY: frozenset[str] = frozenset({
+    EventType.USER_MESSAGE.value,
+    EventType.SKILL_INVOKED.value,
+    EventType.SESSION_COMPLETE.value,
+    EventType.SESSION_FAIL.value,
+})
+
+
 def _strip_skill_prefix(raw_message: str, skill_name: str) -> str:
     """Remove the leading ``/<skill_name>`` from *raw_message*.
 
@@ -302,7 +310,12 @@ async def collect_for_skill(
         exclude_tainted:
             When True (default), sessions with ``policy.denied``,
             ``harness.crash``, ``saga.compensate``, or
-            ``expert.override`` events are skipped entirely.
+            ``expert.override`` events are skipped entirely, and any
+            individual trajectory whose assistant response received a
+            ``user.feedback`` with ``rating: "down"`` is rejected.
+            Per-trajectory granularity is intentional: one thumbs-down
+            on an unrelated response in the same session should not
+            poison sibling invocations with their own class labels.
         """
         invocations = await self._session_store.find_skill_invocations(
             org_id, skill_name, since=since,
@@ -322,6 +335,7 @@ async def collect_for_skill(
 
         examples: list[TrainingExample] = []
         skipped_tainted = 0
+        skipped_thumbs_down = 0
 
         for session_id, session_invocations in by_session.items():
             if exclude_tainted:
@@ -332,20 +346,37 @@ async def collect_for_skill(
             events = await self._session_store.get_events(session_id)
             events_by_id = {e.id: i for i, e in enumerate(events)}
 
+            # Scanned once per session so every trajectory can share it.
+            down_rated_response_ids: set[int] = set()
+            if exclude_tainted:
+                for e in events:
+                    if e.type != EventType.USER_FEEDBACK.value:
+                        continue
+                    if e.data.get("rating") != "down":
+                        continue
+                    target = e.data.get("target_event_id")
+                    if isinstance(target, int):
+                        down_rated_response_ids.add(target)
+
             for inv in session_invocations:
                 start_idx = events_by_id.get(inv.id)
                 if start_idx is None:
                     continue
-                example = self._collect_skill_trajectory(
+                example, rejected_thumbs_down = self._collect_skill_trajectory(
                     events, start_idx, skill_name, session_id, inv,
+                    down_rated_response_ids=down_rated_response_ids,
                 )
-                if example is not None:
+                if rejected_thumbs_down:
+                    skipped_thumbs_down += 1
+                elif example is not None:
                     examples.append(example)
 
         logger.info(
             "Collected %d training examples for skill '%s' "
-            "(%d invocations, %d tainted sessions skipped)",
-            len(examples), skill_name, len(invocations), skipped_tainted,
+            "(%d invocations, %d tainted sessions skipped, "
+            "%d trajectories skipped for thumbs-down)",
+            len(examples), skill_name, len(invocations),
+            skipped_tainted, skipped_thumbs_down,
         )
         return examples
 
@@ -356,18 +387,28 @@ def _collect_skill_trajectory(
         skill_name: str,
         session_id: UUID,
         skill_event: Any,
-    ) -> TrainingExample | None:
+        *,
+        down_rated_response_ids: set[int] | None = None,
+    ) -> tuple[TrainingExample | None, bool]:
         """Walk events from a ``skill.invoked`` through its trajectory.
 
-        Returns a :class:`TrainingExample` when a complete trajectory
-        is found (at least one assistant response with content).  The
-        trajectory ends at the first of: next ``user.message``, next
-        ``skill.invoked``, ``session.complete`` or ``session.fail``.
+        Returns ``(example, rejected_thumbs_down)``.  ``example`` is a
+        :class:`TrainingExample` when a complete trajectory is found
+        (at least one assistant response with content), otherwise
+        ``None``.  ``rejected_thumbs_down`` is ``True`` iff the
+        trajectory was dropped because an ``llm.response`` within it
+        received a judge thumbs-down (id in *down_rated_response_ids*);
+        a down-rated assistant turn is the wrong class label for
+        *skill_name* and must not be exported.
+
+        The trajectory ends at the first of: next ``user.message``,
+        next ``skill.invoked``, ``session.complete`` or
+        ``session.fail``.
         """
         raw_message = skill_event.data.get("raw_message", "")
         user_text = _strip_skill_prefix(raw_message, skill_name)
         if not user_text:
-            return None
+            return None, False
 
         messages: list[dict[str, Any]] = [
             {"role": "user", "content": user_text},
@@ -378,15 +419,15 @@ def _collect_skill_trajectory(
             event = events[i]
             etype = event.type
 
-            if etype in (
-                EventType.USER_MESSAGE.value,
-                EventType.SKILL_INVOKED.value,
-                EventType.SESSION_COMPLETE.value,
-                EventType.SESSION_FAIL.value,
-            ):
-                break  # trajectory boundary
+            if etype in _SKILL_TRAJECTORY_BOUNDARY:
+                break
 
             if etype == EventType.LLM_RESPONSE.value:
+                if (
+                    down_rated_response_ids
+                    and event.id in down_rated_response_ids
+                ):
+                    return None, True
                 msg = event.data.get("message")
                 if not isinstance(msg, dict):
                     continue
@@ -409,14 +450,15 @@ def _collect_skill_trajectory(
             # ``llm.response`` message's ``tool_calls`` field.
 
         if not has_final_assistant_content:
-            return None
+            return None, False
 
-        return TrainingExample(
+        example = TrainingExample(
             messages=messages,
             session_id=session_id,
             expert_name=skill_name,
             created_at=getattr(skill_event, "created_at", None),
         )
+        return example, False
 
     async def export_jsonl(
         self,
diff --git a/tests/integration/test_training_collector.py b/tests/integration/test_training_collector.py
@@ -246,6 +246,75 @@ async def test_collect_for_skill_excludes_tainted_session(
     assert {e.session_id for e in all_examples} == {clean.id, tainted.id}
 
 
+async def test_collect_for_skill_excludes_trajectory_with_thumbs_down(
+    session_store, session_factory,
+):
+    """A ``user.feedback`` rating=down on an LLM response in a trajectory
+    rejects that trajectory only — sibling invocations in the same session
+    still yield training examples.
+
+    Regression: ``session_has_taint`` only checks ``policy.denied`` and
+    friends, so the judge's thumbs-down verdicts used to pass the filter
+    and poison the training set with negative class labels.
+    """
+    org_id = await create_org(session_factory)
+    user_id = await create_user(session_factory, org_id)
+    session = await session_store.create_session(
+        user_id=user_id, org_id=org_id, agent_id="test-agent",
+    )
+
+    # First invocation: rated down — should be excluded.
+    await session_store.emit_event(
+        session.id, EventType.USER_MESSAGE,
+        {"content": "/sql_writer bad query"},
+    )
+    await session_store.emit_event(
+        session.id, EventType.SKILL_INVOKED,
+        {"skill": "sql_writer", "raw_message": "/sql_writer bad query",
+         "staged_at": None},
+    )
+    bad_response_id = await session_store.emit_event(
+        session.id, EventType.LLM_RESPONSE,
+        {
+            "message": {"role": "assistant", "content": "SELECT 1;"},
+            "model": "gpt-4o",
+            "input_tokens": 1,
+            "output_tokens": 1,
+        },
+    )
+    await session_store.emit_event(
+        session.id, EventType.USER_FEEDBACK,
+        {
+            "target_event_id": bad_response_id,
+            "rating": "down",
+            "source": "service_account",
+            "rated_by_service_account_id": "00000000-0000-0000-0000-000000000001",
+            "reason": "query missed the WHERE clause",
+        },
+    )
+
+    # Second invocation: untouched — should survive.
+    await _seed_skill_invocation(
+        session_store, session.id,
+        raw_message="/sql_writer good query",
+        assistant_content="SELECT * FROM users;",
+    )
+
+    collector = TrainingDataCollector(session_store=session_store)
+    examples = await collector.collect_for_skill("sql_writer", org_id)
+
+    assert len(examples) == 1
+    assert examples[0].messages[0]["content"] == "good query"
+    assert examples[0].messages[-1]["content"] == "SELECT * FROM users;"
+
+    # With exclude_tainted=False the rejected trajectory comes back.
+    all_examples = await collector.collect_for_skill(
+        "sql_writer", org_id, exclude_tainted=False,
+    )
+    asks = sorted(ex.messages[0]["content"] for ex in all_examples)
+    assert asks == ["bad query", "good query"]
+
+
 async def test_collect_for_skill_skips_trajectory_with_no_final_assistant(
     session_store, session_factory,
 ):