Fix parseTranscript: real user messages dropped, injected blocks treated as signals

Abdallah01 · Abdallah01 · commit e0476c7e2a53 · 2026-05-25T12:58:34.000+04:00
Two related bugs in `parseTranscript` cause Soul signals to be derived
from the wrong content:

1. **String-shaped content was unhandled.** Claude Code's JSONL emits
   `message.content` as a bare string for genuine human-typed user turns
   (e.g. `{ "role": "user", "content": "Please fix the bug." }`), and
   as an array of content blocks for assistant turns and for system-
   injected user-role messages (tool_result blocks, system reminders,
   slash-command artifacts, Skill tool result bodies). The `TranscriptEntry`
   type declared `content: ContentBlock[]` unconditionally; calling
   `.filter()` on a string threw `TypeError: content.filter is not a
   function`, swallowed by the surrounding try/catch. Result: every
   real user message was silently dropped.

2. **Injected user-role blocks were treated as user intent.** The only
   "user" content surviving the bug above was array-shaped — overwhelmingly
   `tool_result` blocks (correctly filtered by the `c.type === "text"`
   guard) plus the occasional `{ type: "text", ... }` block injected by
   Claude Code: system reminders, slash-command wrappers
   (`&lt;command-name&gt;/clear&lt;/command-name&gt;` etc.), and Skill tool result
   bodies (which carry the invoked skill's SKILL.md content as a text
   block). These pattern-matched against the `CORRECTION_PATTERN` /
   `GRATITUDE_PATTERN` / `COMPLETION_PATTERN` regexes, producing
   self-referential false signals (e.g. the `task-observer` skill body
   was repeatedly registering as user "gratitude" and "correction").
   Soul's own QUICK reflection self-diagnosed this as *"signal data is
   corrupted (contains system marker fragments and truncation)"*.

Fix:
- Widen `TranscriptEntry.message.content` to `string | ContentBlock[]`
  (matches reality).
- In `parseTranscript`, branch on `typeof content`. For strings: trim
  and use directly. For arrays: keep the existing text-block filter.
- Apply a shared `INJECTED_PREFIX` regex in both branches that strips
  system-reminders, slash-command artifacts (`&lt;local-command-caveat&gt;`,
  `&lt;command-name&gt;`, `&lt;command-message&gt;`, `&lt;command-args&gt;`), Skill tool
  result bodies (`Base directory for this skill:`), the request-
  interrupted marker, and the post-compact continuation banner. These
  are all framework-injected, not user-typed.

Tests: 6 new cases in `signal-extractor.test.ts` covering string content,
array content with intermixed tool_use blocks, both injection shapes
(system-reminder array block + slash-command string), Skill-body
injection, mixed-shape transcripts (regression guard against the
swallowed-TypeError pattern), and malformed-line resilience.

Verification against a live JSONL: pre-fix returned 2 array-text blocks
(both injected noise); post-fix returns 9 real user messages with zero
injection leakage.

Impact: this is upstream of every signal-based feature — framework
seeding, lesson selection, reflection cadence triggers. Pre-fix, those
were all running on injected system text instead of user dialogue.
diff --git a/packages/server/src/engine/signal-extractor.ts b/packages/server/src/engine/signal-extractor.ts
@@ -212,6 +212,12 @@ type ContentBlock =
 
 /**
  * A single entry from the transcript JSONL file.
+ *
+ * `message.content` is a discriminated union: a bare string for genuine
+ * human-typed user turns, and an array of content blocks for assistant
+ * turns and for system-injected user-role messages (tool_result blocks,
+ * system reminders, slash-command artifacts, Skill tool result bodies).
+ * Treating it as always-array dropped every real user message.
  */
 type TranscriptEntry = {
   type: "user" | "assistant" | "system" | "summary" | string;
@@ -220,10 +226,19 @@ type TranscriptEntry = {
   sessionId: string;
   message?: {
     role: "user" | "assistant";
-    content: ContentBlock[];
+    content: string | ContentBlock[];
   };
 };
 
+/**
+ * Prefixes identifying user-role text that was injected by Claude Code
+ * rather than typed by the user. Keeping these out of the signal stream
+ * is what prevents Skill tool result bodies (e.g. SKILL.md content) from
+ * pattern-matching as user gratitude / corrections / completion signals.
+ */
+const INJECTED_PREFIX =
+  /^\s*(<system-reminder>|<local-command-caveat>|<command-name>|<command-message>|<command-args>|Base directory for this skill:|\[Request interrupted by user|This session is being continued from a previous conversation)/;
+
 /**
  * Parse a Claude Code transcript JSONL file into messages suitable for signal extraction.
  */
@@ -238,11 +253,25 @@ export function parseTranscript(jsonlContent: string): TranscriptMessage[] {
       if (entry.type !== "user" && entry.type !== "assistant") continue;
       if (!entry.message?.content) continue;
 
-      const textParts = entry.message.content
-        .filter((c): c is { type: "text"; text: string } => c.type === "text")
-        .map((c) => c.text);
+      const content = entry.message.content;
+      let text: string;
+
+      if (typeof content === "string") {
+        if (INJECTED_PREFIX.test(content)) continue;
+        text = content.trim();
+      } else if (Array.isArray(content)) {
+        const textParts = content
+          .filter((c): c is { type: "text"; text: string } =>
+            c.type === "text" && typeof (c as { text?: unknown }).text === "string",
+          )
+          .map((c) => c.text)
+          .filter((t) => !INJECTED_PREFIX.test(t));
+
+        text = textParts.join("\n").trim();
+      } else {
+        continue;
+      }
 
-      const text = textParts.join("\n").trim();
       if (!text) continue;
 
       messages.push({
diff --git a/packages/server/tests/signal-extractor.test.ts b/packages/server/tests/signal-extractor.test.ts
@@ -1,5 +1,9 @@
 import { describe, it, expect } from "vitest";
-import { extractSignalsFromMessages, type TranscriptMessage } from "../src/engine/signal-extractor.js";
+import {
+  extractSignalsFromMessages,
+  parseTranscript,
+  type TranscriptMessage,
+} from "../src/engine/signal-extractor.js";
 
 describe("signal-extractor", () => {
   const sessionKey = "test-session";
@@ -102,3 +106,191 @@ describe("signal-extractor", () => {
     expect(corrections).toHaveLength(1);
   });
 });
+
+describe("parseTranscript", () => {
+  // Helper: build a single JSONL line for a transcript entry.
+  const line = (entry: Record<string, unknown>) => JSON.stringify(entry);
+
+  it("extracts genuine user messages stored as bare strings", () => {
+    // Claude Code emits message.content as a string for user-typed turns.
+    // Before the fix, .filter() was called unconditionally on content,
+    // throwing TypeError which the surrounding try/catch swallowed —
+    // every real user message was silently dropped.
+    const jsonl = [
+      line({
+        type: "user",
+        uuid: "u1",
+        timestamp: "2026-05-25T10:00:00Z",
+        sessionId: "s1",
+        message: { role: "user", content: "Please fix the bug." },
+      }),
+      line({
+        type: "assistant",
+        uuid: "a1",
+        timestamp: "2026-05-25T10:00:01Z",
+        sessionId: "s1",
+        message: { role: "assistant", content: [{ type: "text", text: "Fixed." }] },
+      }),
+    ].join("\n");
+
+    const messages = parseTranscript(jsonl);
+    expect(messages).toHaveLength(2);
+    expect(messages[0]).toEqual({ role: "user", text: "Please fix the bug." });
+    expect(messages[1]).toEqual({ role: "assistant", text: "Fixed." });
+  });
+
+  it("extracts text blocks from array-shaped content", () => {
+    const jsonl = line({
+      type: "assistant",
+      uuid: "a1",
+      timestamp: "2026-05-25T10:00:00Z",
+      sessionId: "s1",
+      message: {
+        role: "assistant",
+        content: [
+          { type: "text", text: "Running the test." },
+          { type: "tool_use", id: "t1", name: "Bash", input: {} },
+          { type: "text", text: "Done." },
+        ],
+      },
+    });
+
+    const messages = parseTranscript(jsonl);
+    expect(messages).toHaveLength(1);
+    expect(messages[0].text).toBe("Running the test.\nDone.");
+  });
+
+  it("filters out system-reminder injections (array shape)", () => {
+    // System reminders arrive as {type:'text'} blocks inside an
+    // array-shaped user-role message. They are injected by Claude Code,
+    // not typed by the user, and must not pose as user intent.
+    const jsonl = line({
+      type: "user",
+      uuid: "u1",
+      timestamp: "2026-05-25T10:00:00Z",
+      sessionId: "s1",
+      message: {
+        role: "user",
+        content: [
+          { type: "text", text: "<system-reminder>\nTodos updated.\n</system-reminder>" },
+        ],
+      },
+    });
+
+    const messages = parseTranscript(jsonl);
+    expect(messages).toHaveLength(0);
+  });
+
+  it("filters out slash-command artifacts (string shape)", () => {
+    // Slash commands like /clear arrive as bare strings with
+    // <local-command-caveat> / <command-name> wrappers.
+    const jsonl = [
+      line({
+        type: "user",
+        uuid: "u1",
+        timestamp: "2026-05-25T10:00:00Z",
+        sessionId: "s1",
+        message: {
+          role: "user",
+          content:
+            "<local-command-caveat>Caveat: ...</local-command-caveat>",
+        },
+      }),
+      line({
+        type: "user",
+        uuid: "u2",
+        timestamp: "2026-05-25T10:00:01Z",
+        sessionId: "s1",
+        message: {
+          role: "user",
+          content: "<command-name>/clear</command-name>",
+        },
+      }),
+    ].join("\n");
+
+    const messages = parseTranscript(jsonl);
+    expect(messages).toHaveLength(0);
+  });
+
+  it("filters out Skill tool result bodies posing as user content", () => {
+    // The Skill tool returns SKILL.md content as a {type:'text'} block
+    // attached to a user-role message. Pre-fix this surfaced as user
+    // gratitude / correction signals — the source of the "all my Soul
+    // signals are corrupted self-references" symptom.
+    const jsonl = line({
+      type: "user",
+      uuid: "u1",
+      timestamp: "2026-05-25T10:00:00Z",
+      sessionId: "s1",
+      message: {
+        role: "user",
+        content: [
+          {
+            type: "text",
+            text:
+              "Base directory for this skill: ~/.claude/skills/example\n\nThanks for using this skill, perfect work!",
+          },
+        ],
+      },
+    });
+
+    const messages = parseTranscript(jsonl);
+    expect(messages).toHaveLength(0);
+  });
+
+  it("handles mixed-shape transcript without throwing", () => {
+    // The pre-fix bug surfaced as a swallowed TypeError on the FIRST
+    // string-content entry — subsequent entries on that same parse
+    // call were unaffected only because the try/catch ate the throw.
+    // This test guards against any regression that would crash on
+    // shape mismatch.
+    const jsonl = [
+      line({
+        type: "user",
+        uuid: "u1",
+        timestamp: "2026-05-25T10:00:00Z",
+        sessionId: "s1",
+        message: { role: "user", content: "Real user message." },
+      }),
+      line({
+        type: "user",
+        uuid: "u2",
+        timestamp: "2026-05-25T10:00:01Z",
+        sessionId: "s1",
+        message: {
+          role: "user",
+          content: [{ type: "tool_result", tool_use_id: "t1", content: "ok" }],
+        },
+      }),
+      line({
+        type: "assistant",
+        uuid: "a1",
+        timestamp: "2026-05-25T10:00:02Z",
+        sessionId: "s1",
+        message: { role: "assistant", content: [{ type: "text", text: "Acknowledged." }] },
+      }),
+    ].join("\n");
+
+    const messages = parseTranscript(jsonl);
+    expect(messages).toHaveLength(2);
+    expect(messages[0].text).toBe("Real user message.");
+    expect(messages[1].text).toBe("Acknowledged.");
+  });
+
+  it("skips malformed JSONL lines without crashing", () => {
+    const jsonl = [
+      "{not valid json",
+      line({
+        type: "user",
+        uuid: "u1",
+        timestamp: "2026-05-25T10:00:00Z",
+        sessionId: "s1",
+        message: { role: "user", content: "valid" },
+      }),
+    ].join("\n");
+
+    const messages = parseTranscript(jsonl);
+    expect(messages).toHaveLength(1);
+    expect(messages[0].text).toBe("valid");
+  });
+});