Merge pull request #236 from Hyperkid123/feat/RHCLOUD-48344

Hyperkid123 · web-flow · commit 920fdaf5837f · 2026-06-12T09:16:55.000+02:00
feat(bot): add transcript capture and progress instructions
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -111,6 +111,13 @@ Active: `in_progress`, `pr_open`, `pr_changes`. Terminal: `done`, `archived`, `p
 
 **Multi-repo**: One task per Jira ticket. Primary repo in `repo`, all in `metadata.repos`. PRs in `metadata.prs` as `[{"repo", "number", "url", "host"}]`.
 
+### Cycle Progress Tools
+
+| Tool | Purpose |
+|------|---------|
+| `progress_store` | Store structured cycle progress. Params: `task_id, instance_id, cycle_type, progress?, started_at?, finished_at?, tool_calls?, tokens_used?` |
+| `progress_load` | Load last N progress entries for a task. Params: `task_id, instance_id?, limit?` (default 5) |
+
 ### Memory Tools
 
 | Tool | Purpose |
@@ -395,7 +402,23 @@ Keep task record updated throughout (not just end). `task_update` w/ `summary` +
 - `last_step`: `branch_created`/`implemented`/`tests_passing`/`push_failed`/`pr_opened`/`review_addressed`/`investigation_posted`/`archived`
 - `files_changed`, `commits`, `next_step`, `notes`, `repos`, `prs`
 
-**On startup — interrupted work**: Triage output shows all `in_progress` tasks w/ `last_step`. Any w/ `last_step` set? → `memory_search` repo + problem → resume from `next_step`. Task metadata = specific work state. RAG memory = cross-ticket learnings.
+### Cycle Progress (progress_load / progress_store)
+
+Persists structured progress across cycles. Separate from `task_update` — creates **history**, not just current state.
+
+**On resume** (existing task, not new):
+1. `task_get(jira_key)` → note `id` field = `task_id`
+2. `progress_load(task_id=<id>)` → last 5 cycle summaries
+3. Use returned progress → understand prior decisions, files, blockers, where left off
+
+**Before cycle ends** (after work on task):
+1. `progress_store(task_id=<id>, instance_id=<instance>, cycle_type="task_work", progress={...})`
+2. Progress keys: `last_step`, `next_step`, `files_changed`, `commits`, `key_decisions`, `blockers`, `notes`
+3. In addition to `task_update` — call both
+
+Idle/error cycles: `run.py` handles automatically. No agent action.
+
+**On startup — interrupted work**: `in_progress` w/ `last_step` set? → `progress_load(task_id)` for cycle history + `memory_search` repo + problem → resume from `next_step`. Cycle progress = per-cycle history. Task metadata = current state. RAG memory = cross-ticket learnings.
 
 ## Rules
 
diff --git a/bot/agent.py b/bot/agent.py
@@ -1,5 +1,6 @@
 """Core agent cycle — invokes Claude Agent SDK."""
 
+import json
 import logging
 import os
 from dataclasses import dataclass
@@ -12,6 +13,7 @@
     ResultMessage,
     SystemMessage,
     TextBlock,
+    ToolResultBlock,
     query,
 )
 
@@ -35,6 +37,7 @@ class CycleContext:
     repo: str | None = None
     work_type: str | None = None
     summary: str | None = None
+    task_id: int | None = None
 
 
 async def _push_status(
@@ -208,10 +211,11 @@ async def run_cycle(
                                 logger.info("[agent] %s", text[:300])
                                 # Push to dashboard
                                 await _push_status(http, "working", text[:500])
+                        elif isinstance(block, ToolResultBlock):
+                            _extract_task_id_from_result(block, ctx)
                         elif hasattr(block, "name"):
                             desc = _describe_tool_use(block)
                             logger.info("[tool] %s", desc)
-                            # Extract work context from MCP tool calls
                             _extract_context(block, ctx)
 
                 elif isinstance(message, ResultMessage):
@@ -304,3 +308,29 @@ def _extract_context(block, ctx: CycleContext) -> None:
     # Memory housekeeping
     elif name == "mcp__bot-memory__memory_delete":
         ctx.work_type = ctx.work_type or "memory_housekeeping"
+
+
+def _extract_task_id_from_result(block: ToolResultBlock, ctx: CycleContext) -> None:
+    """Extract task_id from MCP tool result content (task_add/task_get/task_update return task objects)."""
+    content = block.content
+    if not content:
+        return
+    try:
+        text = (
+            content
+            if isinstance(content, str)
+            else content[0].get("text", "")
+            if isinstance(content, list)
+            else ""
+        )
+        if not text:
+            return
+        data = json.loads(text)
+        if (
+            isinstance(data, dict)
+            and isinstance(data.get("id"), int)
+            and "jira_key" in data
+        ):
+            ctx.task_id = data["id"]
+    except (json.JSONDecodeError, TypeError, IndexError, AttributeError):
+        pass
diff --git a/bot/run.py b/bot/run.py
@@ -20,6 +20,7 @@
 from .config import ALLOWED_TOOLS, Config, load_config, load_mcp_servers, sanitize_env
 from .costs import record_cost
 from .merge import apply_merged_config
+from .transcripts import record_transcript
 
 SCRIPT_DIR = Path(__file__).resolve().parent.parent
 DATA_DIR = SCRIPT_DIR / "data"
@@ -342,6 +343,13 @@ def shutdown(sig, frame):
                     result=result,
                     ctx=ctx,
                 )
+                record_transcript(
+                    label=args.label,
+                    result=result,
+                    ctx=ctx,
+                    cwd=str(SCRIPT_DIR),
+                    instance_id=instance_id,
+                )
             else:
                 logger.warning("Cycle produced no result")
 
diff --git a/bot/transcripts.py b/bot/transcripts.py
@@ -0,0 +1,126 @@
+"""Transcript capture — compresses and stores cycle transcripts via the dashboard API."""
+
+from __future__ import annotations
+
+import base64
+import logging
+import os
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import httpx
+
+if TYPE_CHECKING:
+    from .agent import CycleContext
+
+logger = logging.getLogger(__name__)
+
+CYCLE_RUNS_API = os.environ.get(
+    "CYCLE_RUNS_API_URL", "http://localhost:8080/api/cycle-runs"
+)
+
+_WORK_TYPE_TO_CYCLE_TYPE = {
+    "new_ticket": "task_work",
+    "pr_review": "task_work",
+    "ci_fix": "task_work",
+    "idle": "idle",
+    "memory_housekeeping": "idle",
+    "error": "error",
+}
+
+
+def _resolve_cycle_type(work_type: str | None, is_error: bool) -> str:
+    if is_error:
+        return "error"
+    if work_type:
+        return _WORK_TYPE_TO_CYCLE_TYPE.get(work_type, "task_work")
+    return "triage_only"
+
+
+def _find_transcript(session_id: str, cwd: str) -> Path | None:
+    """Locate the Claude session transcript JSONL file."""
+    slug = cwd.replace("/", "-")
+    if not slug.startswith("-"):
+        slug = "-" + slug
+    home = Path.home()
+    path = home / ".claude" / "projects" / slug / f"{session_id}.jsonl"
+    if path.exists():
+        return path
+    # Fallback: scan project dirs for the session file
+    projects_dir = home / ".claude" / "projects"
+    if projects_dir.is_dir():
+        for candidate in projects_dir.iterdir():
+            f = candidate / f"{session_id}.jsonl"
+            if f.exists():
+                return f
+    return None
+
+
+def record_transcript(
+    label: str,
+    result,
+    ctx: CycleContext | None = None,
+    cwd: str = "",
+    instance_id: str | None = None,
+) -> None:
+    """Compress and store the cycle transcript + metadata to the dashboard API."""
+    session_id = getattr(result, "session_id", "")
+    if not session_id:
+        logger.debug("No session_id in result — skipping transcript capture")
+        return
+
+    usage = getattr(result, "usage", None) or {}
+    is_error = getattr(result, "subtype", "") != "success"
+    cycle_type = _resolve_cycle_type(ctx.work_type if ctx else None, is_error)
+
+    duration_ms = getattr(result, "duration_ms", None) or 0
+    now = datetime.now(timezone.utc)
+    started_at = now
+    if duration_ms:
+        started_at = now - timedelta(milliseconds=duration_ms)
+
+    body: dict = {
+        "task_id": ctx.task_id if ctx else None,
+        "cycle_type": cycle_type,
+        "instance_id": instance_id or label,
+        "started_at": started_at.isoformat(),
+        "finished_at": now.isoformat(),
+        "tool_calls": getattr(result, "num_turns", 0),
+        "tokens_used": usage.get("input_tokens", 0) + usage.get("output_tokens", 0),
+        "progress": {
+            "jira_key": ctx.jira_key if ctx else None,
+            "repo": ctx.repo if ctx else None,
+            "work_type": ctx.work_type if ctx else None,
+            "summary": ctx.summary if ctx else None,
+        },
+    }
+
+    transcript_path = _find_transcript(session_id, cwd)
+    if transcript_path:
+        try:
+            import zstandard as zstd
+
+            raw = transcript_path.read_bytes()
+            compressor = zstd.ZstdCompressor(level=19)
+            compressed = compressor.compress(raw)
+            body["transcript_b64"] = base64.b64encode(compressed).decode()
+            logger.info(
+                "Transcript: %d bytes → %d compressed (%.0f%% savings)",
+                len(raw),
+                len(compressed),
+                (1 - len(compressed) / len(raw)) * 100 if raw else 0,
+            )
+        except ImportError:
+            logger.warning(
+                "zstandard not installed — storing cycle run without transcript"
+            )
+        except Exception:
+            logger.warning("Failed to read/compress transcript", exc_info=True)
+    else:
+        logger.debug("Transcript file not found for session %s", session_id)
+
+    try:
+        httpx.post(CYCLE_RUNS_API, json=body, timeout=10.0)
+    except Exception:
+        logger.warning("Failed to push cycle run to API", exc_info=True)
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,6 +8,7 @@ dependencies = [
     "filelock",
     "httpx",
     "python-dotenv",
+    "zstandard>=0.23",
 ]
 
 [project.optional-dependencies]
diff --git a/tests/test_transcripts.py b/tests/test_transcripts.py

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@ dependencies = [`
`8`	`8`	`"filelock",`
`9`	`9`	`"httpx",`
`10`	`10`	`"python-dotenv",`
	`11`	`+ "zstandard>=0.23",`
`11`	`12`	`]`
`12`	`13`
`13`	`14`	`[project.optional-dependencies]`