diff --git a/.gitignore b/.gitignore index 38efbb64..8723be1a 100644 --- a/.gitignore +++ b/.gitignore @@ -42,3 +42,6 @@ context/ # Playwright .playwright-mcp +# Local development scratch dir (temp clones, build outputs, etc.) +.tmp/ + diff --git a/deploy/helm/humr/templates/onecli/app.yaml b/deploy/helm/humr/templates/onecli/app.yaml index 260556d6..cb40893c 100644 --- a/deploy/helm/humr/templates/onecli/app.yaml +++ b/deploy/helm/humr/templates/onecli/app.yaml @@ -92,6 +92,9 @@ spec: containers: - name: onecli image: {{ .Values.onecli.image }} + {{- with .Values.onecli.imagePullPolicy }} + imagePullPolicy: {{ . }} + {{- end }} ports: - containerPort: {{ .Values.onecli.gateway.port }} name: gateway diff --git a/deploy/helm/humr/templates/onedrive-transcript-template.yaml b/deploy/helm/humr/templates/onedrive-transcript-template.yaml new file mode 100644 index 00000000..1b7bcde2 --- /dev/null +++ b/deploy/helm/humr/templates/onedrive-transcript-template.yaml @@ -0,0 +1,40 @@ +{{- if .Values.onedriveTranscriptTemplate.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.onedriveTranscriptTemplate.name }} + namespace: {{ .Values.agentNamespace }} + labels: + {{- include "humr.labels" . | nindent 4 }} + humr.ai/type: agent-template +data: + spec.yaml: | + version: humr.ai/v1 + image: "{{ .Values.onedriveTranscriptTemplate.image.repository }}:{{ .Values.onedriveTranscriptTemplate.image.tag | default .Chart.AppVersion }}" + description: {{ .Values.onedriveTranscriptTemplate.description | quote }} + mounts: + - path: /home/agent + persist: true + - path: /tmp + persist: false + init: | + #!/bin/bash + # Seed home from image on first boot + if [ ! -f /home/agent/.initialized ]; then + cp -rn /app/working-dir/. /home/agent/ 2>/dev/null || true + touch /home/agent/.initialized + fi + mkdir -p /home/agent/work + env: + - name: PORT + value: "8080" + resources: + requests: + cpu: {{ .Values.onedriveTranscriptTemplate.resources.requests.cpu | quote }} + memory: {{ .Values.onedriveTranscriptTemplate.resources.requests.memory | quote }} + limits: + cpu: {{ .Values.onedriveTranscriptTemplate.resources.limits.cpu | quote }} + memory: {{ .Values.onedriveTranscriptTemplate.resources.limits.memory | quote }} + securityContext: + readOnlyRootFilesystem: false +{{- end }} diff --git a/deploy/helm/humr/values-local.yaml b/deploy/helm/humr/values-local.yaml index 85c7ef29..9929bfe9 100644 --- a/deploy/helm/humr/values-local.yaml +++ b/deploy/helm/humr/values-local.yaml @@ -56,6 +56,13 @@ codeGuardianTemplate: tag: latest pullPolicy: Never +onedriveTranscriptTemplate: + enabled: true + image: + repository: humr-onedrive-transcript + tag: latest + pullPolicy: Never + # Bootstrap a known dev/dev user for local cluster — never enabled in production. keycloak: testUser: diff --git a/deploy/helm/humr/values.yaml b/deploy/helm/humr/values.yaml index 7c21896a..be17de7d 100644 --- a/deploy/helm/humr/values.yaml +++ b/deploy/helm/humr/values.yaml @@ -381,3 +381,20 @@ codeGuardianTemplate: limits: cpu: "1" memory: "2Gi" + +# -- OneDrive transcript processing agent template +onedriveTranscriptTemplate: + enabled: false + name: onedrive-transcript + image: + repository: ghcr.io/kagenti/humr/onedrive-transcript + tag: "" + pullPolicy: IfNotPresent + description: "OneDrive Teams transcript processing agent" + resources: + requests: + cpu: "250m" + memory: "512Mi" + limits: + cpu: "1" + memory: "2Gi" diff --git a/deploy/tasks.toml b/deploy/tasks.toml index f994cd98..1b9b015a 100644 --- a/deploy/tasks.toml +++ b/deploy/tasks.toml @@ -60,7 +60,7 @@ dir = "{{config_root}}" run = 'docker build -f packages/ui/Dockerfile -t humr-ui:latest .' ["image:agent"] -description = "Build agent Docker images (humr-base + claude-code + google-workspace + pi-agent + code-guardian)" +description = "Build agent Docker images (humr-base + claude-code + google-workspace + pi-agent + code-guardian + onedrive-transcript)" dir = "{{config_root}}" run = ''' #!/usr/bin/env bash @@ -70,6 +70,7 @@ docker build -t humr-claude-code:latest packages/agents/claude-code docker build -t humr-google-workspace-agent:latest packages/agents/google-workspace docker build -t humr-pi-agent:latest packages/agents/pi-agent docker build -t humr-code-guardian:latest packages/agents/code-guardian +docker build -t humr-onedrive-transcript:latest packages/agents/onedrive-transcript ''' # -- Cluster lifecycle (k3s via lima) -- @@ -150,7 +151,7 @@ fi # 3. Load images into k3s (built by depends: image:*) echo "Loading images into k3s..." tar="/tmp/humr-images.tar" -docker save -o "$tar" humr-controller:latest humr-api-server:latest humr-ui:latest humr-claude-code:latest humr-google-workspace-agent:latest humr-pi-agent:latest humr-code-guardian:latest +docker save -o "$tar" humr-controller:latest humr-api-server:latest humr-ui:latest humr-claude-code:latest humr-google-workspace-agent:latest humr-pi-agent:latest humr-code-guardian:latest humr-onedrive-transcript:latest if [ -n "${IS_SANDBOX:-}" ]; then docker image prune --all --force >/dev/null 2>&1 || true sudo k3s ctr images import "$tar" @@ -267,7 +268,7 @@ set -eo pipefail echo "Loading into k3s..." tar="/tmp/humr-agent-images.tar" -docker save humr-claude-code:latest humr-google-workspace-agent:latest humr-pi-agent:latest humr-code-guardian:latest -o "$tar" +docker save humr-claude-code:latest humr-google-workspace-agent:latest humr-pi-agent:latest humr-code-guardian:latest humr-onedrive-transcript:latest -o "$tar" if [ -n "${IS_SANDBOX:-}" ]; then KUBECONFIG="/etc/rancher/k3s/k3s.yaml" diff --git a/packages/agents/onedrive-transcript/Dockerfile b/packages/agents/onedrive-transcript/Dockerfile new file mode 100644 index 00000000..af98e122 --- /dev/null +++ b/packages/agents/onedrive-transcript/Dockerfile @@ -0,0 +1,10 @@ +ARG BASE_IMAGE=humr-base +FROM ${BASE_IMAGE} + +# Claude Code harness + uv for running the Python VTT parser +RUN cd /app && npm install @agentclientprotocol/claude-agent-acp @anthropic-ai/claude-agent-sdk \ + && npm install -g @anthropic-ai/claude-code \ + && curl -LsSf https://astral.sh/uv/install.sh | sh \ + && ln -s /root/.local/bin/uv /usr/local/bin/uv + +COPY workspace/ /app/working-dir/ diff --git a/packages/agents/onedrive-transcript/README.md b/packages/agents/onedrive-transcript/README.md new file mode 100644 index 00000000..2ed163d3 --- /dev/null +++ b/packages/agents/onedrive-transcript/README.md @@ -0,0 +1,93 @@ +# OneDrive Transcript Processing Agent + +A Humr agent that polls Microsoft Graph for new Teams meeting transcripts, converts them into structured markdown meeting notes, and posts the results to a Slack channel. + +## How It Works + +On a cron schedule (default: every 30 minutes), the agent: + +1. Reads `state/processed.json` to skip transcripts it has already handled. +2. Lists the connected user's calendar events with Teams online meetings (`/me/events` with `isOnlineMeeting=true`). +3. For each meeting, resolves the `onlineMeeting` resource by its `joinUrl`, then lists transcripts. +4. Downloads each new transcript as VTT. +5. Parses the VTT into structured JSON (speakers, segments, duration). +6. Generates structured markdown meeting notes (subject, attendees, summary, key topics, action items, detailed notes). +7. Posts the notes to the configured Slack channel. +8. Records the processed transcript ID in `state/processed.json` (capped at 20 entries). + +Authentication to Microsoft Graph goes through OneCLI's MITM proxy — the agent uses `MICROSOFT_GRAPH_TOKEN=humr:sentinel` and the proxy swaps in a real OAuth bearer token transparently. + +### Scope and limits + +- ✅ **Scheduled Teams meetings** (those that appear on the user's calendar) are fully supported. +- ❌ **MeetNow / ad-hoc channel meetings** are not supported. They have no calendar entry, and the bulk `getAllTranscripts` API requires application permissions + a Teams Application Access Policy (heavy admin overhead). For transcripts to be processed, the meeting must be scheduled via the calendar (not started with "Meet now"). + +## Setup + +### 1. Register an Azure app + +1. Go to [Azure Portal > App registrations](https://portal.azure.com/#blade/Microsoft_AAD_RegisteredApps/ApplicationsListBlade) > **New registration**. +2. **Supported account types**: single-tenant. +3. **Redirect URI**: `http://localhost:4444/api/apps/microsoft-graph/callback` (for local dev). Production: `http:///api/apps/microsoft-graph/callback`. +4. Under **API permissions**, add Microsoft Graph **Delegated** permissions: + - `Calendars.Read` — list calendar events to find scheduled Teams meetings + - `OnlineMeetings.Read` — resolve a meeting ID from its Teams join URL + - `OnlineMeetingTranscript.Read.All` — list and download VTT transcripts (admin consent required by Microsoft policy, but the scope only grants per-user access) + - `User.Read` — sign in + - `offline_access` — refresh tokens +5. Click **Grant admin consent for {tenant}**. +6. Under **Certificates & secrets**, create a client secret. Copy the **Application (client) ID**, **Client Secret**, and **Tenant ID**. + +### 2. Connect Microsoft Graph in OneCLI + +1. Open OneCLI at http://localhost:4444 → **Apps** → **Microsoft Graph**. +2. Enter Client ID, Client Secret, Tenant ID. Click **Save**. +3. Click **Connect** to start the OAuth flow. Sign in as the user whose meeting transcripts you want to process. Approve the requested scopes. + +### 3. Grant the connection to the agent + +1. Open the Humr UI at http://humr.localhost:4444. +2. Add a new agent from the **onedrive-transcript** template. +3. Open **Configure** → **Connections** → check **Microsoft Graph**. Save. + +### 4. Configure Slack + +The agent posts via a Slack MCP server configured in the schedule's `mcpServers` field. You'll need a Slack app with `chat:write` permission and a bot token. Reference: [Slack MCP server](https://github.com/modelcontextprotocol/servers/tree/main/src/slack) (or any other Slack MCP server). + +### 5. Create a schedule + +In the Humr UI, create a schedule on the agent with: + +- **Cron**: `*/30 * * * *` (every 30 minutes) +- **Session mode**: `continuous` — the agent maintains context across runs +- **Task prompt**: e.g. + ``` + Check for new Teams meeting transcripts since the last run, process them + into meeting notes, and post each set of notes to the #meetings channel + in Slack. + ``` +- **MCP servers**: configure the Slack MCP server with the bot token + +## Workspace contents + +``` +/home/agent/work/ +├── CLAUDE.md # Agent operating manual +├── scripts/ +│ ├── fetch-new-transcripts.py # List events, resolve meetings, download new VTTs +│ ├── parse-vtt.py # VTT → structured JSON +│ └── mark-processed.py # Append entry to state/processed.json +└── state/ + └── processed.json # Last 20 processed transcripts (managed by scripts) +``` + +The workspace is persisted on the `/home/agent` PVC, so `state/processed.json` survives pod restarts. + +## Architecture + +The agent uses the Microsoft Graph REST API directly (no MCP server), with the OneCLI gateway handling token injection and refresh. The `microsoft-graph` provider in OneCLI is configured with tenant-aware token URL refresh — see `apps/web/src/lib/apps/microsoft-graph.ts` and `apps/gateway/src/apps.rs` in the OneCLI repo. + +## Future considerations + +- **Box upload**: post-processing to a Box folder (separate from Slack). Out of scope for this initial version. +- **Application permissions**: required for processing MeetNow / channel meetings. Would need a Teams Application Access Policy configured by the tenant admin (PowerShell). Not implemented today; delegated auth covers scheduled meetings only. diff --git a/packages/agents/onedrive-transcript/workspace/work/CLAUDE.md b/packages/agents/onedrive-transcript/workspace/work/CLAUDE.md new file mode 100644 index 00000000..70dce525 --- /dev/null +++ b/packages/agents/onedrive-transcript/workspace/work/CLAUDE.md @@ -0,0 +1,122 @@ +## OneDrive Transcript Processing Agent + +You are a meeting transcript processor. You retrieve Teams **scheduled** meeting transcripts via Microsoft Graph, convert them into structured meeting notes, and post the results to Slack. + +### Authentication + +Outbound HTTPS requests go through a credential-injection proxy that automatically replaces the sentinel token with a real OAuth bearer token. Use `$MICROSOFT_GRAPH_TOKEN` as the bearer token in all Graph API calls — the proxy swaps it transparently. + +### Scope and Limits + +You have **delegated** Microsoft Graph permissions. This means: + +- ✅ You can access transcripts for meetings the connected user organized or attended +- ✅ Scheduled Teams meetings (those that appear on the user's calendar) are fully supported +- ❌ MeetNow / ad-hoc channel meetings are **not supported** — they don't have calendar entries, and the delegated `getAllTranscripts` API is unavailable. If users want a transcript processed, they must schedule the meeting via the calendar (not click "Meet now") + +### Helper Scripts + +Three Python helpers live in `scripts/` and run via `uv run`. Use them instead of constructing curl pipelines by hand. + +#### `scripts/fetch-new-transcripts.py` + +Lists calendar events, resolves meeting IDs, lists transcripts, downloads VTTs to `/tmp`, and filters out anything already in `state/processed.json`. Prints a JSON array of new transcripts: + +```bash +uv run scripts/fetch-new-transcripts.py [--since ISO8601] [--state state/processed.json] +``` + +Output entries: `{subject, meetingId, transcriptId, vttPath, meetingStart}`. Default `--since` is 24 hours ago. + +#### `scripts/parse-vtt.py` + +Parses a VTT file into structured JSON (metadata, speakers, segments). Pass `--subject` and `--meeting-start` to embed meeting context in the metadata: + +```bash +uv run scripts/parse-vtt.py /tmp/transcript-XYZ.vtt \ + --subject "Meeting subject" --meeting-start "2026-04-27T13:40:00" +``` + +Prints JSON to stdout. Read this output directly — no temp file needed. + +#### `scripts/mark-processed.py` + +Appends an entry to `state/processed.json` (capped at 20): + +```bash +uv run scripts/mark-processed.py \ + --transcript-id ID --meeting-id ID --subject "Meeting subject" +``` + +### Workflow + +Each run follows this sequence: + +1. **Fetch new transcripts** — run `scripts/fetch-new-transcripts.py`. The script reads `state/processed.json` itself and only returns unprocessed entries. +2. **For each entry** in the JSON output: + - Run `scripts/parse-vtt.py` on the VTT, passing `--subject` and `--meeting-start`. + - Generate structured meeting notes from the parsed JSON (see format below). + - Post the notes to the configured Slack channel. + - Run `scripts/mark-processed.py` to record completion. +3. **No new transcripts** — exit quietly without posting anything. + +### Meeting Notes Format + +Generate notes in this markdown structure: + +```markdown +# Meeting Notes: + +**Date:** +**Duration:** +**Attendees:** + +## Summary + +<2-4 sentence executive summary of the meeting> + +## Key Topics + +### + + +### + + +## Action Items + +- [ ] — **** +- [ ] — **** + +## Detailed Notes + + blockquotes for notable direct quotes.> +``` + +### Notes Guidelines + +- **Speaker attribution**: Use first names where possible. If the VTT uses full names ("John Smith"), use "John" in the body but list full names in Attendees. +- **Summary**: Focus on decisions made and outcomes, not play-by-play. +- **Action items**: Extract explicit commitments ("I'll do X", "Can you handle Y") with the responsible person. +- **Key topics**: Group related discussion into logical topics rather than following strict chronological order. +- **Direct quotes**: Use sparingly — only for important statements, decisions, or commitments. +- **Filler removal**: Omit filler words, false starts, and crosstalk artifacts from the VTT. + +### State Tracking + +`state/processed.json` is managed entirely by the helper scripts — `fetch-new-transcripts.py` reads it to filter, `mark-processed.py` appends to it. Do not edit it manually. Structure: + +```json +{ + "processed": [ + {"id": "", "meetingId": "", "subject": "", "processedAt": ""} + ] +} +``` + +### Tips + +- A meeting can have multiple transcripts (transcription started/stopped multiple times). Process each independently. +- If `fetch-new-transcripts.py` returns `[]`, there's nothing to do — exit quietly. +- If the Slack post fails for one transcript, log the error but continue with the rest. Don't mark a transcript as processed if its notes weren't successfully delivered. diff --git a/packages/agents/onedrive-transcript/workspace/work/scripts/fetch-new-transcripts.py b/packages/agents/onedrive-transcript/workspace/work/scripts/fetch-new-transcripts.py new file mode 100644 index 00000000..bf05a236 --- /dev/null +++ b/packages/agents/onedrive-transcript/workspace/work/scripts/fetch-new-transcripts.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.11" +# /// +"""Fetch all new Teams meeting transcripts not yet recorded in state/processed.json. + +Downloads each new transcript VTT to /tmp and prints a JSON array to stdout: + [{"subject", "meetingId", "transcriptId", "vttPath", "meetingStart"}, ...] + +Per-meeting and per-transcript failures are isolated: a failure on one +transcript is logged to stderr and the script continues with the rest. +The exit code is non-zero only if the initial calendar listing fails +(nothing else can proceed without it). + +Usage: + uv run scripts/fetch-new-transcripts.py [--since ISO8601] [--state PATH] + +Defaults: + --since 24 hours ago + --state state/processed.json +""" + +import argparse +import json +import os +import sys +import urllib.parse +import urllib.request +from datetime import datetime, timezone, timedelta +from pathlib import Path + + +GRAPH = "https://graph.microsoft.com/v1.0" + +# Network timeout for all Graph calls. Graph APIs typically respond in <1s +# but can stall on backend issues; 30s is generous without hanging the agent. +TIMEOUT = 30 + + +def graph_get(path: str, token: str) -> dict: + url = path if path.startswith("http") else f"{GRAPH}{path}" + req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"}) + with urllib.request.urlopen(req, timeout=TIMEOUT) as resp: + return json.loads(resp.read()) + + +def graph_get_bytes(path: str, token: str, accept: str) -> bytes: + url = path if path.startswith("http") else f"{GRAPH}{path}" + req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}", "Accept": accept}) + with urllib.request.urlopen(req, timeout=TIMEOUT) as resp: + return resp.read() + + +def get_all_pages(first_url: str, token: str) -> list: + items = [] + url = first_url + while url: + data = graph_get(url, token) + items.extend(data.get("value", [])) + url = data.get("@odata.nextLink") + return items + + +def load_processed_ids(state_path: Path) -> set: + if not state_path.exists(): + return set() + data = json.loads(state_path.read_text()) + return {e["id"] for e in data.get("processed", [])} + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--since", default=None, help="ISO8601 start datetime (default: 24h ago)") + parser.add_argument("--state", default="state/processed.json", help="Path to processed.json") + args = parser.parse_args() + + token = os.environ.get("MICROSOFT_GRAPH_TOKEN") + if not token: + print("Error: MICROSOFT_GRAPH_TOKEN not set", file=sys.stderr) + sys.exit(1) + + since = args.since or ( + datetime.now(timezone.utc) - timedelta(hours=24) + ).strftime("%Y-%m-%dT%H:%M:%SZ") + + processed_ids = load_processed_ids(Path(args.state)) + + since_enc = urllib.parse.quote(since) + # Failure here is fatal — without the calendar list we have nothing to work with. + events = get_all_pages( + f"{GRAPH}/me/events?$filter=start/dateTime%20ge%20'{since_enc}'" + f"&$select=id,subject,start,isOnlineMeeting,onlineMeeting&$top=50&$orderby=start/dateTime%20desc", + token, + ) + + results = [] + + for event in events: + if not event.get("isOnlineMeeting"): + continue + join_url = (event.get("onlineMeeting") or {}).get("joinUrl") + if not join_url: + continue + subject = event.get("subject", "") + meeting_start = (event.get("start") or {}).get("dateTime", "") + event_id = event.get("id", "") + + # Resolve meeting resource ID from join URL. + try: + join_url_enc = urllib.parse.quote(join_url, safe="") + meeting_resp = graph_get( + f"{GRAPH}/me/onlineMeetings?$filter=JoinWebUrl%20eq%20'{join_url_enc}'", + token, + ) + except Exception as e: + print(f"warn: failed to resolve meeting for event {event_id} ({subject}): {e}", file=sys.stderr) + continue + meetings = meeting_resp.get("value", []) + if not meetings: + continue + meeting_id = meetings[0]["id"] + + # List transcripts for the meeting. + try: + transcripts_resp = graph_get( + f"{GRAPH}/me/onlineMeetings/{meeting_id}/transcripts", token + ) + except Exception as e: + print(f"warn: failed to list transcripts for {subject} ({meeting_id}): {e}", file=sys.stderr) + continue + + for transcript in transcripts_resp.get("value", []): + transcript_id = transcript["id"] + if transcript_id in processed_ids: + continue + + # Download VTT. Per-transcript failure must NOT lose the rest. + try: + vtt_path = f"/tmp/transcript-{transcript_id[:20]}.vtt" + content = graph_get_bytes( + f"{GRAPH}/me/onlineMeetings/{meeting_id}/transcripts/{transcript_id}/content?$format=text/vtt", + token, + accept="text/vtt", + ) + Path(vtt_path).write_bytes(content) + except Exception as e: + print( + f"warn: failed to download transcript {transcript_id[:20]}… for {subject}: {e}", + file=sys.stderr, + ) + continue + + results.append({ + "subject": subject, + "meetingId": meeting_id, + "transcriptId": transcript_id, + "vttPath": vtt_path, + "meetingStart": meeting_start, + }) + + print(json.dumps(results, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/packages/agents/onedrive-transcript/workspace/work/scripts/mark-processed.py b/packages/agents/onedrive-transcript/workspace/work/scripts/mark-processed.py new file mode 100644 index 00000000..0bff0d59 --- /dev/null +++ b/packages/agents/onedrive-transcript/workspace/work/scripts/mark-processed.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.11" +# /// +"""Append a transcript entry to state/processed.json, keeping the last 20. + +Usage: + uv run scripts/mark-processed.py \\ + --transcript-id ID --meeting-id ID --subject TEXT [--state PATH] +""" + +import argparse +import json +from datetime import datetime, timezone +from pathlib import Path + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--transcript-id", required=True) + parser.add_argument("--meeting-id", required=True) + parser.add_argument("--subject", required=True) + parser.add_argument("--state", default="state/processed.json") + args = parser.parse_args() + + state_path = Path(args.state) + state_path.parent.mkdir(parents=True, exist_ok=True) + + if state_path.exists(): + data = json.loads(state_path.read_text()) + else: + data = {"processed": []} + + data["processed"].append({ + "id": args.transcript_id, + "meetingId": args.meeting_id, + "subject": args.subject, + "processedAt": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + }) + data["processed"] = data["processed"][-20:] + + state_path.write_text(json.dumps(data, indent=2) + "\n") + print(f"Marked {args.transcript_id[:20]}… as processed ({len(data['processed'])} total)") + + +if __name__ == "__main__": + main() diff --git a/packages/agents/onedrive-transcript/workspace/work/scripts/parse-vtt.py b/packages/agents/onedrive-transcript/workspace/work/scripts/parse-vtt.py new file mode 100644 index 00000000..fdd41f5a --- /dev/null +++ b/packages/agents/onedrive-transcript/workspace/work/scripts/parse-vtt.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +"""Parse a WebVTT transcript into structured JSON. + +Usage: + uv run scripts/parse-vtt.py [--output ] \\ + [--subject "Meeting subject"] [--meeting-start "2026-04-27T13:40:00"] + +Reads a VTT file (Teams meeting transcript format) and produces JSON with: +- metadata (filename, total duration, speaker count, optional subject + meeting_start) +- speakers (list of unique speakers) +- segments (merged consecutive same-speaker blocks with timestamps) +""" + +import argparse +import json +import re +import sys +from pathlib import Path + + +def parse_timestamp(ts: str) -> float: + """Convert VTT timestamp (HH:MM:SS.mmm) to seconds.""" + parts = ts.strip().replace(",", ".").split(":") + if len(parts) == 3: + h, m, s = parts + return int(h) * 3600 + int(m) * 60 + float(s) + if len(parts) == 2: + m, s = parts + return int(m) * 60 + float(s) + return float(parts[0]) + + +def format_duration(seconds: float) -> str: + """Format seconds as HH:MM:SS.""" + h = int(seconds // 3600) + m = int((seconds % 3600) // 60) + s = int(seconds % 60) + if h > 0: + return f"{h}:{m:02d}:{s:02d}" + return f"{m}:{s:02d}" + + +def parse_vtt(content: str) -> dict: + """Parse VTT content into structured data.""" + lines = content.strip().splitlines() + + # Skip BOM and WEBVTT header + start = 0 + for i, line in enumerate(lines): + cleaned = line.strip().lstrip("") + if cleaned.startswith("WEBVTT"): + start = i + 1 + break + else: + start = 0 + + # Skip any header metadata lines (NOTE, empty lines after WEBVTT) + while start < len(lines) and (not lines[start].strip() or lines[start].strip().startswith("NOTE")): + start += 1 + + # Parse cue blocks + timestamp_re = re.compile( + r"(\d{1,2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{1,2}:\d{2}:\d{2}[.,]\d{3})" + ) + # Teams VTT speaker tag: text or text + speaker_re = re.compile(r"]+)>(.*?)(?:)?$") + + raw_cues: list[dict] = [] + i = start + while i < len(lines): + line = lines[i].strip() + + # Look for timestamp line + m = timestamp_re.search(line) + if m: + start_ts = parse_timestamp(m.group(1)) + end_ts = parse_timestamp(m.group(2)) + + # Collect text lines until next blank line or timestamp + text_lines = [] + i += 1 + while i < len(lines) and lines[i].strip() and not timestamp_re.search(lines[i]): + text_lines.append(lines[i].strip()) + i += 1 + + text = " ".join(text_lines) + + # Extract speaker if present + speaker = None + sm = speaker_re.match(text) + if sm: + speaker = sm.group(1).strip() + text = sm.group(2).strip() + # Handle remaining lines that may not have speaker tags + if not text and text_lines: + text = " ".join(text_lines) + + # Strip any remaining VTT tags + text = re.sub(r"<[^>]+>", "", text).strip() + + if text: + raw_cues.append({ + "start": start_ts, + "end": end_ts, + "speaker": speaker, + "text": text, + }) + else: + i += 1 + + # Merge consecutive cues from the same speaker + segments: list[dict] = [] + for cue in raw_cues: + if segments and segments[-1]["speaker"] == cue["speaker"]: + segments[-1]["end"] = cue["end"] + segments[-1]["text"] += " " + cue["text"] + else: + segments.append({ + "start": cue["start"], + "end": cue["end"], + "speaker": cue["speaker"], + "text": cue["text"], + }) + + # Format timestamps in segments for output + for seg in segments: + seg["start_fmt"] = format_duration(seg["start"]) + seg["end_fmt"] = format_duration(seg["end"]) + + speakers = sorted({s["speaker"] for s in segments if s["speaker"]}) + total_duration = max((s["end"] for s in segments), default=0) + + return { + "metadata": { + "speaker_count": len(speakers), + "segment_count": len(segments), + "duration": format_duration(total_duration), + "duration_seconds": round(total_duration, 1), + }, + "speakers": speakers, + "segments": [ + { + "speaker": s["speaker"], + "start": s["start_fmt"], + "end": s["end_fmt"], + "text": s["text"], + } + for s in segments + ], + } + + +def main(): + parser = argparse.ArgumentParser(description="Parse VTT transcript to structured JSON") + parser.add_argument("input", help="Path to .vtt file") + parser.add_argument("--output", "-o", help="Output JSON path (default: stdout)") + parser.add_argument("--subject", help="Meeting subject to embed in metadata") + parser.add_argument("--meeting-start", help="Meeting start datetime (ISO8601) to embed in metadata") + args = parser.parse_args() + + input_path = Path(args.input) + if not input_path.exists(): + print(f"Error: {input_path} not found", file=sys.stderr) + sys.exit(1) + + # Try common encodings for VTT files + content = None + for encoding in ("utf-8-sig", "utf-8", "utf-16", "latin-1"): + try: + content = input_path.read_text(encoding=encoding) + break + except (UnicodeDecodeError, UnicodeError): + continue + + if content is None: + print(f"Error: could not decode {input_path}", file=sys.stderr) + sys.exit(1) + + result = parse_vtt(content) + result["metadata"]["source_file"] = input_path.name + if args.subject: + result["metadata"]["subject"] = args.subject + if args.meeting_start: + result["metadata"]["meeting_start"] = args.meeting_start + + output = json.dumps(result, indent=2, ensure_ascii=False) + + if args.output: + Path(args.output).write_text(output + "\n") + print(f"Written to {args.output}", file=sys.stderr) + else: + print(output) + + +if __name__ == "__main__": + main()