fix(onedrive-transcript): harden fetch-new-transcripts; ignore .tmp/

JanPokorny · JanPokorny · commit 2e7f4eb80c16 · 2026-04-28T12:18:59.000+02:00
- Wrap each meeting/transcript Graph call in try/except so a single
  failure (rate limit, transient 5xx, missing meeting) only loses that
  one item instead of aborting the whole run.
- Add timeout=30 to all urlopen calls so a stalled Graph response can't
  hang the agent indefinitely.
- Add .tmp/ to .gitignore — local-development scratch dir for temp
  clones, build outputs, etc., should never be committed.

Signed-off-by: Jan Pokorný &lt;JenomPokorny@gmail.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -42,3 +42,6 @@ context/
 # Playwright
 .playwright-mcp
 
+# Local development scratch dir (temp clones, build outputs, etc.)
+.tmp/
+
diff --git a/packages/agents/onedrive-transcript/workspace/work/scripts/fetch-new-transcripts.py b/packages/agents/onedrive-transcript/workspace/work/scripts/fetch-new-transcripts.py
@@ -7,6 +7,11 @@
 Downloads each new transcript VTT to /tmp and prints a JSON array to stdout:
   [{"subject", "meetingId", "transcriptId", "vttPath", "meetingStart"}, ...]
 
+Per-meeting and per-transcript failures are isolated: a failure on one
+transcript is logged to stderr and the script continues with the rest.
+The exit code is non-zero only if the initial calendar listing fails
+(nothing else can proceed without it).
+
 Usage:
     uv run scripts/fetch-new-transcripts.py [--since ISO8601] [--state PATH]
 
@@ -27,18 +32,22 @@
 
 GRAPH = "https://graph.microsoft.com/v1.0"
 
+# Network timeout for all Graph calls. Graph APIs typically respond in <1s
+# but can stall on backend issues; 30s is generous without hanging the agent.
+TIMEOUT = 30
+
 
 def graph_get(path: str, token: str) -> dict:
     url = path if path.startswith("http") else f"{GRAPH}{path}"
     req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
-    with urllib.request.urlopen(req) as resp:
+    with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
         return json.loads(resp.read())
 
 
 def graph_get_bytes(path: str, token: str, accept: str) -> bytes:
     url = path if path.startswith("http") else f"{GRAPH}{path}"
     req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}", "Accept": accept})
-    with urllib.request.urlopen(req) as resp:
+    with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
         return resp.read()
 
 
@@ -77,6 +86,7 @@ def main():
     processed_ids = load_processed_ids(Path(args.state))
 
     since_enc = urllib.parse.quote(since)
+    # Failure here is fatal — without the calendar list we have nothing to work with.
     events = get_all_pages(
         f"{GRAPH}/me/events?$filter=start/dateTime%20ge%20'{since_enc}'"
         f"&$select=id,subject,start,isOnlineMeeting,onlineMeeting&$top=50&$orderby=start/dateTime%20desc",
@@ -93,35 +103,52 @@ def main():
             continue
         subject = event.get("subject", "")
         meeting_start = (event.get("start") or {}).get("dateTime", "")
+        event_id = event.get("id", "<unknown>")
 
-        # Resolve meeting resource ID from join URL
-        join_url_enc = urllib.parse.quote(join_url, safe="")
-        meeting_resp = graph_get(
-            f"{GRAPH}/me/onlineMeetings?$filter=JoinWebUrl%20eq%20'{join_url_enc}'",
-            token,
-        )
+        # Resolve meeting resource ID from join URL.
+        try:
+            join_url_enc = urllib.parse.quote(join_url, safe="")
+            meeting_resp = graph_get(
+                f"{GRAPH}/me/onlineMeetings?$filter=JoinWebUrl%20eq%20'{join_url_enc}'",
+                token,
+            )
+        except Exception as e:
+            print(f"warn: failed to resolve meeting for event {event_id} ({subject}): {e}", file=sys.stderr)
+            continue
         meetings = meeting_resp.get("value", [])
         if not meetings:
             continue
         meeting_id = meetings[0]["id"]
 
-        # List transcripts
-        transcripts_resp = graph_get(
-            f"{GRAPH}/me/onlineMeetings/{meeting_id}/transcripts", token
-        )
+        # List transcripts for the meeting.
+        try:
+            transcripts_resp = graph_get(
+                f"{GRAPH}/me/onlineMeetings/{meeting_id}/transcripts", token
+            )
+        except Exception as e:
+            print(f"warn: failed to list transcripts for {subject} ({meeting_id}): {e}", file=sys.stderr)
+            continue
+
         for transcript in transcripts_resp.get("value", []):
             transcript_id = transcript["id"]
             if transcript_id in processed_ids:
                 continue
 
-            # Download VTT
-            vtt_path = f"/tmp/transcript-{transcript_id[:20]}.vtt"
-            content = graph_get_bytes(
-                f"{GRAPH}/me/onlineMeetings/{meeting_id}/transcripts/{transcript_id}/content?$format=text/vtt",
-                token,
-                accept="text/vtt",
-            )
-            Path(vtt_path).write_bytes(content)
+            # Download VTT. Per-transcript failure must NOT lose the rest.
+            try:
+                vtt_path = f"/tmp/transcript-{transcript_id[:20]}.vtt"
+                content = graph_get_bytes(
+                    f"{GRAPH}/me/onlineMeetings/{meeting_id}/transcripts/{transcript_id}/content?$format=text/vtt",
+                    token,
+                    accept="text/vtt",
+                )
+                Path(vtt_path).write_bytes(content)
+            except Exception as e:
+                print(
+                    f"warn: failed to download transcript {transcript_id[:20]}… for {subject}: {e}",
+                    file=sys.stderr,
+                )
+                continue
 
             results.append({
                 "subject": subject,