feat: add clip titles and richer classification metadata

renesansz · renesansz · commit 4d1aa71f8db9 · 2026-04-27T08:28:47.000+08:00
- Generate and normalize a short title from summarized captions; persist on item records
- Add videos.title column with ALTER migration; index title in search and return it from queries
- Surface title in the web UI, CLI search output, and demo fixtures
- Update caption/summary prompts (title + summary shape, expanded tag guidance)
- Default vision model to gemma4:e4b
- Remove output frames/items .gitkeep placeholders
- Added uv.lock file

Signed-off-by: Rene Padillo &lt;developer@renesansz.me&gt;
diff --git a/output/frames/.gitkeep b/output/frames/.gitkeep
diff --git a/output/items/.gitkeep b/output/items/.gitkeep
diff --git a/src/argus/captioner.py b/src/argus/captioner.py
@@ -15,15 +15,17 @@
     "You are classifying a frame from silent B-roll footage for a searchable local media library. "
     "Return strict JSON with keys short_caption, tags, and visible_text. "
     "short_caption must be one concise sentence with only visible facts. "
-    "tags must be 3 to 8 short lowercase visual tags. "
+    "tags must be 35 to 49 short lowercase visual tags and it should accurately contains or represents the shot. "
     "visible_text must be a list of short strings that are actually readable in frame; otherwise return an empty list. "
     "Do not use markdown, headings, preambles, apologies, or questions."
 )
 
 VIDEO_SUMMARY_PROMPT = (
     "You are classifying silent B-roll footage for a local media library. "
-    "Based on timestamped frame captions, produce a concise overall summary and a useful list of search tags. "
-    "Prefer concrete visual terms over abstract adjectives. "
+    "Based on timestamped frame captions, produce: (1) a short title, (2) a fuller summary, and (3) search tags. "
+    "The title must be an attention-grabbing one-line statement that includes the main topic keywords from the clip. "
+    "The summary should give more context and purpose of the clip than the title. "
+    "Prefer concrete visual terms over abstract adjectives; be original, unique, and succinct. "
     "Do not include conversational filler. "
     "Only include brand names or readable on-screen text if they are visually clear and central to the clip."
 )
@@ -204,6 +206,7 @@ def caption_item_record(
         )
         if summary_result["status"] == "ok":
             record["classification_status"] = "captions_ready"
+            record["title"] = summary_result["title"]
             record["summary"] = summary_result["summary"]
             record["suggested_tags"] = summary_result["suggested_tags"]
             record["classification"] = {
@@ -287,9 +290,12 @@ def summarize_captions(
                 "content": (
                     "Frame captions:\n"
                     + "\n".join(lines)
-                    + "\n\nReturn strict JSON with keys summary and suggested_tags. "
-                    + "summary should be 1 to 2 sentences. "
-                    + "suggested_tags should be 5 to 12 short lowercase tags."
+                    + "\n\nReturn strict JSON with keys title, summary, and suggested_tags. "
+                    + "title: one short line, attention-grabbing, must include main topic keywords; "
+                    + "maximum 100 characters, no line breaks, no quotes wrapping the whole title. "
+                    + "summary: Minimum 15 characters - Maximum 200 characters, Minimum 5 words; "
+                    + "more detailed than the title. "
+                    + "suggested_tags should be 35 to 49 short lowercase tags."
                 ),
             },
         ],
@@ -311,9 +317,21 @@ def summarize_captions(
     except json.JSONDecodeError as exc:
         return {"status": "error", "reason": f"invalid summary json: {exc}"}
 
+    title = parsed.get("title")
     summary = parsed.get("summary")
     suggested_tags = parsed.get("suggested_tags")
-    if not isinstance(summary, str) or not isinstance(suggested_tags, list):
+    if (
+        not isinstance(title, str)
+        or not isinstance(summary, str)
+        or not isinstance(suggested_tags, list)
+    ):
+        return {
+            "status": "error",
+            "reason": "summary response missing required fields",
+        }
+
+    normalized_title = normalize_clip_title(title)
+    if not normalized_title:
         return {
             "status": "error",
             "reason": "summary response missing required fields",
@@ -326,6 +344,7 @@ def summarize_captions(
     ]
     return {
         "status": "ok",
+        "title": normalized_title,
         "summary": normalize_sentence(summary),
         "suggested_tags": normalize_tags(cleaned_tags),
     }
@@ -510,6 +529,20 @@ def normalize_sentence(value: str) -> str:
     return collapsed
 
 
+def normalize_clip_title(value: str, *, max_len: int = 100) -> str:
+    """Collapse whitespace and cap length for storage and display."""
+    text = normalize_sentence(value)
+    if not text or len(text) <= max_len:
+        return text
+    chunk = text[: max_len + 1]
+    cut = chunk.rfind(" ", 0, max_len + 1)
+    if cut >= max(1, max_len // 2):
+        base = chunk[:cut].rstrip(" ,.;:!?")
+    else:
+        base = text[:max_len].rstrip(" ,.;:!?")
+    return base[:max_len]
+
+
 def normalize_tags(values: list[str]) -> list[str]:
     normalized: list[str] = []
     seen: set[str] = set()
diff --git a/src/argus/cli.py b/src/argus/cli.py
@@ -410,6 +410,7 @@ def main(argv: list[str] | None = None) -> int:
             return 0
         for index, result in enumerate(results, start=1):
             tags = ", ".join(result["suggested_tags"][:6])
+            title = result.get("title") or ""
             summary = result["summary"] or ""
             match_text = result["match_text"] or ""
             print(f"{index}. {result['filename']}")
@@ -422,6 +423,8 @@ def main(argv: list[str] | None = None) -> int:
                 )
             if tags:
                 print(f"   Tags: {tags}")
+            if title:
+                print(f"   Title: {title}")
             if summary:
                 print(f"   Summary: {summary}")
             if match_text:
diff --git a/src/argus/config.py b/src/argus/config.py
@@ -1,4 +1,4 @@
 from __future__ import annotations
 
 DEFAULT_OLLAMA_HOST = "http://localhost:11434"
-DEFAULT_VISION_MODEL = "gemma3"
+DEFAULT_VISION_MODEL = "gemma4:e4b"
diff --git a/src/argus/database.py b/src/argus/database.py
@@ -52,6 +52,7 @@ def search_index(db_path: Path, query: str, *, limit: int = 10) -> list[dict]:
               videos.filename,
               videos.path,
               videos.classification_status,
+              videos.title,
               videos.summary,
               videos.suggested_tags_json,
               videos.duration_seconds,
@@ -78,6 +79,7 @@ def search_index(db_path: Path, query: str, *, limit: int = 10) -> list[dict]:
                 "filename": row["filename"],
                 "path": row["path"],
                 "classification_status": row["classification_status"],
+                "title": row["title"],
                 "summary": row["summary"],
                 "suggested_tags": json.loads(row["suggested_tags_json"] or "[]"),
                 "duration_seconds": row["duration_seconds"],
@@ -118,6 +120,7 @@ def query_videos(
               filename,
               path,
               classification_status,
+              title,
               summary,
               suggested_tags_json,
               duration_seconds,
@@ -191,6 +194,7 @@ def create_schema(connection: sqlite3.Connection) -> None:
           height INTEGER,
           frame_rate REAL,
           has_audio INTEGER,
+          title TEXT,
           summary TEXT,
           suggested_tags_json TEXT NOT NULL,
           classification_json TEXT,
@@ -217,6 +221,15 @@ def create_schema(connection: sqlite3.Connection) -> None:
         );
         """
     )
+    ensure_videos_title_column(connection)
+
+
+def ensure_videos_title_column(connection: sqlite3.Connection) -> None:
+    """Add title column when upgrading an existing database."""
+    rows = connection.execute("PRAGMA table_info(videos)").fetchall()
+    column_names = {row[1] for row in rows}
+    if "title" not in column_names:
+        connection.execute("ALTER TABLE videos ADD COLUMN title TEXT")
 
 
 def reset_index(connection: sqlite3.Connection) -> None:
@@ -235,9 +248,9 @@ def index_item_record(connection: sqlite3.Connection, record: dict) -> int:
         INSERT INTO videos (
           id, filename, path, extension, file_created_at, file_modified_at,
           classification_status, audio_required, duration_seconds, codec,
-          width, height, frame_rate, has_audio, summary, suggested_tags_json,
+          width, height, frame_rate, has_audio, title, summary, suggested_tags_json,
           classification_json, raw_json
-        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
         """,
         (
             record.get("id"),
@@ -254,6 +267,7 @@ def index_item_record(connection: sqlite3.Connection, record: dict) -> int:
             video.get("height"),
             video.get("frame_rate"),
             none_to_int(media.get("has_audio")),
+            record.get("title"),
             record.get("summary"),
             json.dumps(suggested_tags),
             json.dumps(record.get("classification", {})),
@@ -266,6 +280,7 @@ def index_item_record(connection: sqlite3.Connection, record: dict) -> int:
     searchable_chunks = [
         record.get("filename", ""),
         record.get("path", ""),
+        record.get("title", "") or "",
         record.get("summary", ""),
         " ".join(suggested_tags),
     ]
@@ -375,6 +390,7 @@ def row_to_result(row: sqlite3.Row, *, match_text: str) -> dict:
         "filename": row["filename"],
         "path": row["path"],
         "classification_status": row["classification_status"],
+        "title": row["title"],
         "summary": row["summary"],
         "suggested_tags": json.loads(row["suggested_tags_json"] or "[]"),
         "duration_seconds": row["duration_seconds"],
diff --git a/src/argus/serve.py b/src/argus/serve.py
@@ -16,6 +16,7 @@
         "filename": "clip-001.mp4",
         "path": "/Volumes/Media/Project-A/clip-001.mp4",
         "classification_status": "captions_ready",
+        "title": "Office hallway walk with laptop in hand",
         "summary": "A person walks through a bright office hallway while carrying a laptop.",
         "suggested_tags": ["office", "hallway", "person", "walking", "laptop"],
         "duration_seconds": 14.2,
@@ -29,6 +30,7 @@
         "filename": "clip-002.mp4",
         "path": "/Volumes/Media/Project-B/clip-002.mp4",
         "classification_status": "captions_ready",
+        "title": "Hands packing product boxes on a worktable",
         "summary": "Close-up footage of hands arranging product boxes on a worktable.",
         "suggested_tags": ["close-up", "hands", "boxes", "table", "product"],
         "duration_seconds": 9.6,
@@ -42,6 +44,7 @@
         "filename": "clip-003.mp4",
         "path": "/Volumes/Media/Project-C/clip-003.mp4",
         "classification_status": "captions_ready",
+        "title": "Busy storefront exterior with shoppers coming and going",
         "summary": "Wide exterior shot of a storefront with people entering and leaving.",
         "suggested_tags": ["exterior", "wide shot", "storefront", "people"],
         "duration_seconds": 22.8,
@@ -346,6 +349,14 @@ def render_index_html(*, demo_mode: bool = False) -> str:
       font-size: 0.82rem;
       word-break: break-all;
     }
+    .clip-title {
+      margin: 0 0 0.35rem;
+      font-size: 1.05rem;
+      font-weight: 600;
+      line-height: 1.35;
+      color: var(--ink);
+      letter-spacing: -0.01em;
+    }
     .summary, .match {
       margin: 0;
       line-height: 1.55;
@@ -459,7 +470,7 @@ def render_index_html(*, demo_mode: bool = False) -> str:
     <section class="controls panel">
       <div class="control">
         <label for="query">Search</label>
-        <input id="query" type="search" placeholder="Search by filename, tag, summary, caption, or visible text" autocomplete="off">
+        <input id="query" type="search" placeholder="Search by filename, title, tag, summary, caption, or visible text" autocomplete="off">
       </div>
       <div class="control">
         <label for="status">Status</label>
@@ -544,6 +555,7 @@ def render_index_html(*, demo_mode: bool = False) -> str:
 
       resultsEl.innerHTML = results.map((result) => {
         const tags = (result.suggested_tags || []).map((tag) => `<span class="tag">${tag}</span>`).join("");
+        const clipTitle = result.title ? `<p class="clip-title">${result.title}</p>` : "";
         const summary = result.summary ? `<p class="summary">${result.summary}</p>` : "";
         const match = result.match_text ? `<p class="match">${highlightBrackets(result.match_text)}</p>` : "";
         const duration = typeof result.duration_seconds === "number" ? `${result.duration_seconds.toFixed(2)}s` : "unknown";
@@ -561,6 +573,7 @@ def render_index_html(*, demo_mode: bool = False) -> str:
               <span>${duration}</span>
               <span>${resolution}</span>
             </div>
+            ${clipTitle}
             ${summary}
             ${match}
             <div class="tags">${tags}</div>
@@ -584,6 +597,7 @@ def render_index_html(*, demo_mode: bool = False) -> str:
           const haystack = [
             result.filename,
             result.path,
+            result.title,
             result.summary,
             ...(result.suggested_tags || []),
             result.match_text
diff --git a/tests/test_scanner.py b/tests/test_scanner.py
@@ -10,6 +10,7 @@
 from argus.captioner import (
     caption_output_items,
     match_ollama_model,
+    normalize_clip_title,
     normalize_tags,
     summarize_captions,
 )
@@ -122,7 +123,10 @@ def test_dependency_report_handles_missing_ollama_api(self, urlopen_mock) -> Non
     def test_summarize_captions_parses_json_response(self, ollama_chat_mock) -> None:
         ollama_chat_mock.return_value = {
             "message": {
-                "content": '{"summary":"Wide exterior drone footage.","suggested_tags":["Drone","aerial","drone"]}'
+                "content": (
+                    '{"title":"Aerial drone view over roads and fields.",'
+                    '"summary":"Wide exterior drone footage.","suggested_tags":["Drone","aerial","drone"]}'
+                )
             }
         }
 
@@ -133,9 +137,33 @@ def test_summarize_captions_parses_json_response(self, ollama_chat_mock) -> None
         )
 
         self.assertEqual(result["status"], "ok")
+        self.assertEqual(result["title"], "Aerial drone view over roads and fields.")
+        self.assertLessEqual(len(result["title"]), 100)
         self.assertEqual(result["summary"], "Wide exterior drone footage.")
         self.assertEqual(result["suggested_tags"], ["drone", "aerial"])
 
+    @patch("argus.captioner.ollama_chat")
+    def test_summarize_captions_rejects_missing_title(self, ollama_chat_mock) -> None:
+        ollama_chat_mock.return_value = {
+            "message": {
+                "content": '{"summary":"Wide exterior drone footage.","suggested_tags":["drone"]}'
+            }
+        }
+
+        result = summarize_captions(
+            [{"timestamp_seconds": 1.0, "caption": "Drone shot over a road."}],
+            model="gemma3",
+            ollama_host="http://localhost:11434",
+        )
+
+        self.assertEqual(result["status"], "error")
+        self.assertIn("required fields", result["reason"])
+
+    def test_normalize_clip_title_truncates_to_max_length(self) -> None:
+        long_title = "word " * 40
+        out = normalize_clip_title(long_title, max_len=100)
+        self.assertLessEqual(len(out), 100)
+
     def test_normalize_tags_lowercases_and_deduplicates(self) -> None:
         result = normalize_tags(["Drone", " aerial ", "drone", ""])
 
@@ -276,6 +304,7 @@ def test_index_output_items_and_search_index(self) -> None:
                         "frame_rate": 24.0,
                     },
                 },
+                "title": "Warehouse aisle chat beside stacked boxes",
                 "summary": "Two men talk in a warehouse aisle beside stacked cardboard boxes.",
                 "suggested_tags": ["warehouse", "boxes", "conversation"],
                 "classification": {"model": "gemma3"},
@@ -299,11 +328,15 @@ def test_index_output_items_and_search_index(self) -> None:
 
             report = index_output_items(output)
             results = search_index(Path(report["db_path"]), "warehouse", limit=5)
+            title_hits = search_index(Path(report["db_path"]), "aisle", limit=5)
 
         self.assertEqual(report["indexed_videos"], 1)
         self.assertEqual(report["indexed_frames"], 1)
         self.assertEqual(len(results), 1)
         self.assertEqual(results[0]["filename"], "warehouse.mp4")
+        self.assertEqual(results[0]["title"], "Warehouse aisle chat beside stacked boxes")
+        self.assertEqual(len(title_hits), 1)
+        self.assertEqual(title_hits[0]["title"], "Warehouse aisle chat beside stacked boxes")
 
     @patch("argus.cli.index_output_items")
     @patch("argus.cli.caption_output_items")
@@ -354,6 +387,7 @@ def test_query_videos_without_search_returns_recent_rows(self) -> None:
                     "classification_status": "captions_ready",
                     "audio_required": False,
                     "media": {"video": {}},
+                    "title": f"clip title {index}",
                     "summary": f"clip {index}",
                     "suggested_tags": [f"tag-{index}"],
                     "sample_frames": {"frames": []},
@@ -367,6 +401,7 @@ def test_query_videos_without_search_returns_recent_rows(self) -> None:
 
         self.assertEqual(len(results), 2)
         self.assertEqual(results[0]["filename"], "clip-1.mp4")
+        self.assertEqual(results[0]["title"], "clip title 1")
 
     @patch("argus.captioner.captioning_preflight")
     @patch("argus.captioner.caption_item_record")
diff --git a/uv.lock b/uv.lock