Skip to content

Commit 4d1aa71

Browse files
committed
feat: add clip titles and richer classification metadata
- Generate and normalize a short title from summarized captions; persist on item records - Add videos.title column with ALTER migration; index title in search and return it from queries - Surface title in the web UI, CLI search output, and demo fixtures - Update caption/summary prompts (title + summary shape, expanded tag guidance) - Default vision model to gemma4:e4b - Remove output frames/items .gitkeep placeholders - Added uv.lock file Signed-off-by: Rene Padillo <developer@renesansz.me>
1 parent de0265e commit 4d1aa71

9 files changed

Lines changed: 121 additions & 14 deletions

File tree

output/frames/.gitkeep

Lines changed: 0 additions & 1 deletion
This file was deleted.

output/items/.gitkeep

Lines changed: 0 additions & 1 deletion
This file was deleted.

src/argus/captioner.py

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,17 @@
1515
"You are classifying a frame from silent B-roll footage for a searchable local media library. "
1616
"Return strict JSON with keys short_caption, tags, and visible_text. "
1717
"short_caption must be one concise sentence with only visible facts. "
18-
"tags must be 3 to 8 short lowercase visual tags. "
18+
"tags must be 35 to 49 short lowercase visual tags and it should accurately contains or represents the shot. "
1919
"visible_text must be a list of short strings that are actually readable in frame; otherwise return an empty list. "
2020
"Do not use markdown, headings, preambles, apologies, or questions."
2121
)
2222

2323
VIDEO_SUMMARY_PROMPT = (
2424
"You are classifying silent B-roll footage for a local media library. "
25-
"Based on timestamped frame captions, produce a concise overall summary and a useful list of search tags. "
26-
"Prefer concrete visual terms over abstract adjectives. "
25+
"Based on timestamped frame captions, produce: (1) a short title, (2) a fuller summary, and (3) search tags. "
26+
"The title must be an attention-grabbing one-line statement that includes the main topic keywords from the clip. "
27+
"The summary should give more context and purpose of the clip than the title. "
28+
"Prefer concrete visual terms over abstract adjectives; be original, unique, and succinct. "
2729
"Do not include conversational filler. "
2830
"Only include brand names or readable on-screen text if they are visually clear and central to the clip."
2931
)
@@ -204,6 +206,7 @@ def caption_item_record(
204206
)
205207
if summary_result["status"] == "ok":
206208
record["classification_status"] = "captions_ready"
209+
record["title"] = summary_result["title"]
207210
record["summary"] = summary_result["summary"]
208211
record["suggested_tags"] = summary_result["suggested_tags"]
209212
record["classification"] = {
@@ -287,9 +290,12 @@ def summarize_captions(
287290
"content": (
288291
"Frame captions:\n"
289292
+ "\n".join(lines)
290-
+ "\n\nReturn strict JSON with keys summary and suggested_tags. "
291-
+ "summary should be 1 to 2 sentences. "
292-
+ "suggested_tags should be 5 to 12 short lowercase tags."
293+
+ "\n\nReturn strict JSON with keys title, summary, and suggested_tags. "
294+
+ "title: one short line, attention-grabbing, must include main topic keywords; "
295+
+ "maximum 100 characters, no line breaks, no quotes wrapping the whole title. "
296+
+ "summary: Minimum 15 characters - Maximum 200 characters, Minimum 5 words; "
297+
+ "more detailed than the title. "
298+
+ "suggested_tags should be 35 to 49 short lowercase tags."
293299
),
294300
},
295301
],
@@ -311,9 +317,21 @@ def summarize_captions(
311317
except json.JSONDecodeError as exc:
312318
return {"status": "error", "reason": f"invalid summary json: {exc}"}
313319

320+
title = parsed.get("title")
314321
summary = parsed.get("summary")
315322
suggested_tags = parsed.get("suggested_tags")
316-
if not isinstance(summary, str) or not isinstance(suggested_tags, list):
323+
if (
324+
not isinstance(title, str)
325+
or not isinstance(summary, str)
326+
or not isinstance(suggested_tags, list)
327+
):
328+
return {
329+
"status": "error",
330+
"reason": "summary response missing required fields",
331+
}
332+
333+
normalized_title = normalize_clip_title(title)
334+
if not normalized_title:
317335
return {
318336
"status": "error",
319337
"reason": "summary response missing required fields",
@@ -326,6 +344,7 @@ def summarize_captions(
326344
]
327345
return {
328346
"status": "ok",
347+
"title": normalized_title,
329348
"summary": normalize_sentence(summary),
330349
"suggested_tags": normalize_tags(cleaned_tags),
331350
}
@@ -510,6 +529,20 @@ def normalize_sentence(value: str) -> str:
510529
return collapsed
511530

512531

532+
def normalize_clip_title(value: str, *, max_len: int = 100) -> str:
533+
"""Collapse whitespace and cap length for storage and display."""
534+
text = normalize_sentence(value)
535+
if not text or len(text) <= max_len:
536+
return text
537+
chunk = text[: max_len + 1]
538+
cut = chunk.rfind(" ", 0, max_len + 1)
539+
if cut >= max(1, max_len // 2):
540+
base = chunk[:cut].rstrip(" ,.;:!?")
541+
else:
542+
base = text[:max_len].rstrip(" ,.;:!?")
543+
return base[:max_len]
544+
545+
513546
def normalize_tags(values: list[str]) -> list[str]:
514547
normalized: list[str] = []
515548
seen: set[str] = set()

src/argus/cli.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,7 @@ def main(argv: list[str] | None = None) -> int:
410410
return 0
411411
for index, result in enumerate(results, start=1):
412412
tags = ", ".join(result["suggested_tags"][:6])
413+
title = result.get("title") or ""
413414
summary = result["summary"] or ""
414415
match_text = result["match_text"] or ""
415416
print(f"{index}. {result['filename']}")
@@ -422,6 +423,8 @@ def main(argv: list[str] | None = None) -> int:
422423
)
423424
if tags:
424425
print(f" Tags: {tags}")
426+
if title:
427+
print(f" Title: {title}")
425428
if summary:
426429
print(f" Summary: {summary}")
427430
if match_text:

src/argus/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from __future__ import annotations
22

33
DEFAULT_OLLAMA_HOST = "http://localhost:11434"
4-
DEFAULT_VISION_MODEL = "gemma3"
4+
DEFAULT_VISION_MODEL = "gemma4:e4b"

src/argus/database.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def search_index(db_path: Path, query: str, *, limit: int = 10) -> list[dict]:
5252
videos.filename,
5353
videos.path,
5454
videos.classification_status,
55+
videos.title,
5556
videos.summary,
5657
videos.suggested_tags_json,
5758
videos.duration_seconds,
@@ -78,6 +79,7 @@ def search_index(db_path: Path, query: str, *, limit: int = 10) -> list[dict]:
7879
"filename": row["filename"],
7980
"path": row["path"],
8081
"classification_status": row["classification_status"],
82+
"title": row["title"],
8183
"summary": row["summary"],
8284
"suggested_tags": json.loads(row["suggested_tags_json"] or "[]"),
8385
"duration_seconds": row["duration_seconds"],
@@ -118,6 +120,7 @@ def query_videos(
118120
filename,
119121
path,
120122
classification_status,
123+
title,
121124
summary,
122125
suggested_tags_json,
123126
duration_seconds,
@@ -191,6 +194,7 @@ def create_schema(connection: sqlite3.Connection) -> None:
191194
height INTEGER,
192195
frame_rate REAL,
193196
has_audio INTEGER,
197+
title TEXT,
194198
summary TEXT,
195199
suggested_tags_json TEXT NOT NULL,
196200
classification_json TEXT,
@@ -217,6 +221,15 @@ def create_schema(connection: sqlite3.Connection) -> None:
217221
);
218222
"""
219223
)
224+
ensure_videos_title_column(connection)
225+
226+
227+
def ensure_videos_title_column(connection: sqlite3.Connection) -> None:
228+
"""Add title column when upgrading an existing database."""
229+
rows = connection.execute("PRAGMA table_info(videos)").fetchall()
230+
column_names = {row[1] for row in rows}
231+
if "title" not in column_names:
232+
connection.execute("ALTER TABLE videos ADD COLUMN title TEXT")
220233

221234

222235
def reset_index(connection: sqlite3.Connection) -> None:
@@ -235,9 +248,9 @@ def index_item_record(connection: sqlite3.Connection, record: dict) -> int:
235248
INSERT INTO videos (
236249
id, filename, path, extension, file_created_at, file_modified_at,
237250
classification_status, audio_required, duration_seconds, codec,
238-
width, height, frame_rate, has_audio, summary, suggested_tags_json,
251+
width, height, frame_rate, has_audio, title, summary, suggested_tags_json,
239252
classification_json, raw_json
240-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
253+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
241254
""",
242255
(
243256
record.get("id"),
@@ -254,6 +267,7 @@ def index_item_record(connection: sqlite3.Connection, record: dict) -> int:
254267
video.get("height"),
255268
video.get("frame_rate"),
256269
none_to_int(media.get("has_audio")),
270+
record.get("title"),
257271
record.get("summary"),
258272
json.dumps(suggested_tags),
259273
json.dumps(record.get("classification", {})),
@@ -266,6 +280,7 @@ def index_item_record(connection: sqlite3.Connection, record: dict) -> int:
266280
searchable_chunks = [
267281
record.get("filename", ""),
268282
record.get("path", ""),
283+
record.get("title", "") or "",
269284
record.get("summary", ""),
270285
" ".join(suggested_tags),
271286
]
@@ -375,6 +390,7 @@ def row_to_result(row: sqlite3.Row, *, match_text: str) -> dict:
375390
"filename": row["filename"],
376391
"path": row["path"],
377392
"classification_status": row["classification_status"],
393+
"title": row["title"],
378394
"summary": row["summary"],
379395
"suggested_tags": json.loads(row["suggested_tags_json"] or "[]"),
380396
"duration_seconds": row["duration_seconds"],

src/argus/serve.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"filename": "clip-001.mp4",
1717
"path": "/Volumes/Media/Project-A/clip-001.mp4",
1818
"classification_status": "captions_ready",
19+
"title": "Office hallway walk with laptop in hand",
1920
"summary": "A person walks through a bright office hallway while carrying a laptop.",
2021
"suggested_tags": ["office", "hallway", "person", "walking", "laptop"],
2122
"duration_seconds": 14.2,
@@ -29,6 +30,7 @@
2930
"filename": "clip-002.mp4",
3031
"path": "/Volumes/Media/Project-B/clip-002.mp4",
3132
"classification_status": "captions_ready",
33+
"title": "Hands packing product boxes on a worktable",
3234
"summary": "Close-up footage of hands arranging product boxes on a worktable.",
3335
"suggested_tags": ["close-up", "hands", "boxes", "table", "product"],
3436
"duration_seconds": 9.6,
@@ -42,6 +44,7 @@
4244
"filename": "clip-003.mp4",
4345
"path": "/Volumes/Media/Project-C/clip-003.mp4",
4446
"classification_status": "captions_ready",
47+
"title": "Busy storefront exterior with shoppers coming and going",
4548
"summary": "Wide exterior shot of a storefront with people entering and leaving.",
4649
"suggested_tags": ["exterior", "wide shot", "storefront", "people"],
4750
"duration_seconds": 22.8,
@@ -346,6 +349,14 @@ def render_index_html(*, demo_mode: bool = False) -> str:
346349
font-size: 0.82rem;
347350
word-break: break-all;
348351
}
352+
.clip-title {
353+
margin: 0 0 0.35rem;
354+
font-size: 1.05rem;
355+
font-weight: 600;
356+
line-height: 1.35;
357+
color: var(--ink);
358+
letter-spacing: -0.01em;
359+
}
349360
.summary, .match {
350361
margin: 0;
351362
line-height: 1.55;
@@ -459,7 +470,7 @@ def render_index_html(*, demo_mode: bool = False) -> str:
459470
<section class="controls panel">
460471
<div class="control">
461472
<label for="query">Search</label>
462-
<input id="query" type="search" placeholder="Search by filename, tag, summary, caption, or visible text" autocomplete="off">
473+
<input id="query" type="search" placeholder="Search by filename, title, tag, summary, caption, or visible text" autocomplete="off">
463474
</div>
464475
<div class="control">
465476
<label for="status">Status</label>
@@ -544,6 +555,7 @@ def render_index_html(*, demo_mode: bool = False) -> str:
544555
545556
resultsEl.innerHTML = results.map((result) => {
546557
const tags = (result.suggested_tags || []).map((tag) => `<span class="tag">${tag}</span>`).join("");
558+
const clipTitle = result.title ? `<p class="clip-title">${result.title}</p>` : "";
547559
const summary = result.summary ? `<p class="summary">${result.summary}</p>` : "";
548560
const match = result.match_text ? `<p class="match">${highlightBrackets(result.match_text)}</p>` : "";
549561
const duration = typeof result.duration_seconds === "number" ? `${result.duration_seconds.toFixed(2)}s` : "unknown";
@@ -561,6 +573,7 @@ def render_index_html(*, demo_mode: bool = False) -> str:
561573
<span>${duration}</span>
562574
<span>${resolution}</span>
563575
</div>
576+
${clipTitle}
564577
${summary}
565578
${match}
566579
<div class="tags">${tags}</div>
@@ -584,6 +597,7 @@ def render_index_html(*, demo_mode: bool = False) -> str:
584597
const haystack = [
585598
result.filename,
586599
result.path,
600+
result.title,
587601
result.summary,
588602
...(result.suggested_tags || []),
589603
result.match_text

tests/test_scanner.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from argus.captioner import (
1111
caption_output_items,
1212
match_ollama_model,
13+
normalize_clip_title,
1314
normalize_tags,
1415
summarize_captions,
1516
)
@@ -122,7 +123,10 @@ def test_dependency_report_handles_missing_ollama_api(self, urlopen_mock) -> Non
122123
def test_summarize_captions_parses_json_response(self, ollama_chat_mock) -> None:
123124
ollama_chat_mock.return_value = {
124125
"message": {
125-
"content": '{"summary":"Wide exterior drone footage.","suggested_tags":["Drone","aerial","drone"]}'
126+
"content": (
127+
'{"title":"Aerial drone view over roads and fields.",'
128+
'"summary":"Wide exterior drone footage.","suggested_tags":["Drone","aerial","drone"]}'
129+
)
126130
}
127131
}
128132

@@ -133,9 +137,33 @@ def test_summarize_captions_parses_json_response(self, ollama_chat_mock) -> None
133137
)
134138

135139
self.assertEqual(result["status"], "ok")
140+
self.assertEqual(result["title"], "Aerial drone view over roads and fields.")
141+
self.assertLessEqual(len(result["title"]), 100)
136142
self.assertEqual(result["summary"], "Wide exterior drone footage.")
137143
self.assertEqual(result["suggested_tags"], ["drone", "aerial"])
138144

145+
@patch("argus.captioner.ollama_chat")
146+
def test_summarize_captions_rejects_missing_title(self, ollama_chat_mock) -> None:
147+
ollama_chat_mock.return_value = {
148+
"message": {
149+
"content": '{"summary":"Wide exterior drone footage.","suggested_tags":["drone"]}'
150+
}
151+
}
152+
153+
result = summarize_captions(
154+
[{"timestamp_seconds": 1.0, "caption": "Drone shot over a road."}],
155+
model="gemma3",
156+
ollama_host="http://localhost:11434",
157+
)
158+
159+
self.assertEqual(result["status"], "error")
160+
self.assertIn("required fields", result["reason"])
161+
162+
def test_normalize_clip_title_truncates_to_max_length(self) -> None:
163+
long_title = "word " * 40
164+
out = normalize_clip_title(long_title, max_len=100)
165+
self.assertLessEqual(len(out), 100)
166+
139167
def test_normalize_tags_lowercases_and_deduplicates(self) -> None:
140168
result = normalize_tags(["Drone", " aerial ", "drone", ""])
141169

@@ -276,6 +304,7 @@ def test_index_output_items_and_search_index(self) -> None:
276304
"frame_rate": 24.0,
277305
},
278306
},
307+
"title": "Warehouse aisle chat beside stacked boxes",
279308
"summary": "Two men talk in a warehouse aisle beside stacked cardboard boxes.",
280309
"suggested_tags": ["warehouse", "boxes", "conversation"],
281310
"classification": {"model": "gemma3"},
@@ -299,11 +328,15 @@ def test_index_output_items_and_search_index(self) -> None:
299328

300329
report = index_output_items(output)
301330
results = search_index(Path(report["db_path"]), "warehouse", limit=5)
331+
title_hits = search_index(Path(report["db_path"]), "aisle", limit=5)
302332

303333
self.assertEqual(report["indexed_videos"], 1)
304334
self.assertEqual(report["indexed_frames"], 1)
305335
self.assertEqual(len(results), 1)
306336
self.assertEqual(results[0]["filename"], "warehouse.mp4")
337+
self.assertEqual(results[0]["title"], "Warehouse aisle chat beside stacked boxes")
338+
self.assertEqual(len(title_hits), 1)
339+
self.assertEqual(title_hits[0]["title"], "Warehouse aisle chat beside stacked boxes")
307340

308341
@patch("argus.cli.index_output_items")
309342
@patch("argus.cli.caption_output_items")
@@ -354,6 +387,7 @@ def test_query_videos_without_search_returns_recent_rows(self) -> None:
354387
"classification_status": "captions_ready",
355388
"audio_required": False,
356389
"media": {"video": {}},
390+
"title": f"clip title {index}",
357391
"summary": f"clip {index}",
358392
"suggested_tags": [f"tag-{index}"],
359393
"sample_frames": {"frames": []},
@@ -367,6 +401,7 @@ def test_query_videos_without_search_returns_recent_rows(self) -> None:
367401

368402
self.assertEqual(len(results), 2)
369403
self.assertEqual(results[0]["filename"], "clip-1.mp4")
404+
self.assertEqual(results[0]["title"], "clip title 1")
370405

371406
@patch("argus.captioner.captioning_preflight")
372407
@patch("argus.captioner.caption_item_record")

uv.lock

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)