fix(view-browser): parse argus.log as JSON-lines, not plain text

eFAILution · eFAILution · commit b06553a52c43 · 2026-05-04T07:54:50.000-04:00
The /log viewer's parser assumed argus.log used the same human-
readable text format the *console* handler emits (HH:MM:SS LEVEL
logger msg). It doesn't — the *file* handler in
``argus.audit.logger.JsonLogFormatter`` writes structured JSON,
one object per line. So the regex matched zero lines on every real
log file and the viewer rendered an empty pane.

Empirical: a real 11KB scan log (45 entries) parsed as 0 entries
before, 45 after.

Rewrites parse_log to read JSON-lines:
- One ``json.loads`` per line; malformed lines (partial flushes,
  e.g. when reading a log mid-scan) silently skipped rather than
  500ing.
- Records with a non-Python-logging ``level`` are dropped instead
  of being assigned an arbitrary rank that would warp filters.
- ``_extract_time`` pulls HH:MM:SS from the ISO timestamp; tolerant
  of microseconds and any TZ form (``+00:00``, ``-05:00``, ``Z``).
- The continuation-line concept that mattered for plain-text logs is
  gone — multi-line messages live inside the JSON ``message`` string
  and the ``&lt;pre&gt;`` template renders embedded newlines as-is.

Tests: replaced the regex-format ``_SAMPLE_LOG`` with a JSON-lines
fixture that mirrors what JsonLogFormatter actually writes, and
added new parser cases for the JSON-specific edge paths (malformed
lines skipped, unknown levels dropped, missing module falls back to
"argus", Z-suffix timestamp, empty lines ignored). 31 tests pass
(up from 27).

LogEntry's display shape is unchanged so the template needs no
edits — only the format being parsed changed.
diff --git a/argus/tests/viewers/browser/test_log.py b/argus/tests/viewers/browser/test_log.py
@@ -30,14 +30,42 @@
 # Fixtures shared across route + parser tests
 # ───────────────────────────────────────────────
 
-_SAMPLE_LOG = (
-    "07:13:58 DEBUG    argus Full exclusion set: ['node_modules', '.git']\n"
-    "07:13:58 INFO     argus Loaded 66 exclusion pattern(s) from .gitignore\n"
-    "07:13:59 WARNING  argus Native pull failed for clamav/clamav:1.5\n"
-    "       continuation line for the warning above\n"
-    "07:13:59 ERROR    viewers.browser Could not connect to docker.sock\n"
-    "07:13:59 INFO     argus Scanner 'gitleaks' finished in 11722ms: 0 finding(s)\n"
-)
+# JSON-lines format — matches what JsonLogFormatter (in
+# argus/audit/logger.py) actually writes to disk. The console
+# handler emits human-readable HH:MM:SS lines but the file handler
+# always emits structured JSON, so the parser only handles JSON.
+_SAMPLE_LOG = "\n".join([
+    json.dumps({
+        "timestamp": "2026-05-04T07:13:58.531038+00:00",
+        "level": "DEBUG", "module": "argus",
+        "function": "_load_exclusions", "line": 42,
+        "message": "Full exclusion set: ['node_modules', '.git']",
+    }),
+    json.dumps({
+        "timestamp": "2026-05-04T07:13:58.612001+00:00",
+        "level": "INFO", "module": "argus",
+        "function": "_load_exclusions", "line": 51,
+        "message": "Loaded 66 exclusion pattern(s) from .gitignore",
+    }),
+    json.dumps({
+        "timestamp": "2026-05-04T07:13:59.001234+00:00",
+        "level": "WARNING", "module": "argus",
+        "function": "pull_image", "line": 542,
+        "message": "Native pull failed for clamav/clamav:1.5",
+    }),
+    json.dumps({
+        "timestamp": "2026-05-04T07:13:59.105678+00:00",
+        "level": "ERROR", "module": "viewers.browser",
+        "function": "_resolve_scan", "line": 99,
+        "message": "Could not connect to docker.sock",
+    }),
+    json.dumps({
+        "timestamp": "2026-05-04T07:13:59.205678+00:00",
+        "level": "INFO", "module": "argus",
+        "function": "_run_scanner", "line": 712,
+        "message": "Scanner 'gitleaks' finished in 11722ms: 0 finding(s)",
+    }),
+]) + "\n"
 
 
 def _sample_payload() -> dict:
@@ -74,30 +102,93 @@ def _write_scan(tmp_path, log_contents: str | None = _SAMPLE_LOG) -> str:
 
 
 class TestParseLog:
-    def test_parses_each_header_line(self):
+    def test_parses_each_json_line(self):
         entries = parse_log(_SAMPLE_LOG)
-        # 5 header lines (the continuation belongs to the WARNING entry).
         assert len(entries) == 5
 
     def test_canonicalizes_warn_to_warning(self):
-        entries = parse_log("07:00:00 WARN     argus short-warn form\n")
+        text = json.dumps({
+            "timestamp": "2026-05-04T07:00:00+00:00",
+            "level": "WARN", "module": "argus",
+            "message": "short-warn form",
+        }) + "\n"
+        entries = parse_log(text)
         assert len(entries) == 1
         assert entries[0].level == "WARNING"
 
-    def test_attaches_continuation_to_previous_entry(self):
+    def test_extracts_hhmmss_from_iso_timestamp(self):
         entries = parse_log(_SAMPLE_LOG)
         warning = next(e for e in entries if e.level == "WARNING")
-        assert "continuation line for the warning above" in warning.msg
-
-    def test_continuation_before_first_header_is_dropped(self):
-        text = "stray line before any header\n07:00:00 INFO     argus first\n"
+        assert warning.time == "07:13:59"
+
+    def test_extracts_time_with_z_suffix(self):
+        text = json.dumps({
+            "timestamp": "2026-05-04T09:30:15.123Z",
+            "level": "INFO", "module": "argus",
+            "message": "z-suffixed",
+        }) + "\n"
         entries = parse_log(text)
         assert len(entries) == 1
+        assert entries[0].time == "09:30:15"
+
+    def test_skips_malformed_json_lines(self):
+        # Real-world logs can have a partially-flushed final line if
+        # the user reads while the scan is mid-write. Skip rather than
+        # 500.
+        text = (
+            json.dumps({
+                "timestamp": "2026-05-04T07:00:00+00:00",
+                "level": "INFO", "module": "argus",
+                "message": "first",
+            }) + "\n"
+            + "{not valid json\n"
+            + json.dumps({
+                "timestamp": "2026-05-04T07:00:01+00:00",
+                "level": "INFO", "module": "argus",
+                "message": "third",
+            }) + "\n"
+        )
+        entries = parse_log(text)
+        assert [e.msg for e in entries] == ["first", "third"]
+
+    def test_skips_records_with_unknown_level(self):
+        text = (
+            json.dumps({
+                "timestamp": "2026-05-04T07:00:00+00:00",
+                "level": "INFO", "module": "argus", "message": "kept",
+            }) + "\n"
+            + json.dumps({
+                "timestamp": "2026-05-04T07:00:01+00:00",
+                "level": "TRACE", "module": "argus", "message": "dropped",
+            }) + "\n"
+        )
+        entries = parse_log(text)
+        assert [e.msg for e in entries] == ["kept"]
+
+    def test_missing_module_falls_back_to_argus(self):
+        text = json.dumps({
+            "timestamp": "2026-05-04T07:00:00+00:00",
+            "level": "INFO", "message": "no-module",
+        }) + "\n"
+        entries = parse_log(text)
         assert entries[0].logger == "argus"
 
-    def test_line_no_points_at_header_line(self):
+    def test_empty_lines_ignored(self):
+        text = (
+            "\n\n"
+            + json.dumps({
+                "timestamp": "2026-05-04T07:00:00+00:00",
+                "level": "INFO", "module": "argus", "message": "lonely",
+            }) + "\n"
+            + "\n\n"
+        )
+        entries = parse_log(text)
+        assert len(entries) == 1
+        assert entries[0].msg == "lonely"
+
+    def test_line_no_points_at_source_line(self):
         entries = parse_log(_SAMPLE_LOG)
-        # Lines are 1-based; the WARNING is the 3rd header line.
+        # The WARNING is the 3rd entry in the sample (line 3 of the file).
         warning = next(e for e in entries if e.level == "WARNING")
         assert warning.line_no == 3
 
diff --git a/argus/viewers/browser/log_view.py b/argus/viewers/browser/log_view.py
@@ -4,37 +4,28 @@
 parsing and filtering pure so route handlers, templates, and tests can
 all share the same code path.
 
-Argus emits log lines in the standard Python logging shape::
-
-    07:13:58 DEBUG    argus Container exited: code=0, duration=701ms
-    07:13:59 INFO     viewers.browser argus view browser listening on …
-
-A regex extracts the four fields. Lines that don't match are treated as
-continuations of the previous entry's message — common when a scanner
-dumps a multi-line stderr blob that the engine forwards verbatim.
+argus writes ``argus.log`` as one JSON object per line via
+:class:`argus.audit.logger.JsonLogFormatter`. Each entry looks like::
+
+    {"timestamp": "2026-05-04T11:13:58.531038+00:00",
+     "level": "INFO", "module": "argus", "function": "_cmd_source_scan",
+     "line": 1093, "message": "Argus scan starting"}
+
+The parser reads JSON-lines, dropping any malformed line silently
+(rather than 500ing on a partially-flushed log file). Continuation
+handling that was needed for plain-text logs is unnecessary here:
+multi-line messages live inside the ``message`` string and the
+``<pre>`` template renders the embedded newlines as-is.
 """
 
 from __future__ import annotations
 
-import re
+import json
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Iterable
 
 
-# The structured-line regex. Argus uses Python logging's default time
-# format ``%H:%M:%S`` plus a level + logger + message tail. Levels
-# include both Python's canonical names (DEBUG/INFO/WARNING/ERROR/
-# CRITICAL) and the shortened ``WARN`` we sometimes see in container
-# stderr that's been forwarded through.
-_LOG_LINE_RE = re.compile(
-    r"^(?P<time>\d{2}:\d{2}:\d{2})\s+"
-    r"(?P<level>DEBUG|INFO|WARNING|WARN|ERROR|CRITICAL)\s+"
-    r"(?P<logger>\S+)\s+"
-    r"(?P<msg>.*)$"
-)
-
-
 # Severity ranking for ``min_level`` filtering. Matches Python's
 # ``logging`` module values so "WARN and above" is the obvious thing.
 LEVEL_RANK: dict[str, int] = {
@@ -49,68 +40,91 @@
 
 @dataclass(frozen=True)
 class LogEntry:
-    """A single parsed log entry.
+    """A single parsed log entry — display shape only.
 
-    ``msg`` includes any continuation lines that follow the header line
-    (joined with ``\n``) so the renderer doesn't have to know about
-    multi-line entries — it just paints what we hand it.
+    The on-disk JSON record carries more (function, line, optional
+    scanner / phase / image / duration_ms), but the viewer only needs
+    these five fields. Keep the dataclass narrow so future renderer
+    changes don't have to know about the file format.
     """
 
-    line_no: int    # 1-based line number of the header line in the source file
-    time: str       # "07:13:58"
-    level: str      # canonicalized: DEBUG / INFO / WARNING / ERROR / CRITICAL
-    logger: str     # "argus", "viewers.browser", etc.
-    msg: str        # full message including any continuation lines
+    line_no: int    # 1-based line number in the source file (for reference)
+    time: str       # "07:13:58" — extracted from the ISO timestamp
+    level: str      # canonical: DEBUG / INFO / WARNING / ERROR / CRITICAL
+    logger: str     # "argus", "viewers.browser", etc. (the JSON ``module`` field)
+    msg: str        # the rendered message string
 
 
 def _canonicalize_level(level: str) -> str:
     """Fold the short ``WARN`` form onto Python's canonical ``WARNING``.
 
-    Other levels passthrough. Centralizing the rule keeps filter
-    comparisons (``LEVEL_RANK[entry.level]``) consistent regardless of
-    which form the underlying logger emits.
+    ``JsonLogFormatter`` emits ``WARNING`` directly, but defensive in
+    case future scanner-forwarded entries use the short form. Other
+    levels passthrough.
     """
-    return "WARNING" if level == "WARN" else level
+    upper = (level or "").upper()
+    return "WARNING" if upper == "WARN" else upper
 
 
-def parse_log(text: str) -> list[LogEntry]:
-    """Parse the contents of an ``argus.log`` file into structured entries.
+def _extract_time(iso_timestamp: str) -> str:
+    """Extract the ``HH:MM:SS`` portion from an ISO 8601 timestamp.
 
-    Lines that don't start with the standard timestamp+level prefix are
-    treated as continuations of the previous header line — that's how
-    argus wraps multi-line scanner output today. Continuation text
-    before any header line at all is silently dropped (we don't have a
-    reasonable level/logger to assign it).
+    Returns an empty string if the input is missing or unparsable —
+    we'd rather render a blank time field than crash a whole render
+    on one weird line. Tolerant of microseconds and any timezone
+    suffix (``+00:00``, ``-05:00``, ``Z``).
     """
-    entries: list[LogEntry] = []
-    current_match: re.Match[str] | None = None
-    current_line_no = 0
-    current_lines: list[str] = []
+    if not iso_timestamp or "T" not in iso_timestamp:
+        return ""
+    time_part = iso_timestamp.split("T", 1)[1]
+    # Trim microseconds before timezone matters since ``.`` always
+    # precedes ``+/-`` / ``Z`` when present.
+    if "." in time_part:
+        time_part = time_part.split(".", 1)[0]
+    elif time_part.endswith("Z"):
+        time_part = time_part[:-1]
+    elif "+" in time_part:
+        time_part = time_part.split("+", 1)[0]
+    elif "-" in time_part:
+        time_part = time_part.split("-", 1)[0]
+    return time_part
 
-    def _flush() -> None:
-        if current_match is None:
-            return
+
+def parse_log(text: str) -> list[LogEntry]:
+    """Parse the contents of an ``argus.log`` file (JSON-lines) into entries.
+
+    Skips:
+    - empty lines
+    - lines that aren't valid JSON
+    - JSON values that aren't objects (shouldn't happen with
+      ``JsonLogFormatter``, defensive)
+    - records with a ``level`` we don't recognize (rather than
+      assigning them an arbitrary rank that would warp filters)
+
+    Doesn't enforce field presence beyond that — missing ``module``
+    falls back to ``"argus"``, missing ``message`` to ``""``.
+    """
+    entries: list[LogEntry] = []
+    for i, raw in enumerate(text.splitlines(), start=1):
+        line = raw.strip()
+        if not line:
+            continue
+        try:
+            data = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if not isinstance(data, dict):
+            continue
+        level = _canonicalize_level(data.get("level", ""))
+        if level not in LEVEL_RANK:
+            continue
         entries.append(LogEntry(
-            line_no=current_line_no,
-            time=current_match["time"],
-            level=_canonicalize_level(current_match["level"]),
-            logger=current_match["logger"],
-            msg="\n".join(current_lines).rstrip(),
+            line_no=i,
+            time=_extract_time(data.get("timestamp", "")),
+            level=level,
+            logger=str(data.get("module") or "argus"),
+            msg=str(data.get("message", "")),
         ))
-
-    for i, line in enumerate(text.splitlines(), start=1):
-        match = _LOG_LINE_RE.match(line)
-        if match:
-            _flush()
-            current_match = match
-            current_line_no = i
-            # The first line carries the structured prefix; we keep
-            # only the message portion in current_lines so the rendered
-            # output doesn't re-display time/level/logger inline.
-            current_lines = [match["msg"]]
-        elif current_match is not None:
-            current_lines.append(line)
-    _flush()
     return entries