Skip to content

Commit b06553a

Browse files
committed
fix(view-browser): parse argus.log as JSON-lines, not plain text
The /log viewer's parser assumed argus.log used the same human- readable text format the *console* handler emits (HH:MM:SS LEVEL logger msg). It doesn't — the *file* handler in ``argus.audit.logger.JsonLogFormatter`` writes structured JSON, one object per line. So the regex matched zero lines on every real log file and the viewer rendered an empty pane. Empirical: a real 11KB scan log (45 entries) parsed as 0 entries before, 45 after. Rewrites parse_log to read JSON-lines: - One ``json.loads`` per line; malformed lines (partial flushes, e.g. when reading a log mid-scan) silently skipped rather than 500ing. - Records with a non-Python-logging ``level`` are dropped instead of being assigned an arbitrary rank that would warp filters. - ``_extract_time`` pulls HH:MM:SS from the ISO timestamp; tolerant of microseconds and any TZ form (``+00:00``, ``-05:00``, ``Z``). - The continuation-line concept that mattered for plain-text logs is gone — multi-line messages live inside the JSON ``message`` string and the ``<pre>`` template renders embedded newlines as-is. Tests: replaced the regex-format ``_SAMPLE_LOG`` with a JSON-lines fixture that mirrors what JsonLogFormatter actually writes, and added new parser cases for the JSON-specific edge paths (malformed lines skipped, unknown levels dropped, missing module falls back to "argus", Z-suffix timestamp, empty lines ignored). 31 tests pass (up from 27). LogEntry's display shape is unchanged so the template needs no edits — only the format being parsed changed.
1 parent 25a115d commit b06553a

2 files changed

Lines changed: 191 additions & 86 deletions

File tree

argus/tests/viewers/browser/test_log.py

Lines changed: 109 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,42 @@
3030
# Fixtures shared across route + parser tests
3131
# ───────────────────────────────────────────────
3232

33-
_SAMPLE_LOG = (
34-
"07:13:58 DEBUG argus Full exclusion set: ['node_modules', '.git']\n"
35-
"07:13:58 INFO argus Loaded 66 exclusion pattern(s) from .gitignore\n"
36-
"07:13:59 WARNING argus Native pull failed for clamav/clamav:1.5\n"
37-
" continuation line for the warning above\n"
38-
"07:13:59 ERROR viewers.browser Could not connect to docker.sock\n"
39-
"07:13:59 INFO argus Scanner 'gitleaks' finished in 11722ms: 0 finding(s)\n"
40-
)
33+
# JSON-lines format — matches what JsonLogFormatter (in
34+
# argus/audit/logger.py) actually writes to disk. The console
35+
# handler emits human-readable HH:MM:SS lines but the file handler
36+
# always emits structured JSON, so the parser only handles JSON.
37+
_SAMPLE_LOG = "\n".join([
38+
json.dumps({
39+
"timestamp": "2026-05-04T07:13:58.531038+00:00",
40+
"level": "DEBUG", "module": "argus",
41+
"function": "_load_exclusions", "line": 42,
42+
"message": "Full exclusion set: ['node_modules', '.git']",
43+
}),
44+
json.dumps({
45+
"timestamp": "2026-05-04T07:13:58.612001+00:00",
46+
"level": "INFO", "module": "argus",
47+
"function": "_load_exclusions", "line": 51,
48+
"message": "Loaded 66 exclusion pattern(s) from .gitignore",
49+
}),
50+
json.dumps({
51+
"timestamp": "2026-05-04T07:13:59.001234+00:00",
52+
"level": "WARNING", "module": "argus",
53+
"function": "pull_image", "line": 542,
54+
"message": "Native pull failed for clamav/clamav:1.5",
55+
}),
56+
json.dumps({
57+
"timestamp": "2026-05-04T07:13:59.105678+00:00",
58+
"level": "ERROR", "module": "viewers.browser",
59+
"function": "_resolve_scan", "line": 99,
60+
"message": "Could not connect to docker.sock",
61+
}),
62+
json.dumps({
63+
"timestamp": "2026-05-04T07:13:59.205678+00:00",
64+
"level": "INFO", "module": "argus",
65+
"function": "_run_scanner", "line": 712,
66+
"message": "Scanner 'gitleaks' finished in 11722ms: 0 finding(s)",
67+
}),
68+
]) + "\n"
4169

4270

4371
def _sample_payload() -> dict:
@@ -74,30 +102,93 @@ def _write_scan(tmp_path, log_contents: str | None = _SAMPLE_LOG) -> str:
74102

75103

76104
class TestParseLog:
77-
def test_parses_each_header_line(self):
105+
def test_parses_each_json_line(self):
78106
entries = parse_log(_SAMPLE_LOG)
79-
# 5 header lines (the continuation belongs to the WARNING entry).
80107
assert len(entries) == 5
81108

82109
def test_canonicalizes_warn_to_warning(self):
83-
entries = parse_log("07:00:00 WARN argus short-warn form\n")
110+
text = json.dumps({
111+
"timestamp": "2026-05-04T07:00:00+00:00",
112+
"level": "WARN", "module": "argus",
113+
"message": "short-warn form",
114+
}) + "\n"
115+
entries = parse_log(text)
84116
assert len(entries) == 1
85117
assert entries[0].level == "WARNING"
86118

87-
def test_attaches_continuation_to_previous_entry(self):
119+
def test_extracts_hhmmss_from_iso_timestamp(self):
88120
entries = parse_log(_SAMPLE_LOG)
89121
warning = next(e for e in entries if e.level == "WARNING")
90-
assert "continuation line for the warning above" in warning.msg
91-
92-
def test_continuation_before_first_header_is_dropped(self):
93-
text = "stray line before any header\n07:00:00 INFO argus first\n"
122+
assert warning.time == "07:13:59"
123+
124+
def test_extracts_time_with_z_suffix(self):
125+
text = json.dumps({
126+
"timestamp": "2026-05-04T09:30:15.123Z",
127+
"level": "INFO", "module": "argus",
128+
"message": "z-suffixed",
129+
}) + "\n"
94130
entries = parse_log(text)
95131
assert len(entries) == 1
132+
assert entries[0].time == "09:30:15"
133+
134+
def test_skips_malformed_json_lines(self):
135+
# Real-world logs can have a partially-flushed final line if
136+
# the user reads while the scan is mid-write. Skip rather than
137+
# 500.
138+
text = (
139+
json.dumps({
140+
"timestamp": "2026-05-04T07:00:00+00:00",
141+
"level": "INFO", "module": "argus",
142+
"message": "first",
143+
}) + "\n"
144+
+ "{not valid json\n"
145+
+ json.dumps({
146+
"timestamp": "2026-05-04T07:00:01+00:00",
147+
"level": "INFO", "module": "argus",
148+
"message": "third",
149+
}) + "\n"
150+
)
151+
entries = parse_log(text)
152+
assert [e.msg for e in entries] == ["first", "third"]
153+
154+
def test_skips_records_with_unknown_level(self):
155+
text = (
156+
json.dumps({
157+
"timestamp": "2026-05-04T07:00:00+00:00",
158+
"level": "INFO", "module": "argus", "message": "kept",
159+
}) + "\n"
160+
+ json.dumps({
161+
"timestamp": "2026-05-04T07:00:01+00:00",
162+
"level": "TRACE", "module": "argus", "message": "dropped",
163+
}) + "\n"
164+
)
165+
entries = parse_log(text)
166+
assert [e.msg for e in entries] == ["kept"]
167+
168+
def test_missing_module_falls_back_to_argus(self):
169+
text = json.dumps({
170+
"timestamp": "2026-05-04T07:00:00+00:00",
171+
"level": "INFO", "message": "no-module",
172+
}) + "\n"
173+
entries = parse_log(text)
96174
assert entries[0].logger == "argus"
97175

98-
def test_line_no_points_at_header_line(self):
176+
def test_empty_lines_ignored(self):
177+
text = (
178+
"\n\n"
179+
+ json.dumps({
180+
"timestamp": "2026-05-04T07:00:00+00:00",
181+
"level": "INFO", "module": "argus", "message": "lonely",
182+
}) + "\n"
183+
+ "\n\n"
184+
)
185+
entries = parse_log(text)
186+
assert len(entries) == 1
187+
assert entries[0].msg == "lonely"
188+
189+
def test_line_no_points_at_source_line(self):
99190
entries = parse_log(_SAMPLE_LOG)
100-
# Lines are 1-based; the WARNING is the 3rd header line.
191+
# The WARNING is the 3rd entry in the sample (line 3 of the file).
101192
warning = next(e for e in entries if e.level == "WARNING")
102193
assert warning.line_no == 3
103194

argus/viewers/browser/log_view.py

Lines changed: 82 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -4,37 +4,28 @@
44
parsing and filtering pure so route handlers, templates, and tests can
55
all share the same code path.
66
7-
Argus emits log lines in the standard Python logging shape::
8-
9-
07:13:58 DEBUG argus Container exited: code=0, duration=701ms
10-
07:13:59 INFO viewers.browser argus view browser listening on …
11-
12-
A regex extracts the four fields. Lines that don't match are treated as
13-
continuations of the previous entry's message — common when a scanner
14-
dumps a multi-line stderr blob that the engine forwards verbatim.
7+
argus writes ``argus.log`` as one JSON object per line via
8+
:class:`argus.audit.logger.JsonLogFormatter`. Each entry looks like::
9+
10+
{"timestamp": "2026-05-04T11:13:58.531038+00:00",
11+
"level": "INFO", "module": "argus", "function": "_cmd_source_scan",
12+
"line": 1093, "message": "Argus scan starting"}
13+
14+
The parser reads JSON-lines, dropping any malformed line silently
15+
(rather than 500ing on a partially-flushed log file). Continuation
16+
handling that was needed for plain-text logs is unnecessary here:
17+
multi-line messages live inside the ``message`` string and the
18+
``<pre>`` template renders the embedded newlines as-is.
1519
"""
1620

1721
from __future__ import annotations
1822

19-
import re
23+
import json
2024
from dataclasses import dataclass
2125
from pathlib import Path
2226
from typing import Iterable
2327

2428

25-
# The structured-line regex. Argus uses Python logging's default time
26-
# format ``%H:%M:%S`` plus a level + logger + message tail. Levels
27-
# include both Python's canonical names (DEBUG/INFO/WARNING/ERROR/
28-
# CRITICAL) and the shortened ``WARN`` we sometimes see in container
29-
# stderr that's been forwarded through.
30-
_LOG_LINE_RE = re.compile(
31-
r"^(?P<time>\d{2}:\d{2}:\d{2})\s+"
32-
r"(?P<level>DEBUG|INFO|WARNING|WARN|ERROR|CRITICAL)\s+"
33-
r"(?P<logger>\S+)\s+"
34-
r"(?P<msg>.*)$"
35-
)
36-
37-
3829
# Severity ranking for ``min_level`` filtering. Matches Python's
3930
# ``logging`` module values so "WARN and above" is the obvious thing.
4031
LEVEL_RANK: dict[str, int] = {
@@ -49,68 +40,91 @@
4940

5041
@dataclass(frozen=True)
5142
class LogEntry:
52-
"""A single parsed log entry.
43+
"""A single parsed log entry — display shape only.
5344
54-
``msg`` includes any continuation lines that follow the header line
55-
(joined with ``\n``) so the renderer doesn't have to know about
56-
multi-line entries — it just paints what we hand it.
45+
The on-disk JSON record carries more (function, line, optional
46+
scanner / phase / image / duration_ms), but the viewer only needs
47+
these five fields. Keep the dataclass narrow so future renderer
48+
changes don't have to know about the file format.
5749
"""
5850

59-
line_no: int # 1-based line number of the header line in the source file
60-
time: str # "07:13:58"
61-
level: str # canonicalized: DEBUG / INFO / WARNING / ERROR / CRITICAL
62-
logger: str # "argus", "viewers.browser", etc.
63-
msg: str # full message including any continuation lines
51+
line_no: int # 1-based line number in the source file (for reference)
52+
time: str # "07:13:58" — extracted from the ISO timestamp
53+
level: str # canonical: DEBUG / INFO / WARNING / ERROR / CRITICAL
54+
logger: str # "argus", "viewers.browser", etc. (the JSON ``module`` field)
55+
msg: str # the rendered message string
6456

6557

6658
def _canonicalize_level(level: str) -> str:
6759
"""Fold the short ``WARN`` form onto Python's canonical ``WARNING``.
6860
69-
Other levels passthrough. Centralizing the rule keeps filter
70-
comparisons (``LEVEL_RANK[entry.level]``) consistent regardless of
71-
which form the underlying logger emits.
61+
``JsonLogFormatter`` emits ``WARNING`` directly, but defensive in
62+
case future scanner-forwarded entries use the short form. Other
63+
levels passthrough.
7264
"""
73-
return "WARNING" if level == "WARN" else level
65+
upper = (level or "").upper()
66+
return "WARNING" if upper == "WARN" else upper
7467

7568

76-
def parse_log(text: str) -> list[LogEntry]:
77-
"""Parse the contents of an ``argus.log`` file into structured entries.
69+
def _extract_time(iso_timestamp: str) -> str:
70+
"""Extract the ``HH:MM:SS`` portion from an ISO 8601 timestamp.
7871
79-
Lines that don't start with the standard timestamp+level prefix are
80-
treated as continuations of the previous header line — that's how
81-
argus wraps multi-line scanner output today. Continuation text
82-
before any header line at all is silently dropped (we don't have a
83-
reasonable level/logger to assign it).
72+
Returns an empty string if the input is missing or unparsable —
73+
we'd rather render a blank time field than crash a whole render
74+
on one weird line. Tolerant of microseconds and any timezone
75+
suffix (``+00:00``, ``-05:00``, ``Z``).
8476
"""
85-
entries: list[LogEntry] = []
86-
current_match: re.Match[str] | None = None
87-
current_line_no = 0
88-
current_lines: list[str] = []
77+
if not iso_timestamp or "T" not in iso_timestamp:
78+
return ""
79+
time_part = iso_timestamp.split("T", 1)[1]
80+
# Trim microseconds before timezone matters since ``.`` always
81+
# precedes ``+/-`` / ``Z`` when present.
82+
if "." in time_part:
83+
time_part = time_part.split(".", 1)[0]
84+
elif time_part.endswith("Z"):
85+
time_part = time_part[:-1]
86+
elif "+" in time_part:
87+
time_part = time_part.split("+", 1)[0]
88+
elif "-" in time_part:
89+
time_part = time_part.split("-", 1)[0]
90+
return time_part
8991

90-
def _flush() -> None:
91-
if current_match is None:
92-
return
92+
93+
def parse_log(text: str) -> list[LogEntry]:
94+
"""Parse the contents of an ``argus.log`` file (JSON-lines) into entries.
95+
96+
Skips:
97+
- empty lines
98+
- lines that aren't valid JSON
99+
- JSON values that aren't objects (shouldn't happen with
100+
``JsonLogFormatter``, defensive)
101+
- records with a ``level`` we don't recognize (rather than
102+
assigning them an arbitrary rank that would warp filters)
103+
104+
Doesn't enforce field presence beyond that — missing ``module``
105+
falls back to ``"argus"``, missing ``message`` to ``""``.
106+
"""
107+
entries: list[LogEntry] = []
108+
for i, raw in enumerate(text.splitlines(), start=1):
109+
line = raw.strip()
110+
if not line:
111+
continue
112+
try:
113+
data = json.loads(line)
114+
except json.JSONDecodeError:
115+
continue
116+
if not isinstance(data, dict):
117+
continue
118+
level = _canonicalize_level(data.get("level", ""))
119+
if level not in LEVEL_RANK:
120+
continue
93121
entries.append(LogEntry(
94-
line_no=current_line_no,
95-
time=current_match["time"],
96-
level=_canonicalize_level(current_match["level"]),
97-
logger=current_match["logger"],
98-
msg="\n".join(current_lines).rstrip(),
122+
line_no=i,
123+
time=_extract_time(data.get("timestamp", "")),
124+
level=level,
125+
logger=str(data.get("module") or "argus"),
126+
msg=str(data.get("message", "")),
99127
))
100-
101-
for i, line in enumerate(text.splitlines(), start=1):
102-
match = _LOG_LINE_RE.match(line)
103-
if match:
104-
_flush()
105-
current_match = match
106-
current_line_no = i
107-
# The first line carries the structured prefix; we keep
108-
# only the message portion in current_lines so the rendered
109-
# output doesn't re-display time/level/logger inline.
110-
current_lines = [match["msg"]]
111-
elif current_match is not None:
112-
current_lines.append(line)
113-
_flush()
114128
return entries
115129

116130

0 commit comments

Comments
 (0)