Skip to content

Commit f6bad66

Browse files
committed
feat(reporting): report meta-data information about chunks.
Allow handlers to provide a dict value as part of a ValidChunk metadata attribute. That dictionnary can contain any relevant metadata information from the perspective of the handler, but we advise handler writers to report parsed information such as header values. This metadata dict is later reported as part of our ChunkReports and available in the JSON report file if the user requested one. The idea is to expose metadata to further analysis steps through the unblob report. For example, a binary analysis toolkit would read the load address and architecture from a uImage chunk to analyze the file extracted from that chunk with the right settings. A note on the 'as_dict' implementation. The initial idea was to implement it in dissect.cstruct (see fox-it/dissect.cstruct#29), but due to expected changes in the project's API I chose to implement it in unblob so we're not dependent on another project.
1 parent 46cd4ce commit f6bad66

File tree

5 files changed

+279
-59
lines changed

5 files changed

+279
-59
lines changed

Diff for: tests/test_models.py

+71-53
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,15 @@
44
import pytest
55

66
from unblob.file_utils import InvalidInputFormat
7-
from unblob.models import Chunk, ProcessResult, Task, TaskResult, UnknownChunk, to_json
7+
from unblob.models import (
8+
Chunk,
9+
ProcessResult,
10+
Task,
11+
TaskResult,
12+
UnknownChunk,
13+
ValidChunk,
14+
to_json,
15+
)
816
from unblob.report import (
917
ChunkReport,
1018
ExtractCommandFailedReport,
@@ -170,56 +178,57 @@ def test_process_result_conversion(self):
170178
decoded_report = json.loads(json_text)
171179
assert decoded_report == [
172180
{
173-
"__typename__": "TaskResult",
181+
"task": {
182+
"path": "/nonexistent",
183+
"depth": 0,
184+
"blob_id": "",
185+
"is_multi_file": False,
186+
"__typename__": "Task",
187+
},
174188
"reports": [
175189
{
176-
"__typename__": "StatReport",
190+
"path": "/nonexistent",
191+
"size": 384,
177192
"is_dir": False,
178193
"is_file": True,
179194
"is_link": False,
180195
"link_target": None,
181-
"path": "/nonexistent",
182-
"size": 384,
196+
"__typename__": "StatReport",
183197
},
184198
{
185-
"__typename__": "FileMagicReport",
186199
"magic": "Zip archive data, at least v2.0 to extract",
187200
"mime_type": "application/zip",
201+
"__typename__": "FileMagicReport",
188202
},
189203
{
190-
"__typename__": "HashReport",
191204
"md5": "9019fcece2433ad7f12c077e84537a74",
192205
"sha1": "36998218d8f43b69ef3adcadf2e8979e81eed166",
193206
"sha256": "7d7ca7e1410b702b0f85d18257aebb964ac34f7fad0a0328d72e765bfcb21118",
207+
"__typename__": "HashReport",
194208
},
195209
{
196-
"__typename__": "ChunkReport",
197-
"end_offset": 384,
198-
"extraction_reports": [],
199-
"handler_name": "zip",
200210
"id": "test_basic_conversion:id",
201-
"is_encrypted": False,
202-
"size": 384,
211+
"handler_name": "zip",
203212
"start_offset": 0,
213+
"end_offset": 384,
214+
"size": 384,
215+
"is_encrypted": False,
216+
"metadata": {},
217+
"extraction_reports": [],
218+
"__typename__": "ChunkReport",
204219
},
205220
],
206221
"subtasks": [
207222
{
208-
"__typename__": "Task",
209-
"blob_id": "test_basic_conversion:id",
223+
"path": "/extractions/nonexistent_extract",
210224
"depth": 314,
225+
"blob_id": "test_basic_conversion:id",
211226
"is_multi_file": False,
212-
"path": "/extractions/nonexistent_extract",
227+
"__typename__": "Task",
213228
}
214229
],
215-
"task": {
216-
"__typename__": "Task",
217-
"blob_id": "",
218-
"depth": 0,
219-
"is_multi_file": False,
220-
"path": "/nonexistent",
221-
},
222-
},
230+
"__typename__": "TaskResult",
231+
}
223232
]
224233

225234
def test_exotic_command_output(self):
@@ -235,35 +244,44 @@ def test_exotic_command_output(self):
235244
decoded_report = json.loads(json_text)
236245

237246
assert decoded_report == {
238-
"__typename__": "ExtractCommandFailedReport",
239-
"command": "dump all bytes",
240-
"exit_code": 1,
241247
"severity": "WARNING",
248+
"command": "dump all bytes",
249+
"stdout": "\x00\x01\x02\x03\x04\x05\x06\x07\x08"
250+
"\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16"
251+
"\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,"
252+
"-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]"
253+
"^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\udc80\udc81"
254+
"\udc82\udc83\udc84\udc85\udc86\udc87\udc88\udc89"
255+
"\udc8a\udc8b\udc8c\udc8d\udc8e\udc8f\udc90\udc91"
256+
"\udc92\udc93\udc94\udc95\udc96\udc97\udc98\udc99"
257+
"\udc9a\udc9b\udc9c\udc9d\udc9e\udc9f\udca0\udca1"
258+
"\udca2\udca3\udca4\udca5\udca6\udca7\udca8\udca9"
259+
"\udcaa\udcab\udcac\udcad\udcae\udcaf\udcb0\udcb1"
260+
"\udcb2\udcb3\udcb4\udcb5\udcb6\udcb7\udcb8\udcb9"
261+
"\udcba\udcbb\udcbc\udcbd\udcbe\udcbf\udcc0\udcc1"
262+
"\udcc2\udcc3\udcc4\udcc5\udcc6\udcc7\udcc8\udcc9"
263+
"\udcca\udccb\udccc\udccd\udcce\udccf\udcd0\udcd1"
264+
"\udcd2\udcd3\udcd4\udcd5\udcd6\udcd7\udcd8\udcd9"
265+
"\udcda\udcdb\udcdc\udcdd\udcde\udcdf\udce0\udce1"
266+
"\udce2\udce3\udce4\udce5\udce6\udce7\udce8\udce9"
267+
"\udcea\udceb\udcec\udced\udcee\udcef\udcf0\udcf1"
268+
"\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9"
269+
"\udcfa\udcfb\udcfc\udcfd\udcfe\udcff",
242270
"stderr": "stdout is pretty strange ;)",
243-
"stdout": (
244-
"b'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07"
245-
"\\x08\\t\\n\\x0b\\x0c\\r\\x0e\\x0f"
246-
"\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17"
247-
'\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\\x1f !"#'
248-
"$%&\\'()*+,-./0123456789:;<=>?@AB"
249-
"CDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`a"
250-
"bcdefghijklmnopqrstuvwxyz{|}~\\x7f"
251-
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87"
252-
"\\x88\\x89\\x8a\\x8b\\x8c\\x8d\\x8e\\x8f"
253-
"\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97"
254-
"\\x98\\x99\\x9a\\x9b\\x9c\\x9d\\x9e\\x9f"
255-
"\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7"
256-
"\\xa8\\xa9\\xaa\\xab\\xac\\xad\\xae\\xaf"
257-
"\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
258-
"\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf"
259-
"\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5\\xc6\\xc7"
260-
"\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf"
261-
"\\xd0\\xd1\\xd2\\xd3\\xd4\\xd5\\xd6\\xd7"
262-
"\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf"
263-
"\\xe0\\xe1\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7"
264-
"\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
265-
"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7"
266-
"\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd\\xfe\\xff"
267-
"'"
268-
),
271+
"exit_code": 1,
272+
"__typename__": "ExtractCommandFailedReport",
269273
}
274+
275+
@pytest.mark.parametrize(
276+
"metadata",
277+
[
278+
pytest.param(1, id="metadata_int"),
279+
pytest.param(0.2, id="metadata_float"),
280+
pytest.param(True, id="metadata_bool"),
281+
pytest.param([1, 2], id="metadata_list"),
282+
pytest.param((1, 2), id="metadata_tuple"),
283+
],
284+
)
285+
def test_invalid_metadata(self, metadata):
286+
with pytest.raises(ValueError, match="Can only convert dict or Instance"):
287+
ValidChunk(start_offset=0, end_offset=100, metadata=metadata)

Diff for: tests/test_report.py

+173
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from unblob.report import (
1212
CarveDirectoryReport,
1313
ChunkReport,
14+
ExtractCommandFailedReport,
1415
FileMagicReport,
1516
HashReport,
1617
StatReport,
@@ -48,6 +49,178 @@ def test_process_file_report_output_is_valid_json(
4849
assert len(report)
4950

5051

52+
class Test_ProcessResult_to_json: # noqa: N801
53+
def test_simple_conversion(self):
54+
task = Task(path=Path("/nonexistent"), depth=0, blob_id="")
55+
task_result = TaskResult(task)
56+
chunk_id = "test_basic_conversion:id"
57+
58+
task_result.add_report(
59+
StatReport(
60+
path=task.path,
61+
size=384,
62+
is_dir=False,
63+
is_file=True,
64+
is_link=False,
65+
link_target=None,
66+
)
67+
)
68+
task_result.add_report(
69+
FileMagicReport(
70+
magic="Zip archive data, at least v2.0 to extract",
71+
mime_type="application/zip",
72+
)
73+
)
74+
task_result.add_report(
75+
HashReport(
76+
md5="9019fcece2433ad7f12c077e84537a74",
77+
sha1="36998218d8f43b69ef3adcadf2e8979e81eed166",
78+
sha256="7d7ca7e1410b702b0f85d18257aebb964ac34f7fad0a0328d72e765bfcb21118",
79+
)
80+
)
81+
task_result.add_report(
82+
ChunkReport(
83+
id=chunk_id,
84+
handler_name="zip",
85+
start_offset=0,
86+
end_offset=384,
87+
size=384,
88+
is_encrypted=False,
89+
metadata={},
90+
extraction_reports=[],
91+
)
92+
)
93+
task_result.add_subtask(
94+
Task(
95+
path=Path("/extractions/nonexistent_extract"),
96+
depth=314,
97+
blob_id=chunk_id,
98+
)
99+
)
100+
101+
json_text = ProcessResult(results=[task_result]).to_json()
102+
103+
# output must be a valid json string
104+
assert isinstance(json_text, str)
105+
106+
# that can be loaded back
107+
decoded_report = json.loads(json_text)
108+
assert decoded_report == [
109+
{
110+
"task": {
111+
"path": "/nonexistent",
112+
"depth": 0,
113+
"blob_id": "",
114+
"is_multi_file": False,
115+
"__typename__": "Task",
116+
},
117+
"reports": [
118+
{
119+
"path": "/nonexistent",
120+
"size": 384,
121+
"is_dir": False,
122+
"is_file": True,
123+
"is_link": False,
124+
"link_target": None,
125+
"__typename__": "StatReport",
126+
},
127+
{
128+
"magic": "Zip archive data, at least v2.0 to extract",
129+
"mime_type": "application/zip",
130+
"__typename__": "FileMagicReport",
131+
},
132+
{
133+
"md5": "9019fcece2433ad7f12c077e84537a74",
134+
"sha1": "36998218d8f43b69ef3adcadf2e8979e81eed166",
135+
"sha256": "7d7ca7e1410b702b0f85d18257aebb964ac34f7fad0a0328d72e765bfcb21118",
136+
"__typename__": "HashReport",
137+
},
138+
{
139+
"id": "test_basic_conversion:id",
140+
"handler_name": "zip",
141+
"start_offset": 0,
142+
"end_offset": 384,
143+
"size": 384,
144+
"is_encrypted": False,
145+
"metadata": {},
146+
"extraction_reports": [],
147+
"__typename__": "ChunkReport",
148+
},
149+
],
150+
"subtasks": [
151+
{
152+
"path": "/extractions/nonexistent_extract",
153+
"depth": 314,
154+
"blob_id": "test_basic_conversion:id",
155+
"is_multi_file": False,
156+
"__typename__": "Task",
157+
}
158+
],
159+
"__typename__": "TaskResult",
160+
}
161+
]
162+
163+
def test_exotic_command_output(self):
164+
task = Task(path=Path("/nonexistent"), depth=0, blob_id="")
165+
task_result = TaskResult(task)
166+
report = ExtractCommandFailedReport(
167+
command="dump all bytes",
168+
stdout=bytes(range(256)),
169+
stderr=b"stdout is pretty strange ;)",
170+
exit_code=1,
171+
)
172+
173+
task_result.add_report(
174+
ChunkReport(
175+
id="test",
176+
handler_name="fail",
177+
start_offset=0,
178+
end_offset=256,
179+
size=256,
180+
is_encrypted=False,
181+
extraction_reports=[report],
182+
)
183+
)
184+
json_text = ProcessResult(results=[task_result]).to_json()
185+
186+
decoded_report = json.loads(json_text)
187+
assert decoded_report == [
188+
{
189+
"task": {
190+
"path": "/nonexistent",
191+
"depth": 0,
192+
"blob_id": "",
193+
"is_multi_file": False,
194+
"__typename__": "Task",
195+
},
196+
"reports": [
197+
{
198+
"id": "test",
199+
"handler_name": "fail",
200+
"start_offset": 0,
201+
"end_offset": 256,
202+
"size": 256,
203+
"is_encrypted": False,
204+
"metadata": {},
205+
"extraction_reports": [
206+
{
207+
"severity": "WARNING",
208+
"command": "dump all bytes",
209+
"stdout": "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\udc80\udc81\udc82\udc83\udc84\udc85\udc86\udc87\udc88\udc89\udc8a\udc8b\udc8c\udc8d\udc8e\udc8f\udc90\udc91\udc92\udc93\udc94\udc95\udc96\udc97\udc98\udc99\udc9a\udc9b\udc9c\udc9d\udc9e\udc9f\udca0\udca1\udca2\udca3\udca4\udca5\udca6\udca7\udca8\udca9\udcaa\udcab\udcac\udcad\udcae\udcaf\udcb0\udcb1\udcb2\udcb3\udcb4\udcb5\udcb6\udcb7\udcb8\udcb9\udcba\udcbb\udcbc\udcbd\udcbe\udcbf\udcc0\udcc1\udcc2\udcc3\udcc4\udcc5\udcc6\udcc7\udcc8\udcc9\udcca\udccb\udccc\udccd\udcce\udccf\udcd0\udcd1\udcd2\udcd3\udcd4\udcd5\udcd6\udcd7\udcd8\udcd9\udcda\udcdb\udcdc\udcdd\udcde\udcdf\udce0\udce1\udce2\udce3\udce4\udce5\udce6\udce7\udce8\udce9\udcea\udceb\udcec\udced\udcee\udcef\udcf0\udcf1\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9\udcfa\udcfb\udcfc\udcfd\udcfe\udcff",
210+
"stderr": "stdout is pretty strange ;)",
211+
"exit_code": 1,
212+
"__typename__": "ExtractCommandFailedReport",
213+
}
214+
],
215+
"__typename__": "ChunkReport",
216+
}
217+
],
218+
"subtasks": [],
219+
"__typename__": "TaskResult",
220+
}
221+
]
222+
223+
51224
@pytest.fixture
52225
def hello_kitty(tmp_path: Path) -> Path:
53226
"""Generate an input file with 3 unknown chunks and 2 zip files."""

Diff for: unblob/handlers/archive/sevenzip.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from pathlib import Path
2323
from typing import Optional
2424

25+
from dissect.cstruct import Instance
2526
from structlog import get_logger
2627

2728
from unblob.extractors import Command
@@ -90,14 +91,21 @@ class SevenZipHandler(StructHandler):
9091
HEADER_STRUCT = HEADER_STRUCT
9192
EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")
9293

94+
def get_metadata(self, header: Instance) -> dict:
95+
return {"version_maj": header.version_maj, "version_min": header.version_min}
96+
9397
def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
9498
header = self.parse_header(file)
9599

96100
check_header_crc(header)
97101

98102
size = calculate_sevenzip_size(header)
99103

100-
return ValidChunk(start_offset=start_offset, end_offset=start_offset + size)
104+
metadata = self.get_metadata(header)
105+
106+
return ValidChunk(
107+
start_offset=start_offset, end_offset=start_offset + size, metadata=metadata
108+
)
101109

102110

103111
class MultiVolumeSevenZipHandler(DirectoryHandler):

0 commit comments

Comments
 (0)