Skip to content

Commit 0b150ba

Browse files
committed
feat(reporting): report meta-data information about chunks.
Allow handlers to provide a dict value as part of a ValidChunk metadata attribute. That dictionnary can contain any relevant metadata information from the perspective of the handler, but we advise handler writers to report parsed information such as header values. This metadata dict is later reported as part of our ChunkReports and available in the JSON report file if the user requested one. The idea is to expose metadata to further analysis steps through the unblob report. For example, a binary analysis toolkit would read the load address and architecture from a uImage chunk to analyze the file extracted from that chunk with the right settings. A note on the 'as_dict' implementation. The initial idea was to implement it in dissect.cstruct (see fox-it/dissect.cstruct#29), but due to expected changes in the project's API I chose to implement it in unblob so we're not dependent on another project.
1 parent f7f32fa commit 0b150ba

File tree

4 files changed

+33
-4
lines changed

4 files changed

+33
-4
lines changed

Diff for: unblob/file_utils.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pathlib import Path
99
from typing import Iterator, Tuple
1010

11-
from dissect.cstruct import cstruct
11+
from dissect.cstruct import Instance, cstruct
1212
from pyperscan import Scan
1313

1414
from .logging import format_hex
@@ -311,3 +311,26 @@ def read_until_past(file: File, pattern: bytes):
311311
return file.tell()
312312
if next_byte not in pattern:
313313
return file.tell() - 1
314+
315+
316+
def as_dict(obj):
317+
"""Convert a Python class instance to a dictionary."""
318+
if isinstance(obj, dict):
319+
return obj
320+
if isinstance(obj, list):
321+
return [as_dict(item) for item in obj]
322+
if isinstance(obj, Instance):
323+
result = {}
324+
for k, v in obj._values.items(): # noqa: SLF001
325+
result[k] = v
326+
return result
327+
328+
result = {}
329+
for key, value in obj.__dict__.items():
330+
if key.startswith("_"):
331+
continue
332+
if isinstance(value, (list, tuple)):
333+
result[key] = [as_dict(item) for item in value]
334+
else:
335+
result[key] = as_dict(value)
336+
return result

Diff for: unblob/handlers/archive/sevenzip.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from structlog import get_logger
2424

2525
from unblob.extractors import Command
26+
from unblob.file_utils import as_dict
2627

2728
from ...models import File, HexString, StructHandler, ValidChunk
2829

@@ -70,4 +71,6 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]
7071
# We read the signature header here to get the offset to the header database
7172
first_db_header = start_offset + len(header) + header.next_header_offset
7273
end_offset = first_db_header + header.next_header_size
73-
return ValidChunk(start_offset=start_offset, end_offset=end_offset)
74+
return ValidChunk(
75+
start_offset=start_offset, end_offset=end_offset, metadata=as_dict(header)
76+
)

Diff for: unblob/models.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ class ValidChunk(Chunk):
8888

8989
handler: "Handler" = attr.ib(init=False, eq=False)
9090
is_encrypted: bool = attr.ib(default=False)
91+
metadata: dict = attr.ib(default={})
9192

9293
def extract(self, inpath: Path, outdir: Path):
9394
if self.is_encrypted:
@@ -108,6 +109,7 @@ def as_report(self, extraction_reports: List[Report]) -> ChunkReport:
108109
size=self.size,
109110
handler_name=self.handler.NAME,
110111
is_encrypted=self.is_encrypted,
112+
metadata=self.metadata,
111113
extraction_reports=extraction_reports,
112114
)
113115

@@ -188,7 +190,7 @@ def default(self, obj):
188190

189191
if isinstance(obj, bytes):
190192
try:
191-
return obj.decode()
193+
return obj.decode("utf-8", errors="surrogateescape")
192194
except UnicodeDecodeError:
193195
return str(obj)
194196

Diff for: unblob/report.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import traceback
55
from enum import Enum
66
from pathlib import Path
7-
from typing import List, Optional, Union, final
7+
from typing import Dict, List, Optional, Union, final
88

99
import attr
1010

@@ -181,6 +181,7 @@ class ChunkReport(Report):
181181
end_offset: int
182182
size: int
183183
is_encrypted: bool
184+
metadata: Dict
184185
extraction_reports: List[Report]
185186

186187

0 commit comments

Comments
 (0)