Skip to content

Commit 4ae9395

Browse files
committed
perf(loaders): share a single lief.parse across all loader accessors
PE/ELF/MachO file loaders previously each called lief.parse(binary) inside several of their static accessors. FileLoader._loadFile calls 6 of these accessors in sequence (mapBinary, getBaseAddress, getBitness, getCodeAreas, getArchitecture, getAbi), so a single PE binary was re-parsed up to 2 times, ELF up to 4 times, and MachO up to 5 times on every load. lief.parse is the single most expensive step for any non-trivial binary. Refactor: - Each LIEF-aware loader now exposes parseBinary(binary) -> lief obj | None as a single parse entry point. - Every accessor on PE / ELF / MachO accepts an optional parsed=None kwarg and reuses it when supplied (falling back to lief.parse(binary) when None, preserving the legacy direct-call contract). - ELF's existing elffile= and MachO's macho_file= kwargs are renamed to parsed= for uniform API. grep across src/ and tests/ confirms no external caller passed those by name; the only callers were in-file (mapBinary -> getBaseAddress) and were updated. - FileLoader._loadFile parses once via loader.parseBinary(self._raw_data) and threads the parsed object through every accessor via **kw. Delphi/Dex loaders don't have parseBinary, so kw stays empty for them. - PE accessors that never used lief (mapBinary/getBaseAddress/getBitness/ getAbi) accept parsed= for API uniformity and `del parsed` it. Cheap to pass, keeps the FileLoader call site uniform. Net effect: 1 lief.parse per binary load instead of 2/4/5. Validation: - pytest tests/test* -> 111 passed, 79 subtests passed - testFileFormatParsers (PE/ELF/MachO/CIL paths) all pass - testPeFileLoader passes - ruff check + format --check clean - No external callers reference elffile= or macho_file= by keyword
1 parent 8b6e117 commit 4ae9395

4 files changed

Lines changed: 75 additions & 42 deletions

File tree

src/smda/utility/ElfFileLoader.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,14 @@ def isCompatible(data):
3232
return data[:4] == b"\x7fELF"
3333

3434
@staticmethod
35-
def getBaseAddress(binary, elffile=None):
36-
if elffile is None:
37-
elffile = lief.parse(binary)
35+
def parseBinary(binary):
36+
# Single lief.parse entry point so FileLoader can share one parse
37+
# across all accessors instead of each accessor re-parsing.
38+
return lief.parse(binary)
39+
40+
@staticmethod
41+
def getBaseAddress(binary, parsed=None):
42+
elffile = parsed if parsed is not None else lief.parse(binary)
3843
# Determine base address of binary
3944
#
4045
base_addr = 0
@@ -152,15 +157,15 @@ def _map_sections(elffile, mapped_binary, base_addr):
152157
mapped_binary[rva : rva + section.size] = content_to_be_mapped
153158

154159
@staticmethod
155-
def mapBinary(binary):
160+
def mapBinary(binary, parsed=None):
156161
"""
157162
map the ELF file sections and segments into a contiguous bytearray
158163
as if into virtual memory with the given base address.
159164
"""
160-
elffile = lief.parse(binary)
165+
elffile = parsed if parsed is not None else lief.parse(binary)
161166
if not elffile:
162167
return b""
163-
base_addr = ElfFileLoader.getBaseAddress(binary, elffile=elffile)
168+
base_addr = ElfFileLoader.getBaseAddress(binary, parsed=elffile)
164169

165170
LOGGER.debug("ELF: base address: 0x%x", base_addr)
166171

@@ -208,25 +213,25 @@ def mapBinary(binary):
208213
return bytes(mapped_binary)
209214

210215
@staticmethod
211-
def getAbi(binary):
216+
def getAbi(binary, parsed=None):
212217
abi = ""
213218
try:
214-
elffile = lief.parse(binary)
219+
elffile = parsed if parsed is not None else lief.parse(binary)
215220
if elffile:
216221
abi = elffile.header.identity_os_abi.name
217222
except lief.bad_file as exc:
218223
LOGGER.warning("Failed to determine ELF ABI: %s", exc)
219224
return abi
220225

221226
@staticmethod
222-
def getArchitecture(binary):
223-
architecture = "intel"
224-
return architecture
227+
def getArchitecture(binary, parsed=None):
228+
del binary, parsed
229+
return "intel"
225230

226231
@staticmethod
227-
def getBitness(binary):
232+
def getBitness(binary, parsed=None):
228233
# TODO add machine types whenever we add more architectures
229-
elffile = lief.parse(binary)
234+
elffile = parsed if parsed is not None else lief.parse(binary)
230235
if not elffile:
231236
return 0
232237
machine_type = elffile.header.machine_type
@@ -254,9 +259,9 @@ def mergeCodeAreas(code_areas):
254259
return merged_code_areas
255260

256261
@staticmethod
257-
def getCodeAreas(binary):
262+
def getCodeAreas(binary, parsed=None):
258263
# TODO add machine types whenever we add more architectures
259-
elffile = lief.parse(binary)
264+
elffile = parsed if parsed is not None else lief.parse(binary)
260265
if elffile is None:
261266
return []
262267
code_areas = []

src/smda/utility/FileLoader.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,18 @@ def _loadFile(self, buffer=None):
3838
if self._map_file:
3939
for loader in self.file_loaders:
4040
if loader.isCompatible(self._raw_data):
41-
self._data = loader.mapBinary(self._raw_data)
42-
self._base_addr = loader.getBaseAddress(self._raw_data)
43-
self._bitness = loader.getBitness(self._raw_data)
44-
self._code_areas = loader.getCodeAreas(self._raw_data)
45-
self._architecture = loader.getArchitecture(self._raw_data)
46-
self._abi = loader.getAbi(self._raw_data)
41+
# PE/ELF/MachO loaders expose parseBinary() so we can
42+
# share a single lief.parse(...) across every accessor
43+
# and skip multiple redundant re-parses per binary
44+
# load. Delphi/Dex loaders don't need lief, so kw
45+
# stays empty for them.
46+
kw = {"parsed": loader.parseBinary(self._raw_data)} if hasattr(loader, "parseBinary") else {}
47+
self._data = loader.mapBinary(self._raw_data, **kw)
48+
self._base_addr = loader.getBaseAddress(self._raw_data, **kw)
49+
self._bitness = loader.getBitness(self._raw_data, **kw)
50+
self._code_areas = loader.getCodeAreas(self._raw_data, **kw)
51+
self._architecture = loader.getArchitecture(self._raw_data, **kw)
52+
self._abi = loader.getAbi(self._raw_data, **kw)
4753
break
4854
else:
4955
self._data = self._raw_data

src/smda/utility/MachoFileLoader.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,14 @@ def isCompatible(data):
3030
return data[:4] == b"\xce\xfa\xed\xfe" or data[:4] == b"\xcf\xfa\xed\xfe"
3131

3232
@staticmethod
33-
def getBaseAddress(binary, macho_file=None):
34-
if macho_file is None:
35-
macho_file = lief.parse(binary)
33+
def parseBinary(binary):
34+
# Single lief.parse entry point so FileLoader can share one parse
35+
# across all accessors instead of each accessor re-parsing.
36+
return lief.parse(binary)
37+
38+
@staticmethod
39+
def getBaseAddress(binary, parsed=None):
40+
macho_file = parsed if parsed is not None else lief.parse(binary)
3641
# Determine base address of binary
3742
#
3843
base_addr = 0
@@ -47,17 +52,17 @@ def getBaseAddress(binary, macho_file=None):
4752
return base_addr
4853

4954
@staticmethod
50-
def mapBinary(binary):
55+
def mapBinary(binary, parsed=None):
5156
"""
5257
map the MachO file sections and segments into a contiguous bytearray
5358
as if into virtual memory with the given base address.
5459
"""
5560
# MachO needs a file-like object...
5661
# Attention: for Python 2.x use the cStringIO package for StringIO
57-
macho_file = lief.parse(binary)
62+
macho_file = parsed if parsed is not None else lief.parse(binary)
5863
if not macho_file:
5964
return b""
60-
base_addr = MachoFileLoader.getBaseAddress(binary, macho_file=macho_file)
65+
base_addr = MachoFileLoader.getBaseAddress(binary, parsed=macho_file)
6166

6267
LOGGER.debug("MachO: base address: 0x%x", base_addr)
6368

@@ -162,13 +167,14 @@ def mapBinary(binary):
162167
return bytes(mapped_binary)
163168

164169
@staticmethod
165-
def getAbi(binary):
170+
def getAbi(binary, parsed=None):
171+
del binary, parsed
166172
return ""
167173

168174
@staticmethod
169-
def getArchitecture(binary):
175+
def getArchitecture(binary, parsed=None):
170176
# TODO add machine types whenever we add more architectures
171-
macho_file = lief.parse(binary)
177+
macho_file = parsed if parsed is not None else lief.parse(binary)
172178
if not macho_file:
173179
return ""
174180
machine_type = macho_file.header.cpu_type
@@ -185,9 +191,9 @@ def getArchitecture(binary):
185191
raise NotImplementedError("SMDA does not support this architecture yet.")
186192

187193
@staticmethod
188-
def getBitness(binary):
194+
def getBitness(binary, parsed=None):
189195
# TODO add machine types whenever we add more architectures
190-
macho_file = lief.parse(binary)
196+
macho_file = parsed if parsed is not None else lief.parse(binary)
191197
if not macho_file:
192198
return 0
193199
machine_type = macho_file.header.cpu_type
@@ -217,9 +223,9 @@ def mergeCodeAreas(code_areas):
217223
return merged_code_areas
218224

219225
@staticmethod
220-
def getCodeAreas(binary):
226+
def getCodeAreas(binary, parsed=None):
221227
# TODO add machine types whenever we add more architectures
222-
macho_file = lief.parse(binary)
228+
macho_file = parsed if parsed is not None else lief.parse(binary)
223229
if not macho_file:
224230
return []
225231
ins_flags = (

src/smda/utility/PeFileLoader.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,16 @@ def isCompatible(data):
1616
return data[:2] == b"MZ"
1717

1818
@staticmethod
19-
def mapBinary(binary):
19+
def parseBinary(binary):
20+
# Single lief.parse entry point so FileLoader can share one parse
21+
# across all accessors instead of each accessor re-parsing.
22+
return lief.parse(binary)
23+
24+
@staticmethod
25+
def mapBinary(binary, parsed=None):
26+
# parsed accepted for API uniformity with ELF/MachO; PE mapping is
27+
# done via struct.unpack on the raw bytes and never needs lief.
28+
del parsed
2029
# This is a pretty rough implementation but does the job for now
2130
mapped_binary = bytearray([])
2231
pe_offset = PeFileLoader.getPeOffset(binary)
@@ -88,15 +97,21 @@ def mapBinary(binary):
8897
return bytes(mapped_binary)
8998

9099
@staticmethod
91-
def getBitness(binary):
100+
def getBitness(binary, parsed=None):
101+
# parsed accepted for API uniformity; PE bitness is read from the
102+
# COFF header via struct.unpack.
103+
del parsed
92104
bitness_id = 0
93105
pe_offset = PeFileLoader.getPeOffset(binary)
94106
if pe_offset and len(binary) >= pe_offset + 0x6:
95107
bitness_id = struct.unpack("H", binary[pe_offset + 0x4 : pe_offset + 0x6])[0]
96108
return PeFileLoader.BITNESS_MAP.get(bitness_id, 0)
97109

98110
@staticmethod
99-
def getBaseAddress(binary):
111+
def getBaseAddress(binary, parsed=None):
112+
# parsed accepted for API uniformity; PE base address comes from the
113+
# optional header via struct.unpack.
114+
del parsed
100115
base_addr = 0
101116
pe_offset = PeFileLoader.getPeOffset(binary)
102117
if pe_offset and len(binary) >= pe_offset + 0x38:
@@ -128,13 +143,14 @@ def getOEP(binary):
128143
return oep_rva
129144

130145
@staticmethod
131-
def getAbi(binary):
146+
def getAbi(binary, parsed=None):
147+
del binary, parsed
132148
return ""
133149

134150
@staticmethod
135-
def getArchitecture(binary):
151+
def getArchitecture(binary, parsed=None):
136152
architecture = "intel"
137-
pefile = lief.parse(binary)
153+
pefile = parsed if parsed is not None else lief.parse(binary)
138154
if pefile:
139155
for d in pefile.data_directories:
140156
if d.type == lief.PE.DataDirectory.TYPES.CLR_RUNTIME_HEADER and d.size > 0:
@@ -150,8 +166,8 @@ def checkPe(binary):
150166
return False
151167

152168
@staticmethod
153-
def getCodeAreas(binary):
154-
pefile = lief.parse(binary)
169+
def getCodeAreas(binary, parsed=None):
170+
pefile = parsed if parsed is not None else lief.parse(binary)
155171
code_areas = []
156172
base_address = PeFileLoader.getBaseAddress(binary)
157173
if pefile and pefile.sections:

0 commit comments

Comments
 (0)