Skip to content

Commit 6e8caf4

Browse files
claudedanielplohmann
authored andcommitted
perf(intel): cache block index on analysis_state, not the analyzer
Address Gemini review on PR #47: stash the {instruction_addr: block} index on analysis_state instead of self. analysis_state has the right lifetime (one per function analysis) so the cache is naturally re-entrancy-safe and can't outlive what it indexes, the analyzer keeps no transient state, and the explicit seed + try/finally in resolveRegisterCalls goes away. searchBlock now lazy-builds on first call, so the legacy fallback branch is also gone — every caller (including direct unit-test callers) gets the O(1) path automatically. contextlib.suppress(AttributeError) guards the cache write so that test doubles or hypothetical __slots__-locked states still work; the freshly built dict is returned in that case. Re-ran the focused micro-bench (80 blocks x 15 instructions, 1200 lookups): ~107x faster than the legacy scan, 0/1200 parity mismatches. End-to-end asprox sha256/num_instructions/function count unchanged. Validation: - pytest tests/test* -> 111 passed, 79 subtests passed - ruff check + format --check clean
1 parent d98bd5c commit 6e8caf4

1 file changed

Lines changed: 59 additions & 69 deletions

File tree

src/smda/intel/IndirectCallAnalyzer.py

Lines changed: 59 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import contextlib
12
import logging
23
import re
34
import struct
@@ -19,34 +20,32 @@ def __init__(self, disassembler):
1920
self.disassembly = self.disassembler.disassembly
2021
self.current_calling_addr = 0
2122
self.state = None
22-
# Lazy {instruction_addr: containing_block} index, populated for the
23-
# lifetime of a single resolveRegisterCalls() call. Cleared afterwards
24-
# so a reused analyzer instance never serves a stale index.
25-
self._block_index = None
26-
27-
@staticmethod
28-
def _buildBlockIndex(analysis_state):
29-
# Preserve "first matching block wins" — overlapping potential_starts
30-
# in FunctionAnalysisState.getBlocks() can place the same instruction
31-
# in more than one block; the legacy linear scan returned the first.
32-
index = {}
33-
for block in analysis_state.getBlocks():
34-
for ins in block:
35-
addr = ins[0]
36-
if addr not in index:
37-
index[addr] = block
38-
return index
3923

4024
def searchBlock(self, analysis_state, address):
41-
if self._block_index is not None:
42-
return self._block_index.get(address, [])
43-
# Fallback for direct callers (e.g. unit tests) that bypass
44-
# resolveRegisterCalls and never seed the index.
45-
for block in analysis_state.getBlocks():
46-
for ins in block:
47-
if ins[0] == address:
48-
return block
49-
return []
25+
# Lazy-cache an {instruction_addr: containing_block} index on the
26+
# analysis_state so subsequent lookups during the same function
27+
# analysis are O(1) instead of O(blocks * instructions). The cache
28+
# lives on the state (not on self) so the analyzer stays
29+
# re-entrancy-safe and the index can't outlive the function being
30+
# analyzed.
31+
block_index = getattr(analysis_state, "_block_index", None)
32+
if not isinstance(block_index, dict):
33+
block_index = {}
34+
# Preserve "first matching block wins" — overlapping
35+
# potential_starts in FunctionAnalysisState.getBlocks() can
36+
# place the same instruction in more than one block; the
37+
# legacy linear scan returned the first.
38+
for block in analysis_state.getBlocks():
39+
for ins in block:
40+
addr = ins[0]
41+
if addr not in block_index:
42+
block_index[addr] = block
43+
# Objects with __slots__ or read-only attribute surfaces (and some
44+
# test doubles) reject the assignment; the lookup below still works
45+
# on the freshly built index.
46+
with contextlib.suppress(AttributeError):
47+
analysis_state._block_index = block_index
48+
return block_index.get(address, [])
5049

5150
def getDword(self, addr):
5251
if not self.disassembly.isAddrWithinMemoryImage(addr):
@@ -231,47 +230,38 @@ def resolveRegisterCalls(self, analysis_state, block_depth=3):
231230
len(analysis_state.call_register_ins),
232231
analysis_state.start_addr,
233232
)
234-
# Build the instruction->block index once per function. The previous
235-
# implementation scanned every block linearly inside searchBlock for
236-
# every calling address AND recursively for every incoming ref up to
237-
# block_depth — O(N**2) on call-heavy functions (e.g. Go binaries with
238-
# many register calls). With the index, each lookup is O(1).
239-
self._block_index = self._buildBlockIndex(analysis_state)
240-
try:
241-
max_calls_per_block = 10
242-
calls_per_block = {}
243-
for calling_addr in analysis_state.call_register_ins:
244-
LOGGER.debug("#" * 20)
245-
self.current_calling_addr = calling_addr
246-
self.state = analysis_state
247-
start_block = [ins for ins in self.searchBlock(analysis_state, calling_addr) if ins[0] <= calling_addr]
248-
if not start_block:
249-
return
250-
# we only process at most 10 register-calls per block to avoid extreme cases
251-
# found one Go sample with 130k register calls.
252-
if start_block[0] not in calls_per_block:
253-
calls_per_block[start_block[0]] = 0
254-
calls_per_block[start_block[0]] += 1
255-
# if we have an old config, default to 50
256-
max_calls = (
257-
self.disassembler.config.MAX_INDIRECT_CALLS_PER_BASIC_BLOCK
258-
if hasattr(self.disassembler.config, "MAX_INDIRECT_CALLS_PER_BASIC_BLOCK")
259-
else 50
260-
)
261-
if calls_per_block[start_block[0]] > max_calls:
262-
break
263-
LOGGER.debug(
264-
"For this block, we can still analyze %d indirect calls.",
265-
max_calls_per_block - calls_per_block[start_block[0]],
233+
max_calls_per_block = 10
234+
calls_per_block = {}
235+
for calling_addr in analysis_state.call_register_ins:
236+
LOGGER.debug("#" * 20)
237+
self.current_calling_addr = calling_addr
238+
self.state = analysis_state
239+
start_block = [ins for ins in self.searchBlock(analysis_state, calling_addr) if ins[0] <= calling_addr]
240+
if not start_block:
241+
return
242+
# we only process at most 10 register-calls per block to avoid extreme cases
243+
# found one Go sample with 130k register calls.
244+
if start_block[0] not in calls_per_block:
245+
calls_per_block[start_block[0]] = 0
246+
calls_per_block[start_block[0]] += 1
247+
# if we have an old config, default to 50
248+
max_calls = (
249+
self.disassembler.config.MAX_INDIRECT_CALLS_PER_BASIC_BLOCK
250+
if hasattr(self.disassembler.config, "MAX_INDIRECT_CALLS_PER_BASIC_BLOCK")
251+
else 50
252+
)
253+
if calls_per_block[start_block[0]] > max_calls:
254+
break
255+
LOGGER.debug(
256+
"For this block, we can still analyze %d indirect calls.",
257+
max_calls_per_block - calls_per_block[start_block[0]],
258+
)
259+
if start_block:
260+
self.processBlock(
261+
analysis_state,
262+
start_block,
263+
{},
264+
start_block[-1][3],
265+
[],
266+
block_depth,
266267
)
267-
if start_block:
268-
self.processBlock(
269-
analysis_state,
270-
start_block,
271-
{},
272-
start_block[-1][3],
273-
[],
274-
block_depth,
275-
)
276-
finally:
277-
self._block_index = None

0 commit comments

Comments
 (0)