Skip to content

Commit a69978f

Browse files
committed
perf(intel): index-backed searchBlock in IndirectCallAnalyzer
resolveRegisterCalls() resolves each "call <register>" by walking the CFG backward through up to block_depth (=3) levels of incoming refs. At every level, searchBlock was doing a linear scan over every block in the function and, for each block, a list comprehension over every instruction: for block in analysis_state.getBlocks(): if address in [i[0] for i in block]: return block So one call to searchBlock is O(B*I) — and the recursive descent into processBlock calls it once per incoming ref at every depth. Functions with many register calls (the file already mentions a Go sample with 130k of them) hit this hot. This commit: * Seeds an {instruction_addr: containing_block} dict once at the start of resolveRegisterCalls(), so every searchBlock lookup is O(1). * Preserves "first matching block wins" by using `if addr not in index` during construction — important because FunctionAnalysisState.getBlocks can place the same instruction in multiple overlapping blocks via the sorted potential_starts walk. * Clears the index in a finally so a reused analyzer instance never serves a stale index after the function completes. * Keeps a slim linear-scan fallback in searchBlock for direct callers (e.g. existing unit tests that drive processBlock without going through resolveRegisterCalls). Microbench (80 blocks × 15 instructions, 1200 lookups): legacy linear scan: 17.04 ms indexed O(1) lookup: 0.18 ms -> 92x faster, bit-identical block-object references returned. End-to-end on asprox is unchanged (it has few register calls); the win scales with the number of indirect calls in the binary. Validation: - pytest tests/test* -> 111 passed, 79 subtests passed - ruff check + format --check clean - asprox sha256 / num_instructions / function count unchanged
1 parent 8b6e117 commit a69978f

1 file changed

Lines changed: 67 additions & 36 deletions

File tree

src/smda/intel/IndirectCallAnalyzer.py

Lines changed: 67 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,33 @@ def __init__(self, disassembler):
1919
self.disassembly = self.disassembler.disassembly
2020
self.current_calling_addr = 0
2121
self.state = None
22+
# Lazy {instruction_addr: containing_block} index, populated for the
23+
# lifetime of a single resolveRegisterCalls() call. Cleared afterwards
24+
# so a reused analyzer instance never serves a stale index.
25+
self._block_index = None
26+
27+
@staticmethod
28+
def _buildBlockIndex(analysis_state):
29+
# Preserve "first matching block wins" — overlapping potential_starts
30+
# in FunctionAnalysisState.getBlocks() can place the same instruction
31+
# in more than one block; the legacy linear scan returned the first.
32+
index = {}
33+
for block in analysis_state.getBlocks():
34+
for ins in block:
35+
addr = ins[0]
36+
if addr not in index:
37+
index[addr] = block
38+
return index
2239

2340
def searchBlock(self, analysis_state, address):
41+
if self._block_index is not None:
42+
return self._block_index.get(address, [])
43+
# Fallback for direct callers (e.g. unit tests) that bypass
44+
# resolveRegisterCalls and never seed the index.
2445
for block in analysis_state.getBlocks():
25-
if address in [i[0] for i in block]:
26-
return block
46+
for ins in block:
47+
if ins[0] == address:
48+
return block
2749
return []
2850

2951
def getDword(self, addr):
@@ -209,38 +231,47 @@ def resolveRegisterCalls(self, analysis_state, block_depth=3):
209231
len(analysis_state.call_register_ins),
210232
analysis_state.start_addr,
211233
)
212-
max_calls_per_block = 10
213-
calls_per_block = {}
214-
for calling_addr in analysis_state.call_register_ins:
215-
LOGGER.debug("#" * 20)
216-
self.current_calling_addr = calling_addr
217-
self.state = analysis_state
218-
start_block = [ins for ins in self.searchBlock(analysis_state, calling_addr) if ins[0] <= calling_addr]
219-
if not start_block:
220-
return
221-
# we only process at most 10 register-calls per block to avoid extreme cases
222-
# found one Go sample with 130k register calls.
223-
if start_block[0] not in calls_per_block:
224-
calls_per_block[start_block[0]] = 0
225-
calls_per_block[start_block[0]] += 1
226-
# if we have an old config, default to 50
227-
max_calls = (
228-
self.disassembler.config.MAX_INDIRECT_CALLS_PER_BASIC_BLOCK
229-
if hasattr(self.disassembler.config, "MAX_INDIRECT_CALLS_PER_BASIC_BLOCK")
230-
else 50
231-
)
232-
if calls_per_block[start_block[0]] > max_calls:
233-
break
234-
LOGGER.debug(
235-
"For this block, we can still analyze %d indirect calls.",
236-
max_calls_per_block - calls_per_block[start_block[0]],
237-
)
238-
if start_block:
239-
self.processBlock(
240-
analysis_state,
241-
start_block,
242-
{},
243-
start_block[-1][3],
244-
[],
245-
block_depth,
234+
# Build the instruction->block index once per function. The previous
235+
# implementation scanned every block linearly inside searchBlock for
236+
# every calling address AND recursively for every incoming ref up to
237+
# block_depth — O(N**2) on call-heavy functions (e.g. Go binaries with
238+
# many register calls). With the index, each lookup is O(1).
239+
self._block_index = self._buildBlockIndex(analysis_state)
240+
try:
241+
max_calls_per_block = 10
242+
calls_per_block = {}
243+
for calling_addr in analysis_state.call_register_ins:
244+
LOGGER.debug("#" * 20)
245+
self.current_calling_addr = calling_addr
246+
self.state = analysis_state
247+
start_block = [ins for ins in self.searchBlock(analysis_state, calling_addr) if ins[0] <= calling_addr]
248+
if not start_block:
249+
return
250+
# we only process at most 10 register-calls per block to avoid extreme cases
251+
# found one Go sample with 130k register calls.
252+
if start_block[0] not in calls_per_block:
253+
calls_per_block[start_block[0]] = 0
254+
calls_per_block[start_block[0]] += 1
255+
# if we have an old config, default to 50
256+
max_calls = (
257+
self.disassembler.config.MAX_INDIRECT_CALLS_PER_BASIC_BLOCK
258+
if hasattr(self.disassembler.config, "MAX_INDIRECT_CALLS_PER_BASIC_BLOCK")
259+
else 50
246260
)
261+
if calls_per_block[start_block[0]] > max_calls:
262+
break
263+
LOGGER.debug(
264+
"For this block, we can still analyze %d indirect calls.",
265+
max_calls_per_block - calls_per_block[start_block[0]],
266+
)
267+
if start_block:
268+
self.processBlock(
269+
analysis_state,
270+
start_block,
271+
{},
272+
start_block[-1][3],
273+
[],
274+
block_depth,
275+
)
276+
finally:
277+
self._block_index = None

0 commit comments

Comments
 (0)