perf(intel): index-backed searchBlock in IndirectCallAnalyzer

claude · claude · commit a69978f675e3 · 2026-05-26T15:20:30.000Z
resolveRegisterCalls() resolves each "call &lt;register&gt;" by walking the
CFG backward through up to block_depth (=3) levels of incoming refs.
At every level, searchBlock was doing a linear scan over every block in
the function and, for each block, a list comprehension over every
instruction:

    for block in analysis_state.getBlocks():
        if address in [i[0] for i in block]:
            return block

So one call to searchBlock is O(B*I) — and the recursive descent into
processBlock calls it once per incoming ref at every depth. Functions
with many register calls (the file already mentions a Go sample with
130k of them) hit this hot.

This commit:

* Seeds an {instruction_addr: containing_block} dict once at the start
  of resolveRegisterCalls(), so every searchBlock lookup is O(1).
* Preserves "first matching block wins" by using `if addr not in index`
  during construction — important because FunctionAnalysisState.getBlocks
  can place the same instruction in multiple overlapping blocks via
  the sorted potential_starts walk.
* Clears the index in a finally so a reused analyzer instance never
  serves a stale index after the function completes.
* Keeps a slim linear-scan fallback in searchBlock for direct callers
  (e.g. existing unit tests that drive processBlock without going
  through resolveRegisterCalls).

Microbench (80 blocks × 15 instructions, 1200 lookups):
  legacy linear scan:    17.04 ms
  indexed O(1) lookup:    0.18 ms
  -&gt; 92x faster, bit-identical block-object references returned.

End-to-end on asprox is unchanged (it has few register calls); the win
scales with the number of indirect calls in the binary.

Validation:
- pytest tests/test* -&gt; 111 passed, 79 subtests passed
- ruff check + format --check clean
- asprox sha256 / num_instructions / function count unchanged
diff --git a/src/smda/intel/IndirectCallAnalyzer.py b/src/smda/intel/IndirectCallAnalyzer.py
@@ -19,11 +19,33 @@ def __init__(self, disassembler):
         self.disassembly = self.disassembler.disassembly
         self.current_calling_addr = 0
         self.state = None
+        # Lazy {instruction_addr: containing_block} index, populated for the
+        # lifetime of a single resolveRegisterCalls() call. Cleared afterwards
+        # so a reused analyzer instance never serves a stale index.
+        self._block_index = None
+
+    @staticmethod
+    def _buildBlockIndex(analysis_state):
+        # Preserve "first matching block wins" — overlapping potential_starts
+        # in FunctionAnalysisState.getBlocks() can place the same instruction
+        # in more than one block; the legacy linear scan returned the first.
+        index = {}
+        for block in analysis_state.getBlocks():
+            for ins in block:
+                addr = ins[0]
+                if addr not in index:
+                    index[addr] = block
+        return index
 
     def searchBlock(self, analysis_state, address):
+        if self._block_index is not None:
+            return self._block_index.get(address, [])
+        # Fallback for direct callers (e.g. unit tests) that bypass
+        # resolveRegisterCalls and never seed the index.
         for block in analysis_state.getBlocks():
-            if address in [i[0] for i in block]:
-                return block
+            for ins in block:
+                if ins[0] == address:
+                    return block
         return []
 
     def getDword(self, addr):
@@ -209,38 +231,47 @@ def resolveRegisterCalls(self, analysis_state, block_depth=3):
                 len(analysis_state.call_register_ins),
                 analysis_state.start_addr,
             )
-        max_calls_per_block = 10
-        calls_per_block = {}
-        for calling_addr in analysis_state.call_register_ins:
-            LOGGER.debug("#" * 20)
-            self.current_calling_addr = calling_addr
-            self.state = analysis_state
-            start_block = [ins for ins in self.searchBlock(analysis_state, calling_addr) if ins[0] <= calling_addr]
-            if not start_block:
-                return
-            # we only process at most 10 register-calls per block to avoid extreme cases
-            # found one Go sample with 130k register calls.
-            if start_block[0] not in calls_per_block:
-                calls_per_block[start_block[0]] = 0
-            calls_per_block[start_block[0]] += 1
-            # if we have an old config, default to 50
-            max_calls = (
-                self.disassembler.config.MAX_INDIRECT_CALLS_PER_BASIC_BLOCK
-                if hasattr(self.disassembler.config, "MAX_INDIRECT_CALLS_PER_BASIC_BLOCK")
-                else 50
-            )
-            if calls_per_block[start_block[0]] > max_calls:
-                break
-            LOGGER.debug(
-                "For this block, we can still analyze %d indirect calls.",
-                max_calls_per_block - calls_per_block[start_block[0]],
-            )
-            if start_block:
-                self.processBlock(
-                    analysis_state,
-                    start_block,
-                    {},
-                    start_block[-1][3],
-                    [],
-                    block_depth,
+        # Build the instruction->block index once per function. The previous
+        # implementation scanned every block linearly inside searchBlock for
+        # every calling address AND recursively for every incoming ref up to
+        # block_depth — O(N**2) on call-heavy functions (e.g. Go binaries with
+        # many register calls). With the index, each lookup is O(1).
+        self._block_index = self._buildBlockIndex(analysis_state)
+        try:
+            max_calls_per_block = 10
+            calls_per_block = {}
+            for calling_addr in analysis_state.call_register_ins:
+                LOGGER.debug("#" * 20)
+                self.current_calling_addr = calling_addr
+                self.state = analysis_state
+                start_block = [ins for ins in self.searchBlock(analysis_state, calling_addr) if ins[0] <= calling_addr]
+                if not start_block:
+                    return
+                # we only process at most 10 register-calls per block to avoid extreme cases
+                # found one Go sample with 130k register calls.
+                if start_block[0] not in calls_per_block:
+                    calls_per_block[start_block[0]] = 0
+                calls_per_block[start_block[0]] += 1
+                # if we have an old config, default to 50
+                max_calls = (
+                    self.disassembler.config.MAX_INDIRECT_CALLS_PER_BASIC_BLOCK
+                    if hasattr(self.disassembler.config, "MAX_INDIRECT_CALLS_PER_BASIC_BLOCK")
+                    else 50
                 )
+                if calls_per_block[start_block[0]] > max_calls:
+                    break
+                LOGGER.debug(
+                    "For this block, we can still analyze %d indirect calls.",
+                    max_calls_per_block - calls_per_block[start_block[0]],
+                )
+                if start_block:
+                    self.processBlock(
+                        analysis_state,
+                        start_block,
+                        {},
+                        start_block[-1][3],
+                        [],
+                        block_depth,
+                    )
+        finally:
+            self._block_index = None