perf(intel): cache block index on analysis_state, not the analyzer

claude · danielplohmann · commit 6e8caf4bf92a · 2026-06-10T11:48:17.000+02:00
Address Gemini review on PR #47: stash the {instruction_addr: block} index on analysis_state instead of self. analysis_state has the right lifetime (one per function analysis) so the cache is naturally re-entrancy-safe and can't outlive what it indexes, the analyzer keeps no transient state, and the explicit seed + try/finally in resolveRegisterCalls goes away. searchBlock now lazy-builds on first call, so the legacy fallback branch is also gone — every caller (including direct unit-test callers) gets the O(1) path automatically. contextlib.suppress(AttributeError) guards the cache write so that test doubles or hypothetical __slots__-locked states still work; the freshly built dict is returned in that case. Re-ran the focused micro-bench (80 blocks x 15 instructions, 1200 lookups): ~107x faster than the legacy scan, 0/1200 parity mismatches. End-to-end asprox sha256/num_instructions/function count unchanged. Validation: - pytest tests/test* -> 111 passed, 79 subtests passed - ruff check + format --check clean
diff --git a/src/smda/intel/IndirectCallAnalyzer.py b/src/smda/intel/IndirectCallAnalyzer.py
@@ -1,3 +1,4 @@
+import contextlib
 import logging
 import re
 import struct
@@ -19,34 +20,32 @@ def __init__(self, disassembler):
         self.disassembly = self.disassembler.disassembly
         self.current_calling_addr = 0
         self.state = None
-        # Lazy {instruction_addr: containing_block} index, populated for the
-        # lifetime of a single resolveRegisterCalls() call. Cleared afterwards
-        # so a reused analyzer instance never serves a stale index.
-        self._block_index = None
-
-    @staticmethod
-    def _buildBlockIndex(analysis_state):
-        # Preserve "first matching block wins" — overlapping potential_starts
-        # in FunctionAnalysisState.getBlocks() can place the same instruction
-        # in more than one block; the legacy linear scan returned the first.
-        index = {}
-        for block in analysis_state.getBlocks():
-            for ins in block:
-                addr = ins[0]
-                if addr not in index:
-                    index[addr] = block
-        return index
 
     def searchBlock(self, analysis_state, address):
-        if self._block_index is not None:
-            return self._block_index.get(address, [])
-        # Fallback for direct callers (e.g. unit tests) that bypass
-        # resolveRegisterCalls and never seed the index.
-        for block in analysis_state.getBlocks():
-            for ins in block:
-                if ins[0] == address:
-                    return block
-        return []
+        # Lazy-cache an {instruction_addr: containing_block} index on the
+        # analysis_state so subsequent lookups during the same function
+        # analysis are O(1) instead of O(blocks * instructions). The cache
+        # lives on the state (not on self) so the analyzer stays
+        # re-entrancy-safe and the index can't outlive the function being
+        # analyzed.
+        block_index = getattr(analysis_state, "_block_index", None)
+        if not isinstance(block_index, dict):
+            block_index = {}
+            # Preserve "first matching block wins" — overlapping
+            # potential_starts in FunctionAnalysisState.getBlocks() can
+            # place the same instruction in more than one block; the
+            # legacy linear scan returned the first.
+            for block in analysis_state.getBlocks():
+                for ins in block:
+                    addr = ins[0]
+                    if addr not in block_index:
+                        block_index[addr] = block
+            # Objects with __slots__ or read-only attribute surfaces (and some
+            # test doubles) reject the assignment; the lookup below still works
+            # on the freshly built index.
+            with contextlib.suppress(AttributeError):
+                analysis_state._block_index = block_index
+        return block_index.get(address, [])
 
     def getDword(self, addr):
         if not self.disassembly.isAddrWithinMemoryImage(addr):
@@ -231,47 +230,38 @@ def resolveRegisterCalls(self, analysis_state, block_depth=3):
                 len(analysis_state.call_register_ins),
                 analysis_state.start_addr,
             )
-        # Build the instruction->block index once per function. The previous
-        # implementation scanned every block linearly inside searchBlock for
-        # every calling address AND recursively for every incoming ref up to
-        # block_depth — O(N**2) on call-heavy functions (e.g. Go binaries with
-        # many register calls). With the index, each lookup is O(1).
-        self._block_index = self._buildBlockIndex(analysis_state)
-        try:
-            max_calls_per_block = 10
-            calls_per_block = {}
-            for calling_addr in analysis_state.call_register_ins:
-                LOGGER.debug("#" * 20)
-                self.current_calling_addr = calling_addr
-                self.state = analysis_state
-                start_block = [ins for ins in self.searchBlock(analysis_state, calling_addr) if ins[0] <= calling_addr]
-                if not start_block:
-                    return
-                # we only process at most 10 register-calls per block to avoid extreme cases
-                # found one Go sample with 130k register calls.
-                if start_block[0] not in calls_per_block:
-                    calls_per_block[start_block[0]] = 0
-                calls_per_block[start_block[0]] += 1
-                # if we have an old config, default to 50
-                max_calls = (
-                    self.disassembler.config.MAX_INDIRECT_CALLS_PER_BASIC_BLOCK
-                    if hasattr(self.disassembler.config, "MAX_INDIRECT_CALLS_PER_BASIC_BLOCK")
-                    else 50
-                )
-                if calls_per_block[start_block[0]] > max_calls:
-                    break
-                LOGGER.debug(
-                    "For this block, we can still analyze %d indirect calls.",
-                    max_calls_per_block - calls_per_block[start_block[0]],
+        max_calls_per_block = 10
+        calls_per_block = {}
+        for calling_addr in analysis_state.call_register_ins:
+            LOGGER.debug("#" * 20)
+            self.current_calling_addr = calling_addr
+            self.state = analysis_state
+            start_block = [ins for ins in self.searchBlock(analysis_state, calling_addr) if ins[0] <= calling_addr]
+            if not start_block:
+                return
+            # we only process at most 10 register-calls per block to avoid extreme cases
+            # found one Go sample with 130k register calls.
+            if start_block[0] not in calls_per_block:
+                calls_per_block[start_block[0]] = 0
+            calls_per_block[start_block[0]] += 1
+            # if we have an old config, default to 50
+            max_calls = (
+                self.disassembler.config.MAX_INDIRECT_CALLS_PER_BASIC_BLOCK
+                if hasattr(self.disassembler.config, "MAX_INDIRECT_CALLS_PER_BASIC_BLOCK")
+                else 50
+            )
+            if calls_per_block[start_block[0]] > max_calls:
+                break
+            LOGGER.debug(
+                "For this block, we can still analyze %d indirect calls.",
+                max_calls_per_block - calls_per_block[start_block[0]],
+            )
+            if start_block:
+                self.processBlock(
+                    analysis_state,
+                    start_block,
+                    {},
+                    start_block[-1][3],
+                    [],
+                    block_depth,
                 )
-                if start_block:
-                    self.processBlock(
-                        analysis_state,
-                        start_block,
-                        {},
-                        start_block[-1][3],
-                        [],
-                        block_depth,
-                    )
-        finally:
-            self._block_index = None