perf(core): cut per-call overhead in candidate scoring and hash sequences

claude · r0ny123 · commit ab0a3326c531 · 2026-05-26T20:00:52.000+05:30
Three related hot-path tweaks, all behavior-preserving:

1. SmdaFunction / SmdaBasicBlock: replace
   `bytes([ord(c) for c in "".join(seqs)])` with
   `"".join(seqs).encode("ascii")` in the four PIC/OPC hash sequence
   helpers. The output is byte-identical (the escaper emits ASCII-only
   strings), but the per-character Python loop is gone. Microbench on a
   ~3.7 KB escaped sequence shows ~600x speedup for the conversion step
   alone; on the asprox fixture (105 funcs, 2140 blocks) block hash
   sequence assembly drops ~15%.

2. FunctionCandidate: hoist
   `sorted([int(k) for k in COMMON_PROLOGUES], reverse=True)` out of
   `hasCommonFunctionStart` / `getFunctionStartScore` to a module-level
   constant. Both methods are called from `calculateScore` /
   `getCharacteristics` / `__str__` / `toJson`, so on every candidate
   the prologue length list was being rebuilt and re-sorted from
   scratch. calculateScore over 200k iterations drops from 322ms to
   180ms (~44%) in a focused bench.

3. FunctionCandidate.call_ref_sources: switch from list to set. The
   inner CFG-recovery loop does `addr not in call_ref_sources` on every
   call instruction; with a list this is O(n) per call and quadratic
   for hot targets (popular runtime stubs can accumulate many sources).
   With a set, add/discard/membership are O(1). The only order-sensitive
   read was the single-element branch in `__str__`, which now uses
   `next(iter(...))`. No external code depends on ordering — only `len`
   and truthiness (verified across src/ and tests/).

Validation:
- `make lint` (ruff check + format check) clean.
- `pytest tests/test*` 90 passed, 43 subtests passed.
- pic_hash / opc_hash / serialized report sha256 unchanged on asprox.
diff --git a/src/smda/common/SmdaBasicBlock.py b/src/smda/common/SmdaBasicBlock.py
@@ -59,7 +59,7 @@ def getPicBlockHashSequence(self):
                         + self.smda_function.smda_report.binary_size,
                     )
                 )
-            return bytes([ord(c) for c in "".join(escaped_binary_seqs)])
+            return "".join(escaped_binary_seqs).encode("ascii")
 
     def getOpcBlockHash(self):
         if self.opcblockhash is not None:
@@ -76,7 +76,7 @@ def getOpcBlockHashSequence(self):
             escaped_binary_seqs = []
             for instruction in self.getInstructions():
                 escaped_binary_seqs.append(instruction.getEscapedToOpcodeOnly(self.smda_function._escaper))
-            return bytes([ord(c) for c in "".join(escaped_binary_seqs)])
+            return "".join(escaped_binary_seqs).encode("ascii")
 
     def getPredecessors(self):
         predecessors = []
diff --git a/src/smda/common/SmdaFunction.py b/src/smda/common/SmdaFunction.py
@@ -200,7 +200,7 @@ def getPicHashSequence(self, binary_info):
                         upper_addr=binary_info.base_addr + binary_info.binary_size,
                     )
                 )
-        return bytes([ord(c) for c in "".join(escaped_binary_seqs)])
+        return "".join(escaped_binary_seqs).encode("ascii")
 
     def getOpcHash(self):
         return struct.unpack("<Q", hashlib.sha256(self.getOpcHashSequence()).digest()[:8])[0]
@@ -210,7 +210,7 @@ def getOpcHashSequence(self):
         for _, block in sorted(self.blocks.items()):
             for instruction in block:
                 escaped_binary_seqs.append(instruction.getEscapedToOpcodeOnly(self._escaper))
-        return bytes([ord(c) for c in "".join(escaped_binary_seqs)])
+        return "".join(escaped_binary_seqs).encode("ascii")
 
     def _parseBlocks(self, block_dict):
         self.blocks = {}
diff --git a/src/smda/intel/FunctionCandidate.py b/src/smda/intel/FunctionCandidate.py
@@ -2,6 +2,10 @@
 
 from .definitions import COMMON_PROLOGUES
 
+# Hoisted: prologue lengths are checked longest-first on every candidate scoring call.
+# Pre-sort once at import time instead of re-sorting per call (hot path during CFG recovery).
+_COMMON_PROLOGUE_LENGTHS = sorted((int(k) for k in COMMON_PROLOGUES), reverse=True)
+
 
 class FunctionCandidate:
     def __init__(self, binary_info, addr):
@@ -10,7 +14,9 @@ def __init__(self, binary_info, addr):
         rel_start_addr = addr - binary_info.base_addr
         self.bytes = binary_info.binary[rel_start_addr : rel_start_addr + 5]
         self.lang_spec = None
-        self.call_ref_sources = []
+        # set, not list: addCallRef / removeCallRefs do membership tests in the inner
+        # CFG-recovery loop. Order is never read externally (only len + truthiness).
+        self.call_ref_sources = set()
         self.finished = False
         self.is_symbol = False
         self.is_gap_candidate = False
@@ -61,15 +67,15 @@ def getConfidence(self):
         return self._confidence
 
     def hasCommonFunctionStart(self):
-        for length in sorted([int(length_str) for length_str in COMMON_PROLOGUES], reverse=True):
+        for length in _COMMON_PROLOGUE_LENGTHS:
             byte_sequence = self.bytes[:length]
             if byte_sequence in COMMON_PROLOGUES[f"{length}"][self.bitness]:
                 return True
         return False
 
     def getFunctionStartScore(self):
         if self.function_start_score is None:
-            for length in sorted([int(length_str) for length_str in COMMON_PROLOGUES], reverse=True):
+            for length in _COMMON_PROLOGUE_LENGTHS:
                 byte_sequence = self.bytes[:length]
                 if byte_sequence in COMMON_PROLOGUES[f"{length}"][self.bitness]:
                     self.function_start_score = COMMON_PROLOGUES[f"{length}"][self.bitness][byte_sequence]
@@ -78,14 +84,12 @@ def getFunctionStartScore(self):
         return self.function_start_score
 
     def addCallRef(self, source_addr):
-        if source_addr not in self.call_ref_sources:
-            self.call_ref_sources.append(source_addr)
+        self.call_ref_sources.add(source_addr)
         self._score = None
 
     def removeCallRefs(self, source_addrs):
         for addr in source_addrs:
-            if addr in self.call_ref_sources:
-                self.call_ref_sources.remove(addr)
+            self.call_ref_sources.discard(addr)
         self._score = None
 
     def setIsTailcallCandidate(self, is_tailcall):
@@ -179,11 +183,10 @@ def getCharacteristics(self):
     def __str__(self):
         characteristics = self.getCharacteristics()
         prologue_score = f"{self.getFunctionStartScore()}"
-        ref_summary = (
-            f"{len(self.call_ref_sources)}"
-            if len(self.call_ref_sources) != 1
-            else f"{len(self.call_ref_sources)}: 0x{self.call_ref_sources[0]:x}"
-        )
+        if len(self.call_ref_sources) == 1:
+            ref_summary = f"1: 0x{next(iter(self.call_ref_sources)):x}"
+        else:
+            ref_summary = f"{len(self.call_ref_sources)}"
         return f"0x{self.addr:x}: {hexlify(self.bytes)} -> {prologue_score} (total score: {self.getScore()}), inref: {ref_summary} | {characteristics}"
 
     def toJson(self):