Skip to content

Commit 3f6a2f7

Browse files
committed
Pre-compile regex patterns in ExecutionSafetyManager
Convert static lists of raw string regex patterns to tuples of pre-compiled `re.Pattern` objects (`re.compile`) to reduce overhead during continuous `re.search` calls in `ExecutionSafetyManager` methods. This affects: - `_WRITE_PATTERNS` - `_WRITE_ON_HANDLE_PATTERNS` - `_SENSITIVE_POSIX_PREFIXES` - `_DESTRUCTIVE_PATTERNS` - `_SHELL_PATTERNS`
1 parent 2a47494 commit 3f6a2f7

2 files changed

Lines changed: 20 additions & 12 deletions

File tree

.jules/bolt.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
## 2024-05-29 - Pre-compiling Regex Patterns for Performance
2+
**Learning:** In the `libs/safety_manager.py`, repeated string-based `re.search()` calls inside hot paths create significant overhead compared to using pre-compiled regex objects (`re.compile`). While building a massive combined `|` joined regex might theoretically be faster, in Python 3, pre-compiling individual patterns into a tuple of `re.Pattern` objects provides the most reliable and readable speedup for list comprehensions with `any()`.
3+
**Action:** Pre-compile regex lists as class attributes using `tuple(re.compile(p) for p in RAW_PATTERNS)` to optimize hot paths without sacrificing readability.

libs/safety_manager.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import shutil
55
import tempfile
66
from dataclasses import dataclass, field
7-
from typing import Dict, List, Optional
7+
from typing import Dict, Optional
88

99

1010
# =========================
@@ -74,7 +74,7 @@ class ExecutionSafetyManager:
7474
# single-quoted raw strings r'...' so that ['"] is unambiguous.
7575
# Using r"...['\""]..." caused the bare trailing `"` to prematurely close
7676
# the outer double-quoted string → E999 SyntaxError at line 74.
77-
_WRITE_PATTERNS = [
77+
_WRITE_PATTERNS_RAW = [
7878
# open() explicit write modes — text and binary variants with optional '+'
7979
r'open\s*\([^)]*[\'"]w[btax]?\+?[\'"]',
8080
r'open\s*\([^)]*[\'"]a[btx]?\+?[\'"]',
@@ -100,6 +100,7 @@ class ExecutionSafetyManager:
100100
r'\.to_excel\s*\([^)]*[\'"/]',
101101
r'\.to_parquet\s*\([^)]*[\'"/]',
102102
]
103+
_WRITE_PATTERNS = tuple(re.compile(p, re.IGNORECASE) for p in _WRITE_PATTERNS_RAW)
103104

104105
# BUG FIX (test_blocks_write_function_with_absolute_path):
105106
# When code opens a file handle (any mode, including 'r') and then calls
@@ -108,19 +109,21 @@ class ExecutionSafetyManager:
108109
# _WRITE_PATTERNS so it is only evaluated in the combined absolute-path
109110
# write check — preventing false positives like sys.stdout.write() on
110111
# purely relative / non-file code paths.
111-
_WRITE_ON_HANDLE_PATTERNS = [
112+
_WRITE_ON_HANDLE_PATTERNS_RAW = [
112113
r"\.write\s*\(",
113114
]
115+
_WRITE_ON_HANDLE_PATTERNS = tuple(re.compile(p, re.IGNORECASE) for p in _WRITE_ON_HANDLE_PATTERNS_RAW)
114116

115117
# Sensitive POSIX system path prefixes that are ALWAYS blocked (even for reads).
116-
_SENSITIVE_POSIX_PREFIXES = [
118+
_SENSITIVE_POSIX_PREFIXES_RAW = [
117119
r"/etc/\w+",
118120
r"/root/\w+",
119121
r"/proc/\w+",
120122
r"/sys/\w+",
121123
r"/dev/\w+",
122124
r"/boot/\w+",
123125
]
126+
_SENSITIVE_POSIX_PREFIXES = tuple(re.compile(p, re.IGNORECASE) for p in _SENSITIVE_POSIX_PREFIXES_RAW)
124127

125128
# Known-dangerous call targets for .remove() / .unlink() / .rmtree().
126129
_DANGEROUS_ATTR_OWNERS = frozenset({"os", "shutil", "pathlib", "path"})
@@ -142,7 +145,7 @@ class ExecutionSafetyManager:
142145
# false-positives on SQL DELETE keyword used as a string literal in
143146
# data-analysis code (e.g. cursor.execute("DELETE FROM ...")).
144147
# =========================
145-
_DESTRUCTIVE_PATTERNS = [
148+
_DESTRUCTIVE_PATTERNS_RAW = [
146149
# Filesystem deletes
147150
r"\bunlink\b",
148151
r"\bunlinksync\b",
@@ -166,19 +169,21 @@ class ExecutionSafetyManager:
166169
r"\bformat\s+[a-z]:",
167170
r"\bdiskpart\b",
168171
]
172+
_DESTRUCTIVE_PATTERNS = tuple(re.compile(p) for p in _DESTRUCTIVE_PATTERNS_RAW)
169173

170174
# =========================
171175
# BUG FIX #2: Shell patterns now use re.search() with \b word boundaries
172176
# instead of plain `in` substring matching. Previously "bash" matched
173177
# any identifier containing "bash" (e.g. "rehash", "bashful").
174178
# =========================
175-
_SHELL_PATTERNS = [
179+
_SHELL_PATTERNS_RAW = [
176180
r"\bsubprocess\b",
177181
r"\bos\.system\b",
178182
r"\bpowershell\b",
179183
r"\bcmd\.exe\b",
180184
r"\bbash\b",
181185
]
186+
_SHELL_PATTERNS = tuple(re.compile(p) for p in _SHELL_PATTERNS_RAW)
182187

183188
def __init__(self, unsafe_mode: bool = False):
184189
self.unsafe_mode = unsafe_mode
@@ -228,7 +233,7 @@ def _has_write_operation(self, code: str) -> bool:
228233
"""Return True if *code* contains any write operation that must be
229234
blocked in SAFE mode.
230235
"""
231-
return any(re.search(p, code, re.IGNORECASE) for p in self._WRITE_PATTERNS)
236+
return any(p.search(code) for p in self._WRITE_PATTERNS)
232237

233238
# =========================
234239
# WRITE-ON-HANDLE DETECTION
@@ -240,7 +245,7 @@ def _has_write_on_handle(self, code: str) -> bool:
240245
"""Return True if *code* calls .write() on any object (handle check).
241246
This is intentionally only evaluated when an absolute path is present.
242247
"""
243-
return any(re.search(p, code, re.IGNORECASE) for p in self._WRITE_ON_HANDLE_PATTERNS)
248+
return any(p.search(code) for p in self._WRITE_ON_HANDLE_PATTERNS)
244249

245250
# =========================
246251
# HOST ABSOLUTE PATH CHECK
@@ -285,7 +290,7 @@ def _is_host_absolute_path(self, code: str) -> bool:
285290

286291
def _is_sensitive_posix_path(self, code: str) -> bool:
287292
"""Return True if *code* references a sensitive POSIX system path."""
288-
return any(re.search(p, code, re.IGNORECASE) for p in self._SENSITIVE_POSIX_PREFIXES)
293+
return any(p.search(code) for p in self._SENSITIVE_POSIX_PREFIXES)
289294

290295
# =========================
291296
# MAIN CHECK
@@ -326,15 +331,15 @@ def assess_execution(self, code: str, mode: str) -> Decision:
326331
# (shutdown, reboot, mkfs, dd, format, diskpart) in addition to
327332
# filesystem deletes.
328333
# =========================
329-
if any(re.search(p, code_lower) for p in self._DESTRUCTIVE_PATTERNS):
334+
if any(p.search(code_lower) for p in self._DESTRUCTIVE_PATTERNS):
330335
return Decision(False, ["Destructive operation blocked."])
331336

332337
# =========================
333338
# SHELL BLOCK
334339
# BUG FIX #2: Uses _SHELL_PATTERNS with \b word-boundary regex instead
335340
# of plain substring `in` check to avoid false positives.
336341
# =========================
337-
if any(re.search(p, code_lower) for p in self._SHELL_PATTERNS):
342+
if any(p.search(code_lower) for p in self._SHELL_PATTERNS):
338343
return Decision(False, ["Shell execution is blocked."])
339344

340345
# =========================
@@ -370,7 +375,7 @@ def is_dangerous_operation(self, code: str) -> bool:
370375
if not code or not code.strip():
371376
return False
372377
code_lower = code.lower()
373-
return any(re.search(p, code_lower) for p in self._DESTRUCTIVE_PATTERNS)
378+
return any(p.search(code_lower) for p in self._DESTRUCTIVE_PATTERNS)
374379

375380
# =========================
376381
# ARTIFACT EXPORT

0 commit comments

Comments
 (0)