-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathregex-bench.py
More file actions
103 lines (85 loc) · 3.51 KB
/
regex-bench.py
File metadata and controls
103 lines (85 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python
"""
regex_mt_bench.py – does Python's re.search() scale across threads?
• Creates a single master regex (similar to the RegexValidator patch).
• Launches N threads; each thread calls .search() a fixed number of times.
• Reports wall-time vs process-CPU-time so you can see how many cores
the regex engine actually used.
"""
from __future__ import annotations
import regex as re, threading, time, os, sys, random, math
# ---------------------------------------------------------------------
# 1. Prepare synthetic workload
# ---------------------------------------------------------------------
NUM_PATTERNS = 120 # similar to your real list
ITERATIONS = 80_000 # 16× more work than before
TEXT_LEN_CHARS = 800_000 # force the engine to read a lot
random.seed(42)
# Make deterministic-ish patterns: a literal word or a short .* wildcard
_PATTERNS: list[str] = []
for i in range(NUM_PATTERNS):
if i % 3 == 0:
_PATTERNS.append(fr"\bword{i}\b")
elif i % 3 == 1:
_PATTERNS.append(fr"phrase{i}[^ ]+end")
else:
_PATTERNS.append(fr"token{i}.*?token{i+1}")
# Build a master alternation with named groups (as in the patch)
parts, _group2raw = [], {}
for i, p in enumerate(_PATTERNS):
gname = f"P{i}"
parts.append(f"(?P<{gname}>{p})")
_group2raw[gname] = p
_BIG_RE = re.compile("|".join(parts), re.IGNORECASE | re.MULTILINE | re.DOTALL)
# Generate text that *sometimes* matches: sprinkle keywords every ~1000 chars
_chunks = []
for i in range(TEXT_LEN_CHARS // 50):
if i % 20 == 0: # every 20th chunk drop a keyword
k = random.randrange(NUM_PATTERNS)
tok = f"word{k}" if k % 3 == 0 else f"phrase{k}xxend"
_chunks.append(tok)
else:
_chunks.append("loremipsum")
_TEXT = " ".join(_chunks)
# ---------------------------------------------------------------------
# 2. Benchmark helper
# ---------------------------------------------------------------------
def run_threads(n_threads: int) -> tuple[float, float]:
"""
Launch n_threads that each call _BIG_RE.search(_TEXT) ITERATIONS times.
Returns (wall_seconds, cpu_seconds) for the whole job.
"""
def worker():
s = _BIG_RE # local var for speed
t = _TEXT
for _ in range(ITERATIONS):
s.search(t)
threads = [threading.Thread(target=worker, daemon=True)
for _ in range(n_threads)]
cpu_start = os.times() # returns a 5-tuple
t0 = time.perf_counter()
for th in threads:
th.start()
for th in threads:
th.join()
wall = time.perf_counter() - t0
cpu = (os.times().user + os.times().system) - (cpu_start.user + cpu_start.system)
return wall, cpu
# ---------------------------------------------------------------------
# 3. Run for several thread counts
# ---------------------------------------------------------------------
def main():
print(f"patterns : {NUM_PATTERNS}")
print(f"text length (chars) : {len(_TEXT):,}")
print(f"regex searches/thread: {ITERATIONS}")
print()
for n in (1, 2, 4, 8):
wall, cpu = run_threads(n)
util = cpu / wall if wall else math.nan
print(f"{n:>2} threads → wall {wall:6.2f} s "
f"CPU {cpu:6.2f} s ratio {util:4.2f}")
print("\nInterpretation:")
print(" • ratio ≈ 1.0 → work is effectively single-core.")
print(" • ratio → N → regex scanning scales across N cores.")
if __name__ == "__main__":
main()